Browse Source

upsteam 2.3.1

master
R4SAS 6 years ago
parent
commit
7e513867b1
  1. 17
      Makefile.am
  2. 44
      README.txt
  3. 47
      algos.h
  4. 2
      api.cpp
  5. 25
      bench.cpp
  6. 273
      blake2b.cu
  7. 118
      ccminer.cpp
  8. 17
      ccminer.vcxproj
  9. 48
      ccminer.vcxproj.filters
  10. 2
      compat/ccminer-config.h
  11. 2
      configure.ac
  12. 1
      crypto/cn_aes.cuh
  13. 2
      crypto/cn_blake.cuh
  14. 9
      crypto/cn_groestl.cuh
  15. 20
      crypto/cn_jh.cuh
  16. 2
      crypto/cn_keccak.cuh
  17. 22
      crypto/cn_skein.cuh
  18. 86
      crypto/cryptolight-core.cu
  19. 45
      crypto/cryptolight-cpu.cpp
  20. 30
      crypto/cryptolight.cu
  21. 13
      crypto/cryptolight.h
  22. 236
      crypto/cryptonight-core.cu
  23. 68
      crypto/cryptonight-cpu.cpp
  24. 169
      crypto/cryptonight-extra.cu
  25. 37
      crypto/cryptonight.cu
  26. 13
      crypto/cryptonight.h
  27. 10
      crypto/xmr-rpc.cpp
  28. 2
      equi/equi-stratum.cpp
  29. 3
      equi/equihash.cpp
  30. 173
      lyra2/Lyra2.c
  31. 1
      lyra2/Lyra2.h
  32. 217
      lyra2/allium.cu
  33. 148
      lyra2/cuda_lyra2.cu
  34. 69
      lyra2/cuda_lyra2_sm2.cuh
  35. 76
      lyra2/cuda_lyra2_sm5.cuh
  36. 481
      lyra2/cuda_lyra2v3.cu
  37. 348
      lyra2/cuda_lyra2v3_sm3.cuh
  38. 4
      lyra2/lyra2RE.cu
  39. 183
      lyra2/lyra2REv3.cu
  40. 38
      miner.h
  41. 6
      neoscrypt/cuda_neoscrypt.cu
  42. 89
      phi/cuda_phi2.cu
  43. 319
      phi/cuda_phi2_cubehash512.cu
  44. 8
      phi/phi.cu
  45. 268
      phi/phi2.cu
  46. 10
      res/ccminer.rc
  47. 14
      scrypt.cpp
  48. 5
      scrypt/test_kernel.cu
  49. 4
      scrypt/titan_kernel.cu
  50. 507
      sha256/cuda_sha256q.cu
  51. 136
      sha256/sha256q.cu
  52. 8
      sia/sia-rpc.cpp
  53. 18
      sia/sia.cu
  54. 51
      util.cpp
  55. 21
      x11/cuda_streebog_maxwell.cu
  56. 54
      x11/cuda_x11_cubehash512.cu
  57. 497
      x11/exosis.cu
  58. 67
      x11/timetravel.cu
  59. 18
      x12/x12.cu
  60. 26
      x16/cuda_x16_echo512_64.cu
  61. 5
      x16/x16r.cu
  62. 5
      x16/x16s.cu
  63. 632
      x17/sonoa.cu
  64. 16
      x17/x17.cu

17
Makefile.am

@ -38,13 +38,16 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -38,13 +38,16 @@ ccminer_SOURCES = elist.h miner.h compat.h \
lyra2/Lyra2.c lyra2/Sponge.c \
lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \
lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \
lyra2/lyra2REv3.cu lyra2/cuda_lyra2v3.cu \
lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \
lyra2/allium.cu \
Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \
Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu \
Algo256/cuda_keccak256_sm3.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \
Algo256/blake2s.cu sph/blake2s.c \
Algo256/bmw.cu Algo256/cuda_bmw.cu \
blake2b.cu \
crypto/xmr-rpc.cpp crypto/wildkeccak-cpu.cpp crypto/wildkeccak.cu \
crypto/cryptolight.cu crypto/cryptolight-core.cu crypto/cryptolight-cpu.cpp \
crypto/cryptonight.cu crypto/cryptonight-core.cu crypto/cryptonight-extra.cu \
@ -58,7 +61,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -58,7 +61,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \
pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \
skunk/skunk.cu skunk/cuda_skunk.cu skunk/cuda_skunk_streebog.cu \
sha256/sha256d.cu sha256/cuda_sha256d.cu sha256/sha256t.cu sha256/cuda_sha256t.cu \
sha256/sha256d.cu sha256/cuda_sha256d.cu sha256/sha256t.cu sha256/cuda_sha256t.cu sha256/sha256q.cu sha256/cuda_sha256q.cu \
sia/sia.cu sia/sia-rpc.cpp sph/blake2b.c \
sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
@ -70,7 +73,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -70,7 +73,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
tribus/tribus.cu tribus/cuda_echo512_final.cu \
x11/x11.cu x12/x12.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu x11/exosis.cu \
x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \
x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
@ -79,8 +82,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -79,8 +82,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x16/x16r.cu x16/x16s.cu x16/cuda_x16_echo512.cu x16/cuda_x16_fugue512.cu \
x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
x16/cuda_x16_echo512_64.cu \
x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
x11/phi.cu x11/cuda_streebog_maxwell.cu \
x17/x17.cu x17/hmq17.cu x17/sonoa.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu phi/cuda_phi2_cubehash512.cu x11/cuda_streebog_maxwell.cu \
x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu \
gost/gost.cu gost/cuda_gosthash.cu
@ -116,9 +119,11 @@ endif @@ -116,9 +119,11 @@ endif
ccminer_LDADD += -lcuda
nvcc_ARCH :=
#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
#nvcc_ARCH += -gencode=arch=compute_75,code=\"sm_75,compute_75\" # CUDA 10 req.
#nvcc_ARCH += -gencode=arch=compute_70,code=\"sm_70,compute_70\" # CUDA 9.1
#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\" # CUDA 8
nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\"
#nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\"
#nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
#nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"

44
README.txt

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
ccminer 2.2.5 (Apr 2018) "x12, x16r and x16s algos"
ccminer 2.3.1 "lyra2v3, exosis and sha256q"
---------------------------------------------------------------
***************************************************************
@ -41,19 +41,20 @@ Keccak (Maxcoin) @@ -41,19 +41,20 @@ Keccak (Maxcoin)
Pentablake (Blake 512 x5)
1Coin Triple S
Neoscrypt (FeatherCoin)
Revolver (X11evo)
x11evo (Revolver)
phi2 (LUXCoin)
Scrypt and Scrypt:N
Scrypt-Jane (Chacha)
Sibcoin (sib)
sib (Sibcoin)
Skein (Skein + SHA)
Signatum (Skein cubehash fugue Streebog)
SonoA (Sono)
Tribus (JH, keccak, simd)
Woodcoin (Double Skein)
Vanilla (Blake256 8-rounds - double sha256)
Vertcoin Lyra2RE
Ziftrcoin (ZR5)
Vertcoin Lyra2REv3
Boolberry (Wild Keccak)
Monero (Cryptonight)
Monero (Cryptonight v7 with -a monero)
Aeon (Cryptonight-lite)
where some of these coins have a VERY NOTABLE nVidia advantage
@ -73,19 +74,21 @@ This code is based on the pooler cpuminer and inherits @@ -73,19 +74,21 @@ This code is based on the pooler cpuminer and inherits
its command line interface and options.
-a, --algo=ALGO specify the algorithm to use
allium use to mine Garlic
bastion use to mine Joincoin
bitcore use to mine Bitcore's Timetravel10
blake use to mine Saffroncoin (Blake256)
blakecoin use to mine Old Blake 256
blake2s use to mine Nevacoin (Blake2-S 256)
bmw use to mine Midnight
cryptolight use to mine AEON cryptonight (MEM/2)
cryptonight use to mine XMR cryptonight, Bytecoin, Dash, DigitalNote, etc
cryptolight use to mine AEON cryptonight variant 1 (MEM/2)
cryptonight use to mine original cryptonight
c11/flax use to mine Chaincoin and Flax
decred use to mine Decred 180 bytes Blake256-14
deep use to mine Deepcoin
dmd-gr use to mine Diamond-Groestl
equihash use to mine ZEC, HUSH and KMD
exosis use to mine EXO
fresh use to mine Freshcoin
fugue256 use to mine Fuguecoin
groestl use to mine Groestlcoin
@ -96,13 +99,16 @@ its command line interface and options. @@ -96,13 +99,16 @@ its command line interface and options.
lbry use to mine LBRY Credits
luffa use to mine Joincoin
lyra2 use to mine CryptoCoin
lyra2v2 use to mine Vertcoin
lyra2v2 use to mine Monacoin
lyra2v3 use to mine Vertcoin
lyra2z use to mine Zerocoin (XZC)
monero use to mine Monero (XMR)
myr-gr use to mine Myriad-Groest
neoscrypt use to mine FeatherCoin, Trezarcoin, Orbitcoin, etc
nist5 use to mine TalkCoin
penta use to mine Joincoin / Pentablake
phi use to mine LUXCoin
phi1612 use to mine Seraph
phi2 use to mine LUXCoin
polytimos use to mine Polytimos
quark use to mine Quarkcoin
qubit use to mine Qubit
@ -111,11 +117,14 @@ its command line interface and options. @@ -111,11 +117,14 @@ its command line interface and options.
scrypt-jane use to mine Chacha coins like Cache and Ultracoin
s3 use to mine 1coin (ONE)
sha256t use to mine OneCoin (OC)
sha256q use to mine Pyrite
sia use to mine SIA
sib use to mine Sibcoin
skein use to mine Skeincoin
skein2 use to mine Woodcoin
skunk use to mine Signatum
sonoa use to mine Sono
stellite use to mine Stellite (a cryptonight variant)
timetravel use to mine MachineCoin
tribus use to mine Denarius
x11evo use to mine Revolver
@ -281,6 +290,21 @@ so we can more efficiently implement new algorithms using the latest hardware @@ -281,6 +290,21 @@ so we can more efficiently implement new algorithms using the latest hardware
features.
>>> RELEASE HISTORY <<<
Jan. 30th 2019 v2.3.1
Handle Lyra2v3 algo
Handle sha256q algo
Handle exosis algo
Handle blake2b standard algo
June 23th 2018 v2.3
Handle phi2 header variation for smart contracts
Handle monero, stellite, graft and cryptolight variants
Handle SonoA algo
June 10th 2018 v2.2.6
New phi2 algo for LUX
New allium algo for Garlic
Apr. 02nd 2018 v2.2.5
New x16r algo for Raven
New x16s algo for Pigeon and Eden

47
algos.h

@ -7,7 +7,9 @@ @@ -7,7 +7,9 @@
enum sha_algos {
ALGO_BLAKECOIN = 0,
ALGO_BLAKE,
ALGO_BLAKE2B,
ALGO_BLAKE2S,
ALGO_ALLIUM,
ALGO_BMW,
ALGO_BASTION,
ALGO_C11,
@ -17,6 +19,7 @@ enum sha_algos { @@ -17,6 +19,7 @@ enum sha_algos {
ALGO_DECRED,
ALGO_DMD_GR,
ALGO_EQUIHASH,
ALGO_EXOSIS,
ALGO_FRESH,
ALGO_FUGUE256, /* Fugue256 */
ALGO_GOSTD,
@ -33,6 +36,7 @@ enum sha_algos { @@ -33,6 +36,7 @@ enum sha_algos {
ALGO_LUFFA,
ALGO_LYRA2,
ALGO_LYRA2v2,
ALGO_LYRA2v3,
ALGO_LYRA2Z,
ALGO_MJOLLNIR, /* Hefty hash */
ALGO_MYR_GR,
@ -40,6 +44,7 @@ enum sha_algos { @@ -40,6 +44,7 @@ enum sha_algos {
ALGO_NIST5,
ALGO_PENTABLAKE,
ALGO_PHI,
ALGO_PHI2,
ALGO_POLYTIMOS,
ALGO_QUARK,
ALGO_QUBIT,
@ -47,11 +52,13 @@ enum sha_algos { @@ -47,11 +52,13 @@ enum sha_algos {
ALGO_SCRYPT_JANE,
ALGO_SHA256D,
ALGO_SHA256T,
ALGO_SHA256Q,
ALGO_SIA,
ALGO_SIB,
ALGO_SKEIN,
ALGO_SKEIN2,
ALGO_SKUNK,
ALGO_SONOA,
ALGO_S3,
ALGO_TIMETRAVEL,
ALGO_TRIBUS,
@ -72,6 +79,9 @@ enum sha_algos { @@ -72,6 +79,9 @@ enum sha_algos {
ALGO_WHIRLPOOLX,
ALGO_WILDKECCAK,
ALGO_ZR5,
ALGO_MONERO,
ALGO_GRAFT,
ALGO_STELLITE,
ALGO_AUTO,
ALGO_COUNT
};
@ -81,7 +91,9 @@ extern volatile enum sha_algos opt_algo; @@ -81,7 +91,9 @@ extern volatile enum sha_algos opt_algo;
static const char *algo_names[] = {
"blakecoin",
"blake",
"blake2b",
"blake2s",
"allium",
"bmw",
"bastion",
"c11",
@ -91,6 +103,7 @@ static const char *algo_names[] = { @@ -91,6 +103,7 @@ static const char *algo_names[] = {
"decred",
"dmd-gr",
"equihash",
"exosis",
"fresh",
"fugue256",
"gostd",
@ -107,6 +120,7 @@ static const char *algo_names[] = { @@ -107,6 +120,7 @@ static const char *algo_names[] = {
"luffa",
"lyra2",
"lyra2v2",
"lyra2v3",
"lyra2z",
"mjollnir",
"myr-gr",
@ -114,6 +128,7 @@ static const char *algo_names[] = { @@ -114,6 +128,7 @@ static const char *algo_names[] = {
"nist5",
"penta",
"phi",
"phi2",
"polytimos",
"quark",
"qubit",
@ -121,11 +136,13 @@ static const char *algo_names[] = { @@ -121,11 +136,13 @@ static const char *algo_names[] = {
"scrypt-jane",
"sha256d",
"sha256t",
"sha256q",
"sia",
"sib",
"skein",
"skein2",
"skunk",
"sonoa",
"s3",
"timetravel",
"tribus",
@ -146,6 +163,9 @@ static const char *algo_names[] = { @@ -146,6 +163,9 @@ static const char *algo_names[] = {
"whirlpoolx",
"wildkeccak",
"zr5",
"monero",
"graft",
"stellite",
"auto", /* reserved for multi algo */
""
};
@ -185,6 +205,8 @@ static inline int algo_to_int(char* arg) @@ -185,6 +205,8 @@ static inline int algo_to_int(char* arg)
i = ALGO_LYRA2;
else if (!strcasecmp("lyra2rev2", arg))
i = ALGO_LYRA2v2;
else if (!strcasecmp("lyra2rev3", arg))
i = ALGO_LYRA2v3;
else if (!strcasecmp("phi1612", arg))
i = ALGO_PHI;
else if (!strcasecmp("bitcoin", arg))
@ -210,4 +232,29 @@ static inline int algo_to_int(char* arg) @@ -210,4 +232,29 @@ static inline int algo_to_int(char* arg)
return i;
}
static inline int get_cryptonight_algo(int fork)
{
int algo = ALGO_COUNT;
switch (fork) {
case 8:
algo = ALGO_GRAFT;
break;
case 7:
algo = ALGO_MONERO;
break;
case 3:
algo = ALGO_STELLITE;
break;
default:
algo = ALGO_CRYPTONIGHT;
break;
}
return algo;
}
#endif

2
api.cpp

@ -257,7 +257,7 @@ static char *getpoolnfo(char *params) @@ -257,7 +257,7 @@ static char *getpoolnfo(char *params)
static void gpuhwinfos(int gpu_id)
{
char buf[256];
char buf[512];
char pstate[8];
char* card;
struct cgpu_info *cgpu = NULL;

25
bench.cpp

@ -49,9 +49,11 @@ void bench_free() @@ -49,9 +49,11 @@ void bench_free()
void algo_free_all(int thr_id)
{
// only initialized algos will be freed
free_allium(thr_id);
free_bastion(thr_id);
free_bitcore(thr_id);
free_blake256(thr_id);
free_blake2b(thr_id);
free_blake2s(thr_id);
free_bmw(thr_id);
free_c11(thr_id);
@ -60,9 +62,11 @@ void algo_free_all(int thr_id) @@ -60,9 +62,11 @@ void algo_free_all(int thr_id)
free_decred(thr_id);
free_deep(thr_id);
free_equihash(thr_id);
free_exosis(thr_id);
free_keccak256(thr_id);
free_fresh(thr_id);
free_fugue256(thr_id);
free_gostd(thr_id);
free_groestlcoin(thr_id);
#ifdef WITH_HEAVY_ALGO
free_heavy(thr_id);
@ -75,12 +79,14 @@ void algo_free_all(int thr_id) @@ -75,12 +79,14 @@ void algo_free_all(int thr_id)
free_luffa(thr_id);
free_lyra2(thr_id);
free_lyra2v2(thr_id);
free_lyra2v3(thr_id);
free_lyra2Z(thr_id);
free_myriad(thr_id);
free_neoscrypt(thr_id);
free_nist5(thr_id);
free_pentablake(thr_id);
free_phi(thr_id);
free_phi2(thr_id);
free_polytimos(thr_id);
free_quark(thr_id);
free_qubit(thr_id);
@ -89,9 +95,10 @@ void algo_free_all(int thr_id) @@ -89,9 +95,10 @@ void algo_free_all(int thr_id)
free_skunk(thr_id);
free_sha256d(thr_id);
free_sha256t(thr_id);
free_gostd(thr_id);
free_sha256q(thr_id);
free_sia(thr_id);
free_sib(thr_id);
free_sonoa(thr_id);
free_s3(thr_id);
free_vanilla(thr_id);
free_veltor(thr_id);
@ -154,6 +161,22 @@ bool bench_algo_switch_next(int thr_id) @@ -154,6 +161,22 @@ bool bench_algo_switch_next(int thr_id)
if (algo == ALGO_SCRYPT) algo++;
if (algo == ALGO_SCRYPT_JANE) algo++;
// Set cryptonight variant
switch (algo) {
case ALGO_MONERO:
cryptonight_fork = 7;
break;
case ALGO_GRAFT:
cryptonight_fork = 8;
break;
case ALGO_STELLITE:
cryptonight_fork = 3;
break;
case ALGO_CRYPTONIGHT:
cryptonight_fork = 1;
break;
}
// free current algo memory and track mem usage
mused = cuda_available_memory(thr_id);
algo_free_all(thr_id);

273
blake2b.cu

@ -0,0 +1,273 @@ @@ -0,0 +1,273 @@
/**
* Blake2-B CUDA Implementation
*
* tpruvot@github July 2016
*
*/
#include <miner.h>
#include <string.h>
#include <stdint.h>
#include <sph/blake2b.h>
#include <cuda_helper.h>
#include <cuda_vector_uint2x4.h>
#define TPB 512
#define NBN 2
static uint32_t *d_resNonces[MAX_GPUS];
__device__ uint64_t d_data[10];
static __constant__ const int8_t blake2b_sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } ,
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
// host mem align
#define A 64
extern "C" void blake2b_hash(void *output, const void *input)
{
uint8_t _ALIGN(A) hash[32];
blake2b_ctx ctx;
blake2b_init(&ctx, 32, NULL, 0);
blake2b_update(&ctx, input, 80);
blake2b_final(&ctx, hash);
memcpy(output, hash, 32);
}
// ----------------------------------------------------------------
__device__ __forceinline__
static void G(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, uint64_t &d, uint64_t const m[16])
{
a = a + b + m[ blake2b_sigma[r][2*i] ];
((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
c = c + d;
((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
a = a + b + m[ blake2b_sigma[r][2*i+1] ];
((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
c = c + d;
((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
}
#define ROUND(r) \
G(r, 0, v[0], v[4], v[ 8], v[12], m); \
G(r, 1, v[1], v[5], v[ 9], v[13], m); \
G(r, 2, v[2], v[6], v[10], v[14], m); \
G(r, 3, v[3], v[7], v[11], v[15], m); \
G(r, 4, v[0], v[5], v[10], v[15], m); \
G(r, 5, v[1], v[6], v[11], v[12], m); \
G(r, 6, v[2], v[7], v[ 8], v[13], m); \
G(r, 7, v[3], v[4], v[ 9], v[14], m);
__global__
//__launch_bounds__(128, 8) /* to force 64 regs */
void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2)
{
const uint32_t nonce = (blockDim.x * blockIdx.x + threadIdx.x) + startNonce;
uint64_t m[16];
m[0] = d_data[0];
m[1] = d_data[1];
m[2] = d_data[2];
m[3] = d_data[3];
m[4] = d_data[4];
m[5] = d_data[5];
m[6] = d_data[6];
m[7] = d_data[7];
m[8] = d_data[8];
((uint32_t*)m)[18] = AS_U32(&d_data[9]);
((uint32_t*)m)[19] = nonce;
m[10] = m[11] = 0;
m[12] = m[13] = 0;
m[14] = m[15] = 0;
uint64_t v[16] = {
0x6a09e667f2bdc928, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade68281, 0x9b05688c2b3e6c1f, 0xe07c265404be4294, 0x5be0cd19137e2179
};
ROUND( 0);
ROUND( 1);
ROUND( 2);
ROUND( 3);
ROUND( 4);
ROUND( 5);
ROUND( 6);
ROUND( 7);
ROUND( 8);
ROUND( 9);
ROUND(10);
ROUND(11);
uint2 last = vectorize(v[3] ^ v[11] ^ 0xa54ff53a5f1d36f1);
if (last.y <= target2.y && last.x <= target2.x) {
resNonce[1] = resNonce[0];
resNonce[0] = nonce;
}
}
__host__
uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce)
{
uint32_t resNonces[NBN] = { UINT32_MAX, UINT32_MAX };
uint32_t result = UINT32_MAX;
dim3 grid((threads + TPB-1)/TPB);
dim3 block(TPB);
/* Check error on Ctrl+C or kill to prevent segfaults on exit */
if (cudaMemset(d_resNonces[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
return result;
blake2b_gpu_hash <<<grid, block, 8>>> (threads, startNonce, d_resNonces[thr_id], target2);
cudaThreadSynchronize();
if (cudaSuccess == cudaMemcpy(resNonces, d_resNonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
result = resNonces[0];
secNonce = resNonces[1];
if (secNonce == result) secNonce = UINT32_MAX;
}
return result;
}
__host__
void blake2b_setBlock(uint32_t *data)
{
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 80, 0, cudaMemcpyHostToDevice));
}
static bool init[MAX_GPUS] = { 0 };
int scanhash_blake2b(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t _ALIGN(A) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 28 : 25;
if (device_sm[dev_id] >= 520 && is_windows()) intensity = 26;
if (device_sm[dev_id] < 350) intensity = 22;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (!init[thr_id])
{
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage (linux)
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonces[thr_id], NBN * sizeof(uint32_t)), -1);
init[thr_id] = true;
}
for (int i=0; i < 20; i++)
be32enc(&endiandata[i], pdata[i]);
const uint2 target = make_uint2(ptarget[6], ptarget[7]);
blake2b_setBlock(endiandata);
do {
work->nonces[0] = blake2b_hash_cuda(thr_id, throughput, pdata[19], target, work->nonces[1]);
*hashes_done = pdata[19] - first_nonce + throughput;
if (work->nonces[0] != UINT32_MAX)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(A) vhash[8];
work->valid_nonces = 0;
endiandata[19] = work->nonces[0];
blake2b_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work_set_target_ratio(work, vhash);
work->valid_nonces++;
pdata[19] = work->nonces[0] + 1;
} else {
gpu_increment_reject(thr_id);
}
if (work->nonces[1] != UINT32_MAX) {
endiandata[19] = work->nonces[1];
blake2b_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
work->sharediff[1] = work->sharediff[0];
work->shareratio[1] = work->shareratio[0];
xchg(work->nonces[1], work->nonces[0]);
work_set_target_ratio(work, vhash);
} else {
bn_set_target_ratio(work, vhash, 1);
}
work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // next scan start
} else {
gpu_increment_reject(thr_id);
}
}
if (work->valid_nonces) {
work->nonces[0] = cuda_swab32(work->nonces[0]);
work->nonces[1] = cuda_swab32(work->nonces[1]);
return work->valid_nonces;
}
}
if ((uint64_t) throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_blake2b(int thr_id)
{
if (!init[thr_id])
return;
//cudaThreadSynchronize();
cudaFree(d_resNonces[thr_id]);
init[thr_id] = false;
cudaDeviceSynchronize();
}

118
ccminer.cpp

@ -103,6 +103,7 @@ bool submit_old = false; @@ -103,6 +103,7 @@ bool submit_old = false;
bool use_syslog = false;
bool use_colors = true;
int use_pok = 0;
int use_roots = 0;
static bool opt_background = false;
bool opt_quiet = false;
int opt_maxlograte = 3;
@ -232,27 +233,33 @@ int opt_api_mcast_port = 4068; @@ -232,27 +233,33 @@ int opt_api_mcast_port = 4068;
bool opt_stratum_stats = false;
int cryptonight_fork = 1;
static char const usage[] = "\
Usage: " PROGRAM_NAME " [OPTIONS]\n\
Options:\n\
-a, --algo=ALGO specify the hash algorithm to use\n\
allium Garlic double lyra2\n\
bastion Hefty bastion\n\
bitcore Timetravel-10\n\
blake Blake 256 (SFR)\n\
blake2b Blake2-B 512 (BCX)\n\
blake2s Blake2-S 256 (NEVA)\n\
blakecoin Fast Blake 256 (8 rounds)\n\
bmw BMW 256\n\
cryptolight AEON cryptonight (MEM/2)\n\
cryptonight XMR cryptonight\n\
cryptonight XMR cryptonight v1 (old)\n\
c11/flax X11 variant\n\
decred Decred Blake256\n\
deep Deepcoin\n\
equihash Zcash Equihash\n\
exosis Exosis timetravel\n\
dmd-gr Diamond-Groestl\n\
fresh Freshcoin (shavite 80)\n\
fugue256 Fuguecoin\n\
gostcoin GOSTcoin\n\
gostd Double GOST R 34.11\n\
graft Cryptonight v8\n\
groestl Groestlcoin\n"
#ifdef WITH_HEAVY_ALGO
" heavy Heavycoin\n"
@ -264,18 +271,22 @@ Options:\n\ @@ -264,18 +271,22 @@ Options:\n\
lbry LBRY Credits (Sha/Ripemd)\n\
luffa Joincoin\n\
lyra2 CryptoCoin\n\
lyra2v2 VertCoin\n\
lyra2v2 MonaCoin\n\
lyra2v3 Vertcoin\n\
lyra2z ZeroCoin (3rd impl)\n\
myr-gr Myriad-Groestl\n\
monero XMR cryptonight (v7)\n\
neoscrypt FeatherCoin, Phoenix, UFO...\n\
nist5 NIST5 (TalkCoin)\n\
penta Pentablake hash (5x Blake 512)\n\
phi BHCoin\n\
phi1612 LUX initial algo, for Seraph\n\
phi2 LUX v2 with lyra2\n\
polytimos Politimos\n\
quark Quark\n\
qubit Qubit\n\
sha256d SHA256d (bitcoin)\n\
sha256t SHA256 x3\n\
sha256q SHA256 x4\n\
sia SIA (Blake2B)\n\
sib Sibcoin (X11+Streebog)\n\
scrypt Scrypt\n\
@ -283,6 +294,8 @@ Options:\n\ @@ -283,6 +294,8 @@ Options:\n\
skein Skein SHA2 (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\
skunk Skein Cube Fugue Streebog\n\
sonoa 97 hashes based on X17 ones (Sono)\n\
stellite Cryptonight v3\n\
s3 S3 (1Coin)\n\
timetravel Machinecoin permuted x8\n\
tribus Denarius\n\
@ -572,7 +585,10 @@ static bool get_blocktemplate(CURL *curl, struct work *work); @@ -572,7 +585,10 @@ static bool get_blocktemplate(CURL *curl, struct work *work);
void get_currentalgo(char* buf, int sz)
{
snprintf(buf, sz, "%s", algo_names[opt_algo]);
int algo = opt_algo;
if (algo == ALGO_CRYPTONIGHT)
algo = get_cryptonight_algo(cryptonight_fork);
snprintf(buf, sz, "%s", algo_names[algo]);
}
void format_hashrate(double hashrate, char *output)
@ -698,6 +714,10 @@ static bool work_decode(const json_t *val, struct work *work) @@ -698,6 +714,10 @@ static bool work_decode(const json_t *val, struct work *work)
data_size = 192;
adata_sz = 180/4;
break;
case ALGO_PHI2:
data_size = 144;
adata_sz = data_size / 4;
break;
case ALGO_NEOSCRYPT:
case ALGO_ZR5:
data_size = 80;
@ -743,6 +763,12 @@ static bool work_decode(const json_t *val, struct work *work) @@ -743,6 +763,12 @@ static bool work_decode(const json_t *val, struct work *work)
for (i = 0; i < atarget_sz; i++)
work->target[i] = le32dec(work->target + i);
if (opt_algo == ALGO_PHI2) {
for (i = 20; i < 36; i++) if (work->data[i]) {
use_roots = 1; break;
}
}
if ((opt_showdiff || opt_max_diff > 0.) && !allow_mininginfo)
calc_network_diff(work);
@ -955,6 +981,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work) @@ -955,6 +981,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
case ALGO_BMW:
case ALGO_SHA256D:
case ALGO_SHA256T:
case ALGO_SHA256Q:
case ALGO_VANILLA:
// fast algos require that... (todo: regen hash)
check_dups = true;
@ -1066,6 +1093,9 @@ static bool submit_upstream_work(CURL *curl, struct work *work) @@ -1066,6 +1093,9 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
else if (opt_algo == ALGO_DECRED) {
data_size = 192; adata_sz = 180/4;
}
else if (opt_algo == ALGO_PHI2 && use_roots) {
data_size = 144; adata_sz = 36;
}
else if (opt_algo == ALGO_SIA) {
return sia_submit(curl, pool, work);
}
@ -1637,10 +1667,17 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) @@ -1637,10 +1667,17 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
for (i = 0; i < 8; i++)
work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
for (i = 0; i < 8; i++)
work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
work->data[17 + i] = ((uint32_t*)sctx->job.extra)[i];
work->data[25] = le32dec(sctx->job.ntime);
work->data[26] = le32dec(sctx->job.nbits);
work->data[28] = 0x80000000;
} else if (opt_algo == ALGO_PHI2) {
for (i = 0; i < 8; i++)
work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
work->data[17] = le32dec(sctx->job.ntime);
work->data[18] = le32dec(sctx->job.nbits);
for (i = 0; i < 16; i++)
work->data[20 + i] = ((uint32_t*)sctx->job.extra)[i];
} else if (opt_algo == ALGO_SIA) {
uint32_t extra = 0;
memcpy(&extra, &sctx->job.coinbase[32], 2);
@ -1708,6 +1745,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) @@ -1708,6 +1745,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
case ALGO_SCRYPT_JANE:
work_set_target(work, sctx->job.diff / (65536.0 * opt_difficulty));
break;
case ALGO_ALLIUM:
case ALGO_DMD_GR:
case ALGO_FRESH:
case ALGO_FUGUE256:
@ -1715,9 +1753,12 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) @@ -1715,9 +1753,12 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
case ALGO_KECCAKC:
case ALGO_LBRY:
case ALGO_LYRA2v2:
case ALGO_LYRA2v3:
case ALGO_LYRA2Z:
case ALGO_PHI2:
case ALGO_TIMETRAVEL:
case ALGO_BITCORE:
case ALGO_EXOSIS:
case ALGO_X16R:
case ALGO_X16S:
work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
@ -2231,9 +2272,11 @@ static void *miner_thread(void *userdata) @@ -2231,9 +2272,11 @@ static void *miner_thread(void *userdata)
case ALGO_DECRED:
case ALGO_SHA256D:
case ALGO_SHA256T:
case ALGO_SHA256Q:
//case ALGO_WHIRLPOOLX:
minmax = 0x40000000U;
break;
case ALGO_BLAKE2B:
case ALGO_KECCAK:
case ALGO_KECCAKC:
case ALGO_LBRY:
@ -2244,6 +2287,7 @@ static void *miner_thread(void *userdata) @@ -2244,6 +2287,7 @@ static void *miner_thread(void *userdata)
case ALGO_TRIBUS:
minmax = 0x1000000;
break;
case ALGO_ALLIUM:
case ALGO_C11:
case ALGO_DEEP:
case ALGO_HEAVY:
@ -2251,12 +2295,15 @@ static void *miner_thread(void *userdata) @@ -2251,12 +2295,15 @@ static void *miner_thread(void *userdata)
case ALGO_JHA:
case ALGO_HSR:
case ALGO_LYRA2v2:
case ALGO_LYRA2v3:
case ALGO_PHI:
case ALGO_PHI2:
case ALGO_POLYTIMOS:
case ALGO_S3:
case ALGO_SKUNK:
case ALGO_TIMETRAVEL:
case ALGO_BITCORE:
case ALGO_EXOSIS:
case ALGO_X11EVO:
case ALGO_X11:
case ALGO_X12:
@ -2276,6 +2323,7 @@ static void *miner_thread(void *userdata) @@ -2276,6 +2323,7 @@ static void *miner_thread(void *userdata)
case ALGO_NEOSCRYPT:
case ALGO_SIB:
case ALGO_SCRYPT:
case ALGO_SONOA:
case ALGO_VELTOR:
minmax = 0x80000;
break;
@ -2335,6 +2383,9 @@ static void *miner_thread(void *userdata) @@ -2335,6 +2383,9 @@ static void *miner_thread(void *userdata)
/* scan nonces for a proof-of-work hash */
switch (opt_algo) {
case ALGO_ALLIUM:
rc = scanhash_allium(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_BASTION:
rc = scanhash_bastion(thr_id, &work, max_nonce, &hashes_done);
break;
@ -2344,6 +2395,9 @@ static void *miner_thread(void *userdata) @@ -2344,6 +2395,9 @@ static void *miner_thread(void *userdata)
case ALGO_BLAKE:
rc = scanhash_blake256(thr_id, &work, max_nonce, &hashes_done, 14);
break;
case ALGO_BLAKE2B:
rc = scanhash_blake2b(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_BLAKE2S:
rc = scanhash_blake2s(thr_id, &work, max_nonce, &hashes_done);
break;
@ -2354,11 +2408,19 @@ static void *miner_thread(void *userdata) @@ -2354,11 +2408,19 @@ static void *miner_thread(void *userdata)
rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_CRYPTOLIGHT:
rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done);
rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1);
break;
case ALGO_MONERO:
case ALGO_STELLITE:
case ALGO_GRAFT:
case ALGO_CRYPTONIGHT:
rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done);
{
int cn_variant = 0;
if (cryptonight_fork > 1 && ((unsigned char*)work.data)[0] >= cryptonight_fork)
cn_variant = ((unsigned char*)work.data)[0] - cryptonight_fork + 1;
rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done, cn_variant);
break;
}
case ALGO_DECRED:
rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done);
break;
@ -2375,6 +2437,11 @@ static void *miner_thread(void *userdata) @@ -2375,6 +2437,11 @@ static void *miner_thread(void *userdata)
rc = scanhash_fugue256(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_GOSTCOIN:
case ALGO_GOSTD:
rc = scanhash_gostd(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_GROESTL:
case ALGO_DMD_GR:
rc = scanhash_groestlcoin(thr_id, &work, max_nonce, &hashes_done);
@ -2427,6 +2494,9 @@ static void *miner_thread(void *userdata) @@ -2427,6 +2494,9 @@ static void *miner_thread(void *userdata)
case ALGO_LYRA2v2:
rc = scanhash_lyra2v2(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_LYRA2v3:
rc = scanhash_lyra2v3(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_LYRA2Z:
rc = scanhash_lyra2Z(thr_id, &work, max_nonce, &hashes_done);
break;
@ -2442,6 +2512,9 @@ static void *miner_thread(void *userdata) @@ -2442,6 +2512,9 @@ static void *miner_thread(void *userdata)
case ALGO_PHI:
rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_PHI2:
rc = scanhash_phi2(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_POLYTIMOS:
rc = scanhash_polytimos(thr_id, &work, max_nonce, &hashes_done);
break;
@ -2468,15 +2541,17 @@ static void *miner_thread(void *userdata) @@ -2468,15 +2541,17 @@ static void *miner_thread(void *userdata)
case ALGO_SHA256T:
rc = scanhash_sha256t(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_SHA256Q:
rc = scanhash_sha256q(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_SIA:
rc = scanhash_sia(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_SIB:
rc = scanhash_sib(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_GOSTCOIN:
case ALGO_GOSTD:
rc = scanhash_gostd(thr_id, &work, max_nonce, &hashes_done);
case ALGO_SONOA:
rc = scanhash_sonoa(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_S3:
rc = scanhash_s3(thr_id, &work, max_nonce, &hashes_done);
@ -2506,6 +2581,9 @@ static void *miner_thread(void *userdata) @@ -2506,6 +2581,9 @@ static void *miner_thread(void *userdata)
case ALGO_BITCORE:
rc = scanhash_bitcore(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_EXOSIS:
rc = scanhash_exosis(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_X11EVO:
rc = scanhash_x11evo(thr_id, &work, max_nonce, &hashes_done);
break;
@ -3121,6 +3199,26 @@ void parse_arg(int key, char *arg) @@ -3121,6 +3199,26 @@ void parse_arg(int key, char *arg)
case ALGO_SCRYPT_JANE: opt_nfactor = 14; break;
}
}
// cryptonight variants
switch (opt_algo) {
case ALGO_MONERO:
opt_algo = ALGO_CRYPTONIGHT;
cryptonight_fork = 7;
break;
case ALGO_GRAFT:
opt_algo = ALGO_CRYPTONIGHT;
cryptonight_fork = 8;
break;
case ALGO_STELLITE:
opt_algo = ALGO_CRYPTONIGHT;
cryptonight_fork = 3;
break;
case ALGO_CRYPTONIGHT:
cryptonight_fork = 1;
break;
}
break;
case 'b':
p = strstr(arg, ":");

17
ccminer.vcxproj

@ -435,6 +435,8 @@ @@ -435,6 +435,8 @@
<CudaCompile Include="sha256\sha256d.cu" />
<CudaCompile Include="sha256\cuda_sha256t.cu" />
<CudaCompile Include="sha256\sha256t.cu" />
<CudaCompile Include="sha256\cuda_sha256q.cu" />
<CudaCompile Include="sha256\sha256q.cu" />
<CudaCompile Include="zr5.cu" />
<CudaCompile Include="heavy\cuda_blake512.cu">
</CudaCompile>
@ -460,6 +462,7 @@ @@ -460,6 +462,7 @@
</CudaCompile>
<CudaCompile Include="JHA\cuda_jha_keccak512.cu">
</CudaCompile>
<CudaCompile Include="blake2b.cu" />
<CudaCompile Include="Algo256\blake256.cu">
<MaxRegCount>64</MaxRegCount>
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
@ -519,12 +522,17 @@ @@ -519,12 +522,17 @@
<CudaCompile Include="qubit\luffa.cu" />
<CudaCompile Include="qubit\qubit.cu" />
<CudaCompile Include="qubit\qubit_luffa512.cu" />
<CudaCompile Include="lyra2\allium.cu" />
<CudaCompile Include="lyra2\lyra2RE.cu" />
<CudaCompile Include="lyra2\cuda_lyra2.cu" />
<CudaCompile Include="lyra2\lyra2REv2.cu" />
<CudaCompile Include="lyra2\cuda_lyra2v2.cu" />
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
<ClInclude Include="lyra2\cuda_lyra2_sm5.cuh" />
<ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
<CudaCompile Include="lyra2\lyra2REv3.cu" />
<CudaCompile Include="lyra2\cuda_lyra2v3.cu" />
<ClInclude Include="lyra2\cuda_lyra2v3_sm3.cuh" />
<CudaCompile Include="lyra2\lyra2Z.cu" />
<CudaCompile Include="lyra2\cuda_lyra2Z.cu" />
<ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh" />
@ -536,6 +544,10 @@ @@ -536,6 +544,10 @@
<CudaCompile Include="cuda_skeincoin.cu">
<MaxRegCount>48</MaxRegCount>
</CudaCompile>
<CudaCompile Include="phi\phi.cu" />
<CudaCompile Include="phi\phi2.cu" />
<CudaCompile Include="phi\cuda_phi2.cu" />
<CudaCompile Include="phi\cuda_phi2_cubehash512.cu" />
<CudaCompile Include="skunk\skunk.cu" />
<CudaCompile Include="skunk\cuda_skunk.cu">
<CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
@ -566,11 +578,11 @@ @@ -566,11 +578,11 @@
<CudaCompile Include="x11\cuda_streebog_maxwell.cu" />
<CudaCompile Include="x11\c11.cu" />
<CudaCompile Include="x11\fresh.cu" />
<CudaCompile Include="x11\phi.cu" />
<CudaCompile Include="x11\sib.cu" />
<CudaCompile Include="x11\s3.cu" />
<CudaCompile Include="x11\timetravel.cu" />
<CudaCompile Include="x11\bitcore.cu" />
<CudaCompile Include="x11\exosis.cu" />
<CudaCompile Include="x11\veltor.cu" />
<CudaCompile Include="x11\x11.cu" />
<CudaCompile Include="x11\x11evo.cu" />
@ -586,7 +598,6 @@ @@ -586,7 +598,6 @@
<CudaCompile Include="x15\x14.cu" />
<CudaCompile Include="x15\cuda_x14_shabal512.cu" />
<CudaCompile Include="x15\cuda_x15_whirlpool.cu" />
<CudaCompile Include="x17\hmq17.cu" />
<CudaCompile Include="x15\x15.cu" />
<CudaCompile Include="x15\whirlpool.cu" />
<CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu" />
@ -599,6 +610,8 @@ @@ -599,6 +610,8 @@
<CudaCompile Include="x16\cuda_x16_echo512_64.cu">
<CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
</CudaCompile>
<CudaCompile Include="x17\hmq17.cu" />
<CudaCompile Include="x17\sonoa.cu" />
<CudaCompile Include="x17\x17.cu" />
<CudaCompile Include="x17\cuda_x17_haval256.cu">
</CudaCompile>

48
ccminer.vcxproj.filters

@ -115,6 +115,9 @@ @@ -115,6 +115,9 @@
<Filter Include="Source Files\CUDA\tribus">
<UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\phi">
<UniqueIdentifier>{311e8d79-1612-4f0f-8591-23a592f2b2d3}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\x12">
<UniqueIdentifier>{xde48d89-fx12-1323-129a-b592fe2b2de3}</UniqueIdentifier>
</Filter>
@ -548,6 +551,9 @@ @@ -548,6 +551,9 @@
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
<Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude>
<ClInclude Include="lyra2\cuda_lyra2_sm5.cuh">
<Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude>
<ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh">
<Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude>
@ -775,6 +781,9 @@ @@ -775,6 +781,9 @@
<CudaCompile Include="x17\hmq17.cu">
<Filter>Source Files\CUDA\x17</Filter>
</CudaCompile>
<CudaCompile Include="x17\sonoa.cu">
<Filter>Source Files\CUDA\x17</Filter>
</CudaCompile>
<CudaCompile Include="x17\x17.cu">
<Filter>Source Files\CUDA\x17</Filter>
</CudaCompile>
@ -784,6 +793,18 @@ @@ -784,6 +793,18 @@
<CudaCompile Include="polytimos.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="phi\phi.cu">
<Filter>Source Files\CUDA\phi</Filter>
</CudaCompile>
<CudaCompile Include="phi\phi2.cu">
<Filter>Source Files\CUDA\phi</Filter>
</CudaCompile>
<CudaCompile Include="phi\cuda_phi2.cu">
<Filter>Source Files\CUDA\phi</Filter>
</CudaCompile>
<CudaCompile Include="phi\cuda_phi2_cubehash512.cu">
<Filter>Source Files\CUDA\phi</Filter>
</CudaCompile>
<CudaCompile Include="skunk\skunk.cu">
<Filter>Source Files\CUDA\skunk</Filter>
</CudaCompile>
@ -802,9 +823,6 @@ @@ -802,9 +823,6 @@
<ClInclude Include="tribus\cuda_echo512_aes.cuh">
<Filter>Source Files\CUDA\tribus</Filter>
</ClInclude>
<CudaCompile Include="x11\phi.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\sib.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
@ -826,6 +844,9 @@ @@ -826,6 +844,9 @@
<CudaCompile Include="x11\bitcore.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\exosis.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\veltor.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
@ -913,6 +934,9 @@ @@ -913,6 +934,9 @@
<CudaCompile Include="Algo256\cuda_bmw.cu">
<Filter>Source Files\CUDA\Algo256</Filter>
</CudaCompile>
<CudaCompile Include="lyra2\allium.cu">
<Filter>Source Files\CUDA\lyra2</Filter>
</CudaCompile>
<CudaCompile Include="lyra2\cuda_lyra2.cu">
<Filter>Source Files\CUDA\lyra2</Filter>
</CudaCompile>
@ -925,12 +949,24 @@ @@ -925,12 +949,24 @@
<CudaCompile Include="lyra2\lyra2REv2.cu">
<Filter>Source Files\CUDA\lyra2</Filter>
</CudaCompile>
<CudaCompile Include="lyra2\cuda_lyra2v3.cu">
<Filter>Source Files\CUDA\lyra2</Filter>
</CudaCompile>
<ClInclude Include="lyra2\cuda_lyra2v3_sm3.cuh">
<Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude>
<CudaCompile Include="lyra2\lyra2REv3.cu">
<Filter>Source Files\CUDA\lyra2</Filter>
</CudaCompile>
<CudaCompile Include="lyra2\cuda_lyra2Z.cu">
<Filter>Source Files\CUDA\lyra2</Filter>
</CudaCompile>
<CudaCompile Include="lyra2\lyra2Z.cu">
<Filter>Source Files\CUDA\lyra2</Filter>
</CudaCompile>
<CudaCompile Include="blake2b.cu">
<Filter>Source Files\CUDA\</Filter>
</CudaCompile>
<CudaCompile Include="Algo256\blake2s.cu">
<Filter>Source Files\CUDA\Algo256</Filter>
</CudaCompile>
@ -958,6 +994,12 @@ @@ -958,6 +994,12 @@
<CudaCompile Include="sha256\sha256t.cu">
<Filter>Source Files\CUDA\sha256</Filter>
</CudaCompile>
<CudaCompile Include="sha256\cuda_sha256q.cu">
<Filter>Source Files\CUDA\sha256</Filter>
</CudaCompile>
<CudaCompile Include="sha256\sha256q.cu">
<Filter>Source Files\CUDA\sha256</Filter>
</CudaCompile>
<CudaCompile Include="sia\sia.cu">
<Filter>Source Files\sia</Filter>
</CudaCompile>

2
compat/ccminer-config.h

@ -164,7 +164,7 @@ @@ -164,7 +164,7 @@
#define PACKAGE_URL "http://github.com/tpruvot/ccminer"
/* Define to the version of this package. */
#define PACKAGE_VERSION "2.2.5"
#define PACKAGE_VERSION "2.3.1"
/* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be

2
configure.ac

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
AC_INIT([ccminer], [2.2.5], [], [ccminer], [http://github.com/tpruvot/ccminer])
AC_INIT([ccminer], [2.3.1], [], [ccminer], [http://github.com/tpruvot/ccminer])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

1
crypto/cn_aes.cuh

@ -138,6 +138,7 @@ static const __device__ __align__(16) uint32_t d_t_fn[1024] = { @@ -138,6 +138,7 @@ static const __device__ __align__(16) uint32_t d_t_fn[1024] = {
*/
#define AS_U32(addr) *((uint32_t*)(addr))
#define AS_U64(addr) *((uint64_t*)(addr))
#define AS_UINT2(addr) *((uint2*)(addr))
#define AS_UINT4(addr) *((uint4*)(addr))
#define AS_UL2(addr) *((ulonglong2*)(addr))

2
crypto/cn_blake.cuh

@ -164,7 +164,7 @@ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest) @@ -164,7 +164,7 @@ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest)
}
__device__
void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out)
void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint32_t * out)
{
blake_state bs;
blake_state *S = (blake_state *)&bs;

9
crypto/cn_groestl.cuh

@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState* __restrict__ ctx, BitSequence* __restri @@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState* __restrict__ ctx, BitSequence* __restri
for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) {
output[j] = s[i];
}
#if 0
for (i = 0; i < GROESTL_COLS512; i++) {
ctx->chaining[i] = 0;
}
for (i = 0; i < GROESTL_SIZE512; i++) {
ctx->buffer[i] = 0;
}
#endif
}
__device__
@ -336,12 +337,12 @@ void cn_groestl_init(groestlHashState* ctx) @@ -336,12 +337,12 @@ void cn_groestl_init(groestlHashState* ctx)
}
__device__
void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
void cn_groestl(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
{
DataLength databitlen = len << 3;
groestlHashState context;
cn_groestl_init(&context);
cn_groestl_update(&context, data, databitlen);
cn_groestl_final(&context, hashval);
cn_groestl_update(&context, (BitSequence*) data, databitlen);
cn_groestl_final(&context, (BitSequence*) hashval);
}

20
crypto/cn_jh.cuh

@ -198,8 +198,9 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__ @@ -198,8 +198,9 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
databitlen = 0;
}
if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) {
memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) )
{
memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) );
index = 64-(state->datasize_in_buffer >> 3);
databitlen = databitlen - (512 - state->datasize_in_buffer);
cn_jh_F8(state);
@ -222,7 +223,7 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__ @@ -222,7 +223,7 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
/* pad the message, process the padded block(s), truncate the hash value H to obtain the message digest */
__device__
void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashval)
void cn_jh_final(jhHashState * __restrict__ state, uint8_t * __restrict__ hashval)
{
unsigned int i;
//uint32_t *bufptr = (uint32_t *)state->buffer;
@ -244,7 +245,7 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv @@ -244,7 +245,7 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
} else {
/*set the rest of the bytes in the buffer to 0*/
/* set the rest of the bytes in the buffer to 0 */
if ( (state->datasize_in_buffer & 7) == 0) {
for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0;
} else {
@ -268,7 +269,8 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv @@ -268,7 +269,8 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
cn_jh_F8(state);
}
MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
memcpy(hashval, ((unsigned char*)state->x) + 64 + 32, 32);
//MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
}
__device__
@ -277,12 +279,12 @@ void cn_jh_init(jhHashState *state, int hashbitlen) @@ -277,12 +279,12 @@ void cn_jh_init(jhHashState *state, int hashbitlen)
state->databitlen = 0;
state->datasize_in_buffer = 0;
state->hashbitlen = hashbitlen;
//memcpy(state->x, d_JH256_H0, 128);
MEMCPY8(state->x, d_JH256_H0, 128 / 8);
memcpy(state->x, d_JH256_H0, 128);
//MEMCPY8(state->x, d_JH256_H0, 128 / 8);
}
__device__
void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval)
void cn_jh(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
{
const int hashbitlen = 256;
DataLength databitlen = len << 3;
@ -290,5 +292,5 @@ void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re @@ -290,5 +292,5 @@ void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
cn_jh_init(&state, hashbitlen);
cn_jh_update(&state, data, databitlen);
cn_jh_final(&state, hashval);
cn_jh_final(&state, (uint8_t*) hashval);
}

2
crypto/cn_keccak.cuh

@ -195,7 +195,7 @@ void cn_keccakf(uint64_t *s) @@ -195,7 +195,7 @@ void cn_keccakf(uint64_t *s)
}
__device__ __forceinline__
void cn_keccak(const uint8_t * __restrict__ in, uint8_t * __restrict__ md)
void cn_keccak(const uint32_t * __restrict__ in, uint64_t * __restrict__ md)
{
uint64_t st[25];

22
crypto/cn_skein.cuh

@ -4,19 +4,15 @@ typedef unsigned int uint_t; /* native unsigned integer */ @@ -4,19 +4,15 @@ typedef unsigned int uint_t; /* native unsigned integer */
#define SKEIN_256_STATE_WORDS ( 4)
#define SKEIN_512_STATE_WORDS ( 8)
#define SKEIN1024_STATE_WORDS (16)
#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS)
#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
@ -119,7 +115,7 @@ typedef struct { @@ -119,7 +115,7 @@ typedef struct {
} skeinHashState;
__device__
void cn_skein256_init(skeinHashState *state, size_t hashBitLen)
void cn_skein_init(skeinHashState *state, size_t hashBitLen)
{
const uint64_t SKEIN_512_IV_256[] =
{
@ -258,14 +254,12 @@ void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restr @@ -258,14 +254,12 @@ void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restr
}
__device__
void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
void cn_skein_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
{
if ((databitlen & 7) == 0) {
cn_skein_block(&state->u.ctx_512, data, databitlen >> 3);
}
else {
size_t bCnt = (databitlen >> 3) + 1;
uint8_t b,mask;
@ -280,7 +274,7 @@ void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __r @@ -280,7 +274,7 @@ void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __r
}
__device__
void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restrict__ hashVal)
void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal)
{
uint64_t X[SKEIN_512_STATE_WORDS];
Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
@ -305,13 +299,13 @@ void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restric @@ -305,13 +299,13 @@ void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restric
((uint64_t *)ctx->b)[0] = (uint64_t)i;
Skein_Start_New_Type(ctx, OUT_FINAL);
cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t));
memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES/sizeof(uint32_t)), ctx->X, n);
memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES), ctx->X, n);
memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time
}
}
__device__
void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval)
void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
{
int hashbitlen = 256;
DataLength databitlen = len << 3;
@ -319,7 +313,7 @@ void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re @@ -319,7 +313,7 @@ void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
state.statebits = 64*SKEIN_512_STATE_WORDS;
cn_skein256_init(&state, hashbitlen);
cn_skein256_update(&state, data, databitlen);
cn_skein256_final(&state, hashval);
cn_skein_init(&state, hashbitlen);
cn_skein_update(&state, data, databitlen);
cn_skein_final(&state, (uint8_t*) hashval);
}

86
crypto/cryptolight-core.cu

@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t * @@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
if(thread < threads)
{
const int oft = thread * 52 + sub + 16; // not aligned 16!
const int oft = thread * 50 + sub + 16; // not aligned 16!
const int long_oft = (thread << LONG_SHL_IDX) + sub;
uint32_t __align__(16) key[40];
uint32_t __align__(16) text[4];
@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t * @@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
}
}
// --------------------------------------------------------------------------------------------------------------
__global__
void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
void cryptolight_old_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
{
__shared__ uint32_t __align__(16) sharedMemory[1024];
@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int @@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
#endif // __CUDA_ARCH__ >= 300
}
__device__ __forceinline__ void store_variant1(uint32_t* long_state)
{
uint4* Z = (uint4*) long_state;
const uint32_t tmp = (Z->z >> 24); // __byte_perm(src, 0, 0x7773);
const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
Z->z = (Z->z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
}
#define MUL_SUM_XOR_DST_1(a,c,dst,tweak) { \
uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
hi += ((uint64_t *)c)[0]; \
((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
((uint64_t *)dst)[0] = hi; \
((uint64_t *)dst)[1] = lo ^ tweak; }
__global__
void cryptolight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint64_t * __restrict__ d_tweak)
{
__shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (thread < threads)
{
const uint32_t batchsize = ITER >> (2 + bfactor);
const uint32_t start = partidx * batchsize;
const uint32_t end = start + batchsize;
const uint32_t longptr = thread << LONG_SHL_IDX;
uint32_t * long_state = &d_long_state[longptr];
uint64_t tweak = d_tweak[thread];
void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
uint4 B = AS_UINT4(ctx_b);
uint32_t* a = (uint32_t*)&A;
uint32_t* b = (uint32_t*)&B;
for (int i = start; i < end; i++)
{
uint32_t c[4];
uint32_t j = (A.x >> 2) & E2I_MASK2;
cn_aes_single_round(sharedMemory, &long_state[j], c, a);
XOR_BLOCKS_DST(c, b, &long_state[j]);
store_variant1(&long_state[j]);
MUL_SUM_XOR_DST_1(c, a, &long_state[(c[0] >> 2) & E2I_MASK2], tweak);
j = (A.x >> 2) & E2I_MASK2;
cn_aes_single_round(sharedMemory, &long_state[j], b, a);
XOR_BLOCKS_DST(b, c, &long_state[j]);
store_variant1(&long_state[j]);
MUL_SUM_XOR_DST_1(b, a, &long_state[(b[0] >> 2) & E2I_MASK2], tweak);
}
if (bfactor) {
AS_UINT4(ctx_a) = A;
AS_UINT4(ctx_b) = B;
}
}
}
__global__
void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2)
{
@ -222,7 +288,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3 @@ -222,7 +288,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
if(thread < threads)
{
const int long_oft = (thread << LONG_SHL_IDX) + sub;
const int oft = thread * 52 + sub + 16;
const int oft = thread * 50 + sub + 16;
uint32_t __align__(16) key[40];
uint32_t __align__(16) text[4];
@ -251,8 +317,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3 @@ -251,8 +317,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
extern int device_bfactor[MAX_GPUS];
__host__
void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state,
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state,
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
{
dim3 grid(blocks);
dim3 block(threads);
@ -265,17 +331,21 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_ @@ -265,17 +331,21 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_
int i, partcount = 1 << bfactor;
int dev_id = device_map[thr_id];
cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key1);
cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep);
for(i = 0; i < partcount; i++)
{
cryptolight_core_gpu_phase2 <<<grid, (device_sm[dev_id] >= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
if (variant == 0)
cryptolight_old_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
else
cryptolight_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep);
}
cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key2);
cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}

45
crypto/cryptolight-cpu.cpp

@ -132,14 +132,13 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui @@ -132,14 +132,13 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
}
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
hi += ((uint64_t*) c)[0];
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
((uint64_t*) dst)[0] = hi;
((uint64_t*) dst)[1] = lo;
((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo;
}
static void copy_block(uint8_t* dst, const uint8_t* src) {
@ -157,17 +156,29 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { @@ -157,17 +156,29 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
}
static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx)
static void cryptolight_store_variant(void* state, int variant) {
if (variant == 1) {
// use variant 1 like monero since june 2018
const uint8_t tmp = ((const uint8_t*)(state))[11];
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
}
}
static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant)
{
size_t i, j;
keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
#undef RND
#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
#undef RND
#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
RND(0);
RND(1);
RND(2);
@ -186,21 +197,21 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len, @@ -186,21 +197,21 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
j = e2i(ctx->a);
aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
cryptolight_store_variant(&ctx->long_state[j], variant);
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak);
j = e2i(ctx->a);
aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
cryptolight_store_variant(&ctx->long_state[j], variant);
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak);
}
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
#undef RND
#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
#undef RND
#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
RND(0);
RND(1);
@ -221,9 +232,15 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len, @@ -221,9 +232,15 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
oaes_free((OAES_CTX **) &ctx->aes_ctx);
}
void cryptolight_hash(void* output, const void* input, int len)
void cryptolight_hash_variant(void* output, const void* input, int len, int variant)
{
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
cryptolight_hash_ctx(output, input, len, ctx);
cryptolight_hash_ctx(output, input, len, ctx, variant);
free(ctx);
}
void cryptolight_hash(void* output, const void* input)
{
cryptolight_hash_variant(output, input, 76, 1);
}

30
crypto/cryptolight.cu

@ -7,16 +7,17 @@ static __thread uint32_t cn_blocks = 32; @@ -7,16 +7,17 @@ static __thread uint32_t cn_blocks = 32;
static __thread uint32_t cn_threads = 16;
static uint32_t *d_long_state[MAX_GPUS];
static uint64_t *d_ctx_state[MAX_GPUS];
static uint32_t *d_ctx_state[MAX_GPUS];
static uint32_t *d_ctx_key1[MAX_GPUS];
static uint32_t *d_ctx_key2[MAX_GPUS];
static uint32_t *d_ctx_text[MAX_GPUS];
static uint64_t *d_ctx_tweak[MAX_GPUS];
static uint32_t *d_ctx_a[MAX_GPUS];
static uint32_t *d_ctx_b[MAX_GPUS];
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
{
int res = 0;
uint32_t throughput = 0;
@ -26,6 +27,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -26,6 +27,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
uint32_t *nonceptr = (uint32_t*) (&pdata[39]);
const uint32_t first_nonce = *nonceptr;
uint32_t nonce = first_nonce;
int dev_id = device_map[thr_id];
if(opt_benchmark) {
ptarget[7] = 0x00ff;
@ -33,6 +35,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -33,6 +35,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
if(!init[thr_id])
{
if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
device_config[thr_id] = strdup("80x32");
}
if (device_config[thr_id]) {
sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
@ -63,11 +69,11 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -63,11 +69,11 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
}
const size_t alloc = MEMORY * throughput;
cryptonight_extra_cpu_init(thr_id, throughput);
cryptonight_extra_init(thr_id);
cudaMalloc(&d_long_state[thr_id], alloc);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_state[thr_id], 26 * sizeof(uint64_t) * throughput);
cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
init[thr_id] = true;
}
@ -90,10 +97,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -90,10 +97,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
const uint32_t Htarg = ptarget[7];
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
cryptolight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
cryptonight_extra_setData(thr_id, pdata, ptarget);
cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
*hashes_done = nonce - first_nonce + throughput;
@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
memcpy(tempdata, pdata, 76);
*tempnonceptr = resNonces[0];
cryptolight_hash(vhash, tempdata, 76);
cryptolight_hash_variant(vhash, tempdata, 76, variant);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
{
res = 1;
@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
if(resNonces[1] != UINT32_MAX)
{
*tempnonceptr = resNonces[1];
cryptolight_hash(vhash, tempdata, 76);
cryptolight_hash_variant(vhash, tempdata, 76, variant);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
res++;
work->nonces[1] = resNonces[1];
@ -157,10 +164,11 @@ void free_cryptolight(int thr_id) @@ -157,10 +164,11 @@ void free_cryptolight(int thr_id)
cudaFree(d_ctx_key1[thr_id]);
cudaFree(d_ctx_key2[thr_id]);
cudaFree(d_ctx_text[thr_id]);
cudaFree(d_ctx_tweak[thr_id]);
cudaFree(d_ctx_a[thr_id]);
cudaFree(d_ctx_b[thr_id]);
cryptonight_extra_cpu_free(thr_id);
cryptonight_extra_free(thr_id);
cudaDeviceSynchronize();

13
crypto/cryptolight.h

@ -134,10 +134,11 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line) @@ -134,10 +134,11 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
exit(1);
}
}
void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
void cryptonight_extra_cpu_free(int thr_id);
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
void cryptonight_extra_init(int thr_id/*, uint32_t threads*/);
void cryptonight_extra_free(int thr_id);
void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state);

236
crypto/cryptonight-core.cu

@ -2,47 +2,55 @@ @@ -2,47 +2,55 @@
#include <stdint.h>
#include <string.h>
#include <sys/time.h>
#ifndef _WIN32
#include <unistd.h>
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
#undef __shfl
#define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width)
#endif
#include "cryptonight.h"
#define LONG_SHL32 19 // 1<<19
#define LONG_SHL32 19 // 1<<19 (uint32_t* index)
#define LONG_SHL64 18 // 1<<18 (uint64_t* index)
#define LONG_LOOPS32 0x80000U
#define LONG_LOOPS64 0x40000U
#include "cn_aes.cuh"
__global__
//__launch_bounds__(128, 9) // 56 registers
void cryptonight_core_gpu_phase1(const uint32_t threads, uint64_t * long_state, uint64_t * const ctx_state, uint32_t * ctx_key1)
void cryptonight_gpu_phase1(const uint32_t threads, uint32_t * __restrict__ d_long_state,
uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
{
__shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
__shared__ uint32_t sharedMemory[1024];
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
const uint32_t sub = (threadIdx.x & 7) << 1; // 0 2 .. 14
if(thread < threads)
{
const uint32_t long_oft = (thread << LONG_SHL64) + sub;
const uint32_t* ctx_key = &ctx_key1[thread * 40U];
uint4 keys[10];
#pragma unroll 10 // load 160 bytes
for (int i = 0; i < 10; i ++)
keys[i] = AS_UINT4(&ctx_key[i*4]);
cn_aes_gpu_init(sharedMemory);
__syncthreads();
uint4 text = AS_UINT4(&ctx_state[thread * 26U + sub + 8U]);
const uint32_t sub = (threadIdx.x & 0x7U) << 2;
uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
uint32_t __align__(8) key[40];
MEMCPY8(key, &ctx_key1[thread * 40U], 20);
uint32_t __align__(8) text[4];
MEMCPY8(text, &ctx_state[thread * 50U + sub + 16U], 2);
for (uint32_t i = 0; i < LONG_LOOPS64; i += 16U) {
cn_aes_pseudo_round_mut_uint4(sharedMemory, text, keys);
AS_UINT4(&long_state[long_oft + i]) = text;
for(int i = 0; i < LONG_LOOPS32; i += 32)
{
cn_aes_pseudo_round_mut(sharedMemory, text, key);
MEMCPY8(&longstate[i], text, 2);
}
}
}
// --------------------------------------------------------------------------------------------------------------
__device__ __forceinline__ ulonglong2 cuda_mul128(const uint64_t multiplier, const uint64_t multiplicand)
{
ulonglong2 product;
@ -59,8 +67,7 @@ static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, co @@ -59,8 +67,7 @@ static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, co
return make_ulonglong2(a.x ^ b.x, a.y ^ b.y);
}
#undef MUL_SUM_XOR_DST
__device__ __forceinline__ void MUL_SUM_XOR_DST(const uint64_t m, uint4 &a, void* far_dst)
__device__ __forceinline__ void MUL_SUM_XOR_DST_0(const uint64_t m, uint4 &a, void* far_dst)
{
ulonglong2 d = AS_UL2(far_dst);
ulonglong2 p = cuda_mul128(m, d.x);
@ -73,8 +80,8 @@ __global__ @@ -73,8 +80,8 @@ __global__
#if __CUDA_ARCH__ >= 500
//__launch_bounds__(128,12) /* force 40 regs to allow -l ...x32 */
#endif
void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, const uint32_t partidx,
uint64_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
void cryptonight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b)
{
__shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
@ -84,7 +91,7 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, @@ -84,7 +91,7 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
if (thread < threads)
{
const uint32_t batchsize = ITER >> (2U + bfactor);
const uint32_t batchsize = ITER >> (2 + bfactor);
const uint32_t start = partidx * batchsize;
const uint32_t end = start + batchsize;
@ -101,12 +108,12 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, @@ -101,12 +108,12 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
uint32_t j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
AS_UINT4(&long_state[j]) = C ^ B; // st.global.u32.v4
MUL_SUM_XOR_DST((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);
MUL_SUM_XOR_DST_0((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);
j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
AS_UINT4(&long_state[j]) = C ^ B;
MUL_SUM_XOR_DST((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
MUL_SUM_XOR_DST_0((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
}
if (bfactor) {
@ -116,71 +123,194 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, @@ -116,71 +123,194 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
}
}
// --------------------------------------------------------------------------------------------------------------
__device__ __forceinline__ void store_variant1(uint64_t* long_state, uint4 Z)
{
const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
AS_UINT4(long_state) = Z;
}
__device__ __forceinline__ void store_variant2(uint64_t* long_state, uint4 Z)
{
const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
const uint32_t index = (((tmp >> 4) & 6u) | (tmp & 1u)) << 1;
Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75312u >> index) & 0x30u)) << 24);
AS_UINT4(long_state) = Z;
}
__device__ __forceinline__ void MUL_SUM_XOR_DST_1(const uint64_t m, uint4 &a, void* far_dst, uint64_t tweak)
{
ulonglong2 d = AS_UL2(far_dst);
ulonglong2 p = cuda_mul128(m, d.x);
p += AS_UL2(&a);
AS_UL2(&a) = p ^ d;
p.y = p.y ^ tweak;
AS_UL2(far_dst) = p;
}
__global__
void cryptonight_core_gpu_phase3(const uint32_t threads, const uint64_t * long_state, uint64_t * ctx_state, uint32_t * __restrict__ ctx_key2)
void monero_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint64_t * __restrict__ d_tweak)
{
__shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3U;
const uint32_t sub = (threadIdx.x & 7U) << 1U;
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (thread < threads)
{
const uint32_t batchsize = ITER >> (2 + bfactor);
const uint32_t start = partidx * batchsize;
const uint32_t end = start + batchsize;
uint64_t tweak = d_tweak[thread];
void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
uint4 B = AS_UINT4(ctx_b);
if(thread < threads)
uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
for (int i = start; i < end; i++) // end = 262144
{
uint4 C;
uint32_t j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
store_variant1(&long_state[j], C ^ B); // st.global
MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
store_variant1(&long_state[j], C ^ B);
MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
}
if (bfactor) {
AS_UINT4(ctx_a) = A;
AS_UINT4(ctx_b) = B;
}
}
}
// --------------------------------------------------------------------------------------------------------------
__global__
void stellite_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint64_t * __restrict__ d_tweak)
{
__shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (thread < threads)
{
const uint32_t long_oft = (thread << LONG_SHL64) + sub;
const uint32_t st_oft = (thread * 26U) + sub + 8U;
const uint32_t batchsize = ITER >> (2 + bfactor);
const uint32_t start = partidx * batchsize;
const uint32_t end = start + batchsize;
uint64_t tweak = d_tweak[thread];
uint4 key[10];
const uint32_t* ctx_key = &ctx_key2[thread * 40U];
#pragma unroll 10 // 160 bytes
for (int i = 0; i < 10; i++)
key[i] = AS_UINT4(&ctx_key[i*4U]);
void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
uint4 B = AS_UINT4(ctx_b);
uint4 text = AS_UINT4(&ctx_state[st_oft]);
uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
for (int i = start; i < end; i++) // end = 262144
{
uint4 C;
uint32_t j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
store_variant2(&long_state[j], C ^ B); // st.global
MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
for(uint32_t i = 0; i < LONG_LOOPS64; i += 16U)
j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
store_variant2(&long_state[j], C ^ B);
MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
}
if (bfactor) {
AS_UINT4(ctx_a) = A;
AS_UINT4(ctx_b) = B;
}
}
}
// --------------------------------------------------------------------------------------------------------------
__global__
void cryptonight_gpu_phase3(const uint32_t threads, const uint32_t * __restrict__ d_long_state,
uint32_t * __restrict__ d_ctx_state, const uint32_t * __restrict__ d_ctx_key2)
{
__shared__ uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
if(thread < threads)
{
const int sub = (threadIdx.x & 7) << 2;
const uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
uint32_t key[40], text[4];
MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2);
for(int i = 0; i < LONG_LOOPS32; i += 32)
{
uint4 st = AS_UINT4(&long_state[long_oft + i]);
text = text ^ st;
cn_aes_pseudo_round_mut_uint4(sharedMemory, text, key);
#pragma unroll
for(int j = 0; j < 4; ++j)
text[j] ^= longstate[i + j];
cn_aes_pseudo_round_mut(sharedMemory, text, key);
}
AS_UINT4(&ctx_state[st_oft]) = text;
MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2);
}
}
// --------------------------------------------------------------------------------------------------------------
extern int device_bfactor[MAX_GPUS];
__host__
void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state,
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state,
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
{
dim3 grid(blocks);
dim3 block(threads);
//dim3 block2(threads << 1);
dim3 block4(threads << 2);
dim3 block8(threads << 3);
const uint32_t bfactor = (uint32_t) device_bfactor[thr_id];
const uint32_t partcount = 1 << bfactor;
const uint16_t bfactor = (uint16_t) device_bfactor[thr_id];
const uint32_t partcount = 1U << bfactor;
const uint32_t throughput = (uint32_t) (blocks*threads);
const int bsleep = bfactor ? 100 : 0;
const int dev_id = device_map[thr_id];
cryptonight_core_gpu_phase1 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key1);
cryptonight_gpu_phase1 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key1);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep);
for (uint32_t i = 0; i < partcount; i++)
{
dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
cryptonight_core_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
if (variant == 0)
cryptonight_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
else if (variant == 1 || cryptonight_fork == 8)
monero_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
else if (variant == 2 && cryptonight_fork == 3)
stellite_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep);
}
cryptonight_core_gpu_phase3 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key2);
//cudaDeviceSynchronize();
//exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cryptonight_gpu_phase3 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key2);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}

68
crypto/cryptonight-cpu.cpp

@ -130,14 +130,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui @@ -130,14 +130,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
}
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
hi += ((uint64_t*) c)[0];
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
((uint64_t*) dst)[0] = hi;
((uint64_t*) dst)[1] = lo;
((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo;
}
static void copy_block(uint8_t* dst, const uint8_t* src) {
@ -155,17 +155,34 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { @@ -155,17 +155,34 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
}
static void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cryptonight_ctx* ctx)
static void cryptonight_store_variant(void* state, int variant) {
if (variant == 1 || cryptonight_fork == 8) {
// monero and graft
const uint8_t tmp = ((const uint8_t*)(state))[11];
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
} else if (variant == 2 && cryptonight_fork == 3) {
// stellite
const uint8_t tmp = ((const uint8_t*)(state))[11];
const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1;
((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30);
}
}
static void cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant)
{
size_t i, j;
keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
#undef RND
#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
#undef RND
#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
RND(0);
RND(1);
RND(2);
@ -184,21 +201,21 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st @@ -184,21 +201,21 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
j = e2i(ctx->a) * AES_BLOCK_SIZE;
aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE]);
cryptonight_store_variant(&ctx->long_state[j], variant);
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak);
j = e2i(ctx->a) * AES_BLOCK_SIZE;
aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE]);
cryptonight_store_variant(&ctx->long_state[j], variant);
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak);
}
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
#undef RND
#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
#undef RND
#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
RND(0);
RND(1);
@ -219,9 +236,34 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st @@ -219,9 +236,34 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
oaes_free((OAES_CTX **) &ctx->aes_ctx);
}
void cryptonight_hash(void* output, const void* input, size_t len)
void cryptonight_hash_variant(void* output, const void* input, size_t len, int variant)
{
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
cryptonight_hash_ctx(output, input, len, ctx);
cryptonight_hash_ctx(output, input, len, ctx, variant);
free(ctx);
}
void cryptonight_hash(void* output, const void* input)
{
cryptonight_fork = 1;
cryptonight_hash_variant(output, input, 76, 0);
}
void graft_hash(void* output, const void* input)
{
cryptonight_fork = 8;
cryptonight_hash_variant(output, input, 76, 1);
}
void monero_hash(void* output, const void* input)
{
cryptonight_fork = 7;
cryptonight_hash_variant(output, input, 76, 1);
}
void stellite_hash(void* output, const void* input)
{
cryptonight_fork = 3;
cryptonight_hash_variant(output, input, 76, 2);
}

169
crypto/cryptonight-extra.cu

@ -7,15 +7,15 @@ @@ -7,15 +7,15 @@
#include <miner.h>
#include <cuda_helper.h>
#include "cryptonight.h"
typedef uint8_t BitSequence;
typedef uint64_t DataLength;
#include "cryptonight.h"
static uint32_t *d_input[MAX_GPUS] = { 0 };
static uint32_t *d_input[MAX_GPUS];
static uint32_t *d_target[MAX_GPUS];
static uint32_t *d_result[MAX_GPUS];
typedef uint8_t BitSequence;
typedef uint32_t DataLength;
#include "cn_keccak.cuh"
#include "cn_blake.cuh"
#include "cn_groestl.cuh"
@ -44,13 +44,11 @@ __constant__ uint8_t d_sub_byte[16][16] = { @@ -44,13 +44,11 @@ __constant__ uint8_t d_sub_byte[16][16] = {
__device__ __forceinline__
void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data)
{
const uint32_t aes_gf[] = {
const uint32_t aes_gf[10] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
};
MEMSET4(key, 0, 40);
MEMCPY4(key, data, 8);
#pragma unroll
for(int i = 8; i < 40; i++)
{
@ -74,15 +72,14 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res @@ -74,15 +72,14 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res
}
__global__
void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict__ d_input, uint32_t startNonce,
uint64_t * d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2)
void cryptonight_extra_gpu_prepare(const uint32_t threads, const uint32_t * __restrict__ d_input, uint32_t startNonce,
uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2, int variant, uint64_t * d_ctx_tweak)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if(thread < threads)
{
uint32_t ctx_state[50];
uint64_t ctx_state[25];
uint32_t ctx_a[4];
uint32_t ctx_b[4];
uint32_t ctx_key1[40];
@ -90,92 +87,62 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict @@ -90,92 +87,62 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict
uint32_t input[19];
MEMCPY4(input, d_input, 19);
*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
cn_keccak((uint8_t *)input, (uint8_t *)ctx_state);
cryptonight_aes_set_key(ctx_key1, ctx_state);
cryptonight_aes_set_key(ctx_key2, ctx_state + 8);
XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a);
XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b);
MEMCPY8(&d_ctx_state[thread * 26], ctx_state, 25);
MEMCPY4(d_ctx_a + thread * 4, ctx_a, 4);
MEMCPY4(d_ctx_b + thread * 4, ctx_b, 4);
MEMCPY4(d_ctx_key1 + thread * 40, ctx_key1, 40);
MEMCPY4(d_ctx_key2 + thread * 40, ctx_key2, 40);
}
}
__global__
void cryptonight_extra_gpu_keccak(uint32_t threads, uint32_t * d_ctx_state)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if(thread < threads)
{
uint64_t* ctx_state = (uint64_t*) (&d_ctx_state[thread * 52U]);
uint64_t state[25];
#pragma unroll
for(int i = 0; i < 25; i++)
state[i] = ctx_state[i];
uint32_t nonce = startNonce + thread;
*(((uint8_t *)input) + 39) = nonce & 0xff;
*(((uint8_t *)input) + 40) = (nonce >> 8) & 0xff;
*(((uint8_t *)input) + 41) = (nonce >> 16) & 0xff;
*(((uint8_t *)input) + 42) = (nonce >> 24) & 0xff;
cn_keccakf2(state);
cn_keccak(input, ctx_state);
MEMCPY4(&d_ctx_state[thread * 50U], ctx_state, 50);
// to reduce the final kernel stack frame, cut algos in 2 kernels
// ps: these 2 final kernels are not important for the overall xmr hashrate (< 1%)
switch (((uint8_t*)state)[0] & 0x03)
{
case 0: {
uint32_t hash[8];
cn_blake((uint8_t*)state, 200, (uint8_t*)hash);
((uint32_t*)ctx_state)[0] = 0;
((uint32_t*)ctx_state)[6] = hash[6];
((uint32_t*)ctx_state)[7] = hash[7];
break;
}
case 1: {
uint32_t hash[8];
cn_groestl((BitSequence*)state, 200, (BitSequence*)hash);
((uint32_t*)ctx_state)[0] = 0;
((uint32_t*)ctx_state)[6] = hash[6];
((uint32_t*)ctx_state)[7] = hash[7];
break;
}
default: {
#pragma unroll
for(int i = 0; i < 25; i++)
ctx_state[i] = state[i];
}
cryptonight_aes_set_key(ctx_key1, (uint32_t*)(&ctx_state[0]));
cryptonight_aes_set_key(ctx_key2, (uint32_t*)(&ctx_state[4]));
MEMCPY4(&d_ctx_key1[thread * 40U], ctx_key1, 40);
MEMCPY4(&d_ctx_key2[thread * 40U], ctx_key2, 40);
XOR_BLOCKS_DST(&ctx_state[0], &ctx_state[4], ctx_a);
XOR_BLOCKS_DST(&ctx_state[2], &ctx_state[6], ctx_b);
MEMCPY4(&d_ctx_a[thread * 4U], ctx_a, 4);
MEMCPY4(&d_ctx_b[thread * 4U], ctx_b, 4);
if (variant) {
uint2 tweak = AS_UINT2(&ctx_state[24]);
//tweak.x ^= (input[8] >> 24) | (input[9] << 8);
tweak.x ^= __byte_perm(input[8], input[ 9], 0x6543);
tweak.y ^= __byte_perm(input[9], input[10], 0x6543);
MEMCPY4(&d_ctx_tweak[thread], &tweak, 2);
}
}
}
__global__
void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, uint64_t * __restrict__ d_ctx_state,
const uint32_t* d_target, uint32_t * resNonces)
void cryptonight_extra_gpu_final(const uint32_t threads, uint32_t startNonce, const uint32_t * __restrict__ d_target,
uint32_t * __restrict__ resNonces, uint32_t * __restrict__ d_ctx_state)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if(thread < threads)
{
uint64_t* const state = &d_ctx_state[thread * 26U];
uint32_t *ctx_state = &d_ctx_state[thread * 50U];
uint32_t hash[8];
switch(((uint8_t *)state)[0] & 0x03)
{
case 0: {
uint32_t* h32 = (uint32_t*)state;
hash[6] = h32[6];
hash[7] = h32[7];
break;
}
case 2: {
cn_jh256((uint8_t*)state, 200, hash);
break;
}
case 3: {
cn_skein((uint8_t*)state, 200, hash);
break;
}
}
uint32_t state[50];
#pragma unroll 25
for(int i = 0; i < 50; i+=2)
AS_UINT2(&state[i]) = AS_UINT2(&ctx_state[i]);
cn_keccakf2((uint64_t *)state);
int branch = ((uint8_t *)state)[0] & 0x03;
if(branch == 0)
cn_blake((const uint8_t *)state, 200, hash);
if(branch == 1)
cn_groestl((const uint8_t *)state, 200, hash);
if(branch == 2)
cn_jh((const uint8_t *)state, 200, hash);
if(branch == 3)
cn_skein((const uint8_t *)state, 200, hash);
if(hash[7] <= d_target[1] && hash[6] <= d_target[0])
{
@ -188,55 +155,53 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui @@ -188,55 +155,53 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui
}
__host__
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *ptarget)
void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget)
{
uint32_t *pTargetIn = (uint32_t*) ptarget;
cudaMemcpy(d_input[thr_id], data, 19 * sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2*sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_input[thr_id], data, 20 * sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2 * sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemset(d_result[thr_id], 0xFF, 2 * sizeof(uint32_t));
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}
__host__
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads)
void cryptonight_extra_init(int thr_id)
{
cudaMalloc(&d_input[thr_id], 19 * sizeof(uint32_t));
cudaMalloc(&d_target[thr_id], 2*sizeof(uint32_t));
cudaMalloc(&d_result[thr_id], 2*sizeof(uint32_t));
cudaMalloc(&d_input[thr_id], 20 * sizeof(uint32_t));
cudaMalloc(&d_target[thr_id], 2 * sizeof(uint32_t));
cudaMalloc(&d_result[thr_id], 2 * sizeof(uint32_t));
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}
__host__
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
{
uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
cryptonight_extra_gpu_prepare <<<grid, block >>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2);
cryptonight_extra_gpu_prepare <<<grid, block>>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2, variant, d_ctx_tweak);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}
__host__
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint64_t *d_ctx_state)
void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state)
{
uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
cudaMemset(d_result[thr_id], 0xFF, 2*sizeof(uint32_t));
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cryptonight_extra_gpu_keccak <<<grid, block >>> (threads, (uint32_t*)d_ctx_state);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cryptonight_extra_gpu_final <<<grid, block >>> (threads, startNonce, d_ctx_state, d_target[thr_id], d_result[thr_id]);
cryptonight_extra_gpu_final <<<grid, block>>> (threads, startNonce, d_target[thr_id], d_result[thr_id], d_ctx_state);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMemcpy(resnonce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaMemcpy(resNonces, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}
__host__
void cryptonight_extra_cpu_free(int thr_id)
void cryptonight_extra_free(int thr_id)
{
if (d_input[thr_id]) {
cudaFree(d_input[thr_id]);

37
crypto/cryptonight.cu

@ -12,16 +12,17 @@ static __thread bool gpu_init_shown = false; @@ -12,16 +12,17 @@ static __thread bool gpu_init_shown = false;
gpulog(p, thr, fmt, ##__VA_ARGS__)
static uint64_t *d_long_state[MAX_GPUS];
static uint64_t *d_ctx_state[MAX_GPUS];
static uint32_t *d_ctx_state[MAX_GPUS];
static uint32_t *d_ctx_key1[MAX_GPUS];
static uint32_t *d_ctx_key2[MAX_GPUS];
static uint32_t *d_ctx_text[MAX_GPUS];
static uint64_t *d_ctx_tweak[MAX_GPUS];
static uint32_t *d_ctx_a[MAX_GPUS];
static uint32_t *d_ctx_b[MAX_GPUS];
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
{
int res = 0;
uint32_t throughput = 0;
@ -49,6 +50,13 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -49,6 +50,13 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id],
mem, device_mpcount[dev_id]);
if (!device_config[thr_id]) {
if(strcmp(device_name[dev_id], "TITAN V") == 0)
device_config[thr_id] = strdup("80x24");
if(strstr(device_name[dev_id], "V100"))
device_config[thr_id] = strdup("80x24");
}
if (device_config[thr_id]) {
int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
@ -70,7 +78,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -70,7 +78,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
exit(1);
}
cudaSetDevice(device_map[thr_id]);
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
@ -78,12 +86,12 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -78,12 +86,12 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
CUDA_LOG_ERROR();
}
const size_t alloc = MEMORY * throughput;
cryptonight_extra_cpu_init(thr_id, throughput);
const size_t alloc = MEMORY * size_t(throughput);
cryptonight_extra_init(thr_id);
cudaMalloc(&d_long_state[thr_id], alloc);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 52*4 (200 is not aligned 16)
cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
@ -95,6 +103,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -95,6 +103,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
exit_if_cudaerror(thr_id, __FILE__, __LINE__);
gpu_init_shown = true;
init[thr_id] = true;
@ -107,10 +117,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -107,10 +117,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
const uint32_t Htarg = ptarget[7];
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
cryptonight_extra_setData(thr_id, pdata, ptarget);
cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
*hashes_done = nonce - first_nonce + throughput;
@ -121,7 +131,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -121,7 +131,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
memcpy(tempdata, pdata, 76);
*tempnonceptr = resNonces[0];
cryptonight_hash(vhash, tempdata, 76);
cryptonight_hash_variant(vhash, tempdata, 76, variant);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
{
res = 1;
@ -131,7 +141,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -131,7 +141,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
if(resNonces[1] != UINT32_MAX)
{
*tempnonceptr = resNonces[1];
cryptonight_hash(vhash, tempdata, 76);
cryptonight_hash_variant(vhash, tempdata, 76, variant);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
res++;
work->nonces[1] = resNonces[1];
@ -174,10 +184,11 @@ void free_cryptonight(int thr_id) @@ -174,10 +184,11 @@ void free_cryptonight(int thr_id)
cudaFree(d_ctx_key1[thr_id]);
cudaFree(d_ctx_key2[thr_id]);
cudaFree(d_ctx_text[thr_id]);
cudaFree(d_ctx_tweak[thr_id]);
cudaFree(d_ctx_a[thr_id]);
cudaFree(d_ctx_b[thr_id]);
cryptonight_extra_cpu_free(thr_id);
cryptonight_extra_free(thr_id);
cudaDeviceSynchronize();

13
crypto/cryptonight.h

@ -20,7 +20,6 @@ struct uint3 blockDim; @@ -20,7 +20,6 @@ struct uint3 blockDim;
#define __umul64hi(a,b) a*b
#endif
#define MEMORY (1U << 21) // 2 MiB / 2097152 B
#define ITER (1U << 20) // 1048576
#define E2I_MASK 0x1FFFF0u
@ -136,10 +135,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line) @@ -136,10 +135,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
exit(1);
}
}
void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
void cryptonight_extra_cpu_free(int thr_id);
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
void cryptonight_extra_init(int thr_id);
void cryptonight_extra_free(int thr_id);
void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state);

10
crypto/xmr-rpc.cpp

@ -550,18 +550,24 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work) @@ -550,18 +550,24 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
}
else if (opt_algo == ALGO_CRYPTOLIGHT) {
int variant = 1;
uint32_t nonce = work->nonces[idnonce];
noncestr = bin2hex((unsigned char*) &nonce, 4);
last_found_nonce = nonce;
cryptolight_hash(hash, data, 76);
//if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
// variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
cryptolight_hash_variant(hash, data, 76, variant);
work_set_target_ratio(work, (uint32_t*) hash);
}
else if (opt_algo == ALGO_CRYPTONIGHT) {
int variant = 0;
uint32_t nonce = work->nonces[idnonce];
noncestr = bin2hex((unsigned char*) &nonce, 4);
last_found_nonce = nonce;
cryptonight_hash(hash, data, 76);
if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
cryptonight_hash_variant(hash, data, 76, variant);
work_set_target_ratio(work, (uint32_t*) hash);
}

2
equi/equi-stratum.cpp

@ -101,7 +101,7 @@ bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params) @@ -101,7 +101,7 @@ bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params)
target_be[31-i] = target_bin[i];
if (target_bin[i]) filled++;
}
memcpy(sctx->job.claim, target_be, 32); // hack, unused struct field
memcpy(sctx->job.extra, target_be, 32);
pthread_mutex_lock(&stratum_work_lock);
sctx->next_diff = target_to_diff_equi((uint32_t*) &target_be);

3
equi/equihash.cpp

@ -183,8 +183,7 @@ extern "C" int scanhash_equihash(int thr_id, struct work *work, uint32_t max_non @@ -183,8 +183,7 @@ extern "C" int scanhash_equihash(int thr_id, struct work *work, uint32_t max_non
return -1;
}
size_t memSz = solvers[thr_id]->equi_mem_sz / (1024*1024);
gpus_intensity[thr_id] = (uint32_t) solvers[thr_id]->throughput;
api_set_throughput(thr_id, gpus_intensity[thr_id]);
api_set_throughput(thr_id, (uint32_t) solvers[thr_id]->throughput);
gpulog(LOG_DEBUG, thr_id, "Allocated %u MB of context memory", (u32) memSz);
cuda_get_arch(thr_id);
init[thr_id] = true;

173
lyra2/Lyra2.c

@ -212,3 +212,176 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa @@ -212,3 +212,176 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa
return 0;
}
int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols)
{
//============================= Basic variables ============================//
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
int64_t tau; //Time Loop iterator
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
int64_t i; //auxiliary iteration counter
int64_t v64; // 64bit var for memcpy
uint64_t instance = 0;
//==========================================================================/
//========== Initializing the Memory Matrix and pointers to it =============//
//Tries to allocate enough space for the whole memory matrix
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// for Lyra2REv2, nCols = 4, v1 was using 8
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES;
size_t sz = (size_t)ROW_LEN_BYTES * nRows;
uint64_t *wholeMatrix = malloc(sz);
if (wholeMatrix == NULL) {
return -1;
}
memset(wholeMatrix, 0, sz);
//Allocates pointers to each row of the matrix
uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
if (memMatrix == NULL) {
return -1;
}
//Places the pointers in the correct positions
uint64_t *ptrWord = wholeMatrix;
for (i = 0; i < nRows; i++) {
memMatrix[i] = ptrWord;
ptrWord += ROW_LEN_INT64;
}
//==========================================================================/
//============= Getting the password + salt + basil padded with 10*1 ===============//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
byte *ptrByte = (byte*) wholeMatrix;
//Prepends the password
memcpy(ptrByte, pwd, pwdlen);
ptrByte += pwdlen;
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
memset(ptrByte, 0, (size_t) (nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen)));
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = pwdlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = saltlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = timeCost;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nRows;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nCols;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
//Now comes the padding
*ptrByte = 0x80; //first byte of padding: right after the password
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
//==========================================================================/
//======================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
uint64_t state[16];
initState(state);
//==========================================================================/
//================================ Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
for (i = 0; i < nBlocksInput; i++) {
absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
}
//Initializes M[0] and M[1]
reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
do {
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
//Checks if all rows in the window where visited.
if (rowa == 0) {
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
}
} while (row < nRows);
//==========================================================================/
//============================ Wandering Phase =============================//
row = 0; //Resets the visitation to the first row of the memory matrix
for (tau = 1; tau <= timeCost; tau++) {
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
do {
//Selects a pseudorandom index row* (the only change in REv3)
//------------------------------------------------------------------------------------------
instance = state[instance & 0xF];
rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
//rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//------------------------------------------------------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
//------------------------------------------------------------------------------------------
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//------------------------------------------------------------------------------------------
} while (row != 0);
}
//============================ Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlock(state, memMatrix[rowa]);
//Squeezes the key
squeeze(state, K, (unsigned int) kLen);
//========================= Freeing the memory =============================//
free(memMatrix);
free(wholeMatrix);
return 0;
}

1
lyra2/Lyra2.h

@ -38,5 +38,6 @@ typedef unsigned char byte; @@ -38,5 +38,6 @@ typedef unsigned char byte;
#endif
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
#endif /* LYRA2_H_ */

217
lyra2/allium.cu

@ -0,0 +1,217 @@ @@ -0,0 +1,217 @@
extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_keccak.h"
#include "sph/sph_cubehash.h"
#include "sph/sph_skein.h"
#include "sph/sph_groestl.h"
#include "lyra2/Lyra2.h"
}
#include <miner.h>
#include <cuda_helper.h>
static uint64_t* d_hash[MAX_GPUS];
static uint64_t* d_matrix[MAX_GPUS];
extern void blake256_cpu_init(int thr_id, uint32_t threads);
extern void blake256_cpu_setBlock_80(uint32_t *pdata);
//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
//extern void keccak256_sm3_init(int thr_id, uint32_t threads);
//extern void keccak256_sm3_free(int thr_id);
extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
extern void skein256_cpu_init(int thr_id, uint32_t threads);
extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_outputHash, bool gtx750ti);
extern void groestl256_cpu_init(int thr_id, uint32_t threads);
extern void groestl256_cpu_free(int thr_id);
extern void groestl256_setTarget(const void *ptarget);
extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order);
extern uint32_t groestl256_getSecNonce(int thr_id, int num);
extern "C" void allium_hash(void *state, const void *input)
{
uint32_t hashA[8], hashB[8];
sph_blake256_context ctx_blake;
sph_keccak256_context ctx_keccak;
sph_cubehash256_context ctx_cube;
sph_skein256_context ctx_skein;
sph_groestl256_context ctx_groestl;
sph_blake256_set_rounds(14);
sph_blake256_init(&ctx_blake);
sph_blake256(&ctx_blake, input, 80);
sph_blake256_close(&ctx_blake, hashA);
sph_keccak256_init(&ctx_keccak);
sph_keccak256(&ctx_keccak, hashA, 32);
sph_keccak256_close(&ctx_keccak, hashB);
LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
sph_cubehash256_init(&ctx_cube);
sph_cubehash256(&ctx_cube, hashA, 32);
sph_cubehash256_close(&ctx_cube, hashB);
LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
sph_skein256_init(&ctx_skein);
sph_skein256(&ctx_skein, hashA, 32);
sph_skein256_close(&ctx_skein, hashB);
sph_groestl256_init(&ctx_groestl);
sph_groestl256(&ctx_groestl, hashB, 32);
sph_groestl256_close(&ctx_groestl, hashA);
memcpy(state, hashA, 32);
}
static bool init[MAX_GPUS] = { 0 };
static __thread uint32_t throughput = 0;
extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
if (opt_benchmark)
ptarget[7] = 0x00ff;
static __thread bool gtx750ti;
if (!init[thr_id])
{
int dev_id = device_map[thr_id];
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16;
if (device_sm[device_map[thr_id]] == 500) intensity = 15;
throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
cudaDeviceProp props;
cudaGetDeviceProperties(&props, dev_id);
if (strstr(props.name, "750 Ti")) gtx750ti = true;
else gtx750ti = false;
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
blake256_cpu_init(thr_id, throughput);
//keccak256_sm3_init(thr_id, throughput);
skein256_cpu_init(thr_id, throughput);
groestl256_cpu_init(thr_id, throughput);
//cuda_get_arch(thr_id);
if (device_sm[dev_id] >= 500)
{
size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
}
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
init[thr_id] = true;
}
uint32_t _ALIGN(128) endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
blake256_cpu_setBlock_80(pdata);
groestl256_setTarget(ptarget);
do {
int order = 0;
//blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
//keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti);
cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti);
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
*hashes_done = pdata[19] - first_nonce + throughput;
work->nonces[0] = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
if (work->nonces[0] != UINT32_MAX)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[19], work->nonces[0]);
allium_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
work->nonces[1] = groestl256_getSecNonce(thr_id, 1);
if (work->nonces[1] != UINT32_MAX) {
be32enc(&endiandata[19], work->nonces[1]);
allium_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1; // cursor
}
return work->valid_nonces;
}
else if (vhash[7] > Htarg) {
gpu_increment_reject(thr_id);
if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
pdata[19] = work->nonces[0] + 1;
continue;
}
}
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_allium(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
cudaFree(d_matrix[thr_id]);
//keccak256_sm3_free(thr_id);
groestl256_cpu_free(thr_id);
init[thr_id] = false;
cudaDeviceSynchronize();
}

148
lyra2/cuda_lyra2.cu

@ -1,6 +1,7 @@ @@ -1,6 +1,7 @@
/**
* Lyra2 (v1) cuda implementation based on djm34 work
* tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
* tpruvot@github 2018 for phi2 double lyra2-32 support
*/
#include <stdio.h>
@ -228,9 +229,7 @@ void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads) @@ -228,9 +229,7 @@ void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
{
uint2 state1[3];
#if __CUDA_ARCH__ > 500
#pragma unroll
#endif
#pragma unroll
for (int i = 0; i < Nrow; i++)
{
ST4S(0, Ncol - i - 1, state, thread, threads);
@ -305,7 +304,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin @@ -305,7 +304,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin
LD4S(state1, rowIn, i, thread, threads);
LD4S(state2, rowInOut, i, thread, threads);
#pragma unroll
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
@ -334,7 +333,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin @@ -334,7 +333,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin
LD4S(state1, rowOut, i, thread, threads);
#pragma unroll
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] ^= state[j];
@ -409,14 +408,12 @@ __constant__ uint2x4 blake2b_IV[2] = { @@ -409,14 +408,12 @@ __constant__ uint2x4 blake2b_IV[2] = {
};
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2x4 state[4];
state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
@ -436,10 +433,9 @@ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) @@ -436,10 +433,9 @@ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
__global__
__launch_bounds__(TPB52, 1)
void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
void lyra2_gpu_hash_32_2(const uint32_t threads, uint64_t *g_hash)
{
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
if (thread < threads)
{
uint2 state[4];
@ -481,14 +477,12 @@ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_has @@ -481,14 +477,12 @@ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_has
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash)
{
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
uint28 state[4];
if (thread < threads)
{
uint2x4 state[4];
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
@ -501,8 +495,58 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) @@ -501,8 +495,58 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
g_hash[thread + threads * 1] = state[0].y;
g_hash[thread + threads * 2] = state[0].z;
g_hash[thread + threads * 3] = state[0].w;
}
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2x4 state[4];
const size_t offset = (size_t)8 * thread + (round * 4U);
uint2 *psrc = (uint2*)(&d_hash_512[offset]);
state[0].x = state[1].x = __ldg(&psrc[0]);
state[0].y = state[1].y = __ldg(&psrc[1]);
state[0].z = state[1].z = __ldg(&psrc[2]);
state[0].w = state[1].w = __ldg(&psrc[3]);
state[2] = blake2b_IV[0];
state[3] = blake2b_IV[1];
for (int i = 0; i<24; i++)
round_lyra(state);
((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
}
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round)
{
// This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 rounds
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (thread < threads)
{
uint2x4 state[4];
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
for (int i = 0; i < 12; i++)
round_lyra(state);
} //thread
const size_t offset = (size_t)8 * thread + (round * 4U);
uint2 *pdst = (uint2*)(&d_hash_512[offset]);
pdst[0] = state[0].x;
pdst[1] = state[0].y;
pdst[2] = state[0].z;
pdst[3] = state[0].w;
}
}
#else
#if __CUDA_ARCH__ < 500
@ -510,9 +554,11 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) @@ -510,9 +554,11 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
/* for unsupported SM arch */
__device__ void* DMatrix;
#endif
__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {}
__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {}
__global__ void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {}
#endif
__host__
@ -523,7 +569,7 @@ void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) @@ -523,7 +569,7 @@ void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
}
__host__
void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti)
void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx750ti)
{
int dev_id = device_map[thr_id % MAX_GPUS];
@ -544,11 +590,9 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6 @@ -544,11 +590,9 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
if (cuda_arch[dev_id] >= 520)
{
lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, startNounce, d_hash);
lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash);
lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
}
else if (cuda_arch[dev_id] >= 500)
{
@ -561,12 +605,58 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6 @@ -561,12 +605,58 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
// suitable amount to adjust for 10warp
shared_mem = 6144;
lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash);
lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
}
else
lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash);
}
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, startNounce, (uint2*)d_hash);
__host__
void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti)
{
int dev_id = device_map[thr_id % MAX_GPUS];
uint32_t tpb = TPB52;
if (cuda_arch[dev_id] >= 520) tpb = TPB52;
else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
dim3 grid1((size_t(threads) * 4 + tpb - 1) / tpb);
dim3 block1(4, tpb >> 2);
dim3 grid2((threads + 64 - 1) / 64);
dim3 block2(64);
if (cuda_arch[dev_id] >= 520)
{
const size_t shared_mem = sizeof(uint2) * tpb * 192; // 49152;
lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
}
else if (cuda_arch[dev_id] >= 500)
{
size_t shared_mem = gtx750ti ? 8192 : 6144; // 8 or 10 warps
lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
}
else {
// alternative method for SM 3.x
hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
}
else
lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, startNounce, d_hash);
}

69
lyra2/cuda_lyra2_sm2.cuh

@ -3,7 +3,7 @@ @@ -3,7 +3,7 @@
#ifdef __INTELLISENSE__
/* just for vstudio code colors, only uncomment that temporary, dont commit it */
//#undef __CUDA_ARCH__
//#define __CUDA_ARCH__ 500
//#define __CUDA_ARCH__ 300
#endif
#include "cuda_helper.h"
@ -131,7 +131,7 @@ void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, @@ -131,7 +131,7 @@ void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut,
}
__global__ __launch_bounds__(TPB30, 1)
void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@ -224,5 +224,68 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h @@ -224,5 +224,68 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h
#else
/* if __CUDA_ARCH__ < 200 .. host */
__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) {}
#endif
// -------------------------------------------------------------------------------------------------------------------------
// lyra2 cant be used as-is in 512-bits hash chains, tx to djm for these weird offsets since first lyra2 algo...
#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350
__global__ __launch_bounds__(128, 8)
void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const size_t offset = (size_t) 16 * thread + (round * 8U);
uint2 *psrc = (uint2*) (&d_hash64[offset]);
uint2 *pdst = (uint2*) (&d_hash_lyra[thread]);
pdst[threads*0] = __ldg(&psrc[0]);
pdst[threads*1] = __ldg(&psrc[1]);
pdst[threads*2] = __ldg(&psrc[2]);
pdst[threads*3] = __ldg(&psrc[3]);
}
}
__global__ __launch_bounds__(128, 8)
void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const size_t offset = (size_t) 16 * thread + (round * 8U);
uint2 *psrc = (uint2*) (&d_hash_lyra[thread]);
uint2 *pdst = (uint2*) (&d_hash64[offset]);
pdst[0] = psrc[0];
pdst[1] = psrc[threads*1];
pdst[2] = psrc[threads*2];
pdst[3] = psrc[threads*3];
}
}
#else
/* if __CUDA_ARCH__ < 200 .. host */
__global__ void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {}
__global__ void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {}
#endif
__host__
void hash64_to_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
hash64_to_lyra32_gpu <<<grid, block>>> (threads, d_hash64, (uint2*) d_hash_lyra, round);
}
__host__
void hash64_from_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
hash64_from_lyra32_gpu <<<grid, block>>> (threads, d_hash64, (uint2*) d_hash_lyra, round);
}

76
lyra2/cuda_lyra2_sm5.cuh

@ -589,15 +589,14 @@ void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thr @@ -589,15 +589,14 @@ void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thr
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
const uint2x4 blake2b_IV[2] = {
{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
};
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2x4 state[4];
@ -622,14 +621,13 @@ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha @@ -622,14 +621,13 @@ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha
}
__global__ __launch_bounds__(TPB50, 1)
void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash)
{
const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
if (thread < threads)
{
uint2 state[4];
state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
@ -662,14 +660,13 @@ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha @@ -662,14 +660,13 @@ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2x4 state[4];
state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
@ -685,9 +682,68 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha @@ -685,9 +682,68 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha
}
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round)
{
const uint2x4 blake2b_IV[2] = {
{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
};
// This kernel loads 2x 256-bits hashes from 512-bits chain offsets in 2 steps
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2x4 state[4];
const size_t offset = (size_t)8 * thread + (round * 4U);
uint2 *psrc = (uint2*)(&d_hash_512[offset]);
state[0].x = state[1].x = __ldg(&psrc[0]);
state[0].y = state[1].y = __ldg(&psrc[1]);
state[0].z = state[1].z = __ldg(&psrc[2]);
state[0].w = state[1].w = __ldg(&psrc[3]);
state[1] = state[0];
state[2] = blake2b_IV[0];
state[3] = blake2b_IV[1];
for (int i = 0; i<24; i++)
round_lyra(state);
((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
}
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round)
{
// This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 steps
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (thread < threads)
{
uint2x4 state[4];
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
for (int i = 0; i < 12; i++)
round_lyra(state);
const size_t offset = (size_t)8 * thread + (round * 4U);
uint2 *pdst = (uint2*)(&d_hash_512[offset]);
pdst[0] = state[0].x;
pdst[1] = state[0].y;
pdst[2] = state[0].z;
pdst[3] = state[0].w;
}
}
#else
/* if __CUDA_ARCH__ != 500 .. host */
__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {}
__global__ void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {}
#endif

481
lyra2/cuda_lyra2v3.cu

@ -0,0 +1,481 @@ @@ -0,0 +1,481 @@
/**
* Lyra2 (v3) CUDA Implementation
*
* Based on VTC sources
*/
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
#include "cuda_helper.h"
#include "cuda_lyra2v3_sm3.cuh"
#ifdef __INTELLISENSE__
/* just for vstudio code colors */
#define __CUDA_ARCH__ 500
#endif
#define TPB 32
#if __CUDA_ARCH__ >= 500
#include "cuda_lyra2_vectors.h"
#define Nrow 4
#define Ncol 4
#define memshift 3
__device__ uint2x4 *DMatrix;
__device__ __forceinline__ uint2 LD4S(const int index)
{
extern __shared__ uint2 shared_mem[];
return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
}
__device__ __forceinline__ void ST4S(const int index, const uint2 data)
{
extern __shared__ uint2 shared_mem[];
shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
}
__device__ __forceinline__ uint2 shuffle2(uint2 a, uint32_t b, uint32_t c)
{
return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
}
__device__ __forceinline__
void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
{
a += b; d ^= a; d = SWAPUINT2(d);
c += d; b ^= c; b = ROR2(b, 24);
a += b; d ^= a; d = ROR2(d, 16);
c += d; b ^= c; b = ROR2(b, 63);
}
__device__ __forceinline__
void round_lyra_v5(uint2x4 s[4])
{
Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x);
Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y);
Gfunc_v5(s[0].z, s[1].z, s[2].z, s[3].z);
Gfunc_v5(s[0].w, s[1].w, s[2].w, s[3].w);
Gfunc_v5(s[0].x, s[1].y, s[2].z, s[3].w);
Gfunc_v5(s[0].y, s[1].z, s[2].w, s[3].x);
Gfunc_v5(s[0].z, s[1].w, s[2].x, s[3].y);
Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z);
}
__device__ __forceinline__
void round_lyra_v5(uint2 s[4])
{
Gfunc_v5(s[0], s[1], s[2], s[3]);
s[1] = shuffle2(s[1], threadIdx.x + 1, 4);
s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
s[3] = shuffle2(s[3], threadIdx.x + 3, 4);
Gfunc_v5(s[0], s[1], s[2], s[3]);
s[1] = shuffle2(s[1], threadIdx.x + 3, 4);
s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
s[3] = shuffle2(s[3], threadIdx.x + 1, 4);
}
__device__ __forceinline__
void reduceDuplexRowSetup2(uint2 state[4])
{
uint2 state1[Ncol][3], state0[Ncol][3], state2[3];
int i, j;
#pragma unroll
for (int i = 0; i < Ncol; i++)
{
#pragma unroll
for (j = 0; j < 3; j++)
state0[Ncol - i - 1][j] = state[j];
round_lyra_v5(state);
}
//#pragma unroll 4
for (i = 0; i < Ncol; i++)
{
#pragma unroll
for (j = 0; j < 3; j++)
state[j] ^= state0[i][j];
round_lyra_v5(state);
#pragma unroll
for (j = 0; j < 3; j++)
state1[Ncol - i - 1][j] = state0[i][j];
#pragma unroll
for (j = 0; j < 3; j++)
state1[Ncol - i - 1][j] ^= state[j];
}
for (i = 0; i < Ncol; i++)
{
const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift;
#pragma unroll
for (j = 0; j < 3; j++)
state[j] ^= state1[i][j] + state0[i][j];
round_lyra_v5(state);
#pragma unroll
for (j = 0; j < 3; j++)
state2[j] = state1[i][j];
#pragma unroll
for (j = 0; j < 3; j++)
state2[j] ^= state[j];
#pragma unroll
for (j = 0; j < 3; j++)
ST4S(s2 + j, state2[j]);
uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
if (threadIdx.x == 0) {
state0[i][0] ^= Data2;
state0[i][1] ^= Data0;
state0[i][2] ^= Data1;
} else {
state0[i][0] ^= Data0;
state0[i][1] ^= Data1;
state0[i][2] ^= Data2;
}
#pragma unroll
for (j = 0; j < 3; j++)
ST4S(s0 + j, state0[i][j]);
#pragma unroll
for (j = 0; j < 3; j++)
state0[i][j] = state2[j];
}
for (i = 0; i < Ncol; i++)
{
const uint32_t s1 = memshift * Ncol * 1 + i*memshift;
const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift;
#pragma unroll
for (j = 0; j < 3; j++)
state[j] ^= state1[i][j] + state0[Ncol - i - 1][j];
round_lyra_v5(state);
#pragma unroll
for (j = 0; j < 3; j++)
state0[Ncol - i - 1][j] ^= state[j];
#pragma unroll
for (j = 0; j < 3; j++)
ST4S(s3 + j, state0[Ncol - i - 1][j]);
uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
if (threadIdx.x == 0) {
state1[i][0] ^= Data2;
state1[i][1] ^= Data0;
state1[i][2] ^= Data1;
} else {
state1[i][0] ^= Data0;
state1[i][1] ^= Data1;
state1[i][2] ^= Data2;
}
#pragma unroll
for (j = 0; j < 3; j++)
ST4S(s1 + j, state1[i][j]);
}
}
__device__
void reduceDuplexRowt2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4])
{
uint2 state1[3], state2[3];
const uint32_t ps1 = memshift * Ncol * rowIn;
const uint32_t ps2 = memshift * Ncol * rowInOut;
const uint32_t ps3 = memshift * Ncol * rowOut;
for (int i = 0; i < Ncol; i++)
{
const uint32_t s1 = ps1 + i*memshift;
const uint32_t s2 = ps2 + i*memshift;
const uint32_t s3 = ps3 + i*memshift;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = LD4S(s1 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = LD4S(s2 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra_v5(state);
uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
if (threadIdx.x == 0) {
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
} else {
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
ST4S(s2 + j, state2[j]);
#pragma unroll
for (int j = 0; j < 3; j++)
ST4S(s3 + j, LD4S(s3 + j) ^ state[j]);
}
}
__device__
void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4])
{
const int rowIn = 2;
const int rowOut = 3;
int i, j;
uint2 last[3];
const uint32_t ps1 = memshift * Ncol * rowIn;
const uint32_t ps2 = memshift * Ncol * rowInOut;
#pragma unroll
for (int j = 0; j < 3; j++)
last[j] = LD4S(ps2 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= LD4S(ps1 + j) + last[j];
round_lyra_v5(state);
uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
if (threadIdx.x == 0) {
last[0] ^= Data2;
last[1] ^= Data0;
last[2] ^= Data1;
} else {
last[0] ^= Data0;
last[1] ^= Data1;
last[2] ^= Data2;
}
if (rowInOut == rowOut)
{
#pragma unroll
for (j = 0; j < 3; j++)
last[j] ^= state[j];
}
for (i = 1; i < Ncol; i++)
{
const uint32_t s1 = ps1 + i*memshift;
const uint32_t s2 = ps2 + i*memshift;
#pragma unroll
for (j = 0; j < 3; j++)
state[j] ^= LD4S(s1 + j) + LD4S(s2 + j);
round_lyra_v5(state);
}
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= last[j];
}
__global__
__launch_bounds__(TPB, 1)
void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash)
{
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
const uint2x4 blake2b_IV[2] = {
0xf3bcc908UL, 0x6a09e667UL, 0x84caa73bUL, 0xbb67ae85UL,
0xfe94f82bUL, 0x3c6ef372UL, 0x5f1d36f1UL, 0xa54ff53aUL,
0xade682d1UL, 0x510e527fUL, 0x2b3e6c1fUL, 0x9b05688cUL,
0xfb41bd6bUL, 0x1f83d9abUL, 0x137e2179UL, 0x5be0cd19UL
};
const uint2x4 Mask[2] = {
0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL,
0x00000020UL, 0x00000000UL, 0x00000001UL, 0x00000000UL,
0x00000004UL, 0x00000000UL, 0x00000004UL, 0x00000000UL,
0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL
};
uint2x4 state[4];
if (thread < threads)
{
state[0].x = state[1].x = __ldg(&inputHash[thread + threads * 0]);
state[0].y = state[1].y = __ldg(&inputHash[thread + threads * 1]);
state[0].z = state[1].z = __ldg(&inputHash[thread + threads * 2]);
state[0].w = state[1].w = __ldg(&inputHash[thread + threads * 3]);
state[2] = blake2b_IV[0];
state[3] = blake2b_IV[1];
for (int i = 0; i<12; i++)
round_lyra_v5(state);
state[0] ^= Mask[0];
state[1] ^= Mask[1];
for (int i = 0; i<12; i++)
round_lyra_v5(state);
DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0];
DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1];
DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2];
DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3];
}
}
__global__
__launch_bounds__(TPB, 1)
void lyra2v3_gpu_hash_32_2(uint32_t threads)
{
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
if (thread < threads)
{
uint2 state[4];
state[0] = ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
state[1] = ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
state[2] = ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
state[3] = ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
reduceDuplexRowSetup2(state);
uint32_t rowa;
int prev = 3;
unsigned int instance = 0;
for (int i = 0; i < 3; i++)
{
instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4);
rowa = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3;
//rowa = __shfl(state[0].x, 0, 4) & 3;
reduceDuplexRowt2(prev, rowa, i, state);
prev = i;
}
instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4);
rowa = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3;
//rowa = __shfl(state[0].x, 0, 4) & 3;
reduceDuplexRowt2x4(rowa, state);
((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0];
((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1];
((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2];
((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3];
}
}
__global__
__launch_bounds__(TPB, 1)
void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash)
{
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
uint2x4 state[4];
if (thread < threads)
{
state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]);
state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]);
state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]);
state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]);
for (int i = 0; i < 12; i++)
round_lyra_v5(state);
outputHash[thread + threads * 0] = state[0].x;
outputHash[thread + threads * 1] = state[0].y;
outputHash[thread + threads * 2] = state[0].z;
outputHash[thread + threads * 3] = state[0].w;
}
}
#else
#include "cuda_helper.h"
#if __CUDA_ARCH__ < 200
__device__ void* DMatrix;
#endif
__global__ void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) {}
__global__ void lyra2v3_gpu_hash_32_2(uint32_t threads) {}
__global__ void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) {}
#endif
__host__
void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
{
cuda_get_arch(thr_id);
// just assign the device pointer allocated in main loop
cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
}
__host__
void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order)
{
int dev_id = device_map[thr_id % MAX_GPUS];
if (device_sm[dev_id] >= 500) {
const uint32_t tpb = TPB;
dim3 grid2((threads + tpb - 1) / tpb);
dim3 block2(tpb);
dim3 grid4((threads * 4 + tpb - 1) / tpb);
dim3 block4(4, tpb / 4);
lyra2v3_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)g_hash);
lyra2v3_gpu_hash_32_2 <<< grid4, block4, 48 * sizeof(uint2) * tpb >>> (threads);
lyra2v3_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)g_hash);
} else {
uint32_t tpb = 16;
if (cuda_arch[dev_id] >= 350) tpb = TPB35;
else if (cuda_arch[dev_id] >= 300) tpb = TPB30;
else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
dim3 grid((threads + tpb - 1) / tpb);
dim3 block(tpb);
lyra2v3_gpu_hash_32_v3 <<< grid, block >>> (threads, startNounce, (uint2*)g_hash);
}
}

348
lyra2/cuda_lyra2v3_sm3.cuh

@ -0,0 +1,348 @@ @@ -0,0 +1,348 @@
/* SM 2/3/3.5 Variant for lyra2REv2 */
#ifdef __INTELLISENSE__
/* just for vstudio code colors, only uncomment that temporary, dont commit it */
//#undef __CUDA_ARCH__
//#define __CUDA_ARCH__ 500
#endif
#define TPB20 64
#define TPB30 64
#define TPB35 64
#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500
#include "cuda_lyra2_vectors.h"
#define Nrow 4
#define Ncol 4
#define vectype ulonglong4
#define memshift 4
__device__ vectype *DMatrix;
static __device__ __forceinline__
void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d)
{
a += b; d ^= a; d = ROTR64(d, 32);
c += d; b ^= c; b = ROTR64(b, 24);
a += b; d ^= a; d = ROTR64(d, 16);
c += d; b ^= c; b = ROTR64(b, 63);
}
static __device__ __forceinline__
void round_lyra_v35(vectype* s)
{
Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x);
Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y);
Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z);
Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w);
Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w);
Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x);
Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y);
Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z);
}
static __device__ __forceinline__
void reduceDuplexV3(vectype state[4], uint32_t thread)
{
vectype state1[3];
uint32_t ps1 = (Nrow * Ncol * memshift * thread);
uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread);
#pragma unroll 4
for (int i = 0; i < Ncol; i++)
{
uint32_t s1 = ps1 + Nrow * i *memshift;
uint32_t s2 = ps2 - Nrow * i *memshift;
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix + s1)[j]);
for (int j = 0; j < 3; j++)
state[j] ^= state1[j];
round_lyra_v35(state);
for (int j = 0; j < 3; j++)
state1[j] ^= state[j];
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state1[j];
}
}
static __device__ __forceinline__
void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread)
{
vectype state2[3], state1[3];
uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread);
uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift * rowOut + Nrow * Ncol * memshift * thread);
for (int i = 0; i < Ncol; i++)
{
uint32_t s1 = ps1 + Nrow*i*memshift;
uint32_t s2 = ps2 + Nrow*i*memshift;
uint32_t s3 = ps3 - Nrow*i*memshift;
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix + s1 )[j]);
for (int j = 0; j < 3; j++)
state2[j] = __ldg4(&(DMatrix + s2 )[j]);
for (int j = 0; j < 3; j++) {
vectype tmp = state1[j] + state2[j];
state[j] ^= tmp;
}
round_lyra_v35(state);
for (int j = 0; j < 3; j++) {
state1[j] ^= state[j];
(DMatrix + s3)[j] = state1[j];
}
((uint2*)state2)[0] ^= ((uint2*)state)[11];
for (int j = 0; j < 11; j++)
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j];
}
}
static __device__ __forceinline__
void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread)
{
vectype state1[3], state2[3];
uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread);
uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
uint32_t ps3 = (memshift * rowOut + Nrow * Ncol * memshift * thread);
#pragma nounroll
for (int i = 0; i < Ncol; i++)
{
uint32_t s1 = ps1 + Nrow * i*memshift;
uint32_t s2 = ps2 + Nrow * i*memshift;
uint32_t s3 = ps3 + Nrow * i*memshift;
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix + s1)[j]);
for (int j = 0; j < 3; j++)
state2[j] = __ldg4(&(DMatrix + s2)[j]);
for (int j = 0; j < 3; j++)
state1[j] += state2[j];
for (int j = 0; j < 3; j++)
state[j] ^= state1[j];
round_lyra_v35(state);
((uint2*)state2)[0] ^= ((uint2*)state)[11];
for (int j = 0; j < 11; j++)
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
if (rowInOut != rowOut) {
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j];
for (int j = 0; j < 3; j++)
(DMatrix + s3)[j] ^= state[j];
} else {
for (int j = 0; j < 3; j++)
state2[j] ^= state[j];
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j];
}
}
}
#if __CUDA_ARCH__ >= 300
__global__ __launch_bounds__(TPB35, 1)
void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
vectype state[4];
vectype blake2b_IV[2];
vectype padding[2];
if (threadIdx.x == 0) {
((uint16*)blake2b_IV)[0] = make_uint16(
0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85,
0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a,
0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c,
0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19
);
((uint16*)padding)[0] = make_uint16(
0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0,
0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000
);
}
if (thread < threads)
{
((uint2*)state)[0] = __ldg(&outputHash[thread]);
((uint2*)state)[1] = __ldg(&outputHash[thread + threads]);
((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]);
((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]);
state[1] = state[0];
state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0);
state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0);
for (int i = 0; i<12; i++)
round_lyra_v35(state);
state[0] ^= shuffle4(((vectype*)padding)[0], 0);
state[1] ^= shuffle4(((vectype*)padding)[1], 0);
for (int i = 0; i<12; i++)
round_lyra_v35(state);
uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
//#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint32_t s1 = ps1 - 4 * memshift * i;
for (int j = 0; j < 3; j++)
(DMatrix + s1)[j] = (state)[j];
round_lyra_v35(state);
}
reduceDuplexV3(state, thread);
reduceDuplexRowSetupV3(1, 0, 2, state, thread);
reduceDuplexRowSetupV3(2, 1, 3, state, thread);
unsigned int instance = 0;
uint32_t rowa;
int prev = 3;
for (int i = 0; i < 4; i++)
{
//rowa = ((uint2*)state)[0].x & 3;
instance = ((uint2*)state)[instance & 0xf].x;
rowa = ((uint2*)state)[instance & 0xf].x & 0x3;
reduceDuplexRowtV3(prev, rowa, i, state, thread);
prev = i;
}
uint32_t shift = (memshift * rowa + 16 * memshift * thread);
for (int j = 0; j < 3; j++)
state[j] ^= __ldg4(&(DMatrix + shift)[j]);
for (int i = 0; i < 12; i++)
round_lyra_v35(state);
outputHash[thread] = ((uint2*)state)[0];
outputHash[thread + threads] = ((uint2*)state)[1];
outputHash[thread + 2 * threads] = ((uint2*)state)[2];
outputHash[thread + 3 * threads] = ((uint2*)state)[3];
} //thread
}
#elif __CUDA_ARCH__ >= 200
__global__ __launch_bounds__(TPB20, 1)
void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
vectype state[4];
vectype blake2b_IV[2];
vectype padding[2];
((uint16*)blake2b_IV)[0] = make_uint16(
0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
);
((uint16*)padding)[0] = make_uint16(
0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0,
0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000
);
if (thread < threads)
{
((uint2*)state)[0] = outputHash[thread];
((uint2*)state)[1] = outputHash[thread + threads];
((uint2*)state)[2] = outputHash[thread + 2 * threads];
((uint2*)state)[3] = outputHash[thread + 3 * threads];
state[1] = state[0];
state[2] = ((vectype*)blake2b_IV)[0];
state[3] = ((vectype*)blake2b_IV)[1];
for (int i = 0; i<12; i++)
round_lyra_v35(state);
state[0] ^= ((vectype*)padding)[0];
state[1] ^= ((vectype*)padding)[1];
for (int i = 0; i<12; i++)
round_lyra_v35(state);
uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
//#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint32_t s1 = ps1 - 4 * memshift * i;
for (int j = 0; j < 3; j++)
(DMatrix + s1)[j] = (state)[j];
round_lyra_v35(state);
}
reduceDuplexV3(state, thread);
reduceDuplexRowSetupV3(1, 0, 2, state, thread);
reduceDuplexRowSetupV3(2, 1, 3, state, thread);
uint instance = 0;
uint32_t rowa;
int prev = 3;
for (int i = 0; i < 4; i++)
{
// rowa = ((uint2*)state)[0].x & 3;
instance = ((uint2*)state)[instance & 0xf];
rowa = ((uint2*)state)[instance & 0xf] & 0x3;
reduceDuplexRowtV3(prev, rowa, i, state, thread);
prev = i;
}
uint32_t shift = (memshift * rowa + 16 * memshift * thread);
for (int j = 0; j < 3; j++)
state[j] ^= __ldg4(&(DMatrix + shift)[j]);
for (int i = 0; i < 12; i++)
round_lyra_v35(state);
outputHash[thread] = ((uint2*)state)[0];
outputHash[thread + threads] = ((uint2*)state)[1];
outputHash[thread + 2 * threads] = ((uint2*)state)[2];
outputHash[thread + 3 * threads] = ((uint2*)state)[3];
} //thread
}
#endif
#else
/* host & sm5+ */
__global__ void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {}
#endif

4
lyra2/lyra2RE.cu

@ -26,7 +26,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon @@ -26,7 +26,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon
extern void skein256_cpu_init(int thr_id, uint32_t threads);
extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti);
extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_outputHash, bool gtx750ti);
extern void groestl256_cpu_init(int thr_id, uint32_t threads);
extern void groestl256_cpu_free(int thr_id);
@ -130,7 +130,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, @@ -130,7 +130,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
//blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
//keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti);
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
*hashes_done = pdata[19] - first_nonce + throughput;

183
lyra2/lyra2REv3.cu

@ -0,0 +1,183 @@ @@ -0,0 +1,183 @@
extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_cubehash.h"
#include "lyra2/Lyra2.h"
}
#include <miner.h>
#include <cuda_helper.h>
static uint64_t *d_hash[MAX_GPUS];
static uint64_t* d_matrix[MAX_GPUS];
extern void blake256_cpu_init(int thr_id, uint32_t threads);
extern void blake256_cpu_setBlock_80(uint32_t *pdata);
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
extern void lyra2v3_setTarget(const void *pTargetIn);
extern void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix);
extern void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void lyra2v3_cpu_hash_32_targ(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
extern void bmw256_setTarget(const void *ptarget);
extern void bmw256_cpu_init(int thr_id, uint32_t threads);
extern void bmw256_cpu_free(int thr_id);
extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
extern "C" void lyra2v3_hash(void *state, const void *input)
{
uint32_t hashA[8], hashB[8];
sph_blake256_context ctx_blake;
sph_cubehash256_context ctx_cube;
sph_bmw256_context ctx_bmw;
sph_blake256_set_rounds(14);
sph_blake256_init(&ctx_blake);
sph_blake256(&ctx_blake, input, 80);
sph_blake256_close(&ctx_blake, hashA);
LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
sph_cubehash256_init(&ctx_cube);
sph_cubehash256(&ctx_cube, hashB, 32);
sph_cubehash256_close(&ctx_cube, hashA);
LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
sph_bmw256_init(&ctx_bmw);
sph_bmw256(&ctx_bmw, hashB, 32);
sph_bmw256_close(&ctx_bmw, hashA);
memcpy(state, hashA, 32);
}
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20;
if (strstr(device_name[dev_id], "GTX 1")) intensity = 20;
if (strstr(device_name[dev_id], "RTX 20")) intensity = 20;
uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark)
ptarget[7] = 0x000f;
if (!init[thr_id])
{
size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3;
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
blake256_cpu_init(thr_id, throughput);
bmw256_cpu_init(thr_id, throughput);
cuda_get_arch(thr_id); // cuda_arch[] also used in cubehash256
// SM 3 implentation requires a bit more memory
if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
lyra2v3_cpu_init(thr_id, throughput, d_matrix[thr_id]);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
api_set_throughput(thr_id, throughput);
init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
blake256_cpu_setBlock_80(pdata);
bmw256_setTarget(ptarget);
do {
int order = 0;
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
memset(work->nonces, 0, sizeof(work->nonces));
bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], work->nonces);
*hashes_done = pdata[19] - first_nonce + throughput;
if (work->nonces[0] != 0)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[19], work->nonces[0]);
lyra2v3_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
if (work->nonces[1] != 0) {
be32enc(&endiandata[19], work->nonces[1]);
lyra2v3_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1; // cursor
}
return work->valid_nonces;
}
else if (vhash[7] > Htarg) {
gpu_increment_reject(thr_id);
if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
pdata[19] = work->nonces[0] + 1;
continue;
}
}
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (!work_restart[thr_id].restart && !abort_flag);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_lyra2v3(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
cudaFree(d_matrix[thr_id]);
init[thr_id] = false;
cudaDeviceSynchronize();
}

38
miner.h

@ -274,13 +274,15 @@ void gostd(void *output, const void *input, size_t len); @@ -274,13 +274,15 @@ void gostd(void *output, const void *input, size_t len);
struct work;
extern int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_bastion(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds);
extern int scanhash_blake2b(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -298,26 +300,31 @@ extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsi @@ -298,26 +300,31 @@ extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsi
extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_myriad(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_polytimos(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sha256d(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sha256t(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sha256q(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_skunk(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sonoa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_tribus(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_bitcore(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_exosis(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blake_rounds);
extern int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -341,9 +348,11 @@ extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonc @@ -341,9 +348,11 @@ extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonc
/* free device allocated memory per algo */
void algo_free_all(int thr_id);
extern void free_allium(int thr_id);
extern void free_bastion(int thr_id);
extern void free_bitcore(int thr_id);
extern void free_blake256(int thr_id);
extern void free_blake2b(int thr_id);
extern void free_blake2s(int thr_id);
extern void free_bmw(int thr_id);
extern void free_c11(int thr_id);
@ -352,6 +361,7 @@ extern void free_cryptonight(int thr_id); @@ -352,6 +361,7 @@ extern void free_cryptonight(int thr_id);
extern void free_decred(int thr_id);
extern void free_deep(int thr_id);
extern void free_equihash(int thr_id);
extern void free_exosis(int thr_id);
extern void free_keccak256(int thr_id);
extern void free_fresh(int thr_id);
extern void free_fugue256(int thr_id);
@ -366,23 +376,27 @@ extern void free_lbry(int thr_id); @@ -366,23 +376,27 @@ extern void free_lbry(int thr_id);
extern void free_luffa(int thr_id);
extern void free_lyra2(int thr_id);
extern void free_lyra2v2(int thr_id);
extern void free_lyra2v3(int thr_id);
extern void free_lyra2Z(int thr_id);
extern void free_myriad(int thr_id);
extern void free_neoscrypt(int thr_id);
extern void free_nist5(int thr_id);
extern void free_pentablake(int thr_id);
extern void free_phi(int thr_id);
extern void free_phi2(int thr_id);
extern void free_polytimos(int thr_id);
extern void free_quark(int thr_id);
extern void free_qubit(int thr_id);
extern void free_sha256d(int thr_id);
extern void free_sha256t(int thr_id);
extern void free_sha256q(int thr_id);
extern void free_sia(int thr_id);
extern void free_sib(int thr_id);
extern void free_skeincoin(int thr_id);
extern void free_skein2(int thr_id);
extern void free_skunk(int thr_id);
extern void free_s3(int thr_id);
extern void free_sonoa(int thr_id);
extern void free_timetravel(int thr_id);
extern void free_tribus(int thr_id);
extern void free_bitcore(int thr_id);
@ -574,6 +588,8 @@ extern uint32_t device_plimit[MAX_GPUS]; @@ -574,6 +588,8 @@ extern uint32_t device_plimit[MAX_GPUS];
extern uint32_t gpus_intensity[MAX_GPUS];
extern int opt_cudaschedule;
extern int cryptonight_fork;
// cuda.cpp
int cuda_num_devices();
void cuda_devicenames();
@ -668,7 +684,7 @@ struct stratum_job { @@ -668,7 +684,7 @@ struct stratum_job {
unsigned char version[4];
unsigned char nbits[4];
unsigned char ntime[4];
unsigned char claim[32]; // lbry
unsigned char extra[64]; // like lbry claimtrie
bool clean;
unsigned char nreward[2];
uint32_t height;
@ -890,14 +906,19 @@ void applog_hash64(void *hash); @@ -890,14 +906,19 @@ void applog_hash64(void *hash);
void applog_compare_hash(void *hash, void *hash_ref);
void print_hash_tests(void);
void allium_hash(void *state, const void *input);
void bastionhash(void* output, const unsigned char* input);
void blake256hash(void *output, const void *input, int8_t rounds);
void blake2b_hash(void *output, const void *input);
void blake2s_hash(void *output, const void *input);
void bmw_hash(void *state, const void *input);
void c11hash(void *output, const void *input);
void cryptolight_hash(void* output, const void* input, int len);
void cryptonight_hash(void* output, const void* input, size_t len);
void cryptolight_hash_variant(void* output, const void* input, int len, int variant);
void cryptolight_hash(void* output, const void* input);
void cryptonight_hash_variant(void* output, const void* input, size_t len, int variant);
void cryptonight_hash(void* output, const void* input);
void monero_hash(void* output, const void* input);
void stellite_hash(void* output, const void* input);
void decred_hash(void *state, const void *input);
void deephash(void *state, const void *input);
void luffa_hash(void *state, const void *input);
@ -914,12 +935,14 @@ void jha_hash(void *output, const void *input); @@ -914,12 +935,14 @@ void jha_hash(void *output, const void *input);
void lbry_hash(void *output, const void *input);
void lyra2re_hash(void *state, const void *input);
void lyra2v2_hash(void *state, const void *input);
void lyra2v3_hash(void *state, const void *input);
void lyra2Z_hash(void *state, const void *input);
void myriadhash(void *state, const void *input);
void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
void nist5hash(void *state, const void *input);
void pentablakehash(void *output, const void *input);
void phihash(void *output, const void *input);
void phi_hash(void *output, const void *input);
void phi2_hash(void *output, const void *input);
void polytimos_hash(void *output, const void *input);
void quarkhash(void *state, const void *input);
void qubithash(void *state, const void *input);
@ -927,6 +950,8 @@ void scrypthash(void* output, const void* input); @@ -927,6 +950,8 @@ void scrypthash(void* output, const void* input);
void scryptjane_hash(void* output, const void* input);
void sha256d_hash(void *output, const void *input);
void sha256t_hash(void *output, const void *input);
void sha256q_hash(void *output, const void *input);
void sia_blake2b_hash(void *output, const void *input);
void sibhash(void *output, const void *input);
void skeincoinhash(void *output, const void *input);
void skein2hash(void *output, const void *input);
@ -934,6 +959,7 @@ void skunk_hash(void *state, const void *input); @@ -934,6 +959,7 @@ void skunk_hash(void *state, const void *input);
void s3hash(void *output, const void *input);
void timetravel_hash(void *output, const void *input);
void bitcore_hash(void *output, const void *input);
void exosis_hash(void *output, const void *input);
void tribus_hash(void *output, const void *input);
void veltorhash(void *output, const void *input);
void wcoinhash(void *state, const void *input);

6
neoscrypt/cuda_neoscrypt.cu

@ -179,7 +179,7 @@ static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2) @@ -179,7 +179,7 @@ static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2)
idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
a += b; d = rotateL(d^a, 16); \
c += d; b = rotateR(b^c, 12); \
idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \
a += b; d = rotateR(d^a, 8); \
c += d; b = rotateR(b^c, 7); \
}
@ -392,7 +392,7 @@ void Blake2S(uint32_t *out, const uint32_t* const __restrict__ inout, const ui @@ -392,7 +392,7 @@ void Blake2S(uint32_t *out, const uint32_t* const __restrict__ inout, const ui
idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
a += b; d = __byte_perm(d^a, 0, 0x1032); \
c += d; b = rotateR(b^c, 12); \
idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \
a += b; d = __byte_perm(d^a, 0, 0x0321); \
c += d; b = rotateR(b^c, 7); \
}
@ -1260,7 +1260,7 @@ uint32_t fastkdf32_v3(uint32_t thread, const uint32_t nonce, uint32_t* const sal @@ -1260,7 +1260,7 @@ uint32_t fastkdf32_v3(uint32_t thread, const uint32_t nonce, uint32_t* const sal
idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \
a += b; d = ROTR32(d^a,16); \
c += d; b = ROTR32(b^c, 12); \
idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \
idx = BLAKE2S_SIGMA_host[idx0][idx1 + 1]; a += key[idx]; \
a += b; d = ROTR32(d^a,8); \
c += d; b = ROTR32(b^c, 7); \
}

89
phi/cuda_phi2.cu

@ -0,0 +1,89 @@ @@ -0,0 +1,89 @@
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
__global__ __launch_bounds__(128, 8)
void phi_filter_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
uint4 *psrc = (uint4*) (&d_hash[offset]);
d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 1;
if (d_NonceBranch[thread]) return;
if (d_branch2) {
uint4 *pdst = (uint4*)(&d_branch2[offset]);
uint4 data;
data = psrc[0]; pdst[0] = data;
data = psrc[1]; pdst[1] = data;
data = psrc[2]; pdst[2] = data;
data = psrc[3]; pdst[3] = data;
}
}
}
__global__ __launch_bounds__(128, 8)
void phi_merge_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads && !d_NonceBranch[thread])
{
const uint32_t offset = thread * 16U;
uint4 *psrc = (uint4*) (&d_branch2[offset]);
uint4 *pdst = (uint4*) (&d_hash[offset]);
uint4 data;
data = psrc[0]; pdst[0] = data;
data = psrc[1]; pdst[1] = data;
data = psrc[2]; pdst[2] = data;
data = psrc[3]; pdst[3] = data;
}
}
__global__
void phi_final_compress_gpu(const uint32_t threads, uint32_t* d_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t offset = thread * 16U;
uint2 *psrc = (uint2*) (&d_hash[offset]);
uint2 *pdst = (uint2*) (&d_hash[offset]);
uint2 data;
data = psrc[4]; pdst[0] ^= data;
data = psrc[5]; pdst[1] ^= data;
data = psrc[6]; pdst[2] ^= data;
data = psrc[7]; pdst[3] ^= data;
}
}
__host__
uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
// extract algo permution hashes to a second branch buffer
phi_filter_gpu <<<grid, block>>> (threads, inpHashes, d_br2, d_nonces);
return threads;
}
__host__
void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
// put back second branch hashes to the common buffer d_hash
phi_merge_gpu <<<grid, block>>> (threads, outpHashes, d_br2, d_nonces);
}
__host__
void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
phi_final_compress_gpu <<<grid, block>>> (threads, d_hashes);
}

319
phi/cuda_phi2_cubehash512.cu

@ -0,0 +1,319 @@ @@ -0,0 +1,319 @@
/* phi2 cubehash-512 144-bytes input (80 + 64) */
#include <cuda_helper.h>
#include <cuda_vectors.h>
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
#if __CUDA_ARCH__ < 350
#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
#else
#define LROT(x, bits) __funnelshift_l(x, x, bits)
#endif
#define ROTATEUPWARDS7(a) LROT(a,7)
#define ROTATEUPWARDS11(a) LROT(a,11)
#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
#ifdef NO_MIDSTATE
__device__ __constant__
static const uint32_t c_IV_512[32] = {
0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695,
0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537,
0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE,
0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532,
0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576,
0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44
};
#endif
__device__ __forceinline__
static void rrounds(uint32_t x[2][2][2][2][2])
{
int r;
int j;
int k;
int l;
int m;
//#pragma unroll 16
for (r = 0;r < CUBEHASH_ROUNDS;++r) {
/* "add x_0jklm into x_1jklmn modulo 2^32" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[1][j][k][l][m] += x[0][j][k][l][m];
/* "rotate x_0jklm upwards by 7 bits" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
/* "swap x_00klm with x_01klm" */
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
SWAP(x[0][0][k][l][m],x[0][1][k][l][m])
/* "xor x_1jklm into x_0jklm" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[0][j][k][l][m] ^= x[1][j][k][l][m];
/* "swap x_1jk0m with x_1jk1m" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (m = 0;m < 2;++m)
SWAP(x[1][j][k][0][m],x[1][j][k][1][m])
/* "add x_0jklm into x_1jklm modulo 2^32" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[1][j][k][l][m] += x[0][j][k][l][m];
/* "rotate x_0jklm upwards by 11 bits" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
/* "swap x_0j0lm with x_0j1lm" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
SWAP(x[0][j][0][l][m],x[0][j][1][l][m])
/* "xor x_1jklm into x_0jklm" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[0][j][k][l][m] ^= x[1][j][k][l][m];
/* "swap x_1jkl0 with x_1jkl1" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
SWAP(x[1][j][k][l][0],x[1][j][k][l][1])
}
}
__device__ __forceinline__
static void block_tox(uint32_t* const block, uint32_t x[2][2][2][2][2])
{
// read 32 bytes input from global mem with uint2 chunks
AS_UINT2(x[0][0][0][0]) ^= AS_UINT2(&block[0]);
AS_UINT2(x[0][0][0][1]) ^= AS_UINT2(&block[2]);
AS_UINT2(x[0][0][1][0]) ^= AS_UINT2(&block[4]);
AS_UINT2(x[0][0][1][1]) ^= AS_UINT2(&block[6]);
}
__device__ __forceinline__
static void hash_fromx(uint32_t hash[16], uint32_t const x[2][2][2][2][2])
{
// used to write final hash to global mem
AS_UINT2(&hash[ 0]) = AS_UINT2(x[0][0][0][0]);
AS_UINT2(&hash[ 2]) = AS_UINT2(x[0][0][0][1]);
AS_UINT2(&hash[ 4]) = AS_UINT2(x[0][0][1][0]);
AS_UINT2(&hash[ 6]) = AS_UINT2(x[0][0][1][1]);
AS_UINT2(&hash[ 8]) = AS_UINT2(x[0][1][0][0]);
AS_UINT2(&hash[10]) = AS_UINT2(x[0][1][0][1]);
AS_UINT2(&hash[12]) = AS_UINT2(x[0][1][1][0]);
AS_UINT2(&hash[14]) = AS_UINT2(x[0][1][1][1]);
}
#define Init(x) \
AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512[ 0]); \
AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512[ 2]); \
AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512[ 4]); \
AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512[ 6]); \
AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512[ 8]); \
AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512[10]); \
AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512[12]); \
AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512[14]); \
AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512[16]); \
AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512[18]); \
AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512[20]); \
AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512[22]); \
AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512[24]); \
AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512[26]); \
AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512[28]); \
AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512[30]);
__device__ __forceinline__
static void Update32(uint32_t x[2][2][2][2][2], uint32_t* const data)
{
/* "xor the block into the first b bytes of the state" */
block_tox(data, x);
/* "and then transform the state invertibly through r identical rounds" */
rrounds(x);
}
__device__ __forceinline__
static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
{
/* "the integer 1 is xored into the last state word x_11111" */
x[1][1][1][1][1] ^= 1;
/* "the state is then transformed invertibly through 10r identical rounds" */
#pragma unroll 10
for (int i = 0; i < 10; i++) rrounds(x);
/* "output the first h/8 bytes of the state" */
hash_fromx(hashval, x);
}
__host__ void phi2_cubehash512_cpu_init(int thr_id, uint32_t threads) { }
/***************************************************/
/**
* Timetravel and x16 CUBEHASH-80 CUDA implementation
* by tpruvot@github - Jan 2017 / May 2018
*/
__constant__ static uint32_t c_midstate128[32];
__constant__ static uint32_t c_PaddedMessage_144[36];
#undef SPH_C32
#undef SPH_C64
#undef SPH_T32
#undef SPH_T64
#include "sph/sph_cubehash.h"
__host__
void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata)
{
sph_cubehash512_context ctx_cubehash;
sph_cubehash512_init(&ctx_cubehash);
sph_cubehash512(&ctx_cubehash, (void*)endiandata, 64);
#ifndef NO_MIDSTATE
cudaMemcpyToSymbol(c_midstate128, ctx_cubehash.state, 128, 0, cudaMemcpyHostToDevice);
#endif
cudaMemcpyToSymbol(c_PaddedMessage_144, endiandata, sizeof(c_PaddedMessage_144), 0, cudaMemcpyHostToDevice);
}
__global__
void cubehash512_gpu_hash_144(const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nonce = startNounce + thread;
uint32_t message[8];
uint32_t x[2][2][2][2][2];
#ifdef NO_MIDSTATE
Init(x);
// first 32 bytes
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[0]);
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[4]);
Update32(x, message);
// second 32 bytes
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[8]);
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[12]);
Update32(x, message);
#else
AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_midstate128[ 0]);
AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_midstate128[ 2]);
AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_midstate128[ 4]);
AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_midstate128[ 6]);
AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_midstate128[ 8]);
AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_midstate128[10]);
AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_midstate128[12]);
AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_midstate128[14]);
AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_midstate128[16]);
AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_midstate128[18]);
AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_midstate128[20]);
AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_midstate128[22]);
AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_midstate128[24]);
AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_midstate128[26]);
AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_midstate128[28]);
AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_midstate128[30]);
#endif
// nonce + state root
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[16]);
message[3] = cuda_swab32(nonce);
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[20]); // state
Update32(x, message);
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[24]); // state
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[28]); // utxo
Update32(x, message);
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[32]); // utxo
message[4] = 0x80;
message[5] = 0;
message[6] = 0;
message[7] = 0;
Update32(x, message);
uint32_t* output = (uint32_t*) (&g_outhash[(size_t)8 * thread]);
Final(x, output);
}
}
__host__
void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
cubehash512_gpu_hash_144 <<<grid, block>>> (threads, startNounce, (uint64_t*) d_hash);
}

8
x11/phi.cu → phi/phi.cu

@ -19,7 +19,7 @@ extern "C" { @@ -19,7 +19,7 @@ extern "C" {
#include "miner.h"
#include "cuda_helper.h"
#include "cuda_x11.h"
#include "x11/cuda_x11.h"
extern void skein512_cpu_setBlock_80(void *pdata);
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
@ -38,7 +38,7 @@ extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, @@ -38,7 +38,7 @@ extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash,
static uint32_t *d_hash[MAX_GPUS];
static uint32_t *d_resNonce[MAX_GPUS];
extern "C" void phihash(void *output, const void *input)
extern "C" void phi_hash(void *output, const void *input)
{
unsigned char _ALIGN(128) hash[128] = { 0 };
@ -162,7 +162,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u @@ -162,7 +162,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
uint32_t _ALIGN(64) vhash[8];
if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce;
be32enc(&endiandata[19], work->nonces[0]);
phihash(vhash, endiandata);
phi_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
@ -173,7 +173,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u @@ -173,7 +173,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
if (work->nonces[1] != UINT32_MAX) {
work->nonces[1] += startNonce;
be32enc(&endiandata[19], work->nonces[1]);
phihash(vhash, endiandata);
phi_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;

268
phi/phi2.cu

@ -0,0 +1,268 @@ @@ -0,0 +1,268 @@
//
// PHI2 algo (with smart contracts header)
// CubeHash + Lyra2 x2 + JH + Gost or Echo + Skein
//
// Implemented by tpruvot in May 2018
//
extern "C" {
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_cubehash.h"
#include "sph/sph_streebog.h"
#include "sph/sph_echo.h"
#include "lyra2/Lyra2.h"
}
#include "miner.h"
#include "cuda_helper.h"
#include "x11/cuda_x11.h"
#include <stdio.h>
#include <memory.h>
extern void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
extern void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
extern void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata);
extern void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
extern void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti);
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
extern void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter);
extern void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter);
extern uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces);
extern void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces);
extern void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes);
static uint64_t* d_matrix[MAX_GPUS];
static uint32_t* d_hash_512[MAX_GPUS];
static uint64_t* d_hash_256[MAX_GPUS];
static uint32_t* d_hash_br2[MAX_GPUS];
static uint32_t* d_nonce_br[MAX_GPUS];
static bool has_roots;
extern "C" void phi2_hash(void *output, const void *input)
{
unsigned char _ALIGN(128) hash[64];
unsigned char _ALIGN(128) hashA[64];
unsigned char _ALIGN(128) hashB[64];
sph_cubehash512_context ctx_cubehash;
sph_jh512_context ctx_jh;
sph_gost512_context ctx_gost;
sph_echo512_context ctx_echo;
sph_skein512_context ctx_skein;
sph_cubehash512_init(&ctx_cubehash);
sph_cubehash512(&ctx_cubehash, input, has_roots ? 144 : 80);
sph_cubehash512_close(&ctx_cubehash, (void*)hashB);
LYRA2(&hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8);
LYRA2(&hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8);
sph_jh512_init(&ctx_jh);
sph_jh512(&ctx_jh, (const void*)hashA, 64);
sph_jh512_close(&ctx_jh, (void*)hash);
if (hash[0] & 1) {
sph_gost512_init(&ctx_gost);
sph_gost512(&ctx_gost, (const void*)hash, 64);
sph_gost512_close(&ctx_gost, (void*)hash);
} else {
sph_echo512_init(&ctx_echo);
sph_echo512(&ctx_echo, (const void*)hash, 64);
sph_echo512_close(&ctx_echo, (void*)hash);
sph_echo512_init(&ctx_echo);
sph_echo512(&ctx_echo, (const void*)hash, 64);
sph_echo512_close(&ctx_echo, (void*)hash);
}
sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, (const void*)hash, 64);
sph_skein512_close(&ctx_skein, (void*)hash);
for (int i=0; i<32; i++)
hash[i] ^= hash[i+32];
memcpy(output, hash, 32);
}
//#define _DEBUG
#define _DEBUG_PREFIX "phi-"
#include "cuda_debug.cuh"
static bool init[MAX_GPUS] = { 0 };
static bool use_compat_kernels[MAX_GPUS] = { 0 };
static __thread bool gtx750ti = false;
extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16;
if (device_sm[dev_id] == 500) intensity = 15;
if (device_sm[dev_id] == 600) intensity = 17;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (init[thr_id]) throughput = max(throughput & 0xffffff80, 128); // for shared mem
if (opt_benchmark)
ptarget[7] = 0xff;
if (!init[thr_id])
{
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
cuda_get_arch(thr_id);
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
gtx750ti = (strstr(device_name[dev_id], "GTX 750 Ti") != NULL);
size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 16 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
CUDA_CALL_OR_RET_X(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput), -1);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_256[thr_id], (size_t)32 * throughput), -1);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_512[thr_id], (size_t)64 * throughput), -1);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_nonce_br[thr_id], sizeof(uint32_t) * throughput), -1);
if (use_compat_kernels[thr_id]) {
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t)64 * throughput), -1);
}
lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
quark_jh512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
if (use_compat_kernels[thr_id]) x11_echo512_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
has_roots = false;
uint32_t endiandata[36];
for (int k = 0; k < 36; k++) {
be32enc(&endiandata[k], pdata[k]);
if (k >= 20 && pdata[k]) has_roots = true;
}
cuda_check_cpu_setTarget(ptarget);
if (has_roots)
cubehash512_setBlock_144(thr_id, endiandata);
else
cubehash512_setBlock_80(thr_id, endiandata);
do {
int order = 0;
if (has_roots)
cubehash512_cuda_hash_144(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
else
cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
order++;
TRACE("cube ");
lyra2_cuda_hash_64(thr_id, throughput, d_hash_256[thr_id], d_hash_512[thr_id], gtx750ti);
order++;
TRACE("lyra ");
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++);
TRACE("jh ");
order++;
if (!use_compat_kernels[thr_id]) {
phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], NULL, d_nonce_br[thr_id]);
phi_streebog_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
} else {
// todo: nonces vector to reduce amount of hashes to compute
phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]);
streebog_cpu_hash_64(thr_id, throughput, d_hash_512[thr_id]);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order);
phi_merge_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]);
}
TRACE("mix ");
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++);
TRACE("skein ");
phi_final_compress_cuda(thr_id, throughput, d_hash_512[thr_id]);
TRACE("xor ");
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
if (work->nonces[0] != UINT32_MAX)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[19], work->nonces[0]);
phi2_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
*hashes_done = pdata[19] - first_nonce + throughput;
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash_512[thr_id], 1);
if (work->nonces[1] != 0) {
be32enc(&endiandata[19], work->nonces[1]);
phi2_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1; // cursor
}
if (pdata[19] > max_nonce) pdata[19] = max_nonce;
return work->valid_nonces;
}
else if (vhash[7] > Htarg) {
gpu_increment_reject(thr_id);
if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! thr=%x", work->nonces[0], throughput);
pdata[19] = work->nonces[0] + 1;
continue;
}
}
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_phi2(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_matrix[thr_id]);
cudaFree(d_hash_512[thr_id]);
cudaFree(d_hash_256[thr_id]);
cudaFree(d_nonce_br[thr_id]);
if (use_compat_kernels[thr_id]) cudaFree(d_hash_br2[thr_id]);
cuda_check_cpu_free(thr_id);
init[thr_id] = false;
cudaDeviceSynchronize();
}

10
res/ccminer.rc

@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico" @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico"
//
VS_VERSION_INFO VERSIONINFO
FILEVERSION 2,2,5,0
PRODUCTVERSION 2,2,5,0
FILEVERSION 2,3,1,0
PRODUCTVERSION 2,3,1,0
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x21L
@ -76,10 +76,10 @@ BEGIN @@ -76,10 +76,10 @@ BEGIN
BEGIN
BLOCK "040904e4"
BEGIN
VALUE "FileVersion", "2.2.5"
VALUE "LegalCopyright", "Copyright (C) 2018"
VALUE "FileVersion", "2.3.1"
VALUE "LegalCopyright", "Copyright (C) 2019"
VALUE "ProductName", "ccminer"
VALUE "ProductVersion", "2.2.5"
VALUE "ProductVersion", "2.3.1"
END
END
BLOCK "VarFileInfo"

14
scrypt.cpp

@ -50,7 +50,17 @@ using namespace Concurrency; @@ -50,7 +50,17 @@ using namespace Concurrency;
#if _MSC_VER > 1800
#undef _THROW1
#if __cplusplus < 201101L
#define _THROW1(x) throw(std::bad_alloc)
#else
#define _THROW1(x) noexcept(false)
#endif
#elif !defined(_MSC_VER)
#if __cplusplus < 201101L
#define _THROW1(x) throw(std::bad_alloc)
#else
#define _THROW1(x) noexcept(false)
#endif
#endif
// A thin wrapper around the builtin __m128i type
@ -63,9 +73,9 @@ public: @@ -63,9 +73,9 @@ public:
void * operator new[](size_t size) _THROW1(_STD bad_alloc) { void *p; if ((p = _aligned_malloc(size, 16)) == 0) { static const std::bad_alloc nomem; _RAISE(nomem); } return (p); }
void operator delete[](void *p) { _aligned_free(p); }
#else
void * operator new(size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
void * operator new(size_t size) _THROW1(_STD bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
void operator delete(void *p) { free(p); }
void * operator new[](size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
void * operator new[](size_t size) _THROW1(_STD bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
void operator delete[](void *p) { free(p); }
#endif
uint32x4_t() { };

5
scrypt/test_kernel.cu

@ -47,7 +47,7 @@ texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V; @@ -47,7 +47,7 @@ texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
template <int ALGO> __device__ __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);
static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) {
static __device__ uint4& operator^=(uint4& left, const uint4& right) {
left.x ^= right.x;
left.y ^= right.y;
left.z ^= right.z;
@ -55,7 +55,7 @@ static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) { @@ -55,7 +55,7 @@ static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) {
return left;
}
static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) {
static __device__ uint4& operator+=(uint4& left, const uint4& right) {
left.x += right.x;
left.y += right.y;
left.z += right.z;
@ -63,7 +63,6 @@ static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) { @@ -63,7 +63,6 @@ static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) {
return left;
}
/* write_keys writes the 8 keys being processed by a warp to the global
* scratchpad. To effectively use memory bandwidth, it performs the writes
* (and reads, for read_keys) 128 bytes at a time per memory location

4
scrypt/titan_kernel.cu

@ -50,7 +50,7 @@ __constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP)-1 @@ -50,7 +50,7 @@ __constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP)-1
template <int ALGO> __device__ __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);
static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) {
static __device__ uint4& operator ^= (uint4& left, const uint4& right) {
left.x ^= right.x;
left.y ^= right.y;
left.z ^= right.z;
@ -58,7 +58,7 @@ static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) @@ -58,7 +58,7 @@ static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right)
return left;
}
static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) {
static __device__ uint4& operator += (uint4& left, const uint4& right) {
left.x += right.x;
left.y += right.y;
left.z += right.z;

507
sha256/cuda_sha256q.cu

@ -0,0 +1,507 @@ @@ -0,0 +1,507 @@
/*
* sha256(-q) CUDA implementation.
* pyritepirate 2018
* tpruvot 2017
*/
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
#include <cuda_helper.h>
#include <miner.h>
__constant__ static uint32_t __align__(8) c_midstate76[8];
__constant__ static uint32_t __align__(8) c_dataEnd80[4];
const __constant__ uint32_t __align__(8) c_H256[8] = {
0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
};
__constant__ static uint32_t __align__(8) c_K[64];
__constant__ static uint32_t __align__(8) c_target[2];
__device__ uint64_t d_target[1];
static uint32_t* d_resNonces[MAX_GPUS] = { 0 };
// ------------------------------------------------------------------------------------------------
static const uint32_t cpu_H256[8] = {
0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
};
static const uint32_t cpu_K[64] = {
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};
#define ROTR ROTR32
__host__
static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t in, const uint32_t Kshared)
{
uint32_t t1,t2;
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__host__
static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t* in, uint32_t pc, const uint32_t Kshared)
{
uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
in[pc] = ssg21 + inx2 + ssg20 + inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__host__
static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
{
uint32_t a = state[0];
uint32_t b = state[1];
uint32_t c = state[2];
uint32_t d = state[3];
uint32_t e = state[4];
uint32_t f = state[5];
uint32_t g = state[6];
uint32_t h = state[7];
sha256_step1_host(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
sha256_step1_host(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
sha256_step1_host(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
sha256_step1_host(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
sha256_step1_host(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
sha256_step1_host(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
sha256_step1_host(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
sha256_step1_host(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
sha256_step1_host(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
sha256_step1_host(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
sha256_step1_host(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
sha256_step1_host(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
sha256_step1_host(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
sha256_step1_host(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
sha256_step1_host(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
sha256_step1_host(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
for (int i=0; i<3; i++)
{
sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
}
#define xor3b(a,b,c) (a ^ b ^ c)
__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
{
return xor3b(ROTR32(x,2),ROTR32(x,13),ROTR32(x,22));
}
__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
{
return xor3b(ROTR32(x,6),ROTR32(x,11),ROTR32(x,25));
}
__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
{
return xor3b(ROTR32(x,7),ROTR32(x,18),(x>>3));
}
__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
{
return xor3b(ROTR32(x,17),ROTR32(x,19),(x>>10));
}
__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c)
{
uint32_t result;
asm("{\n\t"
".reg .u32 m,n,o;\n\t"
"and.b32 m, %1, %2;\n\t"
" or.b32 n, %1, %2;\n\t"
"and.b32 o, n, %3;\n\t"
" or.b32 %0, m, o ;\n\t"
"}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c)
);
return result;
}
__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
uint2 result;
asm("mov.b64 {%0,%1},%2; \n\t"
: "=r"(result.y), "=r"(result.x) : "l"(v));
return result;
}
__device__
static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t in, const uint32_t Kshared)
{
uint32_t t1,t2;
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 = bsg2_1(e);
uint32_t bsg20 = bsg2_0(a);
uint32_t andorv = andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__device__
static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t* in, uint32_t pc, const uint32_t Kshared)
{
uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ssg2_1(inx1);
uint32_t ssg20 = ssg2_0(inx3);
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 = bsg2_1(e);
uint32_t bsg20 = bsg2_0(a);
uint32_t andorv = andor32(a,b,c);
in[pc] = ssg21 + inx2 + ssg20 + inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__device__
static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
{
uint32_t a = state[0];
uint32_t b = state[1];
uint32_t c = state[2];
uint32_t d = state[3];
uint32_t e = state[4];
uint32_t f = state[5];
uint32_t g = state[6];
uint32_t h = state[7];
sha2_step1(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
sha2_step1(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
sha2_step1(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
sha2_step1(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
sha2_step1(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
sha2_step1(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
sha2_step1(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
sha2_step1(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
sha2_step1(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
sha2_step1(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
sha2_step1(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
sha2_step1(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
sha2_step1(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
sha2_step1(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
sha2_step1(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
sha2_step1(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
#pragma unroll
for (int i=0; i<3; i++)
{
sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
}
__device__
static void sha256_round_last(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
{
uint32_t a = state[0];
uint32_t b = state[1];
uint32_t c = state[2];
uint32_t d = state[3];
uint32_t e = state[4];
uint32_t f = state[5];
uint32_t g = state[6];
uint32_t h = state[7];
sha2_step1(a,b,c,d, e,f,g,h, in[ 0], Kshared[ 0]);
sha2_step1(h,a,b,c, d,e,f,g, in[ 1], Kshared[ 1]);
sha2_step1(g,h,a,b, c,d,e,f, in[ 2], Kshared[ 2]);
sha2_step1(f,g,h,a, b,c,d,e, in[ 3], Kshared[ 3]);
sha2_step1(e,f,g,h, a,b,c,d, in[ 4], Kshared[ 4]);
sha2_step1(d,e,f,g, h,a,b,c, in[ 5], Kshared[ 5]);
sha2_step1(c,d,e,f, g,h,a,b, in[ 6], Kshared[ 6]);
sha2_step1(b,c,d,e, f,g,h,a, in[ 7], Kshared[ 7]);
sha2_step1(a,b,c,d, e,f,g,h, in[ 8], Kshared[ 8]);
sha2_step1(h,a,b,c, d,e,f,g, in[ 9], Kshared[ 9]);
sha2_step1(g,h,a,b, c,d,e,f, in[10], Kshared[10]);
sha2_step1(f,g,h,a, b,c,d,e, in[11], Kshared[11]);
sha2_step1(e,f,g,h, a,b,c,d, in[12], Kshared[12]);
sha2_step1(d,e,f,g, h,a,b,c, in[13], Kshared[13]);
sha2_step1(c,d,e,f, g,h,a,b, in[14], Kshared[14]);
sha2_step1(b,c,d,e, f,g,h,a, in[15], Kshared[15]);
#pragma unroll
for (int i=0; i<2; i++)
{
sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*i]);
sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*i]);
sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*i]);
sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*i]);
sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*i]);
sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*i]);
sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*i]);
sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*i]);
sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*i]);
sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*i]);
sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*i]);
sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*i]);
sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*i]);
sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*i]);
sha2_step2(c,d,e,f, g,h,a,b, in,14, Kshared[30+16*i]);
sha2_step2(b,c,d,e, f,g,h,a, in,15, Kshared[31+16*i]);
}
sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*2]);
sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*2]);
sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*2]);
sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*2]);
sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*2]);
sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*2]);
sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*2]);
sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*2]);
sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*2]);
sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*2]);
sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*2]);
sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*2]);
sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*2]);
sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*2]);
state[6] += g;
state[7] += h;
}
__device__ __forceinline__
uint64_t cuda_swab32ll(uint64_t x) {
return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
}
__global__
/*__launch_bounds__(256,3)*/
void sha256q_gpu_hash_shared(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
__shared__ uint32_t s_K[64*4];
//s_K[thread & 63] = c_K[thread & 63];
if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
if (thread < threads)
{
const uint32_t nonce = startNonce + thread;
uint32_t dat[16];
AS_UINT2(dat) = AS_UINT2(c_dataEnd80);
dat[ 2] = c_dataEnd80[2];
dat[ 3] = nonce;
dat[ 4] = 0x80000000;
dat[15] = 0x280;
#pragma unroll
for (int i=5; i<15; i++) dat[i] = 0;
uint32_t buf[8];
#pragma unroll
for (int i=0; i<8; i+=2) AS_UINT2(&buf[i]) = AS_UINT2(&c_midstate76[i]);
//for (int i=0; i<8; i++) buf[i] = c_midstate76[i];
sha256_round_body(dat, buf, s_K);
// second sha256
#pragma unroll
for (int i=0; i<8; i++) dat[i] = buf[i];
dat[8] = 0x80000000;
#pragma unroll
for (int i=9; i<15; i++) dat[i] = 0;
dat[15] = 0x100;
#pragma unroll
for (int i=0; i<8; i++) buf[i] = c_H256[i];
sha256_round_body(dat, buf, s_K);
// third sha256
#pragma unroll
for (int i=0; i<8; i++) dat[i] = buf[i];
dat[8] = 0x80000000;
#pragma unroll
for (int i=9; i<15; i++) dat[i] = 0;
dat[15] = 0x100;
#pragma unroll
for (int i=0; i<8; i++) buf[i] = c_H256[i];
sha256_round_body(dat, buf, s_K);
// last sha256
#pragma unroll
for (int i=0; i<8; i++) dat[i] = buf[i];
dat[8] = 0x80000000;
#pragma unroll
for (int i=9; i<15; i++) dat[i] = 0;
dat[15] = 0x100;
#pragma unroll
for (int i=0; i<8; i++) buf[i] = c_H256[i];
sha256_round_last(dat, buf, s_K);
// valid nonces
uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]);
if (high <= c_target[0]) {
//printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]);
resNonces[1] = atomicExch(resNonces, nonce);
//d_target[0] = high;
}
}
}
__host__
void sha256q_init(int thr_id)
{
cuda_get_arch(thr_id);
cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice);
CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 2*sizeof(uint32_t)));
}
__host__
void sha256q_free(int thr_id)
{
if (d_resNonces[thr_id]) cudaFree(d_resNonces[thr_id]);
d_resNonces[thr_id] = NULL;
}
__host__
void sha256q_setBlock_80(uint32_t *pdata, uint32_t *ptarget)
{
uint32_t _ALIGN(64) in[16], buf[8], end[4];
for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
for (int i=0;i<8;i++) buf[i] = cpu_H256[i];
for (int i=0;i<4;i++) end[i] = cuda_swab32(pdata[16+i]);
sha256_round_body_host(in, buf, cpu_K);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate76, buf, 32, 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd80, end, sizeof(end), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
}
__host__
void sha256q_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces)
{
const uint32_t threadsperblock = 128;
dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock);
CUDA_SAFE_CALL(cudaMemset(d_resNonces[thr_id], 0xFF, 2 * sizeof(uint32_t)));
cudaThreadSynchronize();
sha256q_gpu_hash_shared <<<grid, block>>> (threads, startNonce, d_resNonces[thr_id]);
cudaThreadSynchronize();
CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
if (resNonces[0] == resNonces[1]) {
resNonces[1] = UINT32_MAX;
}
}

136
sha256/sha256q.cu

@ -0,0 +1,136 @@ @@ -0,0 +1,136 @@
/**
* SHA256 4x
* by pyritepirate - 2018
* by tpruvot@github - 2017
*/
#include <miner.h>
#include <cuda_helper.h>
#include <openssl/sha.h>
// CPU Check
extern "C" void sha256q_hash(void *output, const void *input)
{
unsigned char _ALIGN(64) hash[64];
SHA256_CTX sha256;
SHA256_Init(&sha256);
SHA256_Update(&sha256, (unsigned char *)input, 80);
SHA256_Final(hash, &sha256);
SHA256_Init(&sha256);
SHA256_Update(&sha256, hash, 32);
SHA256_Final(hash, &sha256);
SHA256_Init(&sha256);
SHA256_Update(&sha256, hash, 32);
SHA256_Final(hash, &sha256);
SHA256_Init(&sha256);
SHA256_Update(&sha256, hash, 32);
SHA256_Final((unsigned char *)output, &sha256);
}
static bool init[MAX_GPUS] = { 0 };
extern void sha256q_init(int thr_id);
extern void sha256q_free(int thr_id);
extern void sha256q_setBlock_80(uint32_t *pdata, uint32_t *ptarget);
extern void sha256q_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces);
extern "C" int scanhash_sha256q(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 23);
if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce));
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x03;
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
sha256q_init(thr_id);
init[thr_id] = true;
}
for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);
sha256q_setBlock_80(endiandata, ptarget);
do {
// Hash with CUDA
*hashes_done = pdata[19] - first_nonce + throughput;
sha256q_hash_80(thr_id, throughput, pdata[19], work->nonces);
if (work->nonces[0] != UINT32_MAX)
{
uint32_t _ALIGN(64) vhash[8];
endiandata[19] = swab32(work->nonces[0]);
sha256q_hash(vhash, endiandata);
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
if (work->nonces[1] != UINT32_MAX) {
endiandata[19] = swab32(work->nonces[1]);
sha256q_hash(vhash, endiandata);
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
work->valid_nonces++;
bn_set_target_ratio(work, vhash, 1);
}
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1;
}
return work->valid_nonces;
}
else if (vhash[7] > ptarget[7]) {
gpu_increment_reject(thr_id);
if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
pdata[19] = work->nonces[0] + 1;
continue;
}
}
if ((uint64_t) throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_sha256q(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
sha256q_free(thr_id);
init[thr_id] = false;
cudaDeviceSynchronize();
}

8
sia/sia-rpc.cpp

@ -74,10 +74,10 @@ char* sia_getheader(CURL *curl, struct pool_infos *pool) @@ -74,10 +74,10 @@ char* sia_getheader(CURL *curl, struct pool_infos *pool)
struct data_buffer all_data = { 0 };
struct curl_slist *headers = NULL;
char data[256] = { 0 };
char url[512];
char url[512*3];
// nanopool
snprintf(url, 512, "%s/miner/header?address=%s&worker=%s", //&longpoll
snprintf(url, sizeof(url), "%s/miner/header?address=%s&worker=%s", //&longpoll
pool->url, pool->user, pool->pass);
if (opt_protocol)
@ -148,7 +148,7 @@ bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work) @@ -148,7 +148,7 @@ bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work)
struct data_buffer all_data = { 0 };
struct curl_slist *headers = NULL;
char buf[256] = { 0 };
char url[512];
char url[512*3];
if (opt_protocol)
applog_hex(work->data, 80);
@ -156,7 +156,7 @@ bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work) @@ -156,7 +156,7 @@ bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work)
//applog_hex(&work->data[10], 4);
// nanopool
snprintf(url, 512, "%s/miner/header?address=%s&worker=%s",
snprintf(url, sizeof(url), "%s/miner/header?address=%s&worker=%s",
pool->url, pool->user, pool->pass);
if (opt_protocol)

18
sia/sia.cu

@ -40,7 +40,7 @@ static __constant__ const int8_t blake2b_sigma[12][16] = { @@ -40,7 +40,7 @@ static __constant__ const int8_t blake2b_sigma[12][16] = {
// host mem align
#define A 64
extern "C" void blake2b_hash(void *output, const void *input)
extern "C" void sia_blake2b_hash(void *output, const void *input)
{
uint8_t _ALIGN(A) hash[32];
blake2b_ctx ctx;
@ -102,7 +102,7 @@ static void H(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, u @@ -102,7 +102,7 @@ static void H(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, u
__global__
//__launch_bounds__(128, 8) /* to force 64 regs */
void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2)
void sia_blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2)
{
const uint32_t nonce = (blockDim.x * blockIdx.x + threadIdx.x) + startNonce;
__shared__ uint64_t s_target;
@ -154,7 +154,7 @@ void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_ @@ -154,7 +154,7 @@ void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_
}
__host__
uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce)
uint32_t sia_blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce)
{
uint32_t resNonces[NBN] = { UINT32_MAX, UINT32_MAX };
uint32_t result = UINT32_MAX;
@ -166,7 +166,7 @@ uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint3 @@ -166,7 +166,7 @@ uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint3
if (cudaMemset(d_resNonces[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
return result;
blake2b_gpu_hash <<<grid, block, 8>>> (threads, startNonce, d_resNonces[thr_id], target2);
sia_blake2b_gpu_hash <<<grid, block, 8>>> (threads, startNonce, d_resNonces[thr_id], target2);
cudaThreadSynchronize();
if (cudaSuccess == cudaMemcpy(resNonces, d_resNonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
@ -178,7 +178,7 @@ uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint3 @@ -178,7 +178,7 @@ uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint3
}
__host__
void blake2b_setBlock(uint32_t *data)
void sia_blake2b_setBlock(uint32_t *data)
{
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 80, 0, cudaMemcpyHostToDevice));
}
@ -224,10 +224,10 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon @@ -224,10 +224,10 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon
const uint2 target = make_uint2(ptarget[6], ptarget[7]);
blake2b_setBlock(inputdata);
sia_blake2b_setBlock(inputdata);
do {
work->nonces[0] = blake2b_hash_cuda(thr_id, throughput, pdata[8], target, work->nonces[1]);
work->nonces[0] = sia_blake2b_hash_cuda(thr_id, throughput, pdata[8], target, work->nonces[1]);
*hashes_done = pdata[8] - first_nonce + throughput;
@ -235,7 +235,7 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon @@ -235,7 +235,7 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon
{
work->valid_nonces = 0;
inputdata[8] = work->nonces[0];
blake2b_hash(hash, inputdata);
sia_blake2b_hash(hash, inputdata);
if (swab32(hash[0]) <= Htarg) {
// sia hash target is reversed (start of hash)
swab256(vhashcpu, hash);
@ -250,7 +250,7 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon @@ -250,7 +250,7 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon
if (work->nonces[1] != UINT32_MAX) {
inputdata[8] = work->nonces[1];
blake2b_hash(hash, inputdata);
sia_blake2b_hash(hash, inputdata);
if (swab32(hash[0]) <= Htarg) {
swab256(vhashcpu, hash);
if (fulltest(vhashcpu, ptarget)) {

51
util.cpp

@ -616,7 +616,7 @@ err_out: @@ -616,7 +616,7 @@ err_out:
json_t *json_rpc_call_pool(CURL *curl, struct pool_infos *pool, const char *req,
bool longpoll_scan, bool longpoll, int *curl_err)
{
char userpass[512];
char userpass[768];
// todo, malloc and store that in pool array
snprintf(userpass, sizeof(userpass), "%s%c%s", pool->user,
strlen(pool->pass)?':':'\0', pool->pass);
@ -627,7 +627,7 @@ json_t *json_rpc_call_pool(CURL *curl, struct pool_infos *pool, const char *req, @@ -627,7 +627,7 @@ json_t *json_rpc_call_pool(CURL *curl, struct pool_infos *pool, const char *req,
/* called only from longpoll thread, we have the lp_url */
json_t *json_rpc_longpoll(CURL *curl, char *lp_url, struct pool_infos *pool, const char *req, int *curl_err)
{
char userpass[512];
char userpass[768];
snprintf(userpass, sizeof(userpass), "%s%c%s", pool->user,
strlen(pool->pass)?':':'\0', pool->pass);
@ -1442,7 +1442,7 @@ static uint32_t getblocheight(struct stratum_ctx *sctx) @@ -1442,7 +1442,7 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)
static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
{
const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime;
const char *claim = NULL, *nreward = NULL;
const char *extradata = NULL, *nreward = NULL;
size_t coinb1_size, coinb2_size;
bool clean, ret = false;
int merkle_count, i, p=0;
@ -1452,7 +1452,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) @@ -1452,7 +1452,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
int ntime;
char algo[64] = { 0 };
get_currentalgo(algo, sizeof(algo));
bool has_claim = !strcasecmp(algo, "lbry");
bool has_claim = !strcmp(algo, "lbry");
bool has_roots = !strcmp(algo, "phi2") && json_array_size(params) == 10;
if (sctx->is_equihash) {
return equi_stratum_notify(sctx, params);
@ -1461,11 +1462,17 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) @@ -1461,11 +1462,17 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
job_id = json_string_value(json_array_get(params, p++));
prevhash = json_string_value(json_array_get(params, p++));
if (has_claim) {
claim = json_string_value(json_array_get(params, p++));
if (!claim || strlen(claim) != 64) {
extradata = json_string_value(json_array_get(params, p++));
if (!extradata || strlen(extradata) != 64) {
applog(LOG_ERR, "Stratum notify: invalid claim parameter");
goto out;
}
} else if (has_roots) {
extradata = json_string_value(json_array_get(params, p++));
if (!extradata || strlen(extradata) != 128) {
applog(LOG_ERR, "Stratum notify: invalid UTXO root parameter");
goto out;
}
}
coinb1 = json_string_value(json_array_get(params, p++));
coinb2 = json_string_value(json_array_get(params, p++));
@ -1529,7 +1536,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) @@ -1529,7 +1536,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
free(sctx->job.job_id);
sctx->job.job_id = strdup(job_id);
hex2bin(sctx->job.prevhash, prevhash, 32);
if (has_claim) hex2bin(sctx->job.claim, claim, 32);
if (has_claim) hex2bin(sctx->job.extra, extradata, 32);
if (has_roots) hex2bin(sctx->job.extra, extradata, 64);
sctx->job.height = getblocheight(sctx);
@ -2164,6 +2172,9 @@ void print_hash_tests(void) @@ -2164,6 +2172,9 @@ void print_hash_tests(void)
printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n");
allium_hash(&hash[0], &buf[0]);
printpfx("allium", hash);
bastionhash(&hash[0], &buf[0]);
printpfx("bastion", hash);
@ -2173,6 +2184,9 @@ void print_hash_tests(void) @@ -2173,6 +2184,9 @@ void print_hash_tests(void)
blake256hash(&hash[0], &buf[0], 14);
printpfx("blake", hash);
blake2b_hash(&hash[0], &buf[0]);
printpfx("blake2b", hash);
blake2s_hash(&hash[0], &buf[0]);
printpfx("blake2s", hash);
@ -2182,10 +2196,10 @@ void print_hash_tests(void) @@ -2182,10 +2196,10 @@ void print_hash_tests(void)
c11hash(&hash[0], &buf[0]);
printpfx("c11", hash);
cryptolight_hash(&hash[0], &buf[0], 76);
cryptolight_hash(&hash[0], &buf[0]);
printpfx("cryptolight", hash);
cryptonight_hash(&hash[0], &buf[0], 76);
cryptonight_hash(&hash[0], &buf[0]);
printpfx("cryptonight", hash);
memset(buf, 0, 180);
@ -2232,9 +2246,15 @@ void print_hash_tests(void) @@ -2232,9 +2246,15 @@ void print_hash_tests(void)
lyra2v2_hash(&hash[0], &buf[0]);
printpfx("lyra2v2", hash);
lyra2v3_hash(&hash[0], &buf[0]);
printpfx("lyra2v3", hash);
lyra2Z_hash(&hash[0], &buf[0]);
printpfx("lyra2z", hash);
monero_hash(&hash[0], &buf[0]);
printpfx("monero", hash);
myriadhash(&hash[0], &buf[0]);
printpfx("myriad", hash);
@ -2247,7 +2267,7 @@ void print_hash_tests(void) @@ -2247,7 +2267,7 @@ void print_hash_tests(void)
pentablakehash(&hash[0], &buf[0]);
printpfx("pentablake", hash);
phihash(&hash[0], &buf[0]);
phi2_hash(&hash[0], &buf[0]);
printpfx("phi", hash);
polytimos_hash(&hash[0], &buf[0]);
@ -2271,7 +2291,10 @@ void print_hash_tests(void) @@ -2271,7 +2291,10 @@ void print_hash_tests(void)
sha256t_hash(&hash[0], &buf[0]);
printpfx("sha256t", hash);
blake2b_hash(&hash[0], &buf[0]);
sha256q_hash(&hash[0], &buf[0]);
printpfx("sha256q", hash);
sia_blake2b_hash(&hash[0], &buf[0]);
printpfx("sia", hash);
sibhash(&hash[0], &buf[0]);
@ -2289,6 +2312,9 @@ void print_hash_tests(void) @@ -2289,6 +2312,9 @@ void print_hash_tests(void)
skunk_hash(&hash[0], &buf[0]);
printpfx("skunk", hash);
stellite_hash(&hash[0], &buf[0]);
printpfx("stelitte", hash);
s3hash(&hash[0], &buf[0]);
printpfx("S3", hash);
@ -2298,6 +2324,9 @@ void print_hash_tests(void) @@ -2298,6 +2324,9 @@ void print_hash_tests(void)
bitcore_hash(&hash[0], &buf[0]);
printpfx("bitcore", hash);
exosis_hash(&hash[0], &buf[0]);
printpfx("exosis", hash);
blake256hash(&hash[0], &buf[0], 8);
printpfx("vanilla", hash);

21
x11/cuda_streebog_maxwell.cu

@ -207,7 +207,7 @@ __launch_bounds__(TPB, 3) @@ -207,7 +207,7 @@ __launch_bounds__(TPB, 3)
#else
__launch_bounds__(TPB, 3)
#endif
void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
void streebog_gpu_hash_64_sm5(uint64_t *g_hash, uint32_t* const d_filter, const uint32_t filter_val)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint2 buf[8], t[8], temp[8], K0[8], hash[8];
@ -222,13 +222,16 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash) @@ -222,13 +222,16 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]);
shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]);
//__threadfence_block();
__syncthreads();
if (d_filter && d_filter[thread] != filter_val) return;
uint64_t* inout = &g_hash[thread<<3];
*(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
*(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);
__threadfence_block();
K0[0] = vectorize(0x74a5d4ce2efc83b3);
#pragma unroll 8
@ -301,9 +304,17 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash) @@ -301,9 +304,17 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
}
__host__
void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash)
void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *g_hash)
{
dim3 grid((threads + TPB-1) / TPB);
dim3 block(TPB);
streebog_gpu_hash_64_sm5 <<<grid, block>>> ((uint64_t*)g_hash, NULL, 0);
}
__host__
void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter)
{
dim3 grid((threads + TPB-1) / TPB);
dim3 block(TPB);
streebog_gpu_hash_64_maxwell <<<grid, block>>> ((uint64_t*)d_hash);
streebog_gpu_hash_64_sm5 <<<grid, block>>> ((uint64_t*)g_hash, d_filter, 1);
}

54
x11/cuda_x11_cubehash512.cu

@ -259,16 +259,32 @@ void x11_cubehash512_cpu_init(int thr_id, uint32_t threads) { } @@ -259,16 +259,32 @@ void x11_cubehash512_cpu_init(int thr_id, uint32_t threads) { }
/***************************************************/
#define WANT_CUBEHASH80
#ifdef WANT_CUBEHASH80
/**
* Timetravel and x16 CUBEHASH-80 CUDA implementation
* by tpruvot@github - Jan 2017 / May 2018
*/
__constant__
static uint32_t c_PaddedMessage80[20];
__constant__ static uint32_t c_midstate128[32];
__constant__ static uint32_t c_PaddedMessage80[20];
#undef SPH_C32
#undef SPH_C64
#undef SPH_T32
#undef SPH_T64
#include "sph/sph_cubehash.h"
__host__
void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata)
{
sph_cubehash512_context ctx_cubehash;
sph_cubehash512_init(&ctx_cubehash);
sph_cubehash512(&ctx_cubehash, (void*)endiandata, 64);
#ifndef NO_MIDSTATE
cudaMemcpyToSymbol(c_midstate128, ctx_cubehash.state, 128, 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_PaddedMessage80, &endiandata[16], 16, 0, cudaMemcpyHostToDevice);
#else
cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
#endif
}
__global__
@ -278,11 +294,11 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, @@ -278,11 +294,11 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
if (thread < threads)
{
const uint32_t nonce = startNounce + thread;
uint32_t message[8];
uint32_t x[2][2][2][2][2];
#ifdef NO_MIDSTATE
Init(x);
uint32_t message[8];
// first 32 bytes
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]);
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[4]);
@ -293,8 +309,31 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, @@ -293,8 +309,31 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[12]);
Update32(x, message);
// last 16 bytes + Padding
// last 16 bytes
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[16]);
#else
AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_midstate128[ 0]);
AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_midstate128[ 2]);
AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_midstate128[ 4]);
AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_midstate128[ 6]);
AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_midstate128[ 8]);
AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_midstate128[10]);
AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_midstate128[12]);
AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_midstate128[14]);
AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_midstate128[16]);
AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_midstate128[18]);
AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_midstate128[20]);
AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_midstate128[22]);
AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_midstate128[24]);
AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_midstate128[26]);
AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_midstate128[28]);
AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_midstate128[30]);
// last 16 bytes
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]);
#endif
// nonce + Padding
message[3] = cuda_swab32(nonce);
message[4] = 0x80;
message[5] = 0;
@ -317,4 +356,3 @@ void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const ui @@ -317,4 +356,3 @@ void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const ui
cubehash512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, (uint64_t*) d_hash);
}
#endif

497
x11/exosis.cu

@ -0,0 +1,497 @@ @@ -0,0 +1,497 @@
/**
* Timetravel (exosis) CUDA implementation
* by tpruvot@github, exosis
*/
#include <stdio.h>
#include <memory.h>
#include <unistd.h>
#define HASH_FUNC_BASE_TIMESTAMP 1538556426U
#define HASH_FUNC_COUNT 8
#define HASH_FUNC_COUNT_PERMUTATIONS 40320U
extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "sph/sph_luffa.h"
#include "sph/sph_cubehash.h"
}
#include "miner.h"
#include "cuda_helper.h"
#include "cuda_x11.h"
static uint32_t *d_hash[MAX_GPUS];
enum Algo {
BLAKE = 0,
BMW,
GROESTL,
SKEIN,
JH,
KECCAK,
LUFFA,
CUBEHASH,
MAX_ALGOS_COUNT
};
static const char* algo_strings[] = {
"blake",
"bmw512",
"groestl",
"skein",
"jh512",
"keccak",
"luffa",
"cube",
NULL
};
inline void swap8(uint8_t *a, uint8_t *b)
{
uint8_t t = *a;
*a = *b;
*b = t;
}
inline void initPerm(uint8_t n[], int count)
{
for (int i = 0; i < count; i++)
n[i] = i;
}
static int nextPerm(uint8_t n[], int count)
{
int tail, i, j;
if (count <= 1)
return 0;
for (i = count - 1; i>0 && n[i - 1] >= n[i]; i--);
tail = i;
if (tail > 0) {
for (j = count - 1; j>tail && n[j] <= n[tail - 1]; j--);
swap8(&n[tail - 1], &n[j]);
}
for (i = tail, j = count - 1; i<j; i++, j--)
swap8(&n[i], &n[j]);
return (tail != 0);
}
static void getAlgoString(char *str, int seq)
{
uint8_t algoList[HASH_FUNC_COUNT];
char *sptr;
initPerm(algoList, HASH_FUNC_COUNT);
for (int k = 0; k < seq; k++) {
nextPerm(algoList, HASH_FUNC_COUNT);
}
sptr = str;
for (int j = 0; j < HASH_FUNC_COUNT; j++) {
if (algoList[j] >= 10)
sprintf(sptr, "%c", 'A' + (algoList[j] - 10));
else
sprintf(sptr, "%u", (uint32_t) algoList[j]);
sptr++;
}
*sptr = '\0';
}
static __thread uint32_t s_ntime = 0;
static uint32_t s_sequence = UINT32_MAX;
static uint8_t s_firstalgo = 0xFF;
static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
#define INITIAL_DATE HASH_FUNC_BASE_TIMESTAMP
static inline uint32_t getCurrentAlgoSeq(uint32_t ntime)
{
// unlike x11evo, the permutation changes often (with ntime)
return (uint32_t) (ntime - INITIAL_DATE) % HASH_FUNC_COUNT_PERMUTATIONS;
}
// To finish...
static void get_travel_order(uint32_t ntime, char *permstr)
{
uint32_t seq = getCurrentAlgoSeq(ntime);
if (s_sequence != seq) {
getAlgoString(permstr, seq);
s_sequence = seq;
}
}
// CPU Hash
extern "C" void exosis_hash(void *output, const void *input)
{
uint32_t _ALIGN(64) hash[64/4] = { 0 };
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_skein512_context ctx_skein;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_luffa512_context ctx_luffa1;
sph_cubehash512_context ctx_cubehash1;
if (s_sequence == UINT32_MAX) {
uint32_t *data = (uint32_t*) input;
const uint32_t ntime = (opt_benchmark || !data[17]) ? (uint32_t) time(NULL) : data[17];
get_travel_order(ntime, hashOrder);
}
void *in = (void*) input;
int size = 80;
const int hashes = (int) strlen(hashOrder);
for (int i = 0; i < hashes; i++)
{
const char elem = hashOrder[i];
uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch (algo) {
case BLAKE:
sph_blake512_init(&ctx_blake);
sph_blake512(&ctx_blake, in, size);
sph_blake512_close(&ctx_blake, hash);
break;
case BMW:
sph_bmw512_init(&ctx_bmw);
sph_bmw512(&ctx_bmw, in, size);
sph_bmw512_close(&ctx_bmw, hash);
break;
case GROESTL:
sph_groestl512_init(&ctx_groestl);
sph_groestl512(&ctx_groestl, in, size);
sph_groestl512_close(&ctx_groestl, hash);
break;
case SKEIN:
sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, in, size);
sph_skein512_close(&ctx_skein, hash);
break;
case JH:
sph_jh512_init(&ctx_jh);
sph_jh512(&ctx_jh, in, size);
sph_jh512_close(&ctx_jh, hash);
break;
case KECCAK:
sph_keccak512_init(&ctx_keccak);
sph_keccak512(&ctx_keccak, in, size);
sph_keccak512_close(&ctx_keccak, hash);
break;
case LUFFA:
sph_luffa512_init(&ctx_luffa1);
sph_luffa512(&ctx_luffa1, in, size);
sph_luffa512_close(&ctx_luffa1, hash);
break;
case CUBEHASH:
sph_cubehash512_init(&ctx_cubehash1);
sph_cubehash512(&ctx_cubehash1, in, size);
sph_cubehash512_close(&ctx_cubehash1, hash);
break;
}
in = (void*) hash;
size = 64;
}
memcpy(output, hash, 32);
}
static uint32_t get_next_time(uint32_t ntime, char* curOrder)
{
char nextOrder[HASH_FUNC_COUNT + 1] = { 0 };
uint32_t secs = 15;
do {
uint32_t nseq = getCurrentAlgoSeq(ntime+secs);
getAlgoString(nextOrder, nseq);
secs += 15;
} while (curOrder[0] == nextOrder[0]);
return secs;
}
//#define _DEBUG
#define _DEBUG_PREFIX "tt-"
#include "cuda_debug.cuh"
void quark_bmw512_cpu_setBlock_80(void *pdata);
void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
void groestl512_setBlock_80(int thr_id, uint32_t *endiandata);
void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
void skein512_cpu_setBlock_80(void *pdata);
void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
void qubit_luffa512_cpu_setBlock_80(void *pdata);
void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
void keccak512_setBlock_80(int thr_id, uint32_t *endiandata);
void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_exosis(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
// if (opt_benchmark) pdata[17] = swab32(0x5886a4be); // TO DEBUG GROESTL 80
if (opt_debug || s_ntime != pdata[17] || s_sequence == UINT32_MAX) {
uint32_t ntime = swab32(work->data[17]);
get_travel_order(ntime, hashOrder);
s_ntime = pdata[17];
if (opt_debug && !thr_id) {
applog(LOG_DEBUG, "exosis hash order %s (%08x)", hashOrder, ntime);
}
}
if (opt_benchmark)
ptarget[7] = 0x5;
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
qubit_luffa512_cpu_init(thr_id, throughput); // only constants (480 bytes)
x11_luffa512_cpu_init(thr_id, throughput);
x11_cubehash512_cpu_init(thr_id, throughput);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
CUDA_CALL_OR_RET_X(cudaMemset(d_hash[thr_id], 0, (size_t) 64 * throughput), -1);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);
cuda_check_cpu_setTarget(ptarget);
const int hashes = (int) strlen(hashOrder);
const char first = hashOrder[0];
const uint8_t algo80 = first >= 'A' ? first - 'A' + 10 : first - '0';
if (algo80 != s_firstalgo) {
s_firstalgo = algo80;
applog(LOG_INFO, "Exosis first algo is now %s", algo_strings[algo80 % HASH_FUNC_COUNT]);
}
switch (algo80) {
case BLAKE:
quark_blake512_cpu_setBlock_80(thr_id, endiandata);
break;
case BMW:
quark_bmw512_cpu_setBlock_80(endiandata);
break;
case GROESTL:
groestl512_setBlock_80(thr_id, endiandata);
break;
case SKEIN:
skein512_cpu_setBlock_80((void*)endiandata);
break;
case JH:
jh512_setBlock_80(thr_id, endiandata);
break;
case KECCAK:
keccak512_setBlock_80(thr_id, endiandata);
break;
case LUFFA:
qubit_luffa512_cpu_setBlock_80((void*)endiandata);
break;
case CUBEHASH:
cubehash512_setBlock_80(thr_id, endiandata);
break;
default: {
uint32_t next = get_next_time(swab32(s_ntime), hashOrder);
if (!thr_id)
applog(LOG_WARNING, "kernel %c unimplemented, next in %u mn", first, next/60);
sleep(next > 30 ? 60 : 10);
return -1;
}
}
do {
int order = 0;
// Hash with CUDA
switch (algo80) {
case BLAKE:
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("blake80:");
break;
case BMW:
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("bmw80 :");
break;
case GROESTL:
groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("grstl80:");
break;
case SKEIN:
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
TRACE("skein80:");
break;
case JH:
jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("jh51280:");
break;
case KECCAK:
keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("kecck80:");
break;
case LUFFA:
qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("luffa80:");
break;
case CUBEHASH:
cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("cube 80:");
break;
}
for (int i = 1; i < hashes; i++)
{
const char elem = hashOrder[i];
const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch (algo64) {
case BLAKE:
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("blake :");
break;
case BMW:
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("bmw :");
break;
case GROESTL:
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("groestl:");
break;
case SKEIN:
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("skein :");
break;
case JH:
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("jh512 :");
break;
case KECCAK:
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("keccak :");
break;
case LUFFA:
x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("luffa :");
break;
case CUBEHASH:
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("cube :");
break;
}
}
*hashes_done = pdata[19] - first_nonce + throughput;
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (work->nonces[0] != UINT32_MAX)
{
uint32_t _ALIGN(64) vhash[8];
const uint32_t Htarg = ptarget[7];
be32enc(&endiandata[19], work->nonces[0]);
exosis_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
pdata[19] = work->nonces[0];
if (work->nonces[1] != 0) {
be32enc(&endiandata[19], work->nonces[1]);
exosis_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
}
pdata[19] = max(pdata[19], work->nonces[1]) + 1;
}
return work->valid_nonces;
} else if (vhash[7] > Htarg) {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
pdata[19] = work->nonces[0] + 1;
continue;
}
}
if ((uint64_t) throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_exosis(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
cuda_check_cpu_free(thr_id);
init[thr_id] = false;
cudaDeviceSynchronize();
}

67
x11/timetravel.cu

@ -20,11 +20,6 @@ extern "C" { @@ -20,11 +20,6 @@ extern "C" {
#include "sph/sph_keccak.h"
#include "sph/sph_luffa.h"
#include "sph/sph_cubehash.h"
#if HASH_FUNC_COUNT > 8
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
#endif
}
#include "miner.h"
@ -42,11 +37,6 @@ enum Algo { @@ -42,11 +37,6 @@ enum Algo {
KECCAK,
LUFFA,
CUBEHASH,
#if HASH_FUNC_COUNT > 8
SHAVITE,
SIMD,
ECHO,
#endif
MAX_ALGOS_COUNT
};
@ -153,11 +143,6 @@ extern "C" void timetravel_hash(void *output, const void *input) @@ -153,11 +143,6 @@ extern "C" void timetravel_hash(void *output, const void *input)
sph_keccak512_context ctx_keccak;
sph_luffa512_context ctx_luffa1;
sph_cubehash512_context ctx_cubehash1;
#if HASH_FUNC_COUNT > 8
sph_shavite512_context ctx_shavite1;
sph_simd512_context ctx_simd1;
sph_echo512_context ctx_echo1;
#endif
if (s_sequence == UINT32_MAX) {
uint32_t *data = (uint32_t*) input;
@ -175,11 +160,6 @@ extern "C" void timetravel_hash(void *output, const void *input) @@ -175,11 +160,6 @@ extern "C" void timetravel_hash(void *output, const void *input)
const char elem = hashOrder[i];
uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
if (i > 0) {
in = (void*) hash;
size = 64;
}
switch (algo) {
case BLAKE:
sph_blake512_init(&ctx_blake);
@ -195,7 +175,6 @@ extern "C" void timetravel_hash(void *output, const void *input) @@ -195,7 +175,6 @@ extern "C" void timetravel_hash(void *output, const void *input)
sph_groestl512_init(&ctx_groestl);
sph_groestl512(&ctx_groestl, in, size);
sph_groestl512_close(&ctx_groestl, hash);
//applog_hex((void*)hash, 32);
break;
case SKEIN:
sph_skein512_init(&ctx_skein);
@ -222,24 +201,10 @@ extern "C" void timetravel_hash(void *output, const void *input) @@ -222,24 +201,10 @@ extern "C" void timetravel_hash(void *output, const void *input)
sph_cubehash512(&ctx_cubehash1, in, size);
sph_cubehash512_close(&ctx_cubehash1, hash);
break;
#if HASH_FUNC_COUNT > 8
case SHAVITE:
sph_shavite512_init(&ctx_shavite1);
sph_shavite512(&ctx_shavite1, in, size);
sph_shavite512_close(&ctx_shavite1, hash);
break;
case SIMD:
sph_simd512_init(&ctx_simd1);
sph_simd512(&ctx_simd1, in, size);
sph_simd512_close(&ctx_simd1, hash);
break;
case ECHO:
sph_echo512_init(&ctx_echo1);
sph_echo512(&ctx_echo1, in, size);
sph_echo512_close(&ctx_echo1, hash);
break;
#endif
}
in = (void*) hash;
size = 64;
}
memcpy(output, hash, 32);
@ -330,13 +295,7 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n @@ -330,13 +295,7 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n
qubit_luffa512_cpu_init(thr_id, throughput); // only constants (480 bytes)
x11_luffa512_cpu_init(thr_id, throughput);
x11_cubehash512_cpu_init(thr_id, throughput);
#if HASH_FUNC_COUNT > 8
x11_shavite512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput);
if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
return 0;
}
#endif
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
CUDA_CALL_OR_RET_X(cudaMemset(d_hash[thr_id], 0, (size_t) 64 * throughput), -1);
@ -471,20 +430,6 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n @@ -471,20 +430,6 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("cube :");
break;
#if HASH_FUNC_COUNT > 8
case SHAVITE:
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shavite:");
break;
case SIMD:
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("simd :");
break;
case ECHO:
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("echo :");
break;
#endif
}
}
@ -544,9 +489,7 @@ extern "C" void free_timetravel(int thr_id) @@ -544,9 +489,7 @@ extern "C" void free_timetravel(int thr_id)
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
#if HASH_FUNC_COUNT > 8
x11_simd512_cpu_free(thr_id);
#endif
cuda_check_cpu_free(thr_id);
init[thr_id] = false;

18
x12/x12.cu

@ -22,6 +22,8 @@ extern "C" { @@ -22,6 +22,8 @@ extern "C" {
static uint32_t *d_hash[MAX_GPUS];
extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -96,13 +98,15 @@ extern "C" void x12hash(void *output, const void *input) @@ -96,13 +98,15 @@ extern "C" void x12hash(void *output, const void *input)
}
static bool init[MAX_GPUS] = { 0 };
static bool use_compat_kernels[MAX_GPUS] = { 0 };
extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19;
const int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
uint32_t throughput = cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8;
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@ -111,7 +115,7 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u @@ -111,7 +115,7 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
@ -120,13 +124,17 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u @@ -120,13 +124,17 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
cuda_get_arch(thr_id);
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
if (use_compat_kernels[thr_id])
x11_echo512_cpu_init(thr_id, throughput);
quark_blake512_cpu_init(thr_id, throughput);
x11_luffaCubehash512_cpu_init(thr_id, throughput);
x11_shavite512_cpu_init(thr_id, throughput);
if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
return 0;
}
x11_echo512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
@ -156,7 +164,11 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u @@ -156,7 +164,11 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (use_compat_kernels[thr_id])
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
else {
x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
}
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

26
x16/cuda_x16_echo512_64.cu

@ -79,11 +79,12 @@ static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W, @@ -79,11 +79,12 @@ static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W,
}
__global__ __launch_bounds__(128, 5) /* will force 80 registers */
static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t* g_hash, uint32_t* const d_filter, const uint32_t filter_val)
{
__shared__ uint32_t sharedMemory[4][256];
aes_gpu_init128(sharedMemory);
__syncthreads();
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t k0;
@ -91,6 +92,9 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) @@ -91,6 +92,9 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
uint32_t hash[16];
if (thread < threads)
{
// phi2 filter (2 hash chain branches)
if (d_filter && d_filter[thread] != filter_val) return;
uint32_t *Hash = &g_hash[thread<<4];
*(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
@ -99,8 +103,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) @@ -99,8 +103,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
*(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0];
*(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8];
__syncthreads();
const uint32_t P[48] = {
0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
//8-12
@ -217,7 +219,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) @@ -217,7 +219,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
W[48 + i + 4] = a ^ cd ^ bcx;
W[48 + i + 8] = d ^ ab ^ cdx;
W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
}
for (int k = 1; k < 10; k++)
@ -237,12 +238,23 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) @@ -237,12 +238,23 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
}
__host__
void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){
void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
x16_echo512_gpu_hash_64<<<grid, block>>>(threads, d_hash);
x16_echo512_gpu_hash_64 <<<grid, block>>> (threads, d_hash, NULL, 0);
}
__host__
void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
x16_echo512_gpu_hash_64 <<<grid, block>>> (threads, g_hash, d_filter, 0);
}

5
x16/x16r.cu

@ -250,7 +250,7 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, @@ -250,7 +250,7 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
@ -491,8 +491,9 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, @@ -491,8 +491,9 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
case ECHO:
if (use_compat_kernels[thr_id])
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
else
else {
x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
}
TRACE("echo :");
break;
case HAMSI:

5
x16/x16s.cu

@ -248,7 +248,7 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, @@ -248,7 +248,7 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
@ -489,8 +489,9 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, @@ -489,8 +489,9 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
case ECHO:
if (use_compat_kernels[thr_id])
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
else
else {
x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
}
TRACE("echo :");
break;
case HAMSI:

632
x17/sonoa.cu

@ -0,0 +1,632 @@ @@ -0,0 +1,632 @@
/**
* x97 SONO
**/
extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "sph/sph_luffa.h"
#include "sph/sph_cubehash.h"
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
#include "sph/sph_hamsi.h"
#include "sph/sph_fugue.h"
#include "sph/sph_shabal.h"
#include "sph/sph_whirlpool.h"
#include "sph/sph_sha2.h"
#include "sph/sph_haval.h"
}
#include "miner.h"
#include "cuda_helper.h"
#include "x11/cuda_x11.h"
#define NBN 2
static uint32_t *d_hash[MAX_GPUS];
extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_fugue512_cpu_free(int thr_id);
extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x15_whirlpool_cpu_free(int thr_id);
extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const int outlen);
// CPU Hash Validation
extern "C" void sonoa_hash(void *output, const void *input)
{
unsigned char _ALIGN(128) hash[64];
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein;
sph_luffa512_context ctx_luffa;
sph_cubehash512_context ctx_cubehash;
sph_shavite512_context ctx_shavite;
sph_simd512_context ctx_simd;
sph_echo512_context ctx_echo;
sph_hamsi512_context ctx_hamsi;
sph_fugue512_context ctx_fugue;
sph_shabal512_context ctx_shabal;
sph_whirlpool_context ctx_whirlpool;
sph_sha512_context ctx_sha512;
sph_haval256_5_context ctx_haval;
sph_blake512_init(&ctx_blake);
sph_blake512(&ctx_blake, input, 80);
sph_blake512_close(&ctx_blake, (void*)hash);
sph_bmw512_init(&ctx_bmw);
sph_bmw512(&ctx_bmw, (const void*)hash, 64);
sph_bmw512_close(&ctx_bmw, (void*)hash);
sph_groestl512_init(&ctx_groestl);
sph_groestl512(&ctx_groestl, (const void*)hash, 64);
sph_groestl512_close(&ctx_groestl, (void*)hash);
sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, (const void*)hash, 64);
sph_skein512_close(&ctx_skein, (void*)hash);
sph_jh512_init(&ctx_jh);
sph_jh512(&ctx_jh, (const void*)hash, 64);
sph_jh512_close(&ctx_jh, (void*)hash);
sph_keccak512_init(&ctx_keccak);
sph_keccak512(&ctx_keccak, (const void*)hash, 64);
sph_keccak512_close(&ctx_keccak, (void*)hash);
sph_luffa512_init(&ctx_luffa);
sph_luffa512(&ctx_luffa, (const void*)hash, 64);
sph_luffa512_close(&ctx_luffa, (void*)hash);
sph_cubehash512_init(&ctx_cubehash);
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
sph_cubehash512_close(&ctx_cubehash, (void*)hash);
sph_shavite512_init(&ctx_shavite);
sph_shavite512(&ctx_shavite, (const void*)hash, 64);
sph_shavite512_close(&ctx_shavite, (void*)hash);
sph_simd512_init(&ctx_simd);
sph_simd512(&ctx_simd, (const void*)hash, 64);
sph_simd512_close(&ctx_simd, (void*)hash);
sph_echo512_init(&ctx_echo);
sph_echo512(&ctx_echo, (const void*)hash, 64);
sph_echo512_close(&ctx_echo, (void*)hash);
sph_bmw512(&ctx_bmw, (const void*)hash, 64);
sph_bmw512_close(&ctx_bmw, (void*)hash);
sph_groestl512(&ctx_groestl, (const void*)hash, 64);
sph_groestl512_close(&ctx_groestl, (void*)hash);
sph_skein512(&ctx_skein, (const void*)hash, 64);
sph_skein512_close(&ctx_skein, (void*)hash);
sph_jh512(&ctx_jh, (const void*)hash, 64);
sph_jh512_close(&ctx_jh, (void*)hash);
sph_keccak512(&ctx_keccak, (const void*)hash, 64);
sph_keccak512_close(&ctx_keccak, (void*)hash);
sph_luffa512(&ctx_luffa, (const void*)hash, 64);
sph_luffa512_close(&ctx_luffa, (void*)hash);
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
sph_cubehash512_close(&ctx_cubehash, (void*)hash);
sph_shavite512(&ctx_shavite, (const void*)hash, 64);
sph_shavite512_close(&ctx_shavite, (void*)hash);
sph_simd512(&ctx_simd, (const void*)hash, 64);
sph_simd512_close(&ctx_simd, (void*)hash);
sph_echo512(&ctx_echo, (const void*)hash, 64);
sph_echo512_close(&ctx_echo, (void*)hash);
sph_hamsi512_init(&ctx_hamsi);
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
sph_hamsi512_close(&ctx_hamsi, (void*)hash);
sph_bmw512(&ctx_bmw, (const void*)hash, 64);
sph_bmw512_close(&ctx_bmw, (void*)hash);
sph_groestl512(&ctx_groestl, (const void*)hash, 64);
sph_groestl512_close(&ctx_groestl, (void*)hash);
sph_skein512(&ctx_skein, (const void*)hash, 64);
sph_skein512_close(&ctx_skein, (void*)hash);
sph_jh512(&ctx_jh, (const void*)hash, 64);
sph_jh512_close(&ctx_jh, (void*)hash);
sph_keccak512(&ctx_keccak, (const void*)hash, 64);
sph_keccak512_close(&ctx_keccak, (void*)hash);
sph_luffa512(&ctx_luffa, (const void*)hash, 64);
sph_luffa512_close(&ctx_luffa, (void*)hash);
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
sph_cubehash512_close(&ctx_cubehash, (void*)hash);
sph_shavite512(&ctx_shavite, (const void*)hash, 64);
sph_shavite512_close(&ctx_shavite, (void*)hash);
sph_simd512(&ctx_simd, (const void*)hash, 64);
sph_simd512_close(&ctx_simd, (void*)hash);
sph_echo512(&ctx_echo, (const void*)hash, 64);
sph_echo512_close(&ctx_echo, (void*)hash);
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
sph_hamsi512_close(&ctx_hamsi, (void*)hash);
sph_fugue512_init(&ctx_fugue);
sph_fugue512(&ctx_fugue, (const void*)hash, 64);
sph_fugue512_close(&ctx_fugue, (void*)hash);
sph_bmw512(&ctx_bmw, (const void*)hash, 64);
sph_bmw512_close(&ctx_bmw, (void*)hash);
sph_groestl512(&ctx_groestl, (const void*)hash, 64);
sph_groestl512_close(&ctx_groestl, (void*)hash);
sph_skein512(&ctx_skein, (const void*)hash, 64);
sph_skein512_close(&ctx_skein, (void*)hash);
sph_jh512(&ctx_jh, (const void*)hash, 64);
sph_jh512_close(&ctx_jh, (void*)hash);
sph_keccak512(&ctx_keccak, (const void*)hash, 64);
sph_keccak512_close(&ctx_keccak, (void*)hash);
sph_luffa512(&ctx_luffa, (const void*)hash, 64);
sph_luffa512_close(&ctx_luffa, (void*)hash);
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
sph_cubehash512_close(&ctx_cubehash, (void*)hash);
sph_shavite512(&ctx_shavite, (const void*)hash, 64);
sph_shavite512_close(&ctx_shavite, (void*)hash);
sph_simd512(&ctx_simd, (const void*)hash, 64);
sph_simd512_close(&ctx_simd, (void*)hash);
sph_echo512(&ctx_echo, (const void*)hash, 64);
sph_echo512_close(&ctx_echo, (void*)hash);
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
sph_hamsi512_close(&ctx_hamsi, (void*)hash);
sph_fugue512(&ctx_fugue, (const void*)hash, 64);
sph_fugue512_close(&ctx_fugue, (void*)hash);
sph_shabal512_init(&ctx_shabal);
sph_shabal512(&ctx_shabal, (const void*)hash, 64);
sph_shabal512_close(&ctx_shabal, (void*)hash);
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
sph_hamsi512_close(&ctx_hamsi, (void*)hash);
sph_echo512(&ctx_echo, (const void*)hash, 64);
sph_echo512_close(&ctx_echo, (void*)hash);
sph_shavite512(&ctx_shavite, (const void*)hash, 64);
sph_shavite512_close(&ctx_shavite, (void*)hash);
sph_bmw512(&ctx_bmw, (const void*)hash, 64);
sph_bmw512_close(&ctx_bmw, (void*)hash);
sph_shabal512(&ctx_shabal, (const void*)hash, 64);
sph_shabal512_close(&ctx_shabal, (void*)hash);
sph_groestl512(&ctx_groestl, (const void*)hash, 64);
sph_groestl512_close(&ctx_groestl, (void*)hash);
sph_skein512(&ctx_skein, (const void*)hash, 64);
sph_skein512_close(&ctx_skein, (void*)hash);
sph_jh512(&ctx_jh, (const void*)hash, 64);
sph_jh512_close(&ctx_jh, (void*)hash);
sph_keccak512(&ctx_keccak, (const void*)hash, 64);
sph_keccak512_close(&ctx_keccak, (void*)hash);
sph_luffa512(&ctx_luffa, (const void*)hash, 64);
sph_luffa512_close(&ctx_luffa, (void*)hash);
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
sph_cubehash512_close(&ctx_cubehash, (void*)hash);
sph_shavite512(&ctx_shavite, (const void*)hash, 64);
sph_shavite512_close(&ctx_shavite, (void*)hash);
sph_simd512(&ctx_simd, (const void*)hash, 64);
sph_simd512_close(&ctx_simd, (void*)hash);
sph_echo512(&ctx_echo, (const void*)hash, 64);
sph_echo512_close(&ctx_echo, (void*)hash);
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
sph_hamsi512_close(&ctx_hamsi, (void*)hash);
sph_fugue512(&ctx_fugue, (const void*)hash, 64);
sph_fugue512_close(&ctx_fugue, (void*)hash);
sph_shabal512(&ctx_shabal, (const void*)hash, 64);
sph_shabal512_close(&ctx_shabal, (void*)hash);
sph_whirlpool_init(&ctx_whirlpool);
sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
sph_bmw512(&ctx_bmw, (const void*)hash, 64);
sph_bmw512_close(&ctx_bmw, (void*)hash);
sph_groestl512(&ctx_groestl, (const void*)hash, 64);
sph_groestl512_close(&ctx_groestl, (void*)hash);
sph_skein512(&ctx_skein, (const void*)hash, 64);
sph_skein512_close(&ctx_skein, (void*)hash);
sph_jh512(&ctx_jh, (const void*)hash, 64);
sph_jh512_close(&ctx_jh, (void*)hash);
sph_keccak512(&ctx_keccak, (const void*)hash, 64);
sph_keccak512_close(&ctx_keccak, (void*)hash);
sph_luffa512(&ctx_luffa, (const void*)hash, 64);
sph_luffa512_close(&ctx_luffa, (void*)hash);
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
sph_cubehash512_close(&ctx_cubehash, (void*)hash);
sph_shavite512(&ctx_shavite, (const void*)hash, 64);
sph_shavite512_close(&ctx_shavite, (void*)hash);
sph_simd512(&ctx_simd, (const void*)hash, 64);
sph_simd512_close(&ctx_simd, (void*)hash);
sph_echo512(&ctx_echo, (const void*)hash, 64);
sph_echo512_close(&ctx_echo, (void*)hash);
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
sph_hamsi512_close(&ctx_hamsi, (void*)hash);
sph_fugue512(&ctx_fugue, (const void*)hash, 64);
sph_fugue512_close(&ctx_fugue, (void*)hash);
sph_shabal512(&ctx_shabal, (const void*)hash, 64);
sph_shabal512_close(&ctx_shabal, (void*)hash);
sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
sph_sha512_init(&ctx_sha512);
sph_sha512(&ctx_sha512, (const void*)hash, 64);
sph_sha512_close(&ctx_sha512, (void*)hash);
sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
sph_bmw512(&ctx_bmw, (const void*)hash, 64);
sph_bmw512_close(&ctx_bmw, (void*)hash);
sph_groestl512(&ctx_groestl, (const void*)hash, 64);
sph_groestl512_close(&ctx_groestl, (void*)hash);
sph_skein512(&ctx_skein, (const void*)hash, 64);
sph_skein512_close(&ctx_skein, (void*)hash);
sph_jh512(&ctx_jh, (const void*)hash, 64);
sph_jh512_close(&ctx_jh, (void*)hash);
sph_keccak512(&ctx_keccak, (const void*)hash, 64);
sph_keccak512_close(&ctx_keccak, (void*)hash);
sph_luffa512(&ctx_luffa, (const void*)hash, 64);
sph_luffa512_close(&ctx_luffa, (void*)hash);
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
sph_cubehash512_close(&ctx_cubehash, (void*)hash);
sph_shavite512(&ctx_shavite, (const void*)hash, 64);
sph_shavite512_close(&ctx_shavite, (void*)hash);
sph_simd512(&ctx_simd, (const void*)hash, 64);
sph_simd512_close(&ctx_simd, (void*)hash);
sph_echo512(&ctx_echo, (const void*)hash, 64);
sph_echo512_close(&ctx_echo, (void*)hash);
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
sph_hamsi512_close(&ctx_hamsi, (void*)hash);
sph_fugue512(&ctx_fugue, (const void*)hash, 64);
sph_fugue512_close(&ctx_fugue, (void*)hash);
sph_shabal512(&ctx_shabal, (const void*)hash, 64);
sph_shabal512_close(&ctx_shabal, (void*)hash);
sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
sph_sha512(&ctx_sha512, (const void*)hash, 64);
sph_sha512_close(&ctx_sha512, (void*)hash);
sph_haval256_5_init(&ctx_haval);
sph_haval256_5(&ctx_haval, (const void*)hash, 64);
sph_haval256_5_close(&ctx_haval, (void*)hash);
memcpy(output, hash, 32);
}
#define x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash) \
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash, order++); \
if (use_compat_kernels[thr_id]) x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash, order++); \
else x16_echo512_cpu_hash_64(thr_id, throughput, d_hash)
static bool init[MAX_GPUS] = { 0 };
static bool use_compat_kernels[MAX_GPUS] = { 0 };
extern "C" int scanhash_sonoa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int dev_id = device_map[thr_id];
uint32_t default_throughput = 1 << 18;
if (device_sm[dev_id] <= 500) default_throughput = 1 << 18;
else if (device_sm[dev_id] <= 520) default_throughput = 1 << 18;
else if (device_sm[dev_id] > 520) default_throughput = (1 << 19) + (1 << 18);
uint32_t throughput = cuda_default_throughput(thr_id, default_throughput);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
throughput &= 0xFFFFFF00;
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00ff;
if (!init[thr_id])
{
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
}
gpulog(LOG_INFO,thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
cuda_get_arch(thr_id);
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
if (use_compat_kernels[thr_id])
x11_echo512_cpu_init(thr_id, throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
x11_luffaCubehash512_cpu_init(thr_id, throughput);
x11_shavite512_cpu_init(thr_id, throughput);
x11_simd512_cpu_init(thr_id, throughput);
x13_hamsi512_cpu_init(thr_id, throughput);
x13_fugue512_cpu_init(thr_id, throughput);
x14_shabal512_cpu_init(thr_id, throughput);
x15_whirlpool_cpu_init(thr_id, throughput, 0);
x17_sha512_cpu_init(thr_id, throughput);
x17_haval256_cpu_init(thr_id, throughput);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 8 * sizeof(uint64_t) * throughput));
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
int warn = 0;
uint32_t _ALIGN(64) endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
quark_blake512_cpu_setBlock_80(thr_id, endiandata);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], 256); order++;
*hashes_done = pdata[19] - first_nonce + throughput;
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (work->nonces[0] != UINT32_MAX)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[19], work->nonces[0]);
sonoa_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
{
work->valid_nonces = 1;
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
work_set_target_ratio(work, vhash);
if (work->nonces[1] != 0) {
be32enc(&endiandata[19], work->nonces[1]);
sonoa_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1; // cursor
}
return work->valid_nonces;
}
else if (vhash[7] > Htarg) {
gpu_increment_reject(thr_id);
if (!warn) {
warn++;
pdata[19] = work->nonces[0] + 1;
continue;
} else {
if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
warn = 0;
}
}
}
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
extern "C" void free_sonoa(int thr_id)
{
if (!init[thr_id])
return;
cudaDeviceSynchronize();
cudaFree(d_hash[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
x11_simd512_cpu_free(thr_id);
x13_fugue512_cpu_free(thr_id);
x15_whirlpool_cpu_free(thr_id);
cudaDeviceSynchronize();
init[thr_id] = false;
}

16
x17/x17.cu

@ -32,6 +32,8 @@ extern "C" { @@ -32,6 +32,8 @@ extern "C" {
static uint32_t *d_hash[MAX_GPUS];
extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -151,12 +153,14 @@ extern "C" void x17hash(void *output, const void *input) @@ -151,12 +153,14 @@ extern "C" void x17hash(void *output, const void *input)
}
static bool init[MAX_GPUS] = { 0 };
static bool use_compat_kernels[MAX_GPUS] = { 0 };
extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int dev_id = device_map[thr_id];
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8;
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@ -166,7 +170,7 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u @@ -166,7 +170,7 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
@ -174,6 +178,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u @@ -174,6 +178,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
cuda_get_arch(thr_id);
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
if (use_compat_kernels[thr_id])
x11_echo512_cpu_init(thr_id, throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
@ -183,7 +192,6 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u @@ -183,7 +192,6 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
x11_luffaCubehash512_cpu_init(thr_id, throughput);
x11_shavite512_cpu_init(thr_id, throughput);
x11_simd512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput);
x13_hamsi512_cpu_init(thr_id, throughput);
x13_fugue512_cpu_init(thr_id, throughput);
x14_shabal512_cpu_init(thr_id, throughput);
@ -220,7 +228,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u @@ -220,7 +228,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (use_compat_kernels[thr_id])
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
else {
x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
}
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

Loading…
Cancel
Save