handle cryptonight light variant
Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
This commit is contained in:
parent
39aad5a003
commit
93adb56c8e
@ -40,7 +40,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \
|
|||||||
Algo256/blake2s.cu sph/blake2s.c \
|
Algo256/blake2s.cu sph/blake2s.c \
|
||||||
Algo256/bmw.cu Algo256/cuda_bmw.cu \
|
Algo256/bmw.cu Algo256/cuda_bmw.cu \
|
||||||
crypto/xmr-rpc.cpp crypto/wildkeccak-cpu.cpp crypto/wildkeccak.cu \
|
crypto/xmr-rpc.cpp crypto/wildkeccak-cpu.cpp crypto/wildkeccak.cu \
|
||||||
crypto/cryptonight.cu crypto/cuda_cryptonight_core.cu crypto/cuda_cryptonight_extra.cu \
|
crypto/cryptolight.cu crypto/cryptolight-core.cu crypto/cryptolight-cpu.cpp \
|
||||||
|
crypto/cryptonight.cu crypto/cryptonight-core.cu crypto/cryptonight-extra.cu \
|
||||||
crypto/cryptonight-cpu.cpp crypto/oaes_lib.cpp crypto/aesb.cpp crypto/cpu/c_keccak.c \
|
crypto/cryptonight-cpu.cpp crypto/oaes_lib.cpp crypto/aesb.cpp crypto/cpu/c_keccak.c \
|
||||||
JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
|
JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
|
||||||
JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \
|
JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \
|
||||||
@ -107,7 +108,10 @@ Algo256/blake256.o: Algo256/blake256.cu
|
|||||||
Algo256/cuda_bmw.o: Algo256/cuda_bmw.cu
|
Algo256/cuda_bmw.o: Algo256/cuda_bmw.cu
|
||||||
$(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $<
|
$(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $<
|
||||||
|
|
||||||
crypto/cuda_cryptonight_extra.o: crypto/cuda_cryptonight_extra.cu
|
crypto/cryptonight-core.o: crypto/cryptonight-core.cu
|
||||||
|
$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
|
||||||
|
|
||||||
|
crypto/cryptonight-extra.o: crypto/cryptonight-extra.cu
|
||||||
$(NVCC) $(nvcc_FLAGS) -o $@ -c $<
|
$(NVCC) $(nvcc_FLAGS) -o $@ -c $<
|
||||||
|
|
||||||
heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
|
heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
|
||||||
|
17
README.txt
17
README.txt
@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
ccminer 2.0 (December 2016) "Boolberry's WildKeccak RPC 2.0"
|
ccminer 2.0 (January 2016) "Cryptonight & other RPC 2.0 algos"
|
||||||
---------------------------------------------------------------
|
---------------------------------------------------------------
|
||||||
|
|
||||||
***************************************************************
|
***************************************************************
|
||||||
@ -8,8 +8,12 @@ If you find this tool useful and like to support its continuous
|
|||||||
|
|
||||||
tpruvot@github:
|
tpruvot@github:
|
||||||
BTC : 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
|
BTC : 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
|
||||||
DRK : XeVrkPrWB7pDbdFLfKhF1Z3xpqhsx6wkH3
|
DCR : DsUCcACGcyP8McNMRXQwbtpDxaVUYLDQDeU
|
||||||
ZRC : ZEcubH2xp2mpuwxMjy7wZThr5AzLGu3mqT
|
LBC : bKe6pLqELL3HHSbpJXxSdn5RrY2bfrkRhF
|
||||||
|
|
||||||
|
Alexis:
|
||||||
|
BTC : 14EgXD7fPYD4sHBXWUi46VeiTVXNq765B8
|
||||||
|
XVC : Vr5oCen8NrY6ekBWFaaWjCUFBH4dyiS57W
|
||||||
|
|
||||||
DJM34:
|
DJM34:
|
||||||
BTC donation address: 1NENYmxwZGHsKFmyjTc5WferTn5VTFb7Ze
|
BTC donation address: 1NENYmxwZGHsKFmyjTc5WferTn5VTFb7Ze
|
||||||
@ -52,6 +56,8 @@ Vanilla (Blake256 8-rounds - double sha256)
|
|||||||
Vertcoin Lyra2RE
|
Vertcoin Lyra2RE
|
||||||
Ziftrcoin (ZR5)
|
Ziftrcoin (ZR5)
|
||||||
Boolberry (Wild Keccak)
|
Boolberry (Wild Keccak)
|
||||||
|
Monero (Cryptonight)
|
||||||
|
Aeon (Cryptonight-light)
|
||||||
|
|
||||||
where some of these coins have a VERY NOTABLE nVidia advantage
|
where some of these coins have a VERY NOTABLE nVidia advantage
|
||||||
over competing AMD (OpenCL Only) implementations.
|
over competing AMD (OpenCL Only) implementations.
|
||||||
@ -74,6 +80,8 @@ its command line interface and options.
|
|||||||
blakecoin use to mine Old Blake 256
|
blakecoin use to mine Old Blake 256
|
||||||
blake2s use to mine Nevacoin (Blake2-S 256)
|
blake2s use to mine Nevacoin (Blake2-S 256)
|
||||||
bmw use to mine Midnight
|
bmw use to mine Midnight
|
||||||
|
cryptolight use to mine AEON cryptonight (MEM/2)
|
||||||
|
cryptonight use to mine XMR cryptonight
|
||||||
c11/flax use to mine Chaincoin and Flax
|
c11/flax use to mine Chaincoin and Flax
|
||||||
decred use to mine Decred 180 bytes Blake256-14
|
decred use to mine Decred 180 bytes Blake256-14
|
||||||
deep use to mine Deepcoin
|
deep use to mine Deepcoin
|
||||||
@ -186,9 +194,10 @@ Scrypt specific options:
|
|||||||
--no-autotune disable auto-tuning of kernel launch parameters
|
--no-autotune disable auto-tuning of kernel launch parameters
|
||||||
|
|
||||||
|
|
||||||
Boolberry specific:
|
XMR and Wildkeccak specific:
|
||||||
-l, --launch-config gives the launch configuration for each kernel
|
-l, --launch-config gives the launch configuration for each kernel
|
||||||
in a comma separated list, one per device.
|
in a comma separated list, one per device.
|
||||||
|
Wildkeccak specific:
|
||||||
-k, --scratchpad url Url used to download the scratchpad cache.
|
-k, --scratchpad url Url used to download the scratchpad cache.
|
||||||
|
|
||||||
|
|
||||||
|
4
algos.h
4
algos.h
@ -10,6 +10,7 @@ enum sha_algos {
|
|||||||
ALGO_BLAKE2S,
|
ALGO_BLAKE2S,
|
||||||
ALGO_BMW,
|
ALGO_BMW,
|
||||||
ALGO_C11,
|
ALGO_C11,
|
||||||
|
ALGO_CRYPTOLIGHT,
|
||||||
ALGO_CRYPTONIGHT,
|
ALGO_CRYPTONIGHT,
|
||||||
ALGO_DEEP,
|
ALGO_DEEP,
|
||||||
ALGO_DECRED,
|
ALGO_DECRED,
|
||||||
@ -63,6 +64,7 @@ static const char *algo_names[] = {
|
|||||||
"blake2s",
|
"blake2s",
|
||||||
"bmw",
|
"bmw",
|
||||||
"c11",
|
"c11",
|
||||||
|
"cryptolight",
|
||||||
"cryptonight",
|
"cryptonight",
|
||||||
"deep",
|
"deep",
|
||||||
"decred",
|
"decred",
|
||||||
@ -123,6 +125,8 @@ static inline int algo_to_int(char* arg)
|
|||||||
// some aliases...
|
// some aliases...
|
||||||
if (!strcasecmp("all", arg))
|
if (!strcasecmp("all", arg))
|
||||||
i = ALGO_AUTO;
|
i = ALGO_AUTO;
|
||||||
|
else if (!strcasecmp("cryptonight-light", arg))
|
||||||
|
i = ALGO_CRYPTOLIGHT;
|
||||||
else if (!strcasecmp("flax", arg))
|
else if (!strcasecmp("flax", arg))
|
||||||
i = ALGO_C11;
|
i = ALGO_C11;
|
||||||
else if (!strcasecmp("diamond", arg))
|
else if (!strcasecmp("diamond", arg))
|
||||||
|
@ -48,6 +48,7 @@ void algo_free_all(int thr_id)
|
|||||||
free_blake2s(thr_id);
|
free_blake2s(thr_id);
|
||||||
free_bmw(thr_id);
|
free_bmw(thr_id);
|
||||||
free_c11(thr_id);
|
free_c11(thr_id);
|
||||||
|
free_cryptolight(thr_id);
|
||||||
free_cryptonight(thr_id);
|
free_cryptonight(thr_id);
|
||||||
free_decred(thr_id);
|
free_decred(thr_id);
|
||||||
free_deep(thr_id);
|
free_deep(thr_id);
|
||||||
|
18
ccminer.cpp
18
ccminer.cpp
@ -223,6 +223,8 @@ Options:\n\
|
|||||||
blake2s Blake2-S 256 (NEVA)\n\
|
blake2s Blake2-S 256 (NEVA)\n\
|
||||||
blakecoin Fast Blake 256 (8 rounds)\n\
|
blakecoin Fast Blake 256 (8 rounds)\n\
|
||||||
bmw BMW 256\n\
|
bmw BMW 256\n\
|
||||||
|
cryptolight AEON cryptonight (MEM/2)\n\
|
||||||
|
cryptonight XMR cryptonight\n\
|
||||||
c11/flax X11 variant\n\
|
c11/flax X11 variant\n\
|
||||||
decred Decred Blake256\n\
|
decred Decred Blake256\n\
|
||||||
deep Deepcoin\n\
|
deep Deepcoin\n\
|
||||||
@ -620,6 +622,7 @@ static bool work_decode(const json_t *val, struct work *work)
|
|||||||
data_size = 80;
|
data_size = 80;
|
||||||
adata_sz = data_size / 4;
|
adata_sz = data_size / 4;
|
||||||
break;
|
break;
|
||||||
|
case ALGO_CRYPTOLIGHT:
|
||||||
case ALGO_CRYPTONIGHT:
|
case ALGO_CRYPTONIGHT:
|
||||||
case ALGO_WILDKECCAK:
|
case ALGO_WILDKECCAK:
|
||||||
return rpc2_job_decode(val, work);
|
return rpc2_job_decode(val, work);
|
||||||
@ -1767,7 +1770,7 @@ static void *miner_thread(void *userdata)
|
|||||||
nonceptr = (uint32_t*) (((char*)work.data) + 1);
|
nonceptr = (uint32_t*) (((char*)work.data) + 1);
|
||||||
wcmpoft = 2;
|
wcmpoft = 2;
|
||||||
wcmplen = 32;
|
wcmplen = 32;
|
||||||
} else if (opt_algo == ALGO_CRYPTONIGHT) {
|
} else if (opt_algo == ALGO_CRYPTOLIGHT || opt_algo == ALGO_CRYPTONIGHT) {
|
||||||
nonceptr = (uint32_t*) (((char*)work.data) + 39);
|
nonceptr = (uint32_t*) (((char*)work.data) + 39);
|
||||||
wcmplen = 39;
|
wcmplen = 39;
|
||||||
}
|
}
|
||||||
@ -1802,7 +1805,7 @@ static void *miner_thread(void *userdata)
|
|||||||
extrajob = false;
|
extrajob = false;
|
||||||
if (stratum_gen_work(&stratum, &g_work))
|
if (stratum_gen_work(&stratum, &g_work))
|
||||||
g_work_time = time(NULL);
|
g_work_time = time(NULL);
|
||||||
if (opt_algo == ALGO_CRYPTONIGHT)
|
if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT)
|
||||||
nonceptr[0] += 0x100000;
|
nonceptr[0] += 0x100000;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -1845,7 +1848,7 @@ static void *miner_thread(void *userdata)
|
|||||||
wcmplen -= 4;
|
wcmplen -= 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opt_algo == ALGO_CRYPTONIGHT) {
|
if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT) {
|
||||||
uint32_t oldpos = nonceptr[0];
|
uint32_t oldpos = nonceptr[0];
|
||||||
if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
|
if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
|
||||||
memcpy(&work, &g_work, sizeof(struct work));
|
memcpy(&work, &g_work, sizeof(struct work));
|
||||||
@ -2097,6 +2100,7 @@ static void *miner_thread(void *userdata)
|
|||||||
case ALGO_VELTOR:
|
case ALGO_VELTOR:
|
||||||
minmax = 0x80000;
|
minmax = 0x80000;
|
||||||
break;
|
break;
|
||||||
|
case ALGO_CRYPTOLIGHT:
|
||||||
case ALGO_CRYPTONIGHT:
|
case ALGO_CRYPTONIGHT:
|
||||||
case ALGO_SCRYPT_JANE:
|
case ALGO_SCRYPT_JANE:
|
||||||
minmax = 0x1000;
|
minmax = 0x1000;
|
||||||
@ -2160,6 +2164,9 @@ static void *miner_thread(void *userdata)
|
|||||||
case ALGO_C11:
|
case ALGO_C11:
|
||||||
rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done);
|
rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done);
|
||||||
break;
|
break;
|
||||||
|
case ALGO_CRYPTOLIGHT:
|
||||||
|
rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done);
|
||||||
|
break;
|
||||||
case ALGO_CRYPTONIGHT:
|
case ALGO_CRYPTONIGHT:
|
||||||
rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done);
|
rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done);
|
||||||
break;
|
break;
|
||||||
@ -2306,6 +2313,7 @@ static void *miner_thread(void *userdata)
|
|||||||
// todo: update all algos to use work->nonces and pdata[19] as counter
|
// todo: update all algos to use work->nonces and pdata[19] as counter
|
||||||
switch (opt_algo) {
|
switch (opt_algo) {
|
||||||
case ALGO_BLAKE2S:
|
case ALGO_BLAKE2S:
|
||||||
|
case ALGO_CRYPTOLIGHT:
|
||||||
case ALGO_CRYPTONIGHT:
|
case ALGO_CRYPTONIGHT:
|
||||||
case ALGO_DECRED:
|
case ALGO_DECRED:
|
||||||
case ALGO_LBRY:
|
case ALGO_LBRY:
|
||||||
@ -2830,7 +2838,7 @@ static void show_usage_and_exit(int status)
|
|||||||
if (opt_algo == ALGO_SCRYPT || opt_algo == ALGO_SCRYPT_JANE) {
|
if (opt_algo == ALGO_SCRYPT || opt_algo == ALGO_SCRYPT_JANE) {
|
||||||
printf(scrypt_usage);
|
printf(scrypt_usage);
|
||||||
}
|
}
|
||||||
if (opt_algo == ALGO_WILDKECCAK || opt_algo == ALGO_CRYPTONIGHT) {
|
if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT || opt_algo == ALGO_WILDKECCAK) {
|
||||||
printf(xmr_usage);
|
printf(xmr_usage);
|
||||||
}
|
}
|
||||||
proper_exit(status);
|
proper_exit(status);
|
||||||
@ -3685,7 +3693,7 @@ int main(int argc, char *argv[])
|
|||||||
allow_mininginfo = false;
|
allow_mininginfo = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opt_algo == ALGO_CRYPTONIGHT) {
|
if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT) {
|
||||||
rpc2_init();
|
rpc2_init();
|
||||||
if (!opt_quiet) applog(LOG_INFO, "Using JSON-RPC 2.0");
|
if (!opt_quiet) applog(LOG_INFO, "Using JSON-RPC 2.0");
|
||||||
}
|
}
|
||||||
|
@ -235,6 +235,7 @@
|
|||||||
<ClCompile Include="crypto\xmr-rpc.cpp" />
|
<ClCompile Include="crypto\xmr-rpc.cpp" />
|
||||||
<ClCompile Include="crypto\aesb.cpp" />
|
<ClCompile Include="crypto\aesb.cpp" />
|
||||||
<ClCompile Include="crypto\oaes_lib.cpp" />
|
<ClCompile Include="crypto\oaes_lib.cpp" />
|
||||||
|
<ClCompile Include="crypto\cryptolight-cpu.cpp" />
|
||||||
<ClCompile Include="crypto\cryptonight-cpu.cpp" />
|
<ClCompile Include="crypto\cryptonight-cpu.cpp" />
|
||||||
<ClCompile Include="crypto\cpu\c_keccak.c" />
|
<ClCompile Include="crypto\cpu\c_keccak.c" />
|
||||||
<ClCompile Include="nvapi.cpp" />
|
<ClCompile Include="nvapi.cpp" />
|
||||||
@ -265,13 +266,15 @@
|
|||||||
<CudaCompile Include="Algo256\cuda_bmw.cu">
|
<CudaCompile Include="Algo256\cuda_bmw.cu">
|
||||||
<MaxRegCount>76</MaxRegCount>
|
<MaxRegCount>76</MaxRegCount>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="crypto\cryptonight.cu">
|
<CudaCompile Include="crypto\cryptolight.cu" />
|
||||||
<MaxRegCount>128</MaxRegCount>
|
<CudaCompile Include="crypto\cryptolight-core.cu">
|
||||||
|
<MaxRegCount>64</MaxRegCount>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="crypto\cuda_cryptonight_core.cu">
|
<CudaCompile Include="crypto\cryptonight.cu" />
|
||||||
<MaxRegCount>128</MaxRegCount>
|
<CudaCompile Include="crypto\cryptonight-core.cu">
|
||||||
|
<MaxRegCount>64</MaxRegCount>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="crypto\cuda_cryptonight_extra.cu">
|
<CudaCompile Include="crypto\cryptonight-extra.cu">
|
||||||
<MaxRegCount>255</MaxRegCount>
|
<MaxRegCount>255</MaxRegCount>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<ClInclude Include="crypto\cn_aes.cuh" />
|
<ClInclude Include="crypto\cn_aes.cuh" />
|
||||||
@ -335,8 +338,9 @@
|
|||||||
<ClInclude Include="compat\unistd.h" />
|
<ClInclude Include="compat\unistd.h" />
|
||||||
<ClInclude Include="compat\winansi.h" />
|
<ClInclude Include="compat\winansi.h" />
|
||||||
<ClInclude Include="compat\ccminer-config.h" />
|
<ClInclude Include="compat\ccminer-config.h" />
|
||||||
<ClInclude Include="crypto\mman.h" />
|
<ClInclude Include="crypto\cryptolight.h" />
|
||||||
<ClInclude Include="crypto\cryptonight.h" />
|
<ClInclude Include="crypto\cryptonight.h" />
|
||||||
|
<ClInclude Include="crypto\mman.h" />
|
||||||
<ClInclude Include="crypto\wildkeccak.h" />
|
<ClInclude Include="crypto\wildkeccak.h" />
|
||||||
<ClInclude Include="crypto\xmr-rpc.h" />
|
<ClInclude Include="crypto\xmr-rpc.h" />
|
||||||
<ClInclude Include="cuda_groestlcoin.h" />
|
<ClInclude Include="cuda_groestlcoin.h" />
|
||||||
|
@ -823,13 +823,19 @@
|
|||||||
<CudaCompile Include="sia\sia.cu">
|
<CudaCompile Include="sia\sia.cu">
|
||||||
<Filter>Source Files\sia</Filter>
|
<Filter>Source Files\sia</Filter>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="crypto\cryptolight.cu">
|
||||||
|
<Filter>Source Files\CUDA\xmr</Filter>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="crypto\cryptolight-core.cu">
|
||||||
|
<Filter>Source Files\CUDA\xmr</Filter>
|
||||||
|
</CudaCompile>
|
||||||
<CudaCompile Include="crypto\cryptonight.cu">
|
<CudaCompile Include="crypto\cryptonight.cu">
|
||||||
<Filter>Source Files\CUDA\xmr</Filter>
|
<Filter>Source Files\CUDA\xmr</Filter>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="crypto\cuda_cryptonight_core.cu">
|
<CudaCompile Include="crypto\cryptonight-core.cu">
|
||||||
<Filter>Source Files\CUDA\xmr</Filter>
|
<Filter>Source Files\CUDA\xmr</Filter>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="crypto\cuda_cryptonight_extra.cu">
|
<CudaCompile Include="crypto\cryptonight-extra.cu">
|
||||||
<Filter>Source Files\CUDA\xmr</Filter>
|
<Filter>Source Files\CUDA\xmr</Filter>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="crypto\wildkeccak.cu">
|
<CudaCompile Include="crypto\wildkeccak.cu">
|
||||||
@ -851,4 +857,4 @@
|
|||||||
<Filter>Ressources</Filter>
|
<Filter>Ressources</Filter>
|
||||||
</Text>
|
</Text>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
274
crypto/cryptolight-core.cu
Normal file
274
crypto/cryptolight-core.cu
Normal file
@ -0,0 +1,274 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
#include "cryptolight.h"
|
||||||
|
#define LONG_SHL_IDX 18
|
||||||
|
#define LONG_LOOPS32 0x40000
|
||||||
|
|
||||||
|
#ifdef WIN32 /* todo: --interactive */
|
||||||
|
static __thread int cn_bfactor = 8;
|
||||||
|
static __thread int cn_bsleep = 100;
|
||||||
|
#else
|
||||||
|
static __thread int cn_bfactor = 0;
|
||||||
|
static __thread int cn_bsleep = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "cn_aes.cuh"
|
||||||
|
|
||||||
|
#define MUL_SUM_XOR_DST(a,c,dst) { \
|
||||||
|
uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
|
||||||
|
hi += ((uint64_t *)c)[0]; \
|
||||||
|
((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
|
||||||
|
((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
|
||||||
|
((uint64_t *)dst)[0] = hi; \
|
||||||
|
((uint64_t *)dst)[1] = lo; }
|
||||||
|
|
||||||
|
__device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
|
||||||
|
{
|
||||||
|
*product_hi = __umul64hi(multiplier, multiplicand);
|
||||||
|
return(multiplier * multiplicand);
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void cryptolight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
|
||||||
|
{
|
||||||
|
__shared__ uint32_t sharedMemory[1024];
|
||||||
|
|
||||||
|
cn_aes_gpu_init(sharedMemory);
|
||||||
|
|
||||||
|
const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
|
||||||
|
const int sub = (threadIdx.x & 7) << 2;
|
||||||
|
|
||||||
|
if(thread < threads)
|
||||||
|
{
|
||||||
|
uint32_t key[40], text[4];
|
||||||
|
|
||||||
|
MEMCPY8(key, ctx_key1 + thread * 40, 20);
|
||||||
|
MEMCPY8(text, ctx_state + thread * 50 + sub + 16, 2);
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
for(int i = 0; i < LONG_LOOPS32; i += 32)
|
||||||
|
{
|
||||||
|
cn_aes_pseudo_round_mut(sharedMemory, text, key);
|
||||||
|
MEMCPY8(&long_state[(thread << LONG_SHL_IDX) + sub + i], text, 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
|
||||||
|
{
|
||||||
|
__shared__ uint32_t sharedMemory[1024];
|
||||||
|
|
||||||
|
cn_aes_gpu_init(sharedMemory);
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
#if 0 && __CUDA_ARCH__ >= 300
|
||||||
|
|
||||||
|
const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
|
||||||
|
const int sub = threadIdx.x & 3;
|
||||||
|
|
||||||
|
if(thread < threads)
|
||||||
|
{
|
||||||
|
const int batchsize = ITER >> (2 + bfactor);
|
||||||
|
const int start = partidx * batchsize;
|
||||||
|
const int end = start + batchsize;
|
||||||
|
uint32_t * __restrict__ long_state = &d_long_state[thread << LONG_SHL_IDX];
|
||||||
|
uint32_t * __restrict__ ctx_a = d_ctx_a + thread * 4;
|
||||||
|
uint32_t * __restrict__ ctx_b = d_ctx_b + thread * 4;
|
||||||
|
uint32_t a, b, c, x[4];
|
||||||
|
uint32_t t1[4], t2[4], res;
|
||||||
|
uint64_t reshi, reslo;
|
||||||
|
int j;
|
||||||
|
|
||||||
|
a = ctx_a[sub];
|
||||||
|
b = ctx_b[sub];
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(int i = start; i < end; ++i)
|
||||||
|
{
|
||||||
|
//j = ((uint32_t *)a)[0] & 0xFFFF0;
|
||||||
|
j = (__shfl((int)a, 0, 4) & E2I_MASK1) >> 2;
|
||||||
|
|
||||||
|
//cn_aes_single_round(sharedMemory, &long_state[j], c, a);
|
||||||
|
x[0] = long_state[j + sub];
|
||||||
|
x[1] = __shfl((int)x[0], sub + 1, 4);
|
||||||
|
x[2] = __shfl((int)x[0], sub + 2, 4);
|
||||||
|
x[3] = __shfl((int)x[0], sub + 3, 4);
|
||||||
|
c = a ^
|
||||||
|
t_fn0(x[0] & 0xff) ^
|
||||||
|
t_fn1((x[1] >> 8) & 0xff) ^
|
||||||
|
t_fn2((x[2] >> 16) & 0xff) ^
|
||||||
|
t_fn3((x[3] >> 24) & 0xff);
|
||||||
|
|
||||||
|
//XOR_BLOCKS_DST(c, b, &long_state[j]);
|
||||||
|
long_state[j + sub] = c ^ b;
|
||||||
|
|
||||||
|
//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0xFFFF0]);
|
||||||
|
j = (__shfl((int)c, 0, 4) & E2I_MASK1) >> 2;
|
||||||
|
#pragma unroll
|
||||||
|
for(int k = 0; k < 2; k++)
|
||||||
|
t1[k] = __shfl((int)c, k, 4);
|
||||||
|
#pragma unroll
|
||||||
|
for(int k = 0; k < 4; k++)
|
||||||
|
t2[k] = __shfl((int)a, k, 4);
|
||||||
|
asm(
|
||||||
|
"mad.lo.u64 %0, %2, %3, %4;\n\t"
|
||||||
|
"mad.hi.u64 %1, %2, %3, %5;\n\t"
|
||||||
|
: "=l"(reslo), "=l"(reshi)
|
||||||
|
: "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)long_state)[j >> 1]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0]));
|
||||||
|
res = (sub & 2 ? reslo : reshi) >> (sub & 1 ? 32 : 0);
|
||||||
|
a = long_state[j + sub] ^ res;
|
||||||
|
long_state[j + sub] = res;
|
||||||
|
|
||||||
|
//j = ((uint32_t *)a)[0] & 0xFFFF0;
|
||||||
|
j = (__shfl((int)a, 0, 4) & E2I_MASK1) >> 2;
|
||||||
|
|
||||||
|
//cn_aes_single_round(sharedMemory, &long_state[j], b, a);
|
||||||
|
x[0] = long_state[j + sub];
|
||||||
|
x[1] = __shfl((int)x[0], sub + 1, 4);
|
||||||
|
x[2] = __shfl((int)x[0], sub + 2, 4);
|
||||||
|
x[3] = __shfl((int)x[0], sub + 3, 4);
|
||||||
|
b = a ^
|
||||||
|
t_fn0(x[0] & 0xff) ^
|
||||||
|
t_fn1((x[1] >> 8) & 0xff) ^
|
||||||
|
t_fn2((x[2] >> 16) & 0xff) ^
|
||||||
|
t_fn3((x[3] >> 24) & 0xff);
|
||||||
|
|
||||||
|
//XOR_BLOCKS_DST(b, c, &long_state[j]);
|
||||||
|
long_state[j + sub] = c ^ b;
|
||||||
|
|
||||||
|
//MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0xFFFF0]);
|
||||||
|
j = (__shfl((int)b, 0, 4) & E2I_MASK1) >> 2;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for(int k = 0; k < 2; k++)
|
||||||
|
t1[k] = __shfl((int)b, k, 4);
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for(int k = 0; k < 4; k++)
|
||||||
|
t2[k] = __shfl((int)a, k, 4);
|
||||||
|
asm(
|
||||||
|
"mad.lo.u64 %0, %2, %3, %4;\n\t"
|
||||||
|
"mad.hi.u64 %1, %2, %3, %5;\n\t"
|
||||||
|
: "=l"(reslo), "=l"(reshi)
|
||||||
|
: "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)long_state)[j >> 1]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0]));
|
||||||
|
res = (sub & 2 ? reslo : reshi) >> (sub & 1 ? 32 : 0);
|
||||||
|
a = long_state[j + sub] ^ res;
|
||||||
|
long_state[j + sub] = res;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(bfactor > 0)
|
||||||
|
{
|
||||||
|
ctx_a[sub] = a;
|
||||||
|
ctx_b[sub] = b;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // __CUDA_ARCH__ < 300
|
||||||
|
|
||||||
|
const int thread = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if(thread < threads)
|
||||||
|
{
|
||||||
|
const int batchsize = ITER >> (2 + bfactor);
|
||||||
|
const int start = partidx * batchsize;
|
||||||
|
const int end = start + batchsize;
|
||||||
|
const off_t longptr = (off_t) thread << LONG_SHL_IDX;
|
||||||
|
uint32_t * long_state = &d_long_state[longptr];
|
||||||
|
uint32_t * ctx_a = &d_ctx_a[thread * 4];
|
||||||
|
uint32_t * ctx_b = &d_ctx_b[thread * 4];
|
||||||
|
uint32_t a[4], b[4];
|
||||||
|
|
||||||
|
MEMCPY8(a, ctx_a, 2);
|
||||||
|
MEMCPY8(b, ctx_b, 2);
|
||||||
|
|
||||||
|
for(int i = start; i < end; i++) // end = 262144
|
||||||
|
{
|
||||||
|
uint32_t c[4];
|
||||||
|
uint32_t j = (a[0] >> 2) & E2I_MASK2;
|
||||||
|
cn_aes_single_round(sharedMemory, &long_state[j], c, a);
|
||||||
|
XOR_BLOCKS_DST(c, b, &long_state[j]);
|
||||||
|
MUL_SUM_XOR_DST(c, a, &long_state[(c[0] >> 2) & E2I_MASK2]);
|
||||||
|
|
||||||
|
j = (a[0] >> 2) & E2I_MASK2;
|
||||||
|
cn_aes_single_round(sharedMemory, &long_state[j], b, a);
|
||||||
|
XOR_BLOCKS_DST(b, c, &long_state[j]);
|
||||||
|
MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & E2I_MASK2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(bfactor > 0)
|
||||||
|
{
|
||||||
|
MEMCPY8(ctx_a, a, 2);
|
||||||
|
MEMCPY8(ctx_b, b, 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __CUDA_ARCH__ >= 300
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void cryptolight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2)
|
||||||
|
{
|
||||||
|
__shared__ uint32_t sharedMemory[1024];
|
||||||
|
|
||||||
|
cn_aes_gpu_init(sharedMemory);
|
||||||
|
|
||||||
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
|
||||||
|
int sub = (threadIdx.x & 7) << 2;
|
||||||
|
|
||||||
|
if(thread < threads)
|
||||||
|
{
|
||||||
|
uint32_t key[40], text[4];
|
||||||
|
MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
|
||||||
|
MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2);
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
for(int i = 0; i < LONG_LOOPS32; i += 32)
|
||||||
|
{
|
||||||
|
#pragma unroll
|
||||||
|
for(int j = 0; j < 4; j++)
|
||||||
|
text[j] ^= long_state[(thread << LONG_SHL_IDX) + sub + i + j];
|
||||||
|
|
||||||
|
cn_aes_pseudo_round_mut(sharedMemory, text, key);
|
||||||
|
}
|
||||||
|
|
||||||
|
MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
|
||||||
|
{
|
||||||
|
dim3 grid(blocks);
|
||||||
|
dim3 block(threads);
|
||||||
|
dim3 block4(threads << 2);
|
||||||
|
dim3 block8(threads << 3);
|
||||||
|
|
||||||
|
const int bfactor = cn_bfactor; // device_bfactor[thr_id];
|
||||||
|
const int bsleep = cn_bsleep; //device_bsleep[thr_id];
|
||||||
|
|
||||||
|
int i, partcount = 1 << bfactor;
|
||||||
|
int dev_id = device_map[thr_id];
|
||||||
|
|
||||||
|
cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1);
|
||||||
|
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||||
|
if(partcount > 1) usleep(bsleep);
|
||||||
|
|
||||||
|
for(i = 0; i < partcount; i++)
|
||||||
|
{
|
||||||
|
cryptolight_core_gpu_phase2 <<<grid, (device_sm[dev_id] >= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
|
||||||
|
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||||
|
if(partcount > 1) usleep(bsleep);
|
||||||
|
}
|
||||||
|
|
||||||
|
cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2);
|
||||||
|
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||||
|
}
|
229
crypto/cryptolight-cpu.cpp
Normal file
229
crypto/cryptolight-cpu.cpp
Normal file
@ -0,0 +1,229 @@
|
|||||||
|
#include <miner.h>
|
||||||
|
#include <memory.h>
|
||||||
|
|
||||||
|
#include "oaes_lib.h"
|
||||||
|
#include "cryptolight.h"
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
#include <sph/sph_blake.h>
|
||||||
|
#include <sph/sph_groestl.h>
|
||||||
|
#include <sph/sph_jh.h>
|
||||||
|
#include <sph/sph_skein.h>
|
||||||
|
#include "cpu/c_keccak.h"
|
||||||
|
}
|
||||||
|
|
||||||
|
struct cryptonight_ctx {
|
||||||
|
uint8_t long_state[MEMORY];
|
||||||
|
union cn_slow_hash_state state;
|
||||||
|
uint8_t text[INIT_SIZE_BYTE];
|
||||||
|
uint8_t a[AES_BLOCK_SIZE];
|
||||||
|
uint8_t b[AES_BLOCK_SIZE];
|
||||||
|
uint8_t c[AES_BLOCK_SIZE];
|
||||||
|
oaes_ctx* aes_ctx;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void do_blake_hash(const void* input, int len, void* output)
|
||||||
|
{
|
||||||
|
uchar hash[32];
|
||||||
|
sph_blake256_context ctx;
|
||||||
|
sph_blake256_set_rounds(14);
|
||||||
|
sph_blake256_init(&ctx);
|
||||||
|
sph_blake256(&ctx, input, len);
|
||||||
|
sph_blake256_close(&ctx, hash);
|
||||||
|
memcpy(output, hash, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void do_groestl_hash(const void* input, int len, void* output)
|
||||||
|
{
|
||||||
|
uchar hash[32];
|
||||||
|
sph_groestl256_context ctx;
|
||||||
|
sph_groestl256_init(&ctx);
|
||||||
|
sph_groestl256(&ctx, input, len);
|
||||||
|
sph_groestl256_close(&ctx, hash);
|
||||||
|
memcpy(output, hash, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void do_jh_hash(const void* input, int len, void* output)
|
||||||
|
{
|
||||||
|
uchar hash[64];
|
||||||
|
sph_jh256_context ctx;
|
||||||
|
sph_jh256_init(&ctx);
|
||||||
|
sph_jh256(&ctx, input, len);
|
||||||
|
sph_jh256_close(&ctx, hash);
|
||||||
|
memcpy(output, hash, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void do_skein_hash(const void* input, int len, void* output)
|
||||||
|
{
|
||||||
|
uchar hash[32];
|
||||||
|
sph_skein256_context ctx;
|
||||||
|
sph_skein256_init(&ctx);
|
||||||
|
sph_skein256(&ctx, input, len);
|
||||||
|
sph_skein256_close(&ctx, hash);
|
||||||
|
memcpy(output, hash, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
// todo: use sph if possible
|
||||||
|
static void keccak_hash_permutation(union hash_state *state) {
|
||||||
|
keccakf((uint64_t*)state, 24);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void keccak_hash_process(union hash_state *state, const uint8_t *buf, int count) {
|
||||||
|
keccak1600(buf, (int)count, (uint8_t*)state);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" int fast_aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
|
||||||
|
extern "C" int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
|
||||||
|
extern "C" int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
|
||||||
|
extern "C" int fast_aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
|
||||||
|
|
||||||
|
static void (* const extra_hashes[4])(const void*, int, void *) = {
|
||||||
|
do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
|
||||||
|
};
|
||||||
|
|
||||||
|
static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
|
||||||
|
{
|
||||||
|
// multiplier = ab = a * 2^32 + b
|
||||||
|
// multiplicand = cd = c * 2^32 + d
|
||||||
|
// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
|
||||||
|
uint64_t a = hi_dword(multiplier);
|
||||||
|
uint64_t b = lo_dword(multiplier);
|
||||||
|
uint64_t c = hi_dword(multiplicand);
|
||||||
|
uint64_t d = lo_dword(multiplicand);
|
||||||
|
|
||||||
|
uint64_t ac = a * c;
|
||||||
|
uint64_t ad = a * d;
|
||||||
|
uint64_t bc = b * c;
|
||||||
|
uint64_t bd = b * d;
|
||||||
|
|
||||||
|
uint64_t adbc = ad + bc;
|
||||||
|
uint64_t adbc_carry = adbc < ad ? 1 : 0;
|
||||||
|
|
||||||
|
// multiplier * multiplicand = product_hi * 2^64 + product_lo
|
||||||
|
uint64_t product_lo = bd + (adbc << 32);
|
||||||
|
uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
|
||||||
|
*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
|
||||||
|
|
||||||
|
return product_lo;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t e2i(const uint8_t* a) {
|
||||||
|
//const uint32_t mask = (MEMORY / AES_BLOCK_SIZE - 1);
|
||||||
|
//return (*((uint64_t*) a) / AES_BLOCK_SIZE) & mask;
|
||||||
|
return *((uint64_t*) a) & 0xFFFF0; /* mask * AES_BLOCK_SIZE */
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) {
|
||||||
|
((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void sum_half_blocks(uint8_t* a, const uint8_t* b) {
|
||||||
|
((uint64_t*) a)[0] += ((uint64_t*) b)[0];
|
||||||
|
((uint64_t*) a)[1] += ((uint64_t*) b)[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void sum_half_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
|
||||||
|
((uint64_t*) dst)[0] = ((uint64_t*) a)[0] + ((uint64_t*) b)[0];
|
||||||
|
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] + ((uint64_t*) b)[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, uint8_t* dst) {
|
||||||
|
((uint64_t*) dst)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) dst) + ((uint64_t*) c)[1];
|
||||||
|
((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
|
||||||
|
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
|
||||||
|
hi += ((uint64_t*) c)[0];
|
||||||
|
|
||||||
|
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
|
||||||
|
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
|
||||||
|
((uint64_t*) dst)[0] = hi;
|
||||||
|
((uint64_t*) dst)[1] = lo;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void copy_block(uint8_t* dst, const uint8_t* src) {
|
||||||
|
((uint64_t*) dst)[0] = ((uint64_t*) src)[0];
|
||||||
|
((uint64_t*) dst)[1] = ((uint64_t*) src)[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void xor_blocks(uint8_t* a, const uint8_t* b) {
|
||||||
|
((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
|
||||||
|
((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
|
||||||
|
((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
|
||||||
|
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx)
|
||||||
|
{
|
||||||
|
size_t i, j;
|
||||||
|
keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
|
||||||
|
ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
|
||||||
|
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
|
||||||
|
|
||||||
|
oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
|
||||||
|
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
|
||||||
|
#undef RND
|
||||||
|
#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
|
||||||
|
RND(0);
|
||||||
|
RND(1);
|
||||||
|
RND(2);
|
||||||
|
RND(3);
|
||||||
|
RND(4);
|
||||||
|
RND(5);
|
||||||
|
RND(6);
|
||||||
|
RND(7);
|
||||||
|
memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE);
|
||||||
|
}
|
||||||
|
|
||||||
|
xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a);
|
||||||
|
xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b);
|
||||||
|
|
||||||
|
for (i = 0; likely(i < ITER / 4); ++i) {
|
||||||
|
j = e2i(ctx->a);
|
||||||
|
aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
|
||||||
|
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
|
||||||
|
|
||||||
|
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
|
||||||
|
|
||||||
|
j = e2i(ctx->a);
|
||||||
|
aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
|
||||||
|
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
|
||||||
|
|
||||||
|
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
|
||||||
|
oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
|
||||||
|
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
|
||||||
|
#undef RND
|
||||||
|
#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
|
||||||
|
aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
|
||||||
|
RND(0);
|
||||||
|
RND(1);
|
||||||
|
RND(2);
|
||||||
|
RND(3);
|
||||||
|
RND(4);
|
||||||
|
RND(5);
|
||||||
|
RND(6);
|
||||||
|
RND(7);
|
||||||
|
}
|
||||||
|
memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
|
||||||
|
keccak_hash_permutation(&ctx->state.hs);
|
||||||
|
|
||||||
|
int extra_algo = ctx->state.hs.b[0] & 3;
|
||||||
|
extra_hashes[extra_algo](&ctx->state, 200, output);
|
||||||
|
if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
|
||||||
|
|
||||||
|
oaes_free((OAES_CTX **) &ctx->aes_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cryptolight_hash(void* output, const void* input, int len)
|
||||||
|
{
|
||||||
|
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
|
||||||
|
cryptolight_hash_ctx(output, input, len, ctx);
|
||||||
|
free(ctx);
|
||||||
|
}
|
166
crypto/cryptolight.cu
Normal file
166
crypto/cryptolight.cu
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
|
||||||
|
#include "cryptolight.h"
|
||||||
|
|
||||||
|
extern char *device_config[MAX_GPUS]; // -l 32x16
|
||||||
|
|
||||||
|
static __thread uint32_t cn_blocks = 32;
|
||||||
|
static __thread uint32_t cn_threads = 16;
|
||||||
|
|
||||||
|
static uint32_t *d_long_state[MAX_GPUS];
|
||||||
|
static uint32_t *d_ctx_state[MAX_GPUS];
|
||||||
|
static uint32_t *d_ctx_key1[MAX_GPUS];
|
||||||
|
static uint32_t *d_ctx_key2[MAX_GPUS];
|
||||||
|
static uint32_t *d_ctx_text[MAX_GPUS];
|
||||||
|
static uint32_t *d_ctx_a[MAX_GPUS];
|
||||||
|
static uint32_t *d_ctx_b[MAX_GPUS];
|
||||||
|
|
||||||
|
static bool init[MAX_GPUS] = { 0 };
|
||||||
|
|
||||||
|
extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
|
||||||
|
{
|
||||||
|
int res = 0;
|
||||||
|
uint32_t throughput = 0;
|
||||||
|
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
uint8_t *pdata = (uint8_t*) work->data;
|
||||||
|
uint32_t *nonceptr = (uint32_t*) (&pdata[39]);
|
||||||
|
const uint32_t first_nonce = *nonceptr;
|
||||||
|
uint32_t nonce = first_nonce;
|
||||||
|
|
||||||
|
if(opt_benchmark) {
|
||||||
|
ptarget[7] = 0x00ff;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!init[thr_id])
|
||||||
|
{
|
||||||
|
if (device_config[thr_id]) {
|
||||||
|
sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
|
||||||
|
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
|
||||||
|
gpulog(LOG_INFO, thr_id, "Using %u x %u kernel launch config, %u threads",
|
||||||
|
cn_blocks, cn_threads, throughput);
|
||||||
|
} else {
|
||||||
|
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
|
||||||
|
if (throughput != cn_blocks*cn_threads && cn_threads) {
|
||||||
|
cn_blocks = throughput / cn_threads;
|
||||||
|
throughput = cn_threads * cn_blocks;
|
||||||
|
}
|
||||||
|
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u threads (%ux%u)",
|
||||||
|
throughput2intensity(throughput), throughput, cn_blocks, cn_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(sizeof(size_t) == 4 && throughput > UINT32_MAX / MEMORY) {
|
||||||
|
gpulog(LOG_ERR, thr_id, "THE 32bit VERSION CAN'T ALLOCATE MORE THAN 4GB OF MEMORY!");
|
||||||
|
gpulog(LOG_ERR, thr_id, "PLEASE REDUCE THE NUMBER OF THREADS OR BLOCKS");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaSetDevice(device_map[thr_id]);
|
||||||
|
if (opt_cudaschedule == -1 && gpu_threads == 1) {
|
||||||
|
cudaDeviceReset();
|
||||||
|
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
|
||||||
|
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
|
||||||
|
CUDA_LOG_ERROR();
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t alloc = MEMORY * throughput;
|
||||||
|
cryptonight_extra_cpu_init(thr_id, throughput);
|
||||||
|
|
||||||
|
cudaMalloc(&d_long_state[thr_id], alloc);
|
||||||
|
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||||
|
cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput);
|
||||||
|
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||||
|
cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
|
||||||
|
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||||
|
cudaMalloc(&d_ctx_key2[thr_id], 40 * sizeof(uint32_t) * throughput);
|
||||||
|
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||||
|
cudaMalloc(&d_ctx_text[thr_id], 32 * sizeof(uint32_t) * throughput);
|
||||||
|
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||||
|
cudaMalloc(&d_ctx_a[thr_id], 4 * sizeof(uint32_t) * throughput);
|
||||||
|
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||||
|
cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
|
||||||
|
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||||
|
|
||||||
|
init[thr_id] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
|
||||||
|
|
||||||
|
cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
|
||||||
|
cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
|
||||||
|
cryptolight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
|
||||||
|
cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
|
||||||
|
|
||||||
|
*hashes_done = nonce - first_nonce + throughput;
|
||||||
|
|
||||||
|
if(resNonces[0] != UINT32_MAX)
|
||||||
|
{
|
||||||
|
uint32_t vhash[8];
|
||||||
|
uint32_t tempdata[19];
|
||||||
|
uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
|
||||||
|
memcpy(tempdata, pdata, 76);
|
||||||
|
*tempnonceptr = resNonces[0];
|
||||||
|
cryptolight_hash(vhash, tempdata, 76);
|
||||||
|
if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
|
||||||
|
{
|
||||||
|
res = 1;
|
||||||
|
work->nonces[0] = resNonces[0];
|
||||||
|
work_set_target_ratio(work, vhash);
|
||||||
|
// second nonce
|
||||||
|
if(resNonces[1] != UINT32_MAX)
|
||||||
|
{
|
||||||
|
*tempnonceptr = resNonces[1];
|
||||||
|
cryptolight_hash(vhash, tempdata, 76);
|
||||||
|
if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
|
||||||
|
res++;
|
||||||
|
work->nonces[1] = resNonces[1];
|
||||||
|
} else if (vhash[7] > Htarg) {
|
||||||
|
gpulog(LOG_WARNING, thr_id, "result for second nonce %08x does not validate on CPU!", resNonces[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
goto done;
|
||||||
|
} else if (vhash[7] > Htarg) {
|
||||||
|
gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", resNonces[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((uint64_t) throughput + nonce >= max_nonce - 127) {
|
||||||
|
nonce = max_nonce;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
nonce += throughput;
|
||||||
|
gpulog(LOG_DEBUG, thr_id, "nonce %08x", nonce);
|
||||||
|
|
||||||
|
} while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + nonce);
|
||||||
|
|
||||||
|
done:
|
||||||
|
gpulog(LOG_DEBUG, thr_id, "nonce %08x exit", nonce);
|
||||||
|
work->valid_nonces = res;
|
||||||
|
*nonceptr = nonce;
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_cryptolight(int thr_id)
|
||||||
|
{
|
||||||
|
if (!init[thr_id])
|
||||||
|
return;
|
||||||
|
|
||||||
|
cudaFree(d_long_state[thr_id]);
|
||||||
|
cudaFree(d_ctx_state[thr_id]);
|
||||||
|
cudaFree(d_ctx_key1[thr_id]);
|
||||||
|
cudaFree(d_ctx_key2[thr_id]);
|
||||||
|
cudaFree(d_ctx_text[thr_id]);
|
||||||
|
cudaFree(d_ctx_a[thr_id]);
|
||||||
|
cudaFree(d_ctx_b[thr_id]);
|
||||||
|
|
||||||
|
cryptonight_extra_cpu_free(thr_id);
|
||||||
|
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
|
||||||
|
init[thr_id] = false;
|
||||||
|
}
|
141
crypto/cryptolight.h
Normal file
141
crypto/cryptolight.h
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
#pragma once
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <miner.h>
|
||||||
|
|
||||||
|
#ifdef __INTELLISENSE__
|
||||||
|
/* avoid red underlining */
|
||||||
|
#define __CUDA_ARCH__ 520
|
||||||
|
struct uint3 {
|
||||||
|
unsigned int x, y, z;
|
||||||
|
};
|
||||||
|
struct uint3 threadIdx;
|
||||||
|
struct uint3 blockIdx;
|
||||||
|
struct uint3 blockDim;
|
||||||
|
#define atomicExch(p,y) (*p) = y
|
||||||
|
#define __funnelshift_r(a,b,c) 1
|
||||||
|
#define __syncthreads()
|
||||||
|
#define asm(x)
|
||||||
|
#define __shfl(a,b,c) 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define MEMORY (1UL << 20) /* 1 MiB - 1048576 */
|
||||||
|
#define ITER (1UL << 19) /* 512k */
|
||||||
|
#define E2I_MASK1 0xFFFF0 /* MEMORY / AES_BLOCK_SIZE - 1 = 0xFFFF */
|
||||||
|
#define E2I_MASK2 0x3FFFC /* 0xFFFF0 >> 2 */
|
||||||
|
|
||||||
|
#define AES_BLOCK_SIZE 16
|
||||||
|
#define AES_KEY_SIZE 32
|
||||||
|
#define INIT_SIZE_BLK 8
|
||||||
|
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B
|
||||||
|
|
||||||
|
#define AES_RKEY_LEN 4
|
||||||
|
#define AES_COL_LEN 4
|
||||||
|
#define AES_ROUND_BASE 7
|
||||||
|
|
||||||
|
#ifndef HASH_SIZE
|
||||||
|
#define HASH_SIZE 32
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef HASH_DATA_AREA
|
||||||
|
#define HASH_DATA_AREA 136
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define hi_dword(x) (x >> 32)
|
||||||
|
#define lo_dword(x) (x & 0xFFFFFFFF)
|
||||||
|
|
||||||
|
#define C32(x) ((uint32_t)(x ## U))
|
||||||
|
#define T32(x) ((x) & C32(0xFFFFFFFF))
|
||||||
|
|
||||||
|
#ifndef ROTL64
|
||||||
|
#if __CUDA_ARCH__ >= 350
|
||||||
|
__forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset) {
|
||||||
|
uint2 result;
|
||||||
|
if(offset >= 32) {
|
||||||
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||||
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||||
|
} else {
|
||||||
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||||
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||||
|
}
|
||||||
|
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||||
|
}
|
||||||
|
#define ROTL64(x, n) (cuda_ROTL64(x, n))
|
||||||
|
#else
|
||||||
|
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef ROTL32
|
||||||
|
#if __CUDA_ARCH__ < 350
|
||||||
|
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
|
||||||
|
#else
|
||||||
|
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef ROTR32
|
||||||
|
#if __CUDA_ARCH__ < 350
|
||||||
|
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
||||||
|
#else
|
||||||
|
#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define MEMSET8(dst,what,cnt) { \
|
||||||
|
int i_memset8; \
|
||||||
|
uint64_t *out_memset8 = (uint64_t *)(dst); \
|
||||||
|
for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \
|
||||||
|
out_memset8[i_memset8] = (what); }
|
||||||
|
|
||||||
|
#define MEMSET4(dst,what,cnt) { \
|
||||||
|
int i_memset4; \
|
||||||
|
uint32_t *out_memset4 = (uint32_t *)(dst); \
|
||||||
|
for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \
|
||||||
|
out_memset4[i_memset4] = (what); }
|
||||||
|
|
||||||
|
#define MEMCPY8(dst,src,cnt) { \
|
||||||
|
int i_memcpy8; \
|
||||||
|
uint64_t *in_memcpy8 = (uint64_t *)(src); \
|
||||||
|
uint64_t *out_memcpy8 = (uint64_t *)(dst); \
|
||||||
|
for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \
|
||||||
|
out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; }
|
||||||
|
|
||||||
|
#define MEMCPY4(dst,src,cnt) { \
|
||||||
|
int i_memcpy4; \
|
||||||
|
uint32_t *in_memcpy4 = (uint32_t *)(src); \
|
||||||
|
uint32_t *out_memcpy4 = (uint32_t *)(dst); \
|
||||||
|
for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \
|
||||||
|
out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }
|
||||||
|
|
||||||
|
#define XOR_BLOCKS_DST(x,y,z) { \
|
||||||
|
((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \
|
||||||
|
((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; }
|
||||||
|
|
||||||
|
union hash_state {
|
||||||
|
uint8_t b[200];
|
||||||
|
uint64_t w[25];
|
||||||
|
};
|
||||||
|
|
||||||
|
union cn_slow_hash_state {
|
||||||
|
union hash_state hs;
|
||||||
|
struct {
|
||||||
|
uint8_t k[64];
|
||||||
|
uint8_t init[INIT_SIZE_BYTE];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
|
||||||
|
{
|
||||||
|
cudaError_t err = cudaGetLastError();
|
||||||
|
if(err != cudaSuccess) {
|
||||||
|
gpulog(LOG_ERR, thr_id, "%s %s line %d", cudaGetErrorString(err), src, line);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
|
||||||
|
|
||||||
|
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
|
||||||
|
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
|
||||||
|
void cryptonight_extra_cpu_free(int thr_id);
|
||||||
|
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
|
||||||
|
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint32_t *d_ctx_state);
|
@ -8,14 +8,15 @@
|
|||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
#include "cryptonight.h"
|
#include "cryptonight.h"
|
||||||
|
#define LONG_SHL_IDX 19
|
||||||
|
|
||||||
#ifdef WIN32
|
#ifdef WIN32
|
||||||
// to prevent ui freeze
|
// to prevent ui freeze
|
||||||
int cn_bfactor = 8;
|
static __thread int cn_bfactor = 8;
|
||||||
int cn_bsleep = 100;
|
static __thread int cn_bsleep = 100;
|
||||||
#else
|
#else
|
||||||
int cn_bfactor = 0;
|
static __thread int cn_bfactor = 0;
|
||||||
int cn_bsleep = 0;
|
static __thread int cn_bsleep = 0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "cn_aes.cuh"
|
#include "cn_aes.cuh"
|
||||||
@ -94,7 +95,7 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int
|
|||||||
for(int i = start; i < end; ++i)
|
for(int i = start; i < end; ++i)
|
||||||
{
|
{
|
||||||
//j = ((uint32_t *)a)[0] & 0x1FFFF0;
|
//j = ((uint32_t *)a)[0] & 0x1FFFF0;
|
||||||
j = (__shfl((int)a, 0, 4) & 0x1FFFF0) >> 2;
|
j = (__shfl((int)a, 0, 4) & E2I_MASK1) >> 2;
|
||||||
|
|
||||||
//cn_aes_single_round(sharedMemory, &long_state[j], c, a);
|
//cn_aes_single_round(sharedMemory, &long_state[j], c, a);
|
||||||
x[0] = long_state[j + sub];
|
x[0] = long_state[j + sub];
|
||||||
@ -111,7 +112,7 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int
|
|||||||
long_state[j + sub] = c ^ b;
|
long_state[j + sub] = c ^ b;
|
||||||
|
|
||||||
//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]);
|
//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]);
|
||||||
j = (__shfl((int)c, 0, 4) & 0x1FFFF0) >> 2;
|
j = (__shfl((int)c, 0, 4) & E2I_MASK1) >> 2;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for(int k = 0; k < 2; k++)
|
for(int k = 0; k < 2; k++)
|
||||||
t1[k] = __shfl((int)c, k, 4);
|
t1[k] = __shfl((int)c, k, 4);
|
||||||
@ -128,7 +129,7 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int
|
|||||||
long_state[j + sub] = res;
|
long_state[j + sub] = res;
|
||||||
|
|
||||||
//j = ((uint32_t *)a)[0] & 0x1FFFF0;
|
//j = ((uint32_t *)a)[0] & 0x1FFFF0;
|
||||||
j = (__shfl((int)a, 0, 4) & 0x1FFFF0) >> 2;
|
j = (__shfl((int)a, 0, 4) & E2I_MASK1) >> 2;
|
||||||
|
|
||||||
//cn_aes_single_round(sharedMemory, &long_state[j], b, a);
|
//cn_aes_single_round(sharedMemory, &long_state[j], b, a);
|
||||||
x[0] = long_state[j + sub];
|
x[0] = long_state[j + sub];
|
||||||
@ -145,7 +146,7 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int
|
|||||||
long_state[j + sub] = c ^ b;
|
long_state[j + sub] = c ^ b;
|
||||||
|
|
||||||
//MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]);
|
//MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]);
|
||||||
j = (__shfl((int)b, 0, 4) & 0x1FFFF0) >> 2;
|
j = (__shfl((int)b, 0, 4) & E2I_MASK1) >> 2;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for(int k = 0; k < 2; k++)
|
for(int k = 0; k < 2; k++)
|
||||||
@ -192,15 +193,15 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int
|
|||||||
for(int i = start; i < end; i++) // end = 262144
|
for(int i = start; i < end; i++) // end = 262144
|
||||||
{
|
{
|
||||||
uint32_t c[4];
|
uint32_t c[4];
|
||||||
uint32_t j = (a[0] >> 2) & 0x7FFFC;
|
uint32_t j = (a[0] >> 2) & E2I_MASK2;
|
||||||
cn_aes_single_round(sharedMemory, &long_state[j], c, a);
|
cn_aes_single_round(sharedMemory, &long_state[j], c, a);
|
||||||
XOR_BLOCKS_DST(c, b, &long_state[j]);
|
XOR_BLOCKS_DST(c, b, &long_state[j]);
|
||||||
MUL_SUM_XOR_DST(c, a, &long_state[(c[0] >> 2) & 0x7FFFC]);
|
MUL_SUM_XOR_DST(c, a, &long_state[(c[0] >> 2) & E2I_MASK2]);
|
||||||
|
|
||||||
j = (a[0] >> 2) & 0x7FFFC;
|
j = (a[0] >> 2) & E2I_MASK2;
|
||||||
cn_aes_single_round(sharedMemory, &long_state[j], b, a);
|
cn_aes_single_round(sharedMemory, &long_state[j], b, a);
|
||||||
XOR_BLOCKS_DST(b, c, &long_state[j]);
|
XOR_BLOCKS_DST(b, c, &long_state[j]);
|
||||||
MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & 0x7FFFC]);
|
MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & E2I_MASK2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(bfactor > 0)
|
if(bfactor > 0)
|
@ -81,7 +81,7 @@ static void (* const extra_hashes[4])(const void*, size_t, void *) = {
|
|||||||
do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
|
do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
|
||||||
};
|
};
|
||||||
|
|
||||||
uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
|
static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
|
||||||
{
|
{
|
||||||
// multiplier = ab = a * 2^32 + b
|
// multiplier = ab = a * 2^32 + b
|
||||||
// multiplicand = cd = c * 2^32 + d
|
// multiplicand = cd = c * 2^32 + d
|
||||||
|
@ -7,13 +7,10 @@
|
|||||||
|
|
||||||
#include <miner.h>
|
#include <miner.h>
|
||||||
#include <cuda_helper.h>
|
#include <cuda_helper.h>
|
||||||
//#include <cuda.h>
|
|
||||||
//#include <cuda_runtime.h>
|
|
||||||
|
|
||||||
#include "cryptonight.h"
|
#include "cryptonight.h"
|
||||||
|
|
||||||
typedef unsigned char BitSequence;
|
typedef uint8_t BitSequence;
|
||||||
typedef unsigned long long DataLength;
|
typedef uint64_t DataLength;
|
||||||
|
|
||||||
static uint32_t *d_input[MAX_GPUS] = { 0 };
|
static uint32_t *d_input[MAX_GPUS] = { 0 };
|
||||||
static uint32_t *d_target[MAX_GPUS];
|
static uint32_t *d_target[MAX_GPUS];
|
@ -1,12 +1,4 @@
|
|||||||
#include <ctype.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#include <cuda.h>
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
|
|
||||||
#include <miner.h>
|
|
||||||
#include "cryptonight.h"
|
#include "cryptonight.h"
|
||||||
|
|
||||||
extern char *device_config[MAX_GPUS]; // -l 32x16
|
extern char *device_config[MAX_GPUS]; // -l 32x16
|
||||||
|
@ -21,6 +21,9 @@ struct uint3 blockDim;
|
|||||||
|
|
||||||
#define MEMORY (1 << 21) // 2 MiB / 2097152 B
|
#define MEMORY (1 << 21) // 2 MiB / 2097152 B
|
||||||
#define ITER (1 << 20) // 1048576
|
#define ITER (1 << 20) // 1048576
|
||||||
|
#define E2I_MASK1 0x1FFFF0
|
||||||
|
#define E2I_MASK2 (0x1FFFF0 >> 2)
|
||||||
|
|
||||||
#define AES_BLOCK_SIZE 16
|
#define AES_BLOCK_SIZE 16
|
||||||
#define AES_KEY_SIZE 32
|
#define AES_KEY_SIZE 32
|
||||||
#define INIT_SIZE_BLK 8
|
#define INIT_SIZE_BLK 8
|
||||||
|
@ -38,6 +38,9 @@ double target_to_diff_rpc2(uint32_t* target)
|
|||||||
// simplified to get 1.0 for 1000
|
// simplified to get 1.0 for 1000
|
||||||
return (double) (UINT32_MAX / target[7]) / 1000;
|
return (double) (UINT32_MAX / target[7]) / 1000;
|
||||||
}
|
}
|
||||||
|
else if (opt_algo == ALGO_CRYPTOLIGHT && target[7]) {
|
||||||
|
return (double) (UINT32_MAX / target[7]) / 1000;
|
||||||
|
}
|
||||||
else if (opt_algo == ALGO_WILDKECCAK) {
|
else if (opt_algo == ALGO_WILDKECCAK) {
|
||||||
return target_to_diff(target) * 1000;
|
return target_to_diff(target) * 1000;
|
||||||
}
|
}
|
||||||
@ -530,6 +533,15 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
|
|||||||
work_set_target_ratio(work, (uint32_t*) hash);
|
work_set_target_ratio(work, (uint32_t*) hash);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
else if (opt_algo == ALGO_CRYPTOLIGHT) {
|
||||||
|
uint32_t nonce;
|
||||||
|
memcpy(&nonce, &data[39], 4);
|
||||||
|
noncestr = bin2hex((unsigned char*) &nonce, 4);
|
||||||
|
last_found_nonce = nonce;
|
||||||
|
cryptolight_hash(hash, data, 76);
|
||||||
|
work_set_target_ratio(work, (uint32_t*) hash);
|
||||||
|
}
|
||||||
|
|
||||||
else if (opt_algo == ALGO_CRYPTONIGHT) {
|
else if (opt_algo == ALGO_CRYPTONIGHT) {
|
||||||
uint32_t nonce;
|
uint32_t nonce;
|
||||||
memcpy(&nonce, &data[39], 4);
|
memcpy(&nonce, &data[39], 4);
|
||||||
@ -1268,15 +1280,6 @@ int rpc2_stratum_thread_stuff(struct pool_infos* pool)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if getjob supported
|
|
||||||
if(0 && opt_algo == ALGO_CRYPTONIGHT) {
|
|
||||||
if(!rpc2_stratum_request_job(&stratum)) {
|
|
||||||
stratum_disconnect(&stratum);
|
|
||||||
applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
|
|
||||||
sleep(opt_fail_pause);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* save every 12 hours */
|
/* save every 12 hours */
|
||||||
if ((time(NULL) - prev_save) > 12*3600) {
|
if ((time(NULL) - prev_save) > 12*3600) {
|
||||||
store_scratchpad_to_file(false);
|
store_scratchpad_to_file(false);
|
||||||
|
3
miner.h
3
miner.h
@ -267,6 +267,7 @@ extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce,
|
|||||||
extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
|
extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
|
extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
@ -316,6 +317,7 @@ extern void free_blake256(int thr_id);
|
|||||||
extern void free_blake2s(int thr_id);
|
extern void free_blake2s(int thr_id);
|
||||||
extern void free_bmw(int thr_id);
|
extern void free_bmw(int thr_id);
|
||||||
extern void free_c11(int thr_id);
|
extern void free_c11(int thr_id);
|
||||||
|
extern void free_cryptolight(int thr_id);
|
||||||
extern void free_cryptonight(int thr_id);
|
extern void free_cryptonight(int thr_id);
|
||||||
extern void free_decred(int thr_id);
|
extern void free_decred(int thr_id);
|
||||||
extern void free_deep(int thr_id);
|
extern void free_deep(int thr_id);
|
||||||
@ -805,6 +807,7 @@ void blake256hash(void *output, const void *input, int8_t rounds);
|
|||||||
void blake2s_hash(void *output, const void *input);
|
void blake2s_hash(void *output, const void *input);
|
||||||
void bmw_hash(void *state, const void *input);
|
void bmw_hash(void *state, const void *input);
|
||||||
void c11hash(void *output, const void *input);
|
void c11hash(void *output, const void *input);
|
||||||
|
void cryptolight_hash(void* output, const void* input, int len);
|
||||||
void cryptonight_hash(void* output, const void* input, size_t len);
|
void cryptonight_hash(void* output, const void* input, size_t len);
|
||||||
void decred_hash(void *state, const void *input);
|
void decred_hash(void *state, const void *input);
|
||||||
void deephash(void *state, const void *input);
|
void deephash(void *state, const void *input);
|
||||||
|
@ -255,6 +255,7 @@ bool pool_switch(int thr_id, int pooln)
|
|||||||
stratum = p->stratum;
|
stratum = p->stratum;
|
||||||
stratum.pooln = cur_pooln;
|
stratum.pooln = cur_pooln;
|
||||||
stratum.rpc2 = (p->algo == ALGO_WILDKECCAK || p->algo == ALGO_CRYPTONIGHT);
|
stratum.rpc2 = (p->algo == ALGO_WILDKECCAK || p->algo == ALGO_CRYPTONIGHT);
|
||||||
|
stratum.rpc2 |= p->algo == ALGO_CRYPTOLIGHT;
|
||||||
|
|
||||||
// unlock the stratum thread
|
// unlock the stratum thread
|
||||||
tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url));
|
tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url));
|
||||||
@ -278,6 +279,7 @@ bool pool_switch(int thr_id, int pooln)
|
|||||||
}
|
}
|
||||||
|
|
||||||
stratum.rpc2 = (p->algo == ALGO_WILDKECCAK || p->algo == ALGO_CRYPTONIGHT);
|
stratum.rpc2 = (p->algo == ALGO_WILDKECCAK || p->algo == ALGO_CRYPTONIGHT);
|
||||||
|
stratum.rpc2 |= p->algo == ALGO_CRYPTOLIGHT;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
3
util.cpp
3
util.cpp
@ -2141,6 +2141,9 @@ void print_hash_tests(void)
|
|||||||
c11hash(&hash[0], &buf[0]);
|
c11hash(&hash[0], &buf[0]);
|
||||||
printpfx("c11", hash);
|
printpfx("c11", hash);
|
||||||
|
|
||||||
|
cryptolight_hash(&hash[0], &buf[0], 76);
|
||||||
|
printpfx("cryptolight", hash);
|
||||||
|
|
||||||
cryptonight_hash(&hash[0], &buf[0], 76);
|
cryptonight_hash(&hash[0], &buf[0], 76);
|
||||||
printpfx("cryptonight", hash);
|
printpfx("cryptonight", hash);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user