Browse Source

import and adapt scrypt from cudaminer project

scrypt-jane under work...
2upstream
Tanguy Pruvot 9 years ago
parent
commit
9dc78da2ee
  1. 13
      Makefile.am
  2. 65
      ccminer.cpp
  3. 25
      ccminer.vcxproj
  4. 47
      ccminer.vcxproj.filters
  5. 11
      miner.h
  6. 626
      scrypt-jane.cpp
  7. 756
      scrypt.c
  8. 1097
      scrypt.cpp
  9. 454
      scrypt/blake.cu
  10. 28
      scrypt/code/scrypt-conf.h
  11. 58
      scrypt/code/scrypt-jane-chacha.h
  12. 69
      scrypt/code/scrypt-jane-mix_chacha.h
  13. 32
      scrypt/code/scrypt-jane-portable-x86.h
  14. 284
      scrypt/code/scrypt-jane-portable.h
  15. 67
      scrypt/code/scrypt-jane-romix-basic.h
  16. 179
      scrypt/code/scrypt-jane-romix-template.h
  17. 1
      scrypt/code/scrypt-jane-romix.h
  18. 907
      scrypt/fermi_kernel.cu
  19. 28
      scrypt/fermi_kernel.h
  20. 837
      scrypt/keccak.cu
  21. 8
      scrypt/keccak.h
  22. 781
      scrypt/kepler_kernel.cu
  23. 29
      scrypt/kepler_kernel.h
  24. 1488
      scrypt/nv_kernel.cu
  25. 36
      scrypt/nv_kernel.h
  26. 1723
      scrypt/nv_kernel2.cu
  27. 36
      scrypt/nv_kernel2.h
  28. 939
      scrypt/salsa_kernel.cu
  29. 135
      scrypt/salsa_kernel.h
  30. 29
      scrypt/scrypt-jane.h
  31. 638
      scrypt/sha2.c
  32. 441
      scrypt/sha256.cu
  33. 10
      scrypt/sha256.h
  34. 781
      scrypt/test_kernel.cu
  35. 30
      scrypt/test_kernel.h
  36. 731
      scrypt/titan_kernel.cu
  37. 26
      scrypt/titan_kernel.h
  38. 3
      util.cpp

13
Makefile.am

@ -18,7 +18,7 @@ bin_PROGRAMS = ccminer @@ -18,7 +18,7 @@ bin_PROGRAMS = ccminer
ccminer_SOURCES = elist.h miner.h compat.h \
compat/inttypes.h compat/stdbool.h compat/unistd.h \
compat/sys/time.h compat/getopt/getopt.h \
crc32.c hefty1.c scrypt.c \
crc32.c hefty1.c \
ccminer.cpp util.cpp \
api.cpp hashlog.cpp nvml.cpp stats.cpp sysinfos.cpp cuda.cpp \
heavy/heavy.cu \
@ -57,6 +57,13 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -57,6 +57,13 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \
x11/s3.cu
# scrypt
ccminer_SOURCES += scrypt.cpp scrypt-jane.cpp \
scrypt/blake.cu scrypt/keccak.cu scrypt/sha256.cu \
scrypt/salsa_kernel.cu scrypt/test_kernel.cu \
scrypt/fermi_kernel.cu scrypt/kepler_kernel.cu \
scrypt/nv_kernel.cu scrypt/nv_kernel2.cu scrypt/titan_kernel.cu
if HAVE_NVML
nvml_defs = -DUSE_WRAPNVML
nvml_libs = -ldl
@ -118,6 +125,10 @@ quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu @@ -118,6 +125,10 @@ quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
# This kernel need also an older SM to be able to autotune kernels
scrypt/salsa_kernel.o: scrypt/salsa_kernel.cu
$(NVCC) $(nvcc_FLAGS) -gencode=arch=compute_20,code=\"sm_21,compute_20\" --maxrregcount=80 -o $@ -c $<
skein.o: skein.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<

65
ccminer.cpp

@ -102,6 +102,8 @@ enum sha_algos { @@ -102,6 +102,8 @@ enum sha_algos {
ALGO_PLUCK,
ALGO_QUARK,
ALGO_QUBIT,
ALGO_SCRYPT,
ALGO_SCRYPT_JANE,
ALGO_SKEIN,
ALGO_SKEIN2,
ALGO_S3,
@ -137,6 +139,8 @@ static const char *algo_names[] = { @@ -137,6 +139,8 @@ static const char *algo_names[] = {
"pluck",
"quark",
"qubit",
"scrypt",
"scrypt-jane",
"skein",
"skein2",
"s3",
@ -184,6 +188,20 @@ char * device_name[MAX_GPUS]; @@ -184,6 +188,20 @@ char * device_name[MAX_GPUS];
short device_map[MAX_GPUS] = { 0 };
long device_sm[MAX_GPUS] = { 0 };
uint32_t gpus_intensity[MAX_GPUS] = { 0 };
int device_interactive[MAX_GPUS] = { 0 };
int device_batchsize[MAX_GPUS] = { 0 };
int device_backoff[MAX_GPUS] = { 0 };
int device_lookup_gap[MAX_GPUS] = { 0 };
int device_texturecache[MAX_GPUS] = { 0 };
int device_singlememory[MAX_GPUS] = { 0 };
char *device_config[MAX_GPUS] = { 0 };
int opt_nfactor = 0;
int parallel = 2;
bool autotune = true;
bool abort_flag = false;
char *jane_params = NULL;
char *rpc_user = NULL;
static char *rpc_pass;
static char *rpc_userpass = NULL;
@ -255,6 +273,8 @@ Options:\n\ @@ -255,6 +273,8 @@ Options:\n\
pluck SupCoin\n\
quark Quark\n\
qubit Qubit\n\
scrypt Scrypt\n\
scrypt-jane Scrypt-jane Chacha\n\
skein Skein SHA2 (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\
s3 S3 (1Coin)\n\
@ -439,6 +459,7 @@ void get_currentalgo(char* buf, int sz) @@ -439,6 +459,7 @@ void get_currentalgo(char* buf, int sz)
*/
void proper_exit(int reason)
{
abort_flag = true;
cuda_devicereset();
if (check_dups)
@ -1173,6 +1194,8 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) @@ -1173,6 +1194,8 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
switch (opt_algo) {
case ALGO_JACKPOT:
case ALGO_PLUCK:
case ALGO_SCRYPT:
case ALGO_SCRYPT_JANE:
diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
break;
case ALGO_DMD_GR:
@ -1386,6 +1409,8 @@ static void *miner_thread(void *userdata) @@ -1386,6 +1409,8 @@ static void *miner_thread(void *userdata)
minmax = 0x400000;
break;
case ALGO_LYRA2:
case ALGO_SCRYPT:
case ALGO_SCRYPT_JANE:
minmax = 0x100000;
break;
case ALGO_PLUCK:
@ -1526,6 +1551,16 @@ static void *miner_thread(void *userdata) @@ -1526,6 +1551,16 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done);
break;
case ALGO_SCRYPT:
rc = scanhash_scrypt(thr_id, work.data, work.target, NULL,
max_nonce, &hashes_done, &tv_start, &tv_end);
break;
case ALGO_SCRYPT_JANE:
rc = scanhash_scrypt_jane(thr_id, work.data, work.target, NULL,
max_nonce, &hashes_done, &tv_start, &tv_end);
break;
case ALGO_SKEIN:
rc = scanhash_skeincoin(thr_id, work.data, work.target,
max_nonce, &hashes_done);
@ -1942,15 +1977,29 @@ void parse_arg(int key, char *arg) @@ -1942,15 +1977,29 @@ void parse_arg(int key, char *arg)
switch(key) {
case 'a':
p = strstr(arg, ":"); // optional factor
if (p) *p = '\0';
for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
if (algo_names[i] &&
!strcmp(arg, algo_names[i])) {
if (algo_names[i] && !strcasecmp(arg, algo_names[i])) {
opt_algo = (enum sha_algos)i;
break;
}
}
if (i == ARRAY_SIZE(algo_names))
show_usage_and_exit(1);
if (p) {
opt_nfactor = atoi(p + 1);
if (opt_algo == ALGO_SCRYPT_JANE) {
free(jane_params);
jane_params = strdup(p+1);
}
}
if (!opt_nfactor) {
switch (opt_algo) {
case ALGO_SCRYPT: opt_nfactor = 9; break;
case ALGO_SCRYPT_JANE: opt_nfactor = 14; break;
}
}
break;
case 'b':
p = strstr(arg, ":");
@ -2404,6 +2453,8 @@ int main(int argc, char *argv[]) @@ -2404,6 +2453,8 @@ int main(int argc, char *argv[])
rpc_pass = strdup("");
rpc_url = strdup("");
jane_params = strdup("");
pthread_mutex_init(&applog_lock, NULL);
// number of cpus for thread affinity
@ -2423,9 +2474,17 @@ int main(int argc, char *argv[]) @@ -2423,9 +2474,17 @@ int main(int argc, char *argv[])
if (num_cpus < 1)
num_cpus = 1;
// default thread to device map
for (i = 0; i < MAX_GPUS; i++) {
device_map[i] = i;
device_name[i] = NULL;
// for future use, maybe
device_interactive[i] = -1;
device_batchsize[i] = 1024;
device_backoff[i] = is_windows() ? 12 : 2;
device_lookup_gap[i] = 1;
device_texturecache[i] = -1;
device_singlememory[i] = -1;
device_config[i] = NULL;
}
// number of gpus

25
ccminer.vcxproj

@ -250,6 +250,8 @@ @@ -250,6 +250,8 @@
<TreatWChar_tAsBuiltInType>false</TreatWChar_tAsBuiltInType>
<Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
</ClCompile>
<ClCompile Include="scrypt-jane.cpp" />
<ClCompile Include="scrypt.cpp" />
<ClCompile Include="util.cpp" />
<ClCompile Include="fuguecoin.cpp" />
<ClCompile Include="groestlcoin.cpp" />
@ -261,10 +263,6 @@ @@ -261,10 +263,6 @@
<ClCompile Include="crc32.c" />
<ClCompile Include="hefty1.c" />
<ClCompile Include="myriadgroestl.cpp" />
<ClCompile Include="scrypt.c">
<Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
<AdditionalOptions>/Tp %(AdditionalOptions)</AdditionalOptions>
</ClCompile>
<ClCompile Include="lyra2\Lyra2.c" />
<ClCompile Include="lyra2\Sponge.c" />
<ClCompile Include="sph\aes_helper.c" />
@ -322,6 +320,7 @@ @@ -322,6 +320,7 @@
<ClInclude Include="miner.h" />
<ClInclude Include="nvml.h" />
<ClInclude Include="res\resource.h" />
<ClInclude Include="scrypt\salsa_kernel.h" />
<ClInclude Include="sph\sph_blake.h" />
<ClInclude Include="sph\sph_bmw.h" />
<ClInclude Include="sph\sph_cubehash.h" />
@ -352,6 +351,22 @@ @@ -352,6 +351,22 @@
<CudaCompile Include="cuda_myriadgroestl.cu" />
<CudaCompile Include="cuda_nist5.cu">
</CudaCompile>
<CudaCompile Include="scrypt\blake.cu" />
<CudaCompile Include="scrypt\fermi_kernel.cu">
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_20,sm_21;compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_52,sm_52</CodeGeneration>
</CudaCompile>
<CudaCompile Include="scrypt\keccak.cu" />
<CudaCompile Include="scrypt\kepler_kernel.cu" />
<CudaCompile Include="scrypt\nv_kernel.cu" />
<CudaCompile Include="scrypt\nv_kernel2.cu">
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_35,sm_35;compute_50,sm_50;compute_52,sm_52</CodeGeneration>
</CudaCompile>
<CudaCompile Include="scrypt\salsa_kernel.cu">
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_20,sm_21</CodeGeneration>
</CudaCompile>
<CudaCompile Include="scrypt\sha256.cu" />
<CudaCompile Include="scrypt\test_kernel.cu" />
<CudaCompile Include="scrypt\titan_kernel.cu" />
<CudaCompile Include="zr5.cu" />
<CudaCompile Include="heavy\cuda_blake512.cu">
</CudaCompile>
@ -510,4 +525,4 @@ @@ -510,4 +525,4 @@
<Target Name="AfterClean">
<Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
</Target>
</Project>
</Project>

47
ccminer.vcxproj.filters

@ -73,6 +73,9 @@ @@ -73,6 +73,9 @@
<Filter Include="Ressources">
<UniqueIdentifier>{f5117ccb-a70d-411a-b7ea-d6faed230bc7}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\scrypt">
<UniqueIdentifier>{c26f5b02-37b5-4420-a4e8-ee1ad517dc95}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="compat\jansson\dump.c">
@ -111,9 +114,6 @@ @@ -111,9 +114,6 @@
<ClCompile Include="hefty1.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="scrypt.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="fuguecoin.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@ -225,6 +225,12 @@ @@ -225,6 +225,12 @@
<ClCompile Include="lyra2\Sponge.c">
<Filter>Source Files\sph</Filter>
</ClCompile>
<ClCompile Include="scrypt.cpp">
<Filter>Source Files\CUDA\scrypt</Filter>
</ClCompile>
<ClCompile Include="scrypt-jane.cpp">
<Filter>Source Files\CUDA\scrypt</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="compat.h">
@ -377,6 +383,9 @@ @@ -377,6 +383,9 @@
<ClInclude Include="res\resource.h">
<Filter>Ressources</Filter>
</ClInclude>
<ClInclude Include="scrypt\salsa_kernel.h">
<Filter>Source Files\CUDA\scrypt</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CudaCompile Include="cuda.cpp">
@ -580,6 +589,36 @@ @@ -580,6 +589,36 @@
<CudaCompile Include="skein2.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="scrypt\blake.cu">
<Filter>Source Files\CUDA\scrypt</Filter>
</CudaCompile>
<CudaCompile Include="scrypt\fermi_kernel.cu">
<Filter>Source Files\CUDA\scrypt</Filter>
</CudaCompile>
<CudaCompile Include="scrypt\keccak.cu">
<Filter>Source Files\CUDA\scrypt</Filter>
</CudaCompile>
<CudaCompile Include="scrypt\kepler_kernel.cu">
<Filter>Source Files\CUDA\scrypt</Filter>
</CudaCompile>
<CudaCompile Include="scrypt\nv_kernel.cu">
<Filter>Source Files\CUDA\scrypt</Filter>
</CudaCompile>
<CudaCompile Include="scrypt\nv_kernel2.cu">
<Filter>Source Files\CUDA\scrypt</Filter>
</CudaCompile>
<CudaCompile Include="scrypt\salsa_kernel.cu">
<Filter>Source Files\CUDA\scrypt</Filter>
</CudaCompile>
<CudaCompile Include="scrypt\sha256.cu">
<Filter>Source Files\CUDA\scrypt</Filter>
</CudaCompile>
<CudaCompile Include="scrypt\test_kernel.cu">
<Filter>Source Files\CUDA\scrypt</Filter>
</CudaCompile>
<CudaCompile Include="scrypt\titan_kernel.cu">
<Filter>Source Files\CUDA\scrypt</Filter>
</CudaCompile>
</ItemGroup>
<ItemGroup>
<Image Include="res\ccminer.ico">
@ -596,4 +635,4 @@ @@ -596,4 +635,4 @@
<Filter>Ressources</Filter>
</Text>
</ItemGroup>
</Project>
</Project>

11
miner.h

@ -272,8 +272,6 @@ void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap); @@ -272,8 +272,6 @@ void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);
extern unsigned char *scrypt_buffer_alloc();
extern int scanhash_deep(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
@ -343,8 +341,12 @@ extern int scanhash_qubit(int thr_id, uint32_t *pdata, @@ -343,8 +341,12 @@ extern int scanhash_qubit(int thr_id, uint32_t *pdata,
unsigned long *hashes_done);
extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
unsigned char *scratchbuf, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done);
const uint32_t *ptarget, unsigned char *scratchbuf, uint32_t max_nonce,
unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end);
extern int scanhash_scrypt_jane(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, unsigned char *scratchbuf, uint32_t max_nonce,
unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end);
extern int scanhash_skeincoin(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
@ -683,6 +685,7 @@ void pentablakehash(void *output, const void *input); @@ -683,6 +685,7 @@ void pentablakehash(void *output, const void *input);
void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N);
void quarkhash(void *state, const void *input);
void qubithash(void *state, const void *input);
void scrypthash(void* output, const void* input);
void skeincoinhash(void *output, const void *input);
void skein2hash(void *output, const void *input);
void s3hash(void *output, const void *input);

626
scrypt-jane.cpp

@ -0,0 +1,626 @@ @@ -0,0 +1,626 @@
/*
scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane
Public Domain or MIT License, whichever is easier
*/
#include "miner.h"
#include "scrypt/scrypt-jane.h"
#include "scrypt/code/scrypt-jane-portable.h"
#include "scrypt/code/scrypt-jane-romix.h"
#include "scrypt/keccak.h"
#include "scrypt/salsa_kernel.h"
#define scrypt_maxN 30 /* (1 << (30 + 1)) = ~2 billion */
#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
#define scrypt_maxr scrypt_r_32kb /* 32kb */
#define scrypt_maxp 25 /* (1 << 25) = ~33 million */
// ---------------------------- BEGIN keccak functions ------------------------------------
#define SCRYPT_HASH "Keccak-512"
#define SCRYPT_HASH_DIGEST_SIZE 64
#define SCRYPT_KECCAK_F 1600
#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 1024 */
#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 576 */
#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8)
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
typedef struct scrypt_hash_state_t {
uint64_t state[SCRYPT_KECCAK_F / 64];
uint32_t leftover;
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
} scrypt_hash_state;
static const uint64_t keccak_round_constants[24] = {
0x0000000000000001ull, 0x0000000000008082ull,
0x800000000000808aull, 0x8000000080008000ull,
0x000000000000808bull, 0x0000000080000001ull,
0x8000000080008081ull, 0x8000000000008009ull,
0x000000000000008aull, 0x0000000000000088ull,
0x0000000080008009ull, 0x000000008000000aull,
0x000000008000808bull, 0x800000000000008bull,
0x8000000000008089ull, 0x8000000000008003ull,
0x8000000000008002ull, 0x8000000000000080ull,
0x000000000000800aull, 0x800000008000000aull,
0x8000000080008081ull, 0x8000000000008080ull,
0x0000000080000001ull, 0x8000000080008008ull
};
static void
keccak_block(scrypt_hash_state *S, const uint8_t *in) {
size_t i;
uint64_t *s = S->state, t[5], u[5], v, w;
/* absorb input */
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8)
s[i] ^= U8TO64_LE(in);
for (i = 0; i < 24; i++) {
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
u[0] = t[4] ^ ROTL64(t[1], 1);
u[1] = t[0] ^ ROTL64(t[2], 1);
u[2] = t[1] ^ ROTL64(t[3], 1);
u[3] = t[2] ^ ROTL64(t[4], 1);
u[4] = t[3] ^ ROTL64(t[0], 1);
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
/* rho pi: b[..] = rotl(a[..], ..) */
v = s[ 1];
s[ 1] = ROTL64(s[ 6], 44);
s[ 6] = ROTL64(s[ 9], 20);
s[ 9] = ROTL64(s[22], 61);
s[22] = ROTL64(s[14], 39);
s[14] = ROTL64(s[20], 18);
s[20] = ROTL64(s[ 2], 62);
s[ 2] = ROTL64(s[12], 43);
s[12] = ROTL64(s[13], 25);
s[13] = ROTL64(s[19], 8);
s[19] = ROTL64(s[23], 56);
s[23] = ROTL64(s[15], 41);
s[15] = ROTL64(s[ 4], 27);
s[ 4] = ROTL64(s[24], 14);
s[24] = ROTL64(s[21], 2);
s[21] = ROTL64(s[ 8], 55);
s[ 8] = ROTL64(s[16], 45);
s[16] = ROTL64(s[ 5], 36);
s[ 5] = ROTL64(s[ 3], 28);
s[ 3] = ROTL64(s[18], 21);
s[18] = ROTL64(s[17], 15);
s[17] = ROTL64(s[11], 10);
s[11] = ROTL64(s[ 7], 6);
s[ 7] = ROTL64(s[10], 3);
s[10] = ROTL64( v, 1);
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
/* iota: a[0,0] ^= round constant */
s[0] ^= keccak_round_constants[i];
}
}
static void
scrypt_hash_init(scrypt_hash_state *S) {
memset(S, 0, sizeof(*S));
}
static void
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
size_t want;
/* handle the previous data */
if (S->leftover) {
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
want = (want < inlen) ? want : inlen;
memcpy(S->buffer + S->leftover, in, want);
S->leftover += (uint32_t)want;
if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
return;
in += want;
inlen -= want;
keccak_block(S, S->buffer);
}
/* handle the current data */
while (inlen >= SCRYPT_HASH_BLOCK_SIZE) {
keccak_block(S, in);
in += SCRYPT_HASH_BLOCK_SIZE;
inlen -= SCRYPT_HASH_BLOCK_SIZE;
}
/* handle leftover data */
S->leftover = (uint32_t)inlen;
if (S->leftover)
memcpy(S->buffer, in, S->leftover);
}
static void
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
size_t i;
S->buffer[S->leftover] = 0x01;
memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1));
S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80;
keccak_block(S, S->buffer);
for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) {
U64TO8_LE(&hash[i], S->state[i / 8]);
}
}
// ---------------------------- END keccak functions ------------------------------------
// ---------------------------- BEGIN PBKDF2 functions ------------------------------------
typedef struct scrypt_hmac_state_t {
scrypt_hash_state inner, outer;
} scrypt_hmac_state;
static void
scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) {
scrypt_hash_state st;
scrypt_hash_init(&st);
scrypt_hash_update(&st, m, mlen);
scrypt_hash_finish(&st, hash);
}
/* hmac */
static void
scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) {
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
size_t i;
scrypt_hash_init(&st->inner);
scrypt_hash_init(&st->outer);
if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
/* use the key directly if it's <= blocksize bytes */
memcpy(pad, key, keylen);
} else {
/* if it's > blocksize bytes, hash it */
scrypt_hash(pad, key, keylen);
}
/* inner = (key ^ 0x36) */
/* h(inner || ...) */
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
pad[i] ^= 0x36;
scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
/* outer = (key ^ 0x5c) */
/* h(outer || ...) */
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
pad[i] ^= (0x5c ^ 0x36);
scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
}
static void
scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) {
/* h(inner || m...) */
scrypt_hash_update(&st->inner, m, mlen);
}
static void
scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) {
/* h(inner || m) */
scrypt_hash_digest innerhash;
scrypt_hash_finish(&st->inner, innerhash);
/* h(outer || h(inner || m)) */
scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
scrypt_hash_finish(&st->outer, mac);
}
/*
* Special version where N = 1
* - mikaelh
*/
static void
scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) {
scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
scrypt_hash_digest ti, u;
uint8_t be[4];
uint32_t i, /*j,*/ blocks;
// uint64_t c;
/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
/* hmac(password, ...) */
scrypt_hmac_init(&hmac_pw, password, password_len);
/* hmac(password, salt...) */
hmac_pw_salt = hmac_pw;
scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
for (i = 1; i <= blocks; i++) {
/* U1 = hmac(password, salt || be(i)) */
U32TO8_BE(be, i);
work = hmac_pw_salt;
scrypt_hmac_update(&work, be, 4);
scrypt_hmac_finish(&work, ti);
memcpy(u, ti, sizeof(u));
memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
out += SCRYPT_HASH_DIGEST_SIZE;
bytes -= SCRYPT_HASH_DIGEST_SIZE;
}
}
// ---------------------------- END PBKDF2 functions ------------------------------------
static void
scrypt_fatal_error_default(const char *msg) {
fprintf(stderr, "%s\n", msg);
exit(1);
}
static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default;
void
scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) {
scrypt_fatal_error = fn;
}
typedef struct scrypt_aligned_alloc_t {
uint8_t *mem, *ptr;
} scrypt_aligned_alloc;
#if defined(SCRYPT_TEST_SPEED)
static uint8_t *mem_base = (uint8_t *)0;
static size_t mem_bump = 0;
/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */
static scrypt_aligned_alloc
scrypt_alloc(uint64_t size) {
scrypt_aligned_alloc aa;
if (!mem_base) {
mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1));
if (!mem_base)
scrypt_fatal_error("scrypt: out of memory");
mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
}
aa.mem = mem_base + mem_bump;
aa.ptr = aa.mem;
mem_bump += (size_t)size;
return aa;
}
static void
scrypt_free(scrypt_aligned_alloc *aa) {
mem_bump = 0;
}
#else
static scrypt_aligned_alloc
scrypt_alloc(uint64_t size) {
static const size_t max_alloc = (size_t)-1;
scrypt_aligned_alloc aa;
size += (SCRYPT_BLOCK_BYTES - 1);
if (size > max_alloc)
scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
aa.mem = (uint8_t *)malloc((size_t)size);
aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
if (!aa.mem)
scrypt_fatal_error("scrypt: out of memory");
return aa;
}
static void
scrypt_free(scrypt_aligned_alloc *aa) {
free(aa->mem);
}
#endif
// yacoin: increasing Nfactor gradually
unsigned char GetNfactor(unsigned int nTimestamp) {
int l = 0;
unsigned int Nfactor = 0;
// Yacoin defaults
unsigned int Ntimestamp = 1367991200;
unsigned int minN = 4;
unsigned int maxN = 30;
if (strlen(jane_params) > 0) {
if (!strcmp(jane_params, "YAC") || !strcasecmp(jane_params, "Yacoin")) {} // No-Op
//
// NO WARRANTY FOR CORRECTNESS. Look for the int64 nChainStartTime constant
// in the src/main.cpp file of the official wallet clients as well as the
// const unsigned char minNfactor and const unsigned char maxNfactor
//
else if (!strcmp(jane_params, "YBC") || !strcasecmp(jane_params, "YBCoin")) {
// YBCoin: 1372386273, minN: 4, maxN: 30
Ntimestamp = 1372386273; minN= 4; maxN= 30;
} else if (!strcmp(jane_params, "ZZC") || !strcasecmp(jane_params, "ZZCoin")) {
// ZcCoin: 1375817223, minN: 12, maxN: 30
Ntimestamp = 1375817223; minN= 12; maxN= 30;
} else if (!strcmp(jane_params, "FEC") || !strcasecmp(jane_params, "FreeCoin")) {
// FreeCoin: 1375801200, minN: 6, maxN: 32
Ntimestamp = 1375801200; minN= 6; maxN= 32;
} else if (!strcmp(jane_params, "ONC") || !strcasecmp(jane_params, "OneCoin")) {
// OneCoin: 1371119462, minN: 6, maxN: 30
Ntimestamp = 1371119462; minN= 6; maxN= 30;
} else if (!strcmp(jane_params, "QQC") || !strcasecmp(jane_params, "QQCoin")) {
// QQCoin: 1387769316, minN: 4, maxN: 30
Ntimestamp = 1387769316; minN= 4; maxN= 30;
} else if (!strcmp(jane_params, "GPL") || !strcasecmp(jane_params, "GoldPressedLatinum")) {
// GoldPressedLatinum:1377557832, minN: 4, maxN: 30
Ntimestamp = 1377557832; minN= 4; maxN= 30;
} else if (!strcmp(jane_params, "MRC") || !strcasecmp(jane_params, "MicroCoin")) {
// MicroCoin:1389028879, minN: 4, maxN: 30
Ntimestamp = 1389028879; minN= 4; maxN= 30;
} else if (!strcmp(jane_params, "APC") || !strcasecmp(jane_params, "AppleCoin")) {
// AppleCoin:1384720832, minN: 4, maxN: 30
Ntimestamp = 1384720832; minN= 4; maxN= 30;
} else if (!strcmp(jane_params, "CPR") || !strcasecmp(jane_params, "Copperbars")) {
// Copperbars:1376184687, minN: 4, maxN: 30
Ntimestamp = 1376184687; minN= 4; maxN= 30;
} else if (!strcmp(jane_params, "CACH") || !strcasecmp(jane_params, "CacheCoin")) {
// CacheCoin:1388949883, minN: 4, maxN: 30
Ntimestamp = 1388949883; minN= 4; maxN= 30;
} else if (!strcmp(jane_params, "UTC") || !strcasecmp(jane_params, "UltraCoin")) {
// MicroCoin:1388361600, minN: 4, maxN: 30
Ntimestamp = 1388361600; minN= 4; maxN= 30;
} else if (!strcmp(jane_params, "VEL") || !strcasecmp(jane_params, "VelocityCoin")) {
// VelocityCoin:1387769316, minN: 4, maxN: 30
Ntimestamp = 1387769316; minN= 4; maxN= 30;
} else if (!strcmp(jane_params, "ITC") || !strcasecmp(jane_params, "InternetCoin")) {
// InternetCoin:1388385602, minN: 4, maxN: 30
Ntimestamp = 1388385602; minN= 4; maxN= 30;
} else if (!strcmp(jane_params, "RAD") || !strcasecmp(jane_params, "RadioactiveCoin")) {
// InternetCoin:1389196388, minN: 4, maxN: 30
Ntimestamp = 1389196388; minN= 4; maxN= 30;
} else {
if (sscanf(jane_params, "%u,%u,%u", &Ntimestamp, &minN, &maxN) != 3)
if (sscanf(jane_params, "%u", &Nfactor) == 1) return Nfactor; // skip bounding against minN, maxN
else applog(LOG_INFO, "Unable to parse scrypt-jane parameters: '%s'. Defaulting to Yacoin.", jane_params);
}
}
// determination based on the constants determined above
if (nTimestamp <= Ntimestamp)
return minN;
unsigned long int s = nTimestamp - Ntimestamp;
while ((s >> 1) > 3) {
l += 1;
s >>= 1;
}
s &= 3;
int n = (l * 170 + s * 25 - 2320) / 100;
if (n < 0) n = 0;
if (n > 255)
printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n);
Nfactor = n;
if (Nfactor<minN) return minN;
if (Nfactor>maxN) return maxN;
return Nfactor;
}
#define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
| (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
static int s_Nfactor = 0;
int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, unsigned char *scratchbuf,
uint32_t max_nonce, unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end)
{
const uint32_t Htarg = ptarget[7];
if (s_Nfactor == 0 && strlen(jane_params) > 0)
applog(LOG_INFO, "Given scrypt-jane parameters: %s", jane_params);
int Nfactor = GetNfactor(bswap_32x4(pdata[17]));
if (Nfactor > scrypt_maxN) {
scrypt_fatal_error("scrypt: N out of range");
}
if (Nfactor != s_Nfactor)
{
// all of this isn't very thread-safe...
opt_nfactor = (1 << (Nfactor + 1));
applog(LOG_INFO, "Nfactor is %d (N=%d)!", Nfactor, opt_nfactor);
if (s_Nfactor != 0) {
// handle N-factor increase at runtime
// by adjusting the lookup_gap by factor 2
if (s_Nfactor == Nfactor-1)
for (int i=0; i < 8; ++i)
device_lookup_gap[i] *= 2;
}
s_Nfactor = Nfactor;
}
int throughput = cuda_throughput(thr_id);
if(throughput == 0)
return -1;
gettimeofday(tv_start, NULL);
uint32_t *data[2] = { new uint32_t[20*throughput], new uint32_t[20*throughput] };
uint32_t* hash[2] = { cuda_hashbuffer(thr_id,0), cuda_hashbuffer(thr_id,1) };
uint32_t n = pdata[19];
/* byte swap pdata into data[0]/[1] arrays */
for (int k=0; k<2; ++k) {
for(int z=0;z<20;z++) data[k][z] = bswap_32x4(pdata[z]);
for(int i=1;i<throughput;++i) memcpy(&data[k][20*i], &data[k][0], 20*sizeof(uint32_t));
}
if (parallel == 2) prepare_keccak512(thr_id, pdata);
scrypt_aligned_alloc Xbuf[2] = { scrypt_alloc(128 * throughput), scrypt_alloc(128 * throughput) };
scrypt_aligned_alloc Vbuf = scrypt_alloc((uint64_t)opt_nfactor * 128);
scrypt_aligned_alloc Ybuf = scrypt_alloc(128);
uint32_t nonce[2];
uint32_t* cuda_X[2] = { cuda_transferbuffer(thr_id,0), cuda_transferbuffer(thr_id,1) };
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
#endif
int cur = 0, nxt = 1;
int iteration = 0;
do {
nonce[nxt] = n;
if (parallel < 2)
{
for(int i=0;i<throughput;++i) {
uint32_t tmp_nonce = n++;
data[nxt][20*i + 19] = bswap_32x4(tmp_nonce);
}
for(int i=0;i<throughput;++i)
scrypt_pbkdf2_1((unsigned char *)&data[nxt][20*i], 80, (unsigned char *)&data[nxt][20*i], 80, Xbuf[nxt].ptr + 128 * i, 128);
memcpy(cuda_X[nxt], Xbuf[nxt].ptr, 128 * throughput);
cuda_scrypt_serialize(thr_id, nxt);
cuda_scrypt_HtoD(thr_id, cuda_X[nxt], nxt);
cuda_scrypt_core(thr_id, nxt, opt_nfactor);
cuda_scrypt_done(thr_id, nxt);
cuda_scrypt_DtoH(thr_id, cuda_X[nxt], nxt, false);
cuda_scrypt_flush(thr_id, nxt);
if(!cuda_scrypt_sync(thr_id, cur))
{
return -1;
}
memcpy(Xbuf[cur].ptr, cuda_X[cur], 128 * throughput);
for(int i=0;i<throughput;++i)
scrypt_pbkdf2_1((unsigned char *)&data[cur][20*i], 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)(&hash[cur][8*i]), 32);
#define VERIFY_ALL 0
#if VERIFY_ALL
{
/* 2: X = ROMix(X) */
for(int i=0;i<throughput;++i)
scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)Ybuf.ptr, (scrypt_mix_word_t *)Vbuf.ptr, N);
unsigned int err = 0;
for(int i=0;i<throughput;++i) {
unsigned char *ref = (Xbuf[cur].ptr + 128 * i);
unsigned char *dat = (unsigned char*)(cuda_X[cur] + 32 * i);
if (memcmp(ref, dat, 128) != 0)
{
err++;
#if 0
uint32_t *ref32 = (uint32_t*) ref;
uint32_t *dat32 = (uint32_t*) dat;
for (int j=0; j<32; ++j) {
if (ref32[j] != dat32[j])
fprintf(stderr, "ref32[i=%d][j=%d] = $%08x / $%08x\n", i, j, ref32[j], dat32[j]);
}
#endif
}
}
if (err > 0) fprintf(stderr, "%d out of %d hashes differ.\n", err, throughput);
}
#endif
} else {
n += throughput;
cuda_scrypt_serialize(thr_id, nxt);
pre_keccak512(thr_id, nxt, nonce[nxt], throughput);
cuda_scrypt_core(thr_id, nxt, opt_nfactor);
cuda_scrypt_flush(thr_id, nxt);
post_keccak512(thr_id, nxt, nonce[nxt], throughput);
cuda_scrypt_done(thr_id, nxt);
cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true);
if(!cuda_scrypt_sync(thr_id, cur))
{
return -1;
}
}
if(iteration > 0)
{
for(int i=0;i<throughput;++i) {
volatile unsigned char *hashc = (unsigned char *)(&hash[cur][8*i]);
if (hash[cur][8*i+7] <= Htarg && fulltest(&hash[cur][8*i], ptarget))
{
uint32_t _ALIGN(64) thash[8], tdata[20];
uint32_t tmp_nonce = nonce[cur] + i;
for(int z=0;z<20;z++)
tdata[z] = bswap_32x4(pdata[z]);
tdata[19] = bswap_32x4(tmp_nonce);
scrypt_pbkdf2_1((unsigned char *)tdata, 80, (unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128);
scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)(Ybuf.ptr), (scrypt_mix_word_t *)(Vbuf.ptr), opt_nfactor);
scrypt_pbkdf2_1((unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)thash, 32);
if (memcmp(thash, &hash[cur][8*i], 32) == 0)
{
//applog(LOG_INFO, "GPU #%d: %s result validates on CPU.", device_map[thr_id], device_name[thr_id]);
*hashes_done = n - pdata[19];
pdata[19] = tmp_nonce;
scrypt_free(&Vbuf);
scrypt_free(&Ybuf);
scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]);
delete[] data[0]; delete[] data[1];
gettimeofday(tv_end, NULL);
return 1;
} else {
applog(LOG_INFO, "GPU #%d: %s result does not validate on CPU (i=%d, s=%d)!", device_map[thr_id], device_name[thr_id], i, cur);
}
}
}
}
cur = (cur+1)&1;
nxt = (nxt+1)&1;
++iteration;
} while (n <= max_nonce && !work_restart[thr_id].restart);
scrypt_free(&Vbuf);
scrypt_free(&Ybuf);
scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]);
delete[] data[0]; delete[] data[1];
*hashes_done = n - pdata[19];
pdata[19] = n;
gettimeofday(tv_end, NULL);
return 0;
}

756
scrypt.c

@ -1,756 +0,0 @@ @@ -1,756 +0,0 @@
/*
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include "cpuminer-config.h"
#include "miner.h"
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
static const uint32_t keypad[12] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
};
static const uint32_t innerpad[11] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
};
static const uint32_t outerpad[8] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
};
static const uint32_t finalblk[16] = {
0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
};
static inline void HMAC_SHA256_80_init(const uint32_t *key,
uint32_t *tstate, uint32_t *ostate)
{
uint32_t ihash[8];
uint32_t pad[16];
int i;
/* tstate is assumed to contain the midstate of key */
memcpy(pad, key + 16, 16);
memcpy(pad + 4, keypad, 48);
sha256_transform(tstate, pad, 0);
memcpy(ihash, tstate, 32);
sha256_init(ostate);
for (i = 0; i < 8; i++)
pad[i] = ihash[i] ^ 0x5c5c5c5c;
for (; i < 16; i++)
pad[i] = 0x5c5c5c5c;
sha256_transform(ostate, pad, 0);
sha256_init(tstate);
for (i = 0; i < 8; i++)
pad[i] = ihash[i] ^ 0x36363636;
for (; i < 16; i++)
pad[i] = 0x36363636;
sha256_transform(tstate, pad, 0);
}
static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
uint32_t istate[8], ostate2[8];
uint32_t ibuf[16], obuf[16];
int i, j;
memcpy(istate, tstate, 32);
sha256_transform(istate, salt, 0);
memcpy(ibuf, salt + 16, 16);
memcpy(ibuf + 5, innerpad, 44);
memcpy(obuf + 8, outerpad, 32);
for (i = 0; i < 4; i++) {
memcpy(obuf, istate, 32);
ibuf[4] = i + 1;
sha256_transform(obuf, ibuf, 0);
memcpy(ostate2, ostate, 32);
sha256_transform(ostate2, obuf, 0);
for (j = 0; j < 8; j++)
output[8 * i + j] = swab32(ostate2[j]);
}
}
static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
const uint32_t *salt, uint32_t *output)
{
uint32_t buf[16];
int i;
sha256_transform(tstate, salt, 1);
sha256_transform(tstate, salt + 16, 1);
sha256_transform(tstate, finalblk, 0);
memcpy(buf, tstate, 32);
memcpy(buf + 8, outerpad, 32);
sha256_transform(ostate, buf, 0);
for (i = 0; i < 8; i++)
output[i] = swab32(ostate[i]);
}
#if HAVE_SHA256_4WAY
static const uint32_t keypad_4way[4 * 12] = {
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000280, 0x00000280, 0x00000280, 0x00000280
};
static const uint32_t innerpad_4way[4 * 11] = {
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0
};
static const uint32_t outerpad_4way[4 * 8] = {
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000300, 0x00000300, 0x00000300, 0x00000300
};
static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = {
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000620, 0x00000620, 0x00000620, 0x00000620
};
static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
uint32_t *tstate, uint32_t *ostate)
{
uint32_t ihash[4 * 8] __attribute__((aligned(16)));
uint32_t pad[4 * 16] __attribute__((aligned(16)));
int i;
/* tstate is assumed to contain the midstate of key */
memcpy(pad, key + 4 * 16, 4 * 16);
memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
sha256_transform_4way(tstate, pad, 0);
memcpy(ihash, tstate, 4 * 32);
sha256_init_4way(ostate);
for (i = 0; i < 4 * 8; i++)
pad[i] = ihash[i] ^ 0x5c5c5c5c;
for (; i < 4 * 16; i++)
pad[i] = 0x5c5c5c5c;
sha256_transform_4way(ostate, pad, 0);
sha256_init_4way(tstate);
for (i = 0; i < 4 * 8; i++)
pad[i] = ihash[i] ^ 0x36363636;
for (; i < 4 * 16; i++)
pad[i] = 0x36363636;
sha256_transform_4way(tstate, pad, 0);
}
static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
uint32_t istate[4 * 8] __attribute__((aligned(16)));
uint32_t ostate2[4 * 8] __attribute__((aligned(16)));
uint32_t ibuf[4 * 16] __attribute__((aligned(16)));
uint32_t obuf[4 * 16] __attribute__((aligned(16)));
int i, j;
memcpy(istate, tstate, 4 * 32);
sha256_transform_4way(istate, salt, 0);
memcpy(ibuf, salt + 4 * 16, 4 * 16);
memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);
for (i = 0; i < 4; i++) {
memcpy(obuf, istate, 4 * 32);
ibuf[4 * 4 + 0] = i + 1;
ibuf[4 * 4 + 1] = i + 1;
ibuf[4 * 4 + 2] = i + 1;
ibuf[4 * 4 + 3] = i + 1;
sha256_transform_4way(obuf, ibuf, 0);
memcpy(ostate2, ostate, 4 * 32);
sha256_transform_4way(ostate2, obuf, 0);
for (j = 0; j < 4 * 8; j++)
output[4 * 8 * i + j] = swab32(ostate2[j]);
}
}
static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
uint32_t buf[4 * 16] __attribute__((aligned(16)));
int i;
sha256_transform_4way(tstate, salt, 1);
sha256_transform_4way(tstate, salt + 4 * 16, 1);
sha256_transform_4way(tstate, finalblk_4way, 0);
memcpy(buf, tstate, 4 * 32);
memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
sha256_transform_4way(ostate, buf, 0);
for (i = 0; i < 4 * 8; i++)
output[i] = swab32(ostate[i]);
}
#endif /* HAVE_SHA256_4WAY */
#if HAVE_SHA256_8WAY
static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = {
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620
};
static inline void HMAC_SHA256_80_init_8way(const uint32_t *key,
uint32_t *tstate, uint32_t *ostate)
{
uint32_t ihash[8 * 8] __attribute__((aligned(32)));
uint32_t pad[8 * 16] __attribute__((aligned(32)));
int i;
/* tstate is assumed to contain the midstate of key */
memcpy(pad, key + 8 * 16, 8 * 16);
for (i = 0; i < 8; i++)
pad[8 * 4 + i] = 0x80000000;
memset(pad + 8 * 5, 0x00, 8 * 40);
for (i = 0; i < 8; i++)
pad[8 * 15 + i] = 0x00000280;
sha256_transform_8way(tstate, pad, 0);
memcpy(ihash, tstate, 8 * 32);
sha256_init_8way(ostate);
for (i = 0; i < 8 * 8; i++)
pad[i] = ihash[i] ^ 0x5c5c5c5c;
for (; i < 8 * 16; i++)
pad[i] = 0x5c5c5c5c;
sha256_transform_8way(ostate, pad, 0);
sha256_init_8way(tstate);
for (i = 0; i < 8 * 8; i++)
pad[i] = ihash[i] ^ 0x36363636;
for (; i < 8 * 16; i++)
pad[i] = 0x36363636;
sha256_transform_8way(tstate, pad, 0);
}
static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
uint32_t istate[8 * 8] __attribute__((aligned(32)));
uint32_t ostate2[8 * 8] __attribute__((aligned(32)));
uint32_t ibuf[8 * 16] __attribute__((aligned(32)));
uint32_t obuf[8 * 16] __attribute__((aligned(32)));
int i, j;
memcpy(istate, tstate, 8 * 32);
sha256_transform_8way(istate, salt, 0);
memcpy(ibuf, salt + 8 * 16, 8 * 16);
for (i = 0; i < 8; i++)
ibuf[8 * 5 + i] = 0x80000000;
memset(ibuf + 8 * 6, 0x00, 8 * 36);
for (i = 0; i < 8; i++)
ibuf[8 * 15 + i] = 0x000004a0;
for (i = 0; i < 8; i++)
obuf[8 * 8 + i] = 0x80000000;
memset(obuf + 8 * 9, 0x00, 8 * 24);
for (i = 0; i < 8; i++)
obuf[8 * 15 + i] = 0x00000300;
for (i = 0; i < 4; i++) {
memcpy(obuf, istate, 8 * 32);
ibuf[8 * 4 + 0] = i + 1;
ibuf[8 * 4 + 1] = i + 1;
ibuf[8 * 4 + 2] = i + 1;
ibuf[8 * 4 + 3] = i + 1;
ibuf[8 * 4 + 4] = i + 1;
ibuf[8 * 4 + 5] = i + 1;
ibuf[8 * 4 + 6] = i + 1;
ibuf[8 * 4 + 7] = i + 1;
sha256_transform_8way(obuf, ibuf, 0);
memcpy(ostate2, ostate, 8 * 32);
sha256_transform_8way(ostate2, obuf, 0);
for (j = 0; j < 8 * 8; j++)
output[8 * 8 * i + j] = swab32(ostate2[j]);
}
}
static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
uint32_t buf[8 * 16] __attribute__((aligned(32)));
int i;
sha256_transform_8way(tstate, salt, 1);
sha256_transform_8way(tstate, salt + 8 * 16, 1);
sha256_transform_8way(tstate, finalblk_8way, 0);
memcpy(buf, tstate, 8 * 32);
for (i = 0; i < 8; i++)
buf[8 * 8 + i] = 0x80000000;
memset(buf + 8 * 9, 0x00, 8 * 24);
for (i = 0; i < 8; i++)
buf[8 * 15 + i] = 0x00000300;
sha256_transform_8way(ostate, buf, 0);
for (i = 0; i < 8 * 8; i++)
output[i] = swab32(ostate[i]);
}
#endif /* HAVE_SHA256_8WAY */
#if defined(__x86_64__)
#define SCRYPT_MAX_WAYS 1
#define HAVE_SCRYPT_3WAY 0
#define scrypt_best_throughput() 1
static void scrypt_core(uint32_t *X, uint32_t *V);
void scrypt_core_3way(uint32_t *X, uint32_t *V);
#if defined(USE_AVX2)
#undef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 21
#define HAVE_SCRYPT_6WAY 0
void scrypt_core_6way(uint32_t *X, uint32_t *V);
#endif
#elif defined(__i386__)
#define SCRYPT_MAX_WAYS 1
#define scrypt_best_throughput() 1
static void scrypt_core(uint32_t *X, uint32_t *V);
#elif defined(__arm__) && defined(__APCS_32__)
static void scrypt_core(uint32_t *X, uint32_t *V);
#if defined(__ARM_NEON__)
#undef HAVE_SHA256_4WAY
#define SCRYPT_MAX_WAYS 1
#define HAVE_SCRYPT_3WAY 0
#define scrypt_best_throughput() 1
void scrypt_core_3way(uint32_t *X, uint32_t *V);
#endif
#endif
static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
{
uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
int i;
x00 = (B[ 0] ^= Bx[ 0]);
x01 = (B[ 1] ^= Bx[ 1]);
x02 = (B[ 2] ^= Bx[ 2]);
x03 = (B[ 3] ^= Bx[ 3]);
x04 = (B[ 4] ^= Bx[ 4]);
x05 = (B[ 5] ^= Bx[ 5]);
x06 = (B[ 6] ^= Bx[ 6]);
x07 = (B[ 7] ^= Bx[ 7]);
x08 = (B[ 8] ^= Bx[ 8]);
x09 = (B[ 9] ^= Bx[ 9]);
x10 = (B[10] ^= Bx[10]);
x11 = (B[11] ^= Bx[11]);
x12 = (B[12] ^= Bx[12]);
x13 = (B[13] ^= Bx[13]);
x14 = (B[14] ^= Bx[14]);
x15 = (B[15] ^= Bx[15]);
for (i = 0; i < 8; i += 2) {
#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
/* Operate on columns. */
x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7);
x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7);
x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9);
x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9);
x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13);
x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13);
x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18);
x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18);
/* Operate on rows. */
x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7);
x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7);
x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9);
x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9);
x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13);
x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13);
x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18);
x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18);
#undef R
}
B[ 0] += x00;
B[ 1] += x01;
B[ 2] += x02;
B[ 3] += x03;
B[ 4] += x04;
B[ 5] += x05;
B[ 6] += x06;
B[ 7] += x07;
B[ 8] += x08;
B[ 9] += x09;
B[10] += x10;
B[11] += x11;
B[12] += x12;
B[13] += x13;
B[14] += x14;
B[15] += x15;
}
static inline void scrypt_core(uint32_t *X, uint32_t *V)
{
uint32_t i, j, k;
for (i = 0; i < 1024; i++) {
memcpy(&V[i * 32], X, 128);
xor_salsa8(&X[0], &X[16]);
xor_salsa8(&X[16], &X[0]);
}
for (i = 0; i < 1024; i++) {
j = 32 * (X[16] & 1023);
for (k = 0; k < 32; k++)
X[k] ^= V[j + k];
xor_salsa8(&X[0], &X[16]);
xor_salsa8(&X[16], &X[0]);
}
}
#ifndef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 1
#define scrypt_best_throughput() 1
#endif
#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63)
unsigned char *scrypt_buffer_alloc()
{
return (unsigned char *)malloc(SCRYPT_BUFFER_SIZE);
}
static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad)
{
uint32_t tstate[8], ostate[8];
uint32_t X[32];
uint32_t *V;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
memcpy(tstate, midstate, 32);
HMAC_SHA256_80_init(input, tstate, ostate);
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
scrypt_core(X, V);
PBKDF2_SHA256_128_32(tstate, ostate, X, output);
}
#if HAVE_SHA256_4WAY
static void scrypt_1024_1_1_256_4way(const uint32_t *input,
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
{
uint32_t tstate[4 * 8] __attribute__((aligned(128)));
uint32_t ostate[4 * 8] __attribute__((aligned(128)));
uint32_t W[4 * 32] __attribute__((aligned(128)));
uint32_t X[4 * 32] __attribute__((aligned(128)));
uint32_t *V;
int i, k;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
for (i = 0; i < 20; i++)
for (k = 0; k < 4; k++)
W[4 * i + k] = input[k * 20 + i];
for (i = 0; i < 8; i++)
for (k = 0; k < 4; k++)
tstate[4 * i + k] = midstate[i];
HMAC_SHA256_80_init_4way(W, tstate, ostate);
PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
for (i = 0; i < 32; i++)
for (k = 0; k < 4; k++)
X[k * 32 + i] = W[4 * i + k];
scrypt_core(X + 0 * 32, V);
scrypt_core(X + 1 * 32, V);
scrypt_core(X + 2 * 32, V);
scrypt_core(X + 3 * 32, V);
for (i = 0; i < 32; i++)
for (k = 0; k < 4; k++)
W[4 * i + k] = X[k * 32 + i];
PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
for (i = 0; i < 8; i++)
for (k = 0; k < 4; k++)
output[k * 8 + i] = W[4 * i + k];
}
#endif /* HAVE_SHA256_4WAY */
#if HAVE_SCRYPT_3WAY
static void scrypt_1024_1_1_256_3way(const uint32_t *input,
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
{
uint32_t tstate[3 * 8], ostate[3 * 8];
uint32_t X[3 * 32] __attribute__((aligned(64)));
uint32_t *V;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
memcpy(tstate + 0, midstate, 32);
memcpy(tstate + 8, midstate, 32);
memcpy(tstate + 16, midstate, 32);
HMAC_SHA256_80_init(input + 0, tstate + 0, ostate + 0);
HMAC_SHA256_80_init(input + 20, tstate + 8, ostate + 8);
HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16);
PBKDF2_SHA256_80_128(tstate + 0, ostate + 0, input + 0, X + 0);
PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32);
PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64);
scrypt_core_3way(X, V);
PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0);
PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8);
PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16);
}
#if HAVE_SHA256_4WAY
static void scrypt_1024_1_1_256_12way(const uint32_t *input,
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
{
uint32_t tstate[12 * 8] __attribute__((aligned(128)));
uint32_t ostate[12 * 8] __attribute__((aligned(128)));
uint32_t W[12 * 32] __attribute__((aligned(128)));
uint32_t X[12 * 32] __attribute__((aligned(128)));
uint32_t *V;
int i, j, k;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
for (j = 0; j < 3; j++)
for (i = 0; i < 20; i++)
for (k = 0; k < 4; k++)
W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i];
for (j = 0; j < 3; j++)
for (i = 0; i < 8; i++)
for (k = 0; k < 4; k++)
tstate[32 * j + 4 * i + k] = midstate[i];
HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0);
HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32);
HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64);
PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0);
PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128);
PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256);
for (j = 0; j < 3; j++)
for (i = 0; i < 32; i++)
for (k = 0; k < 4; k++)
X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
scrypt_core_3way(X + 0 * 96, V);
scrypt_core_3way(X + 1 * 96, V);
scrypt_core_3way(X + 2 * 96, V);
scrypt_core_3way(X + 3 * 96, V);
for (j = 0; j < 3; j++)
for (i = 0; i < 32; i++)
for (k = 0; k < 4; k++)
W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i];
PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0);
PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128);
PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256);
for (j = 0; j < 3; j++)
for (i = 0; i < 8; i++)
for (k = 0; k < 4; k++)
output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k];
}
#endif /* HAVE_SHA256_4WAY */
#endif /* HAVE_SCRYPT_3WAY */
#if HAVE_SCRYPT_6WAY
static void scrypt_1024_1_1_256_24way(const uint32_t *input,
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
{
uint32_t tstate[24 * 8] __attribute__((aligned(128)));
uint32_t ostate[24 * 8] __attribute__((aligned(128)));
uint32_t W[24 * 32] __attribute__((aligned(128)));
uint32_t X[24 * 32] __attribute__((aligned(128)));
uint32_t *V;
int i, j, k;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
for (j = 0; j < 3; j++)
for (i = 0; i < 20; i++)
for (k = 0; k < 8; k++)
W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i];
for (j = 0; j < 3; j++)
for (i = 0; i < 8; i++)
for (k = 0; k < 8; k++)
tstate[8 * 8 * j + 8 * i + k] = midstate[i];
HMAC_SHA256_80_init_8way(W + 0, tstate + 0, ostate + 0);
HMAC_SHA256_80_init_8way(W + 256, tstate + 64, ostate + 64);
HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128);
PBKDF2_SHA256_80_128_8way(tstate + 0, ostate + 0, W + 0, W + 0);
PBKDF2_SHA256_80_128_8way(tstate + 64, ostate + 64, W + 256, W + 256);
PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512);
for (j = 0; j < 3; j++)
for (i = 0; i < 32; i++)
for (k = 0; k < 8; k++)
X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
scrypt_core_6way(X + 0 * 32, V);
scrypt_core_6way(X + 6 * 32, V);
scrypt_core_6way(X + 12 * 32, V);
scrypt_core_6way(X + 18 * 32, V);
for (j = 0; j < 3; j++)
for (i = 0; i < 32; i++)
for (k = 0; k < 8; k++)
W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i];
PBKDF2_SHA256_128_32_8way(tstate + 0, ostate + 0, W + 0, W + 0);
PBKDF2_SHA256_128_32_8way(tstate + 64, ostate + 64, W + 256, W + 256);
PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512);
for (j = 0; j < 3; j++)
for (i = 0; i < 8; i++)
for (k = 0; k < 8; k++)
output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k];
}
#endif /* HAVE_SCRYPT_6WAY */
int scanhash_scrypt(int thr_id, uint32_t *pdata,
unsigned char *scratchbuf, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
uint32_t midstate[8];
uint32_t n = pdata[19] - 1;
const uint32_t Htarg = ptarget[7];
uint32_t throughput = scrypt_best_throughput();
uint32_t i;
#if HAVE_SHA256_4WAY
if (sha256_use_4way())
throughput *= 4;
#endif
for (i = 0; i < throughput; i++)
memcpy(data + i * 20, pdata, 80);
sha256_init(midstate);
sha256_transform(midstate, data, 0);
do {
for (i = 0; i < throughput; i++)
data[i * 20 + 19] = ++n;
#if HAVE_SHA256_4WAY
if (throughput == 4)
scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf);
else
#endif
#if HAVE_SCRYPT_3WAY && HAVE_SHA256_4WAY
if (throughput == 12)
scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf);
else
#endif
#if HAVE_SCRYPT_6WAY
if (throughput == 24)
scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf);
else
#endif
#if HAVE_SCRYPT_3WAY
if (throughput == 3)
scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf);
else
#endif
scrypt_1024_1_1_256(data, hash, midstate, scratchbuf);
for (i = 0; i < throughput; i++) {
if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) {
*hashes_done = n - pdata[19] + 1;
pdata[19] = data[i * 20 + 19];
return 1;
}
}
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - pdata[19] + 1;
pdata[19] = n;
return 0;
}

1097
scrypt.cpp

File diff suppressed because it is too large Load Diff

454
scrypt/blake.cu

@ -0,0 +1,454 @@ @@ -0,0 +1,454 @@
//
// =============== BLAKE part on nVidia GPU ======================
//
// This is the generic "default" implementation when no architecture
// specific implementation is available in the kernel.
//
// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64
//
// TODO: CUDA porting work remains to be done.
//
#include <map>
#include <stdint.h>
#include "cuda_runtime.h"
#include "salsa_kernel.h"
#include "miner.h"
typedef uint32_t sph_u32;
#define SPH_C32(x) ((sph_u32)(x))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
__constant__ uint64_t ptarget64[4];
__constant__ uint32_t pdata[20];
// define some error checking macros
#undef checkCudaErrors
#if WIN32
#define DELIMITER '/'
#else
#define DELIMITER '/'
#endif
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
#define checkCudaErrors(x) \
{ \
cudaGetLastError(); \
x; \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) \
applog(LOG_ERR, "GPU #%d: cudaError %d (%s) calling '%s' (%s line %d)\n", device_map[thr_id], err, cudaGetErrorString(err), #x, __FILENAME__, __LINE__); \
}
// from salsa_kernel.cu
extern std::map<int, uint32_t *> context_idata[2];
extern std::map<int, uint32_t *> context_odata[2];
extern std::map<int, cudaStream_t> context_streams[2];
extern std::map<int, uint32_t *> context_hash[2];
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
static __device__ sph_u32 cuda_sph_bswap32(sph_u32 x)
{
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
}
/**
* Encode a 32-bit value into the provided buffer (big endian convention).
*
* @param dst the destination buffer
* @param val the 32-bit value to encode
*/
static __device__ void
cuda_sph_enc32be(void *dst, sph_u32 val)
{
*(sph_u32 *)dst = cuda_sph_bswap32(val);
}
#define Z00 0
#define Z01 1
#define Z02 2
#define Z03 3
#define Z04 4
#define Z05 5
#define Z06 6
#define Z07 7
#define Z08 8
#define Z09 9
#define Z0A A
#define Z0B B
#define Z0C C
#define Z0D D
#define Z0E E
#define Z0F F
#define Z10 E
#define Z11 A
#define Z12 4
#define Z13 8
#define Z14 9
#define Z15 F
#define Z16 D
#define Z17 6
#define Z18 1
#define Z19 C
#define Z1A 0
#define Z1B 2
#define Z1C B
#define Z1D 7
#define Z1E 5
#define Z1F 3
#define Z20 B
#define Z21 8
#define Z22 C
#define Z23 0
#define Z24 5
#define Z25 2
#define Z26 F
#define Z27 D
#define Z28 A
#define Z29 E
#define Z2A 3
#define Z2B 6
#define Z2C 7
#define Z2D 1
#define Z2E 9
#define Z2F 4
#define Z30 7
#define Z31 9
#define Z32 3
#define Z33 1
#define Z34 D
#define Z35 C
#define Z36 B
#define Z37 E
#define Z38 2
#define Z39 6
#define Z3A 5
#define Z3B A
#define Z3C 4
#define Z3D 0
#define Z3E F
#define Z3F 8
#define Z40 9
#define Z41 0
#define Z42 5
#define Z43 7
#define Z44 2
#define Z45 4
#define Z46 A
#define Z47 F
#define Z48 E
#define Z49 1
#define Z4A B
#define Z4B C
#define Z4C 6
#define Z4D 8
#define Z4E 3
#define Z4F D
#define Z50 2
#define Z51 C
#define Z52 6
#define Z53 A
#define Z54 0
#define Z55 B
#define Z56 8
#define Z57 3
#define Z58 4
#define Z59 D
#define Z5A 7
#define Z5B 5
#define Z5C F
#define Z5D E
#define Z5E 1
#define Z5F 9
#define Z60 C
#define Z61 5
#define Z62 1
#define Z63 F
#define Z64 E
#define Z65 D
#define Z66 4
#define Z67 A
#define Z68 0
#define Z69 7
#define Z6A 6
#define Z6B 3
#define Z6C 9
#define Z6D 2
#define Z6E 8
#define Z6F B
#define Z70 D
#define Z71 B
#define Z72 7
#define Z73 E
#define Z74 C
#define Z75 1
#define Z76 3
#define Z77 9
#define Z78 5
#define Z79 0
#define Z7A F
#define Z7B 4
#define Z7C 8
#define Z7D 6
#define Z7E 2
#define Z7F A
#define Z80 6
#define Z81 F
#define Z82 E
#define Z83 9
#define Z84 B
#define Z85 3
#define Z86 0
#define Z87 8
#define Z88 C
#define Z89 2
#define Z8A D
#define Z8B 7
#define Z8C 1
#define Z8D 4
#define Z8E A
#define Z8F 5
#define Z90 A
#define Z91 2
#define Z92 8
#define Z93 4
#define Z94 7
#define Z95 6
#define Z96 1
#define Z97 5
#define Z98 F
#define Z99 B
#define Z9A 9
#define Z9B E
#define Z9C 3
#define Z9D C
#define Z9E D
#define Z9F 0
#define Mx(r, i) Mx_(Z ## r ## i)
#define Mx_(n) Mx__(n)
#define Mx__(n) M ## n
#define CSx(r, i) CSx_(Z ## r ## i)
#define CSx_(n) CSx__(n)
#define CSx__(n) CS ## n
#define CS0 SPH_C32(0x243F6A88)
#define CS1 SPH_C32(0x85A308D3)
#define CS2 SPH_C32(0x13198A2E)
#define CS3 SPH_C32(0x03707344)
#define CS4 SPH_C32(0xA4093822)
#define CS5 SPH_C32(0x299F31D0)
#define CS6 SPH_C32(0x082EFA98)
#define CS7 SPH_C32(0xEC4E6C89)
#define CS8 SPH_C32(0x452821E6)
#define CS9 SPH_C32(0x38D01377)
#define CSA SPH_C32(0xBE5466CF)
#define CSB SPH_C32(0x34E90C6C)
#define CSC SPH_C32(0xC0AC29B7)
#define CSD SPH_C32(0xC97C50DD)
#define CSE SPH_C32(0x3F84D5B5)
#define CSF SPH_C32(0xB5470917)
#define GS(m0, m1, c0, c1, a, b, c, d) do { \
a = SPH_T32(a + b + (m0 ^ c1)); \
d = SPH_ROTR32(d ^ a, 16); \
c = SPH_T32(c + d); \
b = SPH_ROTR32(b ^ c, 12); \
a = SPH_T32(a + b + (m1 ^ c0)); \
d = SPH_ROTR32(d ^ a, 8); \
c = SPH_T32(c + d); \
b = SPH_ROTR32(b ^ c, 7); \
} while (0)
#define ROUND_S(r) do { \
GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
} while (0)
#define COMPRESS32 do { \
sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = S0 ^ CS0; \
V9 = S1 ^ CS1; \
VA = S2 ^ CS2; \
VB = S3 ^ CS3; \
VC = T0 ^ CS4; \
VD = T0 ^ CS5; \
VE = T1 ^ CS6; \
VF = T1 ^ CS7; \
M0 = input[0]; \
M1 = input[1]; \
M2 = input[2]; \
M3 = input[3]; \
M4 = input[4]; \
M5 = input[5]; \
M6 = input[6]; \
M7 = input[7]; \
M8 = input[8]; \
M9 = input[9]; \
MA = input[10]; \
MB = input[11]; \
MC = input[12]; \
MD = input[13]; \
ME = input[14]; \
MF = input[15]; \
ROUND_S(0); \
ROUND_S(1); \
ROUND_S(2); \
ROUND_S(3); \
ROUND_S(4); \
ROUND_S(5); \
ROUND_S(6); \
ROUND_S(7); \
H0 ^= S0 ^ V0 ^ V8; \
H1 ^= S1 ^ V1 ^ V9; \
H2 ^= S2 ^ V2 ^ VA; \
H3 ^= S3 ^ V3 ^ VB; \
H4 ^= S0 ^ V4 ^ VC; \
H5 ^= S1 ^ V5 ^ VD; \
H6 ^= S2 ^ V6 ^ VE; \
H7 ^= S3 ^ V7 ^ VF; \
} while (0)
__global__ void cuda_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate )
{
uint32_t input[16];
uint64_t output[4];
#pragma unroll 16
for (int i=0; i < 16; ++i) input[i] = pdata[i];
sph_u32 H0 = 0x6A09E667;
sph_u32 H1 = 0xBB67AE85;
sph_u32 H2 = 0x3C6EF372;
sph_u32 H3 = 0xA54FF53A;
sph_u32 H4 = 0x510E527F;
sph_u32 H5 = 0x9B05688C;
sph_u32 H6 = 0x1F83D9AB;
sph_u32 H7 = 0x5BE0CD19;
sph_u32 S0 = 0;
sph_u32 S1 = 0;
sph_u32 S2 = 0;
sph_u32 S3 = 0;
sph_u32 T0 = 0;
sph_u32 T1 = 0;
T0 = SPH_T32(T0 + 512);
COMPRESS32;
#pragma unroll 3
for (int i=0; i < 3; ++i) input[i] = pdata[16+i];
input[3] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
input[4] = 0x80000000;
#pragma unroll 8
for (int i=5; i < 13; ++i) input[i] = 0;
input[13] = 0x00000001;
input[14] = T1;
input[15] = T0 + 128;
T0 = SPH_T32(T0 + 128);
COMPRESS32;
cuda_sph_enc32be((unsigned char*)output + 4*6, H6);
cuda_sph_enc32be((unsigned char*)output + 4*7, H7);
if (validate || output[3] <= ptarget64[3])
{
// this data is only needed when we actually need to save the hashes
cuda_sph_enc32be((unsigned char*)output + 4*0, H0);
cuda_sph_enc32be((unsigned char*)output + 4*1, H1);
cuda_sph_enc32be((unsigned char*)output + 4*2, H2);
cuda_sph_enc32be((unsigned char*)output + 4*3, H3);
cuda_sph_enc32be((unsigned char*)output + 4*4, H4);
cuda_sph_enc32be((unsigned char*)output + 4*5, H5);
}
if (validate)
{
g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
#pragma unroll 4
for (int i=0; i < 4; ++i) g_out[i] = output[i];
}
if (output[3] <= ptarget64[3]) {
uint64_t *g_good64 = (uint64_t*)g_good;
if (output[3] < g_good64[3]) {
g_good64[3] = output[3];
g_good64[2] = output[2];
g_good64[1] = output[1];
g_good64[0] = output[0];
g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
}
}
}
static bool init[MAX_GPUS] = { 0 };
static std::map<int, uint32_t *> context_good[2];
bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
{
if (!init[thr_id])
{
// allocate pinned host memory for good hashes
uint32_t *tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
init[thr_id] = true;
}
checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 80, 0, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 32, 0, cudaMemcpyHostToDevice));
return context_good[0][thr_id] && context_good[1][thr_id];
}
void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
{
checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
cuda_blake256_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
// copy hashes from device memory to host (ALL hashes, lots of data...)
if (do_d2h && hash != NULL) {
size_t mem_size = throughput * sizeof(uint32_t) * 8;
checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
}
else if (hash != NULL) {
// asynchronous copy of winning nonce (just 4 bytes...)
checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
}
}

28
scrypt/code/scrypt-conf.h

@ -0,0 +1,28 @@ @@ -0,0 +1,28 @@
/*
pick the best algo at runtime or compile time?
----------------------------------------------
SCRYPT_CHOOSE_COMPILETIME (gcc only!)
SCRYPT_CHOOSE_RUNTIME
*/
#define SCRYPT_CHOOSE_RUNTIME
/*
hash function to use
-------------------------------
SCRYPT_BLAKE256
SCRYPT_BLAKE512
SCRYPT_SHA256
SCRYPT_SHA512
SCRYPT_SKEIN512
*/
//#define SCRYPT_SHA256
/*
block mixer to use
-----------------------------
SCRYPT_CHACHA
SCRYPT_SALSA
*/
//#define SCRYPT_SALSA

58
scrypt/code/scrypt-jane-chacha.h

@ -0,0 +1,58 @@ @@ -0,0 +1,58 @@
#define SCRYPT_MIX_BASE "ChaCha20/8"
typedef uint32_t scrypt_mix_word_t;
#define SCRYPT_WORDTO8_LE U32TO8_LE
#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
#define SCRYPT_BLOCK_BYTES 64
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
/* must have these here in case block bytes is ever != 64 */
#include "scrypt-jane-romix-basic.h"
#include "scrypt-jane-mix_chacha.h"
/* cpu agnostic */
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
#define SCRYPT_MIX_FN chacha_core_basic
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
#include "scrypt-jane-romix-template.h"
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
static scrypt_ROMixfn
scrypt_getROMix() {
size_t cpuflags = detect_cpu();
return scrypt_ROMix_basic;
}
#endif
#if defined(SCRYPT_TEST_SPEED)
static size_t
available_implementations() {
size_t cpuflags = detect_cpu();
size_t flags = 0;
return flags;
}
#endif
static int
scrypt_test_mix() {
static const uint8_t expected[16] = {
0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a,
};
int ret = 1;
size_t cpuflags = detect_cpu();
#if defined(SCRYPT_CHACHA_BASIC)
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
#endif
return ret;
}

69
scrypt/code/scrypt-jane-mix_chacha.h

@ -0,0 +1,69 @@ @@ -0,0 +1,69 @@
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)
#undef SCRYPT_MIX
#define SCRYPT_MIX "ChaCha20/8 Ref"
#undef SCRYPT_CHACHA_INCLUDED
#define SCRYPT_CHACHA_INCLUDED
#define SCRYPT_CHACHA_BASIC
static void
chacha_core_basic(uint32_t state[16]) {
size_t rounds = 8;
uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t;
x0 = state[0];
x1 = state[1];
x2 = state[2];
x3 = state[3];
x4 = state[4];
x5 = state[5];
x6 = state[6];
x7 = state[7];
x8 = state[8];
x9 = state[9];
x10 = state[10];
x11 = state[11];
x12 = state[12];
x13 = state[13];
x14 = state[14];
x15 = state[15];
#define quarter(a,b,c,d) \
a += b; t = d^a; d = ROTL32(t,16); \
c += d; t = b^c; b = ROTL32(t,12); \
a += b; t = d^a; d = ROTL32(t, 8); \
c += d; t = b^c; b = ROTL32(t, 7);
for (; rounds; rounds -= 2) {
quarter( x0, x4, x8,x12)
quarter( x1, x5, x9,x13)
quarter( x2, x6,x10,x14)
quarter( x3, x7,x11,x15)
quarter( x0, x5,x10,x15)
quarter( x1, x6,x11,x12)
quarter( x2, x7, x8,x13)
quarter( x3, x4, x9,x14)
}
state[0] += x0;
state[1] += x1;
state[2] += x2;
state[3] += x3;
state[4] += x4;
state[5] += x5;
state[6] += x6;
state[7] += x7;
state[8] += x8;
state[9] += x9;
state[10] += x10;
state[11] += x11;
state[12] += x12;
state[13] += x13;
state[14] += x14;
state[15] += x15;
#undef quarter
}
#endif

32
scrypt/code/scrypt-jane-portable-x86.h

@ -0,0 +1,32 @@ @@ -0,0 +1,32 @@
typedef enum cpu_flags_x86_t { }cpu_flags_x86;
typedef enum cpu_vendors_x86_t {
cpu_nobody,
cpu_intel,
cpu_amd
} cpu_vendors_x86;
typedef struct x86_regs_t {
uint32_t eax, ebx, ecx, edx;
} x86_regs;
#if defined(SCRYPT_TEST_SPEED)
size_t cpu_detect_mask = (size_t)-1;
#endif
static size_t
detect_cpu(void) {
size_t cpu_flags = 0;
return cpu_flags;
}
#if defined(SCRYPT_TEST_SPEED)
static const char *
get_top_cpuflag_desc(size_t flag) {
return "Basic";
}
#endif
#define asm_calling_convention

284
scrypt/code/scrypt-jane-portable.h

@ -0,0 +1,284 @@ @@ -0,0 +1,284 @@
/* determine os */
#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
#include <windows.h>
#include <wincrypt.h>
#define OS_WINDOWS
#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
#include <sys/mman.h>
#include <sys/time.h>
#include <fcntl.h>
#define OS_SOLARIS
#else
#include <sys/mman.h>
#include <sys/time.h>
#include <sys/param.h> /* need this to define BSD */
#include <unistd.h>
#include <fcntl.h>
#define OS_NIX
#if defined(__linux__)
#include <endian.h>
#define OS_LINUX
#elif defined(BSD)
#define OS_BSD
#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
#define OS_OSX
#elif defined(macintosh) || defined(Macintosh)
#define OS_MAC
#elif defined(__OpenBSD__)
#define OS_OPENBSD
#endif
#endif
#endif
/* determine compiler */
#if defined(_MSC_VER)
#define COMPILER_MSVC _MSC_VER
#if ((COMPILER_MSVC > 1200) || defined(_mm_free))
#define COMPILER_MSVC6PP_AND_LATER
#endif
#if (COMPILER_MSVC >= 1500)
#define COMPILER_HAS_TMMINTRIN
#endif
#pragma warning(disable : 4127) /* conditional expression is constant */
#pragma warning(disable : 4100) /* unreferenced formal parameter */
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS
#endif
#include <float.h>
#include <stdlib.h> /* _rotl */
#include <intrin.h>
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef signed int int32_t;
typedef unsigned __int64 uint64_t;
typedef signed __int64 int64_t;
#define ROTL32(a,b) _rotl(a,b)
#define ROTR32(a,b) _rotr(a,b)
#define ROTL64(a,b) _rotl64(a,b)
#define ROTR64(a,b) _rotr64(a,b)
#undef NOINLINE
#define NOINLINE __declspec(noinline)
#undef INLINE
#define INLINE __forceinline
#undef FASTCALL
#define FASTCALL __fastcall
#undef CDECL
#define CDECL __cdecl
#undef STDCALL
#define STDCALL __stdcall
#undef NAKED
#define NAKED __declspec(naked)
#define MM16 __declspec(align(16))
#endif
#if defined(__ICC)
#define COMPILER_INTEL
#endif
#if defined(__GNUC__)
#if (__GNUC__ >= 3)
#define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__
#else
#define COMPILER_GCC_PATCHLEVEL 0
#endif
#define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL)
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
#define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b)))
#undef NOINLINE
#if (COMPILER_GCC >= 30000)
#define NOINLINE __attribute__((noinline))
#else
#define NOINLINE
#endif
#undef INLINE
#if (COMPILER_GCC >= 30000)
#define INLINE __attribute__((always_inline))
#else
#define INLINE inline
#endif
#undef FASTCALL
#if (COMPILER_GCC >= 30400)
#define FASTCALL __attribute__((fastcall))
#else
#define FASTCALL
#endif
#undef CDECL
#define CDECL __attribute__((cdecl))
#undef STDCALL
#define STDCALL __attribute__((stdcall))
#define MM16 __attribute__((aligned(16)))
#include <stdint.h>
#endif
#if defined(__MINGW32__) || defined(__MINGW64__)
#define COMPILER_MINGW
#endif
#if defined(__PATHCC__)
#define COMPILER_PATHCC
#endif
#define OPTIONAL_INLINE
#if defined(OPTIONAL_INLINE)
#undef OPTIONAL_INLINE
#define OPTIONAL_INLINE INLINE
#else
#define OPTIONAL_INLINE
#endif
#define CRYPTO_FN NOINLINE STDCALL
/* determine cpu */
#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
#define CPU_X86_64
#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
#define CPU_X86 500
#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
#define CPU_X86 400
#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
#define CPU_X86 300
#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
#define CPU_IA64
#endif
#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
#define CPU_SPARC
#if defined(__sparcv9)
#define CPU_SPARC64
#endif
#endif
#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
#define CPU_64BITS
#undef FASTCALL
#define FASTCALL
#undef CDECL
#define CDECL
#undef STDCALL
#define STDCALL
#endif
#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
#define CPU_PPC
#if defined(_ARCH_PWR7)
#define CPU_POWER7
#elif defined(__64BIT__)
#define CPU_PPC64
#else
#define CPU_PPC32
#endif
#endif
#if defined(__hppa__) || defined(__hppa)
#define CPU_HPPA
#endif
#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
#define CPU_ALPHA
#endif
/* endian */
#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
(defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
(defined(CPU_X86) || defined(CPU_X86_64)) || \
(defined(vax) || defined(MIPSEL) || defined(_MIPSEL)))
#define CPU_LE
#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \
(defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \
(defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB))
#define CPU_BE
#else
/* unknown endian! */
#endif
#define U8TO32_BE(p) \
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) ))
#define U8TO32_LE(p) \
(((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \
((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
#define U32TO8_BE(p, v) \
(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
(p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) );
#define U32TO8_LE(p, v) \
(p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \
(p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24);
#define U8TO64_BE(p) \
(((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4))
#define U8TO64_LE(p) \
(((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32))
#define U64TO8_BE(p, v) \
U32TO8_BE((p), (uint32_t)((v) >> 32)); \
U32TO8_BE((p) + 4, (uint32_t)((v) ));
#define U64TO8_LE(p, v) \
U32TO8_LE((p), (uint32_t)((v) )); \
U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
#define U32_SWAP(v) { \
(v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \
(v) = ((v) << 16) | ((v) >> 16); \
}
#define U64_SWAP(v) { \
(v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \
(v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \
(v) = ((v) << 32) | ((v) >> 32); \
}
static int
scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) {
uint32_t differentbits = 0;
while (len--)
differentbits |= (*x++ ^ *y++);
return (1 & ((differentbits - 1) >> 8));
}
void
scrypt_ensure_zero(void *p, size_t len) {
#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC))
__stosb((unsigned char *)p, 0, len);
#elif (defined(CPU_X86) && defined(COMPILER_GCC))
__asm__ __volatile__(
"pushl %%edi;\n"
"pushl %%ecx;\n"
"rep stosb;\n"
"popl %%ecx;\n"
"popl %%edi;\n"
:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
);
#elif (defined(CPU_X86_64) && defined(COMPILER_GCC))
__asm__ __volatile__(
"pushq %%rdi;\n"
"pushq %%rcx;\n"
"rep stosb;\n"
"popq %%rcx;\n"
"popq %%rdi;\n"
:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
);
#else
volatile uint8_t *b = (volatile uint8_t *)p;
size_t i;
for (i = 0; i < len; i++)
b[i] = 0;
#endif
}
#include "scrypt-jane-portable-x86.h"

67
scrypt/code/scrypt-jane-romix-basic.h

@ -0,0 +1,67 @@ @@ -0,0 +1,67 @@
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
/* function type returned by scrypt_getROMix, used with cpu detection */
typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r);
#endif
/* romix pre/post nop function */
static void asm_calling_convention
scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
}
/* romix pre/post endian conversion function */
static void asm_calling_convention
scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
#if !defined(CPU_LE)
static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
size_t i;
if (endian_test.w == 0x100) {
nblocks *= SCRYPT_BLOCK_WORDS;
for (i = 0; i < nblocks; i++) {
SCRYPT_WORD_ENDIAN_SWAP(blocks[i]);
}
}
#endif
}
/* chunkmix test function */
typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
static int
scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
uint8_t final[16];
size_t i;
for (i = 0; i < words; i++) {
v = (scrypt_mix_word_t)i;
v = (v << 8) | v;
v = (v << 16) | v;
chunk[0][i] = v;
}
prefn(chunk[0], blocks);
mixfn(chunk[1], chunk[0], NULL, r);
postfn(chunk[1], blocks);
/* grab the last 16 bytes of the final block */
for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) {
SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]);
}
return scrypt_verify(expected, final, 16);
}
/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */
static scrypt_mix_word_t *
scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) {
return base + (i * len);
}
/* returns a pointer to block i */
static scrypt_mix_word_t *
scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) {
return base + (i * SCRYPT_BLOCK_WORDS);
}

179
scrypt/code/scrypt-jane-romix-template.h

@ -0,0 +1,179 @@ @@ -0,0 +1,179 @@
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)
#if defined(SCRYPT_CHOOSE_COMPILETIME)
#undef SCRYPT_ROMIX_FN
#define SCRYPT_ROMIX_FN scrypt_ROMix
#endif
#undef SCRYPT_HAVE_ROMIX
#define SCRYPT_HAVE_ROMIX
#if !defined(SCRYPT_CHUNKMIX_FN)
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic
/*
Bout = ChunkMix(Bin)
2*r: number of blocks in the chunk
*/
static void asm_calling_convention
SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block;
uint32_t i, j, blocksPerChunk = r * 2, half = 0;
/* 1: X = B_{2r - 1} */
block = scrypt_block(Bin, blocksPerChunk - 1);
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
X[i] = block[i];
if (Bxor) {
block = scrypt_block(Bxor, blocksPerChunk - 1);
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
X[i] ^= block[i];
}
/* 2: for i = 0 to 2r - 1 do */
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
/* 3: X = H(X ^ B_i) */
block = scrypt_block(Bin, i);
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
X[j] ^= block[j];
if (Bxor) {
block = scrypt_block(Bxor, i);
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
X[j] ^= block[j];
}
SCRYPT_MIX_FN(X);
/* 4: Y_i = X */
/* 6: B'[0..r-1] = Y_even */
/* 6: B'[r..2r-1] = Y_odd */
block = scrypt_block(Bout, (i / 2) + half);
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
block[j] = X[j];
}
}
#endif
/*
X = ROMix(X)
X: chunk to mix
Y: scratch chunk
N: number of rounds
V[N]: array of chunks to randomly index in to
2*r: number of blocks in a chunk
*/
static void NOINLINE FASTCALL
SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) {
uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
scrypt_mix_word_t *block = V;
SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
/* 1: X = B */
/* implicit */
/* 2: for i = 0 to N - 1 do */
memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
for (i = 0; i < N - 1; i++, block += chunkWords) {
/* 3: V_i = X */
/* 4: X = H(X) */
SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r);
}
SCRYPT_CHUNKMIX_FN(X, block, NULL, r);
/* 6: for i = 0 to N - 1 do */
for (i = 0; i < N; i += 2) {
/* 7: j = Integerify(X) % N */
j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
/* 8: X = H(Y ^ V_j) */
SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r);
/* 7: j = Integerify(Y) % N */
j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
/* 8: X = H(Y ^ V_j) */
SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r);
}
/* 10: B' = X */
/* implicit */
SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
}
/*
* Special version with hard-coded r = 1
* - mikaelh
*/
static void NOINLINE FASTCALL
scrypt_ROMix_1(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N) {
const uint32_t r = 1;
uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
scrypt_mix_word_t *block = V;
SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
/* 1: X = B */
/* implicit */
/* 2: for i = 0 to N - 1 do */
memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
for (i = 0; i < N - 1; i++, block += chunkWords) {
/* 3: V_i = X */
/* 4: X = H(X) */
#ifdef SCRYPT_CHUNKMIX_1_FN
SCRYPT_CHUNKMIX_1_FN(block + chunkWords, block);
#else
SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r);
#endif
}
#ifdef SCRYPT_CHUNKMIX_1_FN
SCRYPT_CHUNKMIX_1_FN(X, block);
#else
SCRYPT_CHUNKMIX_FN(X, block, NULL, r);
#endif
/* 6: for i = 0 to N - 1 do */
for (i = 0; i < N; i += 2) {
/* 7: j = Integerify(X) % N */
j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
/* 8: X = H(Y ^ V_j) */
#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
SCRYPT_CHUNKMIX_1_XOR_FN(Y, X, scrypt_item(V, j, chunkWords));
#else
SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r);
#endif
/* 7: j = Integerify(Y) % N */
j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
/* 8: X = H(Y ^ V_j) */
#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
SCRYPT_CHUNKMIX_1_XOR_FN(X, Y, scrypt_item(V, j, chunkWords));
#else
SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r);
#endif
}
/* 10: B' = X */
/* implicit */
SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
}
#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
#undef SCRYPT_CHUNKMIX_FN
#undef SCRYPT_ROMIX_FN
#undef SCRYPT_MIX_FN
#undef SCRYPT_ROMIX_TANGLE_FN
#undef SCRYPT_ROMIX_UNTANGLE_FN

1
scrypt/code/scrypt-jane-romix.h

@ -0,0 +1 @@ @@ -0,0 +1 @@
#include "scrypt-jane-chacha.h"

907
scrypt/fermi_kernel.cu

@ -0,0 +1,907 @@ @@ -0,0 +1,907 @@
//
// Kernel that runs best on Fermi devices
//
// - shared memory use reduced by nearly factor 2 over legacy kernel
// by transferring only half work units (16 x uint32_t) at once.
// - uses ulong2/uint4 based memory transfers (each thread moves 16 bytes),
// allowing for shorter unrolled loops. This relies on Fermi's better
// memory controllers to get high memory troughput.
//
// NOTE: compile this .cu module for compute_20,sm_20 with --maxrregcount=63
//
// TODO: batch-size support for this kernel
//
#include <map>
#include "cuda_runtime.h"
#include "miner.h"
#include "salsa_kernel.h"
#include "fermi_kernel.h"
#define THREADS_PER_WU 1 // single thread per hash
#define TEXWIDTH 32768
// forward references
template <int ALGO> __global__ void fermi_scrypt_core_kernelA(uint32_t *g_idata, unsigned int N);
template <int ALGO> __global__ void fermi_scrypt_core_kernelB(uint32_t *g_odata, unsigned int N);
template <int ALGO, int TEX_DIM> __global__ void fermi_scrypt_core_kernelB_tex(uint32_t *g_odata, unsigned int N);
template <int ALGO> __global__ void fermi_scrypt_core_kernelA_LG(uint32_t *g_idata, unsigned int N, unsigned int LOOKUP_GAP);
template <int ALGO> __global__ void fermi_scrypt_core_kernelB_LG(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP);
template <int ALGO, int TEX_DIM> __global__ void fermi_scrypt_core_kernelB_LG_tex(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP);
// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
// using texture references for the "tex" variants of the B kernels
texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V;
texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
FermiKernel::FermiKernel() : KernelInterface()
{
}
bool FermiKernel::bindtexture_1D(uint32_t *d_V, size_t size)
{
cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
texRef1D_4_V.normalized = 0;
texRef1D_4_V.filterMode = cudaFilterModePoint;
texRef1D_4_V.addressMode[0] = cudaAddressModeClamp;
checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size));
return true;
}
bool FermiKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch)
{
cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
texRef2D_4_V.normalized = 0;
texRef2D_4_V.filterMode = cudaFilterModePoint;
texRef2D_4_V.addressMode[0] = cudaAddressModeClamp;
texRef2D_4_V.addressMode[1] = cudaAddressModeClamp;
// maintain texture width of TEXWIDTH (max. limit is 65000)
while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; }
while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; }
// fprintf(stderr, "total size: %u, %u bytes\n", pitch * height, width * sizeof(uint32_t) * 4 * height);
// fprintf(stderr, "binding width width=%d, height=%d, pitch=%d\n", width, height,pitch);
checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch));
return true;
}
bool FermiKernel::unbindtexture_1D()
{
checkCudaErrors(cudaUnbindTexture(texRef1D_4_V));
return true;
}
bool FermiKernel::unbindtexture_2D()
{
checkCudaErrors(cudaUnbindTexture(texRef2D_4_V));
return true;
}
void FermiKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
{
checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
}
bool FermiKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
{
bool success = true;
int shared = WARPS_PER_BLOCK * WU_PER_WARP * (16+4) * sizeof(uint32_t);
// First phase: Sequential writes to scratchpad.
if (LOOKUP_GAP == 1) {
if (IS_SCRYPT()) fermi_scrypt_core_kernelA<A_SCRYPT><<< grid, threads, shared, stream >>>(d_idata, N);
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelA<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_idata, N);
} else {
if (IS_SCRYPT()) fermi_scrypt_core_kernelA_LG<A_SCRYPT><<< grid, threads, shared, stream >>>(d_idata, N, LOOKUP_GAP);
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelA_LG<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_idata, N, LOOKUP_GAP);
}
// Second phase: Random read access from scratchpad.
if (LOOKUP_GAP == 1) {
if (texture_cache) {
if (texture_cache == 1) {
if (IS_SCRYPT()) fermi_scrypt_core_kernelB_tex<A_SCRYPT,1><<< grid, threads, shared, stream >>>(d_odata, N);
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_tex<A_SCRYPT_JANE,1><<< grid, threads, shared, stream >>>(d_odata, N);
} else if (texture_cache == 2) {
if (IS_SCRYPT()) fermi_scrypt_core_kernelB_tex<A_SCRYPT,2><<< grid, threads, shared, stream >>>(d_odata, N);
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_tex<A_SCRYPT_JANE,2><<< grid, threads, shared, stream >>>(d_odata, N);
}
else success = false;
} else {
if (IS_SCRYPT()) fermi_scrypt_core_kernelB<A_SCRYPT><<< grid, threads, shared, stream >>>(d_odata, N);
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_odata, N);
}
} else {
if (texture_cache) {
if (texture_cache == 1) {
if (IS_SCRYPT()) fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT,1><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT_JANE,1><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
} else if (texture_cache == 2) {
if (IS_SCRYPT()) fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT,2><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT_JANE,2><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
}
else success = false;
} else {
if (IS_SCRYPT()) fermi_scrypt_core_kernelB_LG<A_SCRYPT><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_LG<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
}
}
return success;
}
#if 0
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
#define QUARTER(a,b,c,d) \
a += b; d ^= a; d = ROTL(d,16); \
c += d; b ^= c; b = ROTL(b,12); \
a += b; d ^= a; d = ROTL(d,8); \
c += d; b ^= c; b = ROTL(b,7);
static __device__ void xor_chacha8(uint4 *B, uint4 *C)
{
uint32_t x[16];
x[0]=(B[0].x ^= C[0].x);
x[1]=(B[0].y ^= C[0].y);
x[2]=(B[0].z ^= C[0].z);
x[3]=(B[0].w ^= C[0].w);
x[4]=(B[1].x ^= C[1].x);
x[5]=(B[1].y ^= C[1].y);
x[6]=(B[1].z ^= C[1].z);
x[7]=(B[1].w ^= C[1].w);
x[8]=(B[2].x ^= C[2].x);
x[9]=(B[2].y ^= C[2].y);
x[10]=(B[2].z ^= C[2].z);
x[11]=(B[2].w ^= C[2].w);
x[12]=(B[3].x ^= C[3].x);
x[13]=(B[3].y ^= C[3].y);
x[14]=(B[3].z ^= C[3].z);
x[15]=(B[3].w ^= C[3].w);
/* Operate on columns. */
QUARTER( x[0], x[4], x[ 8], x[12] )
QUARTER( x[1], x[5], x[ 9], x[13] )
QUARTER( x[2], x[6], x[10], x[14] )
QUARTER( x[3], x[7], x[11], x[15] )
/* Operate on diagonals */
QUARTER( x[0], x[5], x[10], x[15] )
QUARTER( x[1], x[6], x[11], x[12] )
QUARTER( x[2], x[7], x[ 8], x[13] )
QUARTER( x[3], x[4], x[ 9], x[14] )
/* Operate on columns. */
QUARTER( x[0], x[4], x[ 8], x[12] )
QUARTER( x[1], x[5], x[ 9], x[13] )
QUARTER( x[2], x[6], x[10], x[14] )
QUARTER( x[3], x[7], x[11], x[15] )
/* Operate on diagonals */
QUARTER( x[0], x[5], x[10], x[15] )
QUARTER( x[1], x[6], x[11], x[12] )
QUARTER( x[2], x[7], x[ 8], x[13] )
QUARTER( x[3], x[4], x[ 9], x[14] )
/* Operate on columns. */
QUARTER( x[0], x[4], x[ 8], x[12] )
QUARTER( x[1], x[5], x[ 9], x[13] )
QUARTER( x[2], x[6], x[10], x[14] )
QUARTER( x[3], x[7], x[11], x[15] )
/* Operate on diagonals */
QUARTER( x[0], x[5], x[10], x[15] )
QUARTER( x[1], x[6], x[11], x[12] )
QUARTER( x[2], x[7], x[ 8], x[13] )
QUARTER( x[3], x[4], x[ 9], x[14] )
/* Operate on columns. */
QUARTER( x[0], x[4], x[ 8], x[12] )
QUARTER( x[1], x[5], x[ 9], x[13] )
QUARTER( x[2], x[6], x[10], x[14] )
QUARTER( x[3], x[7], x[11], x[15] )
/* Operate on diagonals */
QUARTER( x[0], x[5], x[10], x[15] )
QUARTER( x[1], x[6], x[11], x[12] )
QUARTER( x[2], x[7], x[ 8], x[13] )
QUARTER( x[3], x[4], x[ 9], x[14] )
B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7];
B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
}
#else
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
#define ADD4(d1,d2,d3,d4,s1,s2,s3,s4) \
d1 += s1; d2 += s2; d3 += s3; d4 += s4;
#define XOR4(d1,d2,d3,d4,s1,s2,s3,s4) \
d1 ^= s1; d2 ^= s2; d3 ^= s3; d4 ^= s4;
#define ROTL4(d1,d2,d3,d4,amt) \
d1 = ROTL(d1, amt); d2 = ROTL(d2, amt); d3 = ROTL(d3, amt); d4 = ROTL(d4, amt);
#define QROUND(a1,a2,a3,a4, b1,b2,b3,b4, c1,c2,c3,c4, amt) \
ADD4 (a1,a2,a3,a4, c1,c2,c3,c4) \
XOR4 (b1,b2,b3,b4, a1,a2,a3,a4) \
ROTL4(b1,b2,b3,b4, amt)
static __device__ void xor_chacha8(uint4 *B, uint4 *C)
{
uint32_t x[16];
x[0]=(B[0].x ^= C[0].x);
x[1]=(B[0].y ^= C[0].y);
x[2]=(B[0].z ^= C[0].z);
x[3]=(B[0].w ^= C[0].w);
x[4]=(B[1].x ^= C[1].x);
x[5]=(B[1].y ^= C[1].y);
x[6]=(B[1].z ^= C[1].z);
x[7]=(B[1].w ^= C[1].w);
x[8]=(B[2].x ^= C[2].x);
x[9]=(B[2].y ^= C[2].y);
x[10]=(B[2].z ^= C[2].z);
x[11]=(B[2].w ^= C[2].w);
x[12]=(B[3].x ^= C[3].x);
x[13]=(B[3].y ^= C[3].y);
x[14]=(B[3].z ^= C[3].z);
x[15]=(B[3].w ^= C[3].w);
/* Operate on columns. */
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8);
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7);
/* Operate on diagonals */
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8);
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7);
/* Operate on columns. */
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8);
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7);
/* Operate on diagonals */
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8);
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7);
/* Operate on columns. */
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8);
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7);
/* Operate on diagonals */
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8);
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7);
/* Operate on columns. */
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8);
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7);
/* Operate on diagonals */
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8);
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7);
B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7];
B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
}
#endif
#define ROTL7(a0,a1,a2,a3,a00,a10,a20,a30){\
a0^=(((a00)<<7) | ((a00)>>25) );\
a1^=(((a10)<<7) | ((a10)>>25) );\
a2^=(((a20)<<7) | ((a20)>>25) );\
a3^=(((a30)<<7) | ((a30)>>25) );\
};\
#define ROTL9(a0,a1,a2,a3,a00,a10,a20,a30){\
a0^=(((a00)<<9) | ((a00)>>23) );\
a1^=(((a10)<<9) | ((a10)>>23) );\
a2^=(((a20)<<9) | ((a20)>>23) );\
a3^=(((a30)<<9) | ((a30)>>23) );\
};\
#define ROTL13(a0,a1,a2,a3,a00,a10,a20,a30){\
a0^=(((a00)<<13) | ((a00)>>19) );\
a1^=(((a10)<<13) | ((a10)>>19) );\
a2^=(((a20)<<13) | ((a20)>>19) );\
a3^=(((a30)<<13) | ((a30)>>19) );\
};\
#define ROTL18(a0,a1,a2,a3,a00,a10,a20,a30){\
a0^=(((a00)<<18) | ((a00)>>14) );\
a1^=(((a10)<<18) | ((a10)>>14) );\
a2^=(((a20)<<18) | ((a20)>>14) );\
a3^=(((a30)<<18) | ((a30)>>14) );\
};\
static __device__ void xor_salsa8(uint4 *B, uint4 *C)
{
uint32_t x[16];
x[0]=(B[0].x ^= C[0].x);
x[1]=(B[0].y ^= C[0].y);
x[2]=(B[0].z ^= C[0].z);
x[3]=(B[0].w ^= C[0].w);
x[4]=(B[1].x ^= C[1].x);
x[5]=(B[1].y ^= C[1].y);
x[6]=(B[1].z ^= C[1].z);
x[7]=(B[1].w ^= C[1].w);
x[8]=(B[2].x ^= C[2].x);
x[9]=(B[2].y ^= C[2].y);
x[10]=(B[2].z ^= C[2].z);
x[11]=(B[2].w ^= C[2].w);
x[12]=(B[3].x ^= C[3].x);
x[13]=(B[3].y ^= C[3].y);
x[14]=(B[3].z ^= C[3].z);
x[15]=(B[3].w ^= C[3].w);
/* Operate on columns. */
ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
/* Operate on rows. */
ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
/* Operate on columns. */
ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
/* Operate on rows. */
ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
/* Operate on columns. */
ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
/* Operate on rows. */
ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
/* Operate on columns. */
ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
/* Operate on rows. */
ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7];
B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
}
static __device__ __forceinline__ uint4& operator^=(uint4& left, const uint4& right)
{
left.x ^= right.x;
left.y ^= right.y;
left.z ^= right.z;
left.w ^= right.w;
return left;
}
////////////////////////////////////////////////////////////////////////////////
//! Scrypt core kernel for Fermi class devices.
//! @param g_idata input data in global memory
//! @param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
template <int ALGO> __global__
void fermi_scrypt_core_kernelA(uint32_t *g_idata, unsigned int N)
{
extern __shared__ unsigned char x[];
uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
int warpIdx = threadIdx.x / warpSize;
int warpThread = threadIdx.x % warpSize;
const unsigned int LOOKUP_GAP = 1;
// variables supporting the large memory transaction magic
unsigned int Y = warpThread/4;
unsigned int Z = 4*(warpThread%4);
// add block specific offsets
int WARPS_PER_BLOCK = blockDim.x / 32;
int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
g_idata += 32 * offset;
uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z;
// registers to store an entire work unit
uint4 B[4], C[4];
uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
uint32_t *XX = X[warpIdx][warpThread];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&V[SCRATCH*wu])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+Z]));
#pragma unroll 4
for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&V[SCRATCH*wu+16])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+16+Z]));
#pragma unroll 4
for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
for (int i = 1; i < N; i++) {
switch(ALGO) {
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break;
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
}
#pragma unroll 4
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&V[SCRATCH*wu + i*32])) = *((ulonglong2*)XB[wu]);
#pragma unroll 4
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&V[SCRATCH*wu + i*32 + 16])) = *((ulonglong2*)XB[wu]);
}
}
template <int ALGO> __global__
void fermi_scrypt_core_kernelB(uint32_t *g_odata, unsigned int N)
{
extern __shared__ unsigned char x[];
uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
int warpIdx = threadIdx.x / warpSize;
int warpThread = threadIdx.x % warpSize;
const unsigned int LOOKUP_GAP = 1;
// variables supporting the large memory transaction magic
unsigned int Y = warpThread/4;
unsigned int Z = 4*(warpThread%4);
// add block specific offsets
int WARPS_PER_BLOCK = blockDim.x / 32;
int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
g_odata += 32 * offset;
uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z;
// registers to store an entire work unit
uint4 B[4], C[4];
uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
uint32_t *XX = X[warpIdx][warpThread];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + (N-1)*32]));
#pragma unroll 4
for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + (N-1)*32 + 16]));
#pragma unroll 4
for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
switch(ALGO) {
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break;
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
}
for (int i = 0; i < N; i++) {
XX[16] = 32 * (C[0].x & (N-1));
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z]]));
#pragma unroll 4
for (int idx=0; idx < 4; idx++) B[idx] ^= *((uint4*)&XX[4*idx]);
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z] + 16]));
#pragma unroll 4
for (int idx=0; idx < 4; idx++) C[idx] ^= *((uint4*)&XX[4*idx]);
switch(ALGO) {
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break;
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
}
}
#pragma unroll 4
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]);
#pragma unroll 4
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]);
}
template <int ALGO, int TEX_DIM> __global__ void
fermi_scrypt_core_kernelB_tex(uint32_t *g_odata, unsigned int N)
{
extern __shared__ unsigned char x[];
uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
int warpIdx = threadIdx.x / warpSize;
int warpThread = threadIdx.x % warpSize;
const unsigned int LOOKUP_GAP = 1;
// variables supporting the large memory transaction magic
unsigned int Y = warpThread/4;
unsigned int Z = 4*(warpThread%4);
// add block specific offsets
int WARPS_PER_BLOCK = blockDim.x / 32;
int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
g_odata += 32 * offset;
// registers to store an entire work unit
uint4 B[4], C[4];
uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
uint32_t *XX = X[warpIdx][warpThread];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + (N-1)*32 + Z)/4;
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
tex1Dfetch(texRef1D_4_V, loc) :
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
#pragma unroll 4
for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + (N-1)*32 + 16+Z)/4;
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
tex1Dfetch(texRef1D_4_V, loc) :
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
#pragma unroll 4
for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
switch(ALGO) {
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break;
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
}
for (int i = 0; i < N; i++) {
XX[16] = 32 * (C[0].x & (N-1));
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + Z)/4;
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
tex1Dfetch(texRef1D_4_V, loc) :
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
#pragma unroll 4
for (int idx=0; idx < 4; idx++) B[idx] ^= *((uint4*)&XX[4*idx]);
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + 16+Z)/4;
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
tex1Dfetch(texRef1D_4_V, loc) :
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
#pragma unroll 4
for (int idx=0; idx < 4; idx++) C[idx] ^= *((uint4*)&XX[4*idx]);
switch(ALGO) {
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break;
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
}
}
#pragma unroll 4
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]);
#pragma unroll 4
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]);
}
//
// Lookup-Gap variations of the above functions
//
template <int ALGO> __global__ void
fermi_scrypt_core_kernelA_LG(uint32_t *g_idata, unsigned int N, unsigned int LOOKUP_GAP)
{
extern __shared__ unsigned char x[];
uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
int warpIdx = threadIdx.x / warpSize;
int warpThread = threadIdx.x % warpSize;
// variables supporting the large memory transaction magic
unsigned int Y = warpThread/4;
unsigned int Z = 4*(warpThread%4);
// add block specific offsets
int WARPS_PER_BLOCK = blockDim.x / 32;
int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
g_idata += 32 * offset;
uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z;
// registers to store an entire work unit
uint4 B[4], C[4];
uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
uint32_t *XX = X[warpIdx][warpThread];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&V[SCRATCH*wu])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+Z]));
#pragma unroll 4
for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&V[SCRATCH*wu+16])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+16+Z]));
#pragma unroll 4
for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
for (int i = 1; i < N; i++) {
switch(ALGO) {
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break;
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
}
if (i % LOOKUP_GAP == 0) {
#pragma unroll 4
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&V[SCRATCH*wu + (i/LOOKUP_GAP)*32])) = *((ulonglong2*)XB[wu]);
#pragma unroll 4
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&V[SCRATCH*wu + (i/LOOKUP_GAP)*32 + 16])) = *((ulonglong2*)XB[wu]);
}
}
}
template <int ALGO> __global__ void
fermi_scrypt_core_kernelB_LG(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP)
{
extern __shared__ unsigned char x[];
uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
int warpIdx = threadIdx.x / warpSize;
int warpThread = threadIdx.x % warpSize;
// variables supporting the large memory transaction magic
unsigned int Y = warpThread/4;
unsigned int Z = 4*(warpThread%4);
// add block specific offsets
int WARPS_PER_BLOCK = blockDim.x / 32;
int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
g_odata += 32 * offset;
uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z;
// registers to store an entire work unit
uint4 B[4], C[4];
uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
uint32_t *XX = X[warpIdx][warpThread];
uint32_t pos = (N-1)/LOOKUP_GAP; uint32_t loop = 1 + (N-1)-pos*LOOKUP_GAP;
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + pos*32]));
#pragma unroll 4
for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + pos*32 + 16]));
#pragma unroll 4
for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
while (loop--)
switch(ALGO) {
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break;
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
}
for (int i = 0; i < N; i++) {
uint32_t j = C[0].x & (N-1);
uint32_t pos = j / LOOKUP_GAP; uint32_t loop = j - pos*LOOKUP_GAP;
XX[16] = 32 * pos;
uint4 b[4], c[4];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z]]));
#pragma unroll 4
for (int idx=0; idx < 4; idx++) b[idx] = *((uint4*)&XX[4*idx]);
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z] + 16]));
#pragma unroll 4
for (int idx=0; idx < 4; idx++) c[idx] = *((uint4*)&XX[4*idx]);
while (loop--)
switch(ALGO) {
case A_SCRYPT: xor_salsa8(b, c); xor_salsa8(c, b); break;
case A_SCRYPT_JANE: xor_chacha8(b, c); xor_chacha8(c, b); break;
}
#pragma unroll 4
for (int idx=0; idx < 4; idx++) B[idx] ^= b[idx];
#pragma unroll 4
for (int idx=0; idx < 4; idx++) C[idx] ^= c[idx];
switch(ALGO) {
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break;
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
}
}
#pragma unroll 4
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]);
#pragma unroll 4
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]);
}
template <int ALGO, int TEX_DIM> __global__ void
fermi_scrypt_core_kernelB_LG_tex(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP)
{
extern __shared__ unsigned char x[];
uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
int warpIdx = threadIdx.x / warpSize;
int warpThread = threadIdx.x % warpSize;
// variables supporting the large memory transaction magic
unsigned int Y = warpThread/4;
unsigned int Z = 4*(warpThread%4);
// add block specific offsets
int WARPS_PER_BLOCK = blockDim.x / 32;
int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
g_odata += 32 * offset;
// registers to store an entire work unit
uint4 B[4], C[4];
uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
uint32_t *XX = X[warpIdx][warpThread];
uint32_t pos = (N-1)/LOOKUP_GAP; uint32_t loop = 1 + (N-1)-pos*LOOKUP_GAP;
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + pos*32 + Z)/4;
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
tex1Dfetch(texRef1D_4_V, loc) :
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
#pragma unroll 4
for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + pos*32 + 16+Z)/4;
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
tex1Dfetch(texRef1D_4_V, loc) :
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
#pragma unroll 4
for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
while (loop--)
switch(ALGO) {
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break;
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
}
for (int i = 0; i < N; i++) {
uint32_t j = C[0].x & (N-1);
uint32_t pos = j / LOOKUP_GAP; uint32_t loop = j - pos*LOOKUP_GAP;
XX[16] = 32 * pos;
uint4 b[4], c[4];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + Z)/4;
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
tex1Dfetch(texRef1D_4_V, loc) :
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
#pragma unroll 4
for (int idx=0; idx < 4; idx++) b[idx] = *((uint4*)&XX[4*idx]);
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + 16+Z)/4;
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
tex1Dfetch(texRef1D_4_V, loc) :
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
#pragma unroll 4
for (int idx=0; idx < 4; idx++) c[idx] = *((uint4*)&XX[4*idx]);
while (loop--)
switch(ALGO) {
case A_SCRYPT: xor_salsa8(b, c); xor_salsa8(c, b); break;
case A_SCRYPT_JANE: xor_chacha8(b, c); xor_chacha8(c, b); break;
}
#pragma unroll 4
for (int idx=0; idx < 4; idx++) B[idx] ^= b[idx];
#pragma unroll 4
for (int idx=0; idx < 4; idx++) C[idx] ^= c[idx];
switch(ALGO) {
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break;
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
}
}
#pragma unroll 4
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]);
#pragma unroll 4
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
#pragma unroll 4
for (int wu=0; wu < 32; wu+=8)
*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]);
}

28
scrypt/fermi_kernel.h

@ -0,0 +1,28 @@ @@ -0,0 +1,28 @@
#ifndef FERMI_KERNEL_H
#define FERMI_KERNEL_H
#include "salsa_kernel.h"
class FermiKernel : public KernelInterface
{
public:
FermiKernel();
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
virtual bool bindtexture_1D(uint32_t *d_V, size_t size);
virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch);
virtual bool unbindtexture_1D();
virtual bool unbindtexture_2D();
virtual char get_identifier() { return 'F'; };
virtual int get_major_version() { return 1; }
virtual int get_minor_version() { return 0; }
virtual int max_warps_per_block() { return 16; };
virtual int get_texel_width() { return 4; };
virtual bool support_lookup_gap() { return true; }
virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferShared; }
};
#endif // #ifndef FERMI_KERNEL_H

837
scrypt/keccak.cu

@ -0,0 +1,837 @@ @@ -0,0 +1,837 @@
//
// =============== KECCAK part on nVidia GPU ======================
//
// The keccak512 (SHA-3) is used in the PBKDF2 for scrypt-jane coins
// in place of the SHA2 based PBKDF2 used in scrypt coins.
//
// The keccak256 is used exclusively in Maxcoin and clones. This module
// holds the generic "default" implementation when no architecture
// specific implementation is available in the kernel.
//
// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64
//
#include <map>
#include <stdint.h>
#include "salsa_kernel.h"
#include "cuda_runtime.h"
#include "miner.h"
#include "keccak.h"
// define some error checking macros
#undef checkCudaErrors
#if WIN32
#define DELIMITER '/'
#else
#define DELIMITER '/'
#endif
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
#define checkCudaErrors(x) \
{ \
cudaGetLastError(); \
x; \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) \
applog(LOG_ERR, "GPU #%d: cudaError %d (%s) calling '%s' (%s line %d)\n", device_map[thr_id], err, cudaGetErrorString(err), #x, __FILENAME__, __LINE__); \
}
// from salsa_kernel.cu
extern std::map<int, uint32_t *> context_idata[2];
extern std::map<int, uint32_t *> context_odata[2];
extern std::map<int, cudaStream_t> context_streams[2];
extern std::map<int, uint32_t *> context_hash[2];
#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
// CB
#define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
#define U64TO32_LE(p, v) \
*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
static __device__ void mycpy64(uint32_t *d, const uint32_t *s) {
#pragma unroll 16
for (int k=0; k < 16; ++k) d[k] = s[k];
}
static __device__ void mycpy56(uint32_t *d, const uint32_t *s) {
#pragma unroll 14
for (int k=0; k < 14; ++k) d[k] = s[k];
}
static __device__ void mycpy32(uint32_t *d, const uint32_t *s) {
#pragma unroll 8
for (int k=0; k < 8; ++k) d[k] = s[k];
}
static __device__ void mycpy8(uint32_t *d, const uint32_t *s) {
#pragma unroll 2
for (int k=0; k < 2; ++k) d[k] = s[k];
}
static __device__ void mycpy4(uint32_t *d, const uint32_t *s) {
*d = *s;
}
// ---------------------------- BEGIN keccak functions ------------------------------------
#define KECCAK_HASH "Keccak-512"
typedef struct keccak_hash_state_t {
uint64_t state[25]; // 25*2
uint32_t buffer[72/4]; // 72
} keccak_hash_state;
__device__ void statecopy0(keccak_hash_state *d, keccak_hash_state *s)
{
#pragma unroll 25
for (int i=0; i < 25; ++i)
d->state[i] = s->state[i];
}
__device__ void statecopy8(keccak_hash_state *d, keccak_hash_state *s)
{
#pragma unroll 25
for (int i=0; i < 25; ++i)
d->state[i] = s->state[i];
#pragma unroll 2
for (int i=0; i < 2; ++i)
d->buffer[i] = s->buffer[i];
}
static const uint64_t host_keccak_round_constants[24] = {
0x0000000000000001ull, 0x0000000000008082ull,
0x800000000000808aull, 0x8000000080008000ull,
0x000000000000808bull, 0x0000000080000001ull,
0x8000000080008081ull, 0x8000000000008009ull,
0x000000000000008aull, 0x0000000000000088ull,
0x0000000080008009ull, 0x000000008000000aull,
0x000000008000808bull, 0x800000000000008bull,
0x8000000000008089ull, 0x8000000000008003ull,
0x8000000000008002ull, 0x8000000000000080ull,
0x000000000000800aull, 0x800000008000000aull,
0x8000000080008081ull, 0x8000000000008080ull,
0x0000000080000001ull, 0x8000000080008008ull
};
__constant__ uint64_t c_keccak_round_constants[24];
__constant__ uint32_t pdata[20];
__device__
void keccak_block(keccak_hash_state *S, const uint32_t *in) {
size_t i;
uint64_t *s = S->state, t[5], u[5], v, w;
/* absorb input */
#pragma unroll 9
for (i = 0; i < 72 / 8; i++, in += 2)
s[i] ^= U32TO64_LE(in);
for (i = 0; i < 24; i++) {
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
u[0] = t[4] ^ ROTL64(t[1], 1);
u[1] = t[0] ^ ROTL64(t[2], 1);
u[2] = t[1] ^ ROTL64(t[3], 1);
u[3] = t[2] ^ ROTL64(t[4], 1);
u[4] = t[3] ^ ROTL64(t[0], 1);
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
/* rho pi: b[..] = rotl(a[..], ..) */
v = s[ 1];
s[ 1] = ROTL64(s[ 6], 44);
s[ 6] = ROTL64(s[ 9], 20);
s[ 9] = ROTL64(s[22], 61);
s[22] = ROTL64(s[14], 39);
s[14] = ROTL64(s[20], 18);
s[20] = ROTL64(s[ 2], 62);
s[ 2] = ROTL64(s[12], 43);
s[12] = ROTL64(s[13], 25);
s[13] = ROTL64(s[19], 8);
s[19] = ROTL64(s[23], 56);
s[23] = ROTL64(s[15], 41);
s[15] = ROTL64(s[ 4], 27);
s[ 4] = ROTL64(s[24], 14);
s[24] = ROTL64(s[21], 2);
s[21] = ROTL64(s[ 8], 55);
s[ 8] = ROTL64(s[16], 45);
s[16] = ROTL64(s[ 5], 36);
s[ 5] = ROTL64(s[ 3], 28);
s[ 3] = ROTL64(s[18], 21);
s[18] = ROTL64(s[17], 15);
s[17] = ROTL64(s[11], 10);
s[11] = ROTL64(s[ 7], 6);
s[ 7] = ROTL64(s[10], 3);
s[10] = ROTL64( v, 1);
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
/* iota: a[0,0] ^= round constant */
s[0] ^= c_keccak_round_constants[i];
}
}
__device__
void keccak_hash_init(keccak_hash_state *S) {
#pragma unroll 25
for (int i=0; i<25; ++i)
S->state[i] = 0ULL;
}
// assuming there is no leftover data and exactly 72 bytes are incoming
// we can directly call into the block hashing function
__device__ void keccak_hash_update72(keccak_hash_state *S, const uint32_t *in) {
keccak_block(S, in);
}
__device__ void keccak_hash_update8(keccak_hash_state *S, const uint32_t *in) {
mycpy8(S->buffer, in);
}
__device__ void keccak_hash_update4_8(keccak_hash_state *S, const uint32_t *in) {
mycpy4(S->buffer+8/4, in);
}
__device__ void keccak_hash_update4_56(keccak_hash_state *S, const uint32_t *in) {
mycpy4(S->buffer+56/4, in);
}
__device__ void keccak_hash_update56(keccak_hash_state *S, const uint32_t *in) {
mycpy56(S->buffer, in);
}
__device__ void keccak_hash_update64(keccak_hash_state *S, const uint32_t *in) {
mycpy64(S->buffer, in);
}
__device__ void keccak_hash_finish8(keccak_hash_state *S, uint32_t *hash) {
S->buffer[8/4] = 0x01;
#pragma unroll 15
for (int i=8/4+1; i < 72/4; ++i) S->buffer[i] = 0;
S->buffer[72/4 - 1] |= 0x80000000;
keccak_block(S, (const uint32_t*)S->buffer);
#pragma unroll 8
for (size_t i = 0; i < 64; i += 8) {
U64TO32_LE((&hash[i/4]), S->state[i / 8]);
}
}
__device__ void keccak_hash_finish12(keccak_hash_state *S, uint32_t *hash) {
S->buffer[12/4] = 0x01;
#pragma unroll 14
for (int i=12/4+1; i < 72/4; ++i) S->buffer[i] = 0;
S->buffer[72/4 - 1] |= 0x80000000;
keccak_block(S, (const uint32_t*)S->buffer);
#pragma unroll 8
for (size_t i = 0; i < 64; i += 8) {
U64TO32_LE((&hash[i/4]), S->state[i / 8]);
}
}
__device__ void keccak_hash_finish60(keccak_hash_state *S, uint32_t *hash) {
S->buffer[60/4] = 0x01;
#pragma unroll 2
for (int i=60/4+1; i < 72/4; ++i) S->buffer[i] = 0;
S->buffer[72/4 - 1] |= 0x80000000;
keccak_block(S, (const uint32_t*)S->buffer);
#pragma unroll 8
for (size_t i = 0; i < 64; i += 8) {
U64TO32_LE((&hash[i/4]), S->state[i / 8]);
}
}
__device__ void keccak_hash_finish64(keccak_hash_state *S, uint32_t *hash) {
S->buffer[64/4] = 0x01;
#pragma unroll 1
for (int i=64/4+1; i < 72/4; ++i) S->buffer[i] = 0;
S->buffer[72/4 - 1] |= 0x80000000;
keccak_block(S, (const uint32_t*)S->buffer);
#pragma unroll 8
for (size_t i = 0; i < 64; i += 8) {
U64TO32_LE((&hash[i/4]), S->state[i / 8]);
}
}
// ---------------------------- END keccak functions ------------------------------------
// ---------------------------- BEGIN PBKDF2 functions ------------------------------------
typedef struct pbkdf2_hmac_state_t {
keccak_hash_state inner, outer;
} pbkdf2_hmac_state;
__device__ void pbkdf2_hash(uint32_t *hash, const uint32_t *m) {
keccak_hash_state st;
keccak_hash_init(&st);
keccak_hash_update72(&st, m);
keccak_hash_update8(&st, m+72/4);
keccak_hash_finish8(&st, hash);
}
/* hmac */
__device__ void pbkdf2_hmac_init80(pbkdf2_hmac_state *st, const uint32_t *key) {
uint32_t pad[72/4];
size_t i;
keccak_hash_init(&st->inner);
keccak_hash_init(&st->outer);
#pragma unroll 18
for (i = 0; i < 72/4; i++)
pad[i] = 0;
/* key > blocksize bytes, hash it */
pbkdf2_hash(pad, key);
/* inner = (key ^ 0x36) */
/* h(inner || ...) */
#pragma unroll 18
for (i = 0; i < 72/4; i++)
pad[i] ^= 0x36363636;
keccak_hash_update72(&st->inner, pad);
/* outer = (key ^ 0x5c) */
/* h(outer || ...) */
#pragma unroll 18
for (i = 0; i < 72/4; i++)
pad[i] ^= 0x6a6a6a6a;
keccak_hash_update72(&st->outer, pad);
}
// assuming there is no leftover data and exactly 72 bytes are incoming
// we can directly call into the block hashing function
__device__ void pbkdf2_hmac_update72(pbkdf2_hmac_state *st, const uint32_t *m) {
/* h(inner || m...) */
keccak_hash_update72(&st->inner, m);
}
__device__ void pbkdf2_hmac_update8(pbkdf2_hmac_state *st, const uint32_t *m) {
/* h(inner || m...) */
keccak_hash_update8(&st->inner, m);
}
__device__ void pbkdf2_hmac_update4_8(pbkdf2_hmac_state *st, const uint32_t *m) {
/* h(inner || m...) */
keccak_hash_update4_8(&st->inner, m);
}
__device__ void pbkdf2_hmac_update4_56(pbkdf2_hmac_state *st, const uint32_t *m) {
/* h(inner || m...) */
keccak_hash_update4_56(&st->inner, m);
}
__device__ void pbkdf2_hmac_update56(pbkdf2_hmac_state *st, const uint32_t *m) {
/* h(inner || m...) */
keccak_hash_update56(&st->inner, m);
}
__device__ void pbkdf2_hmac_finish12(pbkdf2_hmac_state *st, uint32_t *mac) {
/* h(inner || m) */
uint32_t innerhash[16];
keccak_hash_finish12(&st->inner, innerhash);
/* h(outer || h(inner || m)) */
keccak_hash_update64(&st->outer, innerhash);
keccak_hash_finish64(&st->outer, mac);
}
__device__ void pbkdf2_hmac_finish60(pbkdf2_hmac_state *st, uint32_t *mac) {
/* h(inner || m) */
uint32_t innerhash[16];
keccak_hash_finish60(&st->inner, innerhash);
/* h(outer || h(inner || m)) */
keccak_hash_update64(&st->outer, innerhash);
keccak_hash_finish64(&st->outer, mac);
}
__device__ void pbkdf2_statecopy8(pbkdf2_hmac_state *d, pbkdf2_hmac_state *s) {
statecopy8(&d->inner, &s->inner);
statecopy0(&d->outer, &s->outer);
}
// ---------------------------- END PBKDF2 functions ------------------------------------
static __device__ uint32_t cuda_swab32(uint32_t x) {
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
}
__global__ __launch_bounds__(128)
void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce)
{
nonce += (blockIdx.x * blockDim.x) + threadIdx.x;
g_idata += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x);
uint32_t data[20];
#pragma unroll
for (int i=0; i <19; ++i)
data[i] = cuda_swab32(pdata[i]);
data[19] = cuda_swab32(nonce);
// scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)data, 80, (uint8_t*)g_idata, 128);
pbkdf2_hmac_state hmac_pw, work;
uint32_t ti[16];
uint32_t be;
/* hmac(password, ...) */
pbkdf2_hmac_init80(&hmac_pw, data);
/* hmac(password, salt...) */
pbkdf2_hmac_update72(&hmac_pw, data);
pbkdf2_hmac_update8(&hmac_pw, data+72/4);
/* U1 = hmac(password, salt || be(i)) */
be = cuda_swab32(1);
pbkdf2_statecopy8(&work, &hmac_pw);
pbkdf2_hmac_update4_8(&work, &be);
pbkdf2_hmac_finish12(&work, ti);
mycpy64(g_idata, ti);
be = cuda_swab32(2);
pbkdf2_statecopy8(&work, &hmac_pw);
pbkdf2_hmac_update4_8(&work, &be);
pbkdf2_hmac_finish12(&work, ti);
mycpy64(g_idata+16, ti);
}
__global__ __launch_bounds__(128)
void cuda_post_keccak512(uint32_t *g_odata, uint32_t *g_hash, uint32_t nonce)
{
nonce += (blockIdx.x * blockDim.x) + threadIdx.x;
g_odata += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x);
g_hash += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
uint32_t data[20];
#pragma unroll 19
for (int i=0; i <19; ++i)
data[i] = cuda_swab32(pdata[i]);
data[19] = cuda_swab32(nonce);
// scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)g_odata, 128, (uint8_t*)g_hash, 32);
pbkdf2_hmac_state hmac_pw;
uint32_t ti[16];
uint32_t be;
/* hmac(password, ...) */
pbkdf2_hmac_init80(&hmac_pw, data);
/* hmac(password, salt...) */
pbkdf2_hmac_update72(&hmac_pw, g_odata);
pbkdf2_hmac_update56(&hmac_pw, g_odata+72/4);
/* U1 = hmac(password, salt || be(i)) */
be = cuda_swab32(1);
pbkdf2_hmac_update4_56(&hmac_pw, &be);
pbkdf2_hmac_finish60(&hmac_pw, ti);
mycpy32(g_hash, ti);
}
//
// callable host code to initialize constants and to call kernels
//
static bool init[MAX_GPUS] = { 0 };
extern "C" void prepare_keccak512(int thr_id, const uint32_t host_pdata[20])
{
if (!init[thr_id])
{
checkCudaErrors(cudaMemcpyToSymbol(c_keccak_round_constants, host_keccak_round_constants, sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
init[thr_id] = true;
}
checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
}
extern "C" void pre_keccak512(int thr_id, int stream, uint32_t nonce, int throughput)
{
dim3 block(128);
dim3 grid((throughput+127)/128);
cuda_pre_keccak512<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_idata[stream][thr_id], nonce);
}
extern "C" void post_keccak512(int thr_id, int stream, uint32_t nonce, int throughput)
{
dim3 block(128);
dim3 grid((throughput+127)/128);
cuda_post_keccak512<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_odata[stream][thr_id], context_hash[stream][thr_id], nonce);
}
//
// Maxcoin related Keccak implementation (Keccak256)
//
#include <stdint.h>
#include <map>
extern std::map<int, int> context_blocks;
extern std::map<int, int> context_wpb;
extern std::map<int, KernelInterface *> context_kernel;
__constant__ uint64_t ptarget64[4];
#define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64))))
#define ROL_mult8(a, offset) ROL(a, offset)
__constant__ uint64_t KeccakF_RoundConstants[24];
static uint64_t host_KeccakF_RoundConstants[24] = {
(uint64_t)0x0000000000000001ULL,
(uint64_t)0x0000000000008082ULL,
(uint64_t)0x800000000000808aULL,
(uint64_t)0x8000000080008000ULL,
(uint64_t)0x000000000000808bULL,
(uint64_t)0x0000000080000001ULL,
(uint64_t)0x8000000080008081ULL,
(uint64_t)0x8000000000008009ULL,
(uint64_t)0x000000000000008aULL,
(uint64_t)0x0000000000000088ULL,
(uint64_t)0x0000000080008009ULL,
(uint64_t)0x000000008000000aULL,
(uint64_t)0x000000008000808bULL,
(uint64_t)0x800000000000008bULL,
(uint64_t)0x8000000000008089ULL,
(uint64_t)0x8000000000008003ULL,
(uint64_t)0x8000000000008002ULL,
(uint64_t)0x8000000000000080ULL,
(uint64_t)0x000000000000800aULL,
(uint64_t)0x800000008000000aULL,
(uint64_t)0x8000000080008081ULL,
(uint64_t)0x8000000000008080ULL,
(uint64_t)0x0000000080000001ULL,
(uint64_t)0x8000000080008008ULL
};
__constant__ uint64_t pdata64[10];
__global__
void crypto_hash(uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate)
{
uint64_t Aba, Abe, Abi, Abo, Abu;
uint64_t Aga, Age, Agi, Ago, Agu;
uint64_t Aka, Ake, Aki, Ako, Aku;
uint64_t Ama, Ame, Ami, Amo, Amu;
uint64_t Asa, Ase, Asi, Aso, Asu;
uint64_t BCa, BCe, BCi, BCo, BCu;
uint64_t Da, De, Di, Do, Du;
uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
uint64_t Ega, Ege, Egi, Ego, Egu;
uint64_t Eka, Eke, Eki, Eko, Eku;
uint64_t Ema, Eme, Emi, Emo, Emu;
uint64_t Esa, Ese, Esi, Eso, Esu;
//copyFromState(A, state)
Aba = pdata64[0];
Abe = pdata64[1];
Abi = pdata64[2];
Abo = pdata64[3];
Abu = pdata64[4];
Aga = pdata64[5];
Age = pdata64[6];
Agi = pdata64[7];
Ago = pdata64[8];
Agu = (pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32);
Aka = 0x0000000000000001ULL;
Ake = 0;
Aki = 0;
Ako = 0;
Aku = 0;
Ama = 0;
Ame = 0x8000000000000000ULL;
Ami = 0;
Amo = 0;
Amu = 0;
Asa = 0;
Ase = 0;
Asi = 0;
Aso = 0;
Asu = 0;
#pragma unroll 12
for( int laneCount = 0; laneCount < 24; laneCount += 2 )
{
// prepareTheta
BCa = Aba^Aga^Aka^Ama^Asa;
BCe = Abe^Age^Ake^Ame^Ase;
BCi = Abi^Agi^Aki^Ami^Asi;
BCo = Abo^Ago^Ako^Amo^Aso;
BCu = Abu^Agu^Aku^Amu^Asu;
//thetaRhoPiChiIotaPrepareTheta(round , A, E)
Da = BCu^ROL(BCe, 1);
De = BCa^ROL(BCi, 1);
Di = BCe^ROL(BCo, 1);
Do = BCi^ROL(BCu, 1);
Du = BCo^ROL(BCa, 1);
Aba ^= Da;
BCa = Aba;
Age ^= De;
BCe = ROL(Age, 44);
Aki ^= Di;
BCi = ROL(Aki, 43);
Amo ^= Do;
BCo = ROL(Amo, 21);
Asu ^= Du;
BCu = ROL(Asu, 14);
Eba = BCa ^((~BCe)& BCi );
Eba ^= (uint64_t)KeccakF_RoundConstants[laneCount];
Ebe = BCe ^((~BCi)& BCo );
Ebi = BCi ^((~BCo)& BCu );
Ebo = BCo ^((~BCu)& BCa );
Ebu = BCu ^((~BCa)& BCe );
Abo ^= Do;
BCa = ROL(Abo, 28);
Agu ^= Du;
BCe = ROL(Agu, 20);
Aka ^= Da;
BCi = ROL(Aka, 3);
Ame ^= De;
BCo = ROL(Ame, 45);
Asi ^= Di;
BCu = ROL(Asi, 61);
Ega = BCa ^((~BCe)& BCi );
Ege = BCe ^((~BCi)& BCo );
Egi = BCi ^((~BCo)& BCu );
Ego = BCo ^((~BCu)& BCa );
Egu = BCu ^((~BCa)& BCe );
Abe ^= De;
BCa = ROL(Abe, 1);
Agi ^= Di;
BCe = ROL(Agi, 6);
Ako ^= Do;
BCi = ROL(Ako, 25);
Amu ^= Du;
BCo = ROL_mult8(Amu, 8);
Asa ^= Da;
BCu = ROL(Asa, 18);
Eka = BCa ^((~BCe)& BCi );
Eke = BCe ^((~BCi)& BCo );
Eki = BCi ^((~BCo)& BCu );
Eko = BCo ^((~BCu)& BCa );
Eku = BCu ^((~BCa)& BCe );
Abu ^= Du;
BCa = ROL(Abu, 27);
Aga ^= Da;
BCe = ROL(Aga, 36);
Ake ^= De;
BCi = ROL(Ake, 10);
Ami ^= Di;
BCo = ROL(Ami, 15);
Aso ^= Do;
BCu = ROL_mult8(Aso, 56);
Ema = BCa ^((~BCe)& BCi );
Eme = BCe ^((~BCi)& BCo );
Emi = BCi ^((~BCo)& BCu );
Emo = BCo ^((~BCu)& BCa );
Emu = BCu ^((~BCa)& BCe );
Abi ^= Di;
BCa = ROL(Abi, 62);
Ago ^= Do;
BCe = ROL(Ago, 55);
Aku ^= Du;
BCi = ROL(Aku, 39);
Ama ^= Da;
BCo = ROL(Ama, 41);
Ase ^= De;
BCu = ROL(Ase, 2);
Esa = BCa ^((~BCe)& BCi );
Ese = BCe ^((~BCi)& BCo );
Esi = BCi ^((~BCo)& BCu );
Eso = BCo ^((~BCu)& BCa );
Esu = BCu ^((~BCa)& BCe );
// prepareTheta
BCa = Eba^Ega^Eka^Ema^Esa;
BCe = Ebe^Ege^Eke^Eme^Ese;
BCi = Ebi^Egi^Eki^Emi^Esi;
BCo = Ebo^Ego^Eko^Emo^Eso;
BCu = Ebu^Egu^Eku^Emu^Esu;
//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
Da = BCu^ROL(BCe, 1);
De = BCa^ROL(BCi, 1);
Di = BCe^ROL(BCo, 1);
Do = BCi^ROL(BCu, 1);
Du = BCo^ROL(BCa, 1);
Eba ^= Da;
BCa = Eba;
Ege ^= De;
BCe = ROL(Ege, 44);
Eki ^= Di;
BCi = ROL(Eki, 43);
Emo ^= Do;
BCo = ROL(Emo, 21);
Esu ^= Du;
BCu = ROL(Esu, 14);
Aba = BCa ^((~BCe)& BCi );
Aba ^= (uint64_t)KeccakF_RoundConstants[laneCount+1];
Abe = BCe ^((~BCi)& BCo );
Abi = BCi ^((~BCo)& BCu );
Abo = BCo ^((~BCu)& BCa );
Abu = BCu ^((~BCa)& BCe );
Ebo ^= Do;
BCa = ROL(Ebo, 28);
Egu ^= Du;
BCe = ROL(Egu, 20);
Eka ^= Da;
BCi = ROL(Eka, 3);
Eme ^= De;
BCo = ROL(Eme, 45);
Esi ^= Di;
BCu = ROL(Esi, 61);
Aga = BCa ^((~BCe)& BCi );
Age = BCe ^((~BCi)& BCo );
Agi = BCi ^((~BCo)& BCu );
Ago = BCo ^((~BCu)& BCa );
Agu = BCu ^((~BCa)& BCe );
Ebe ^= De;
BCa = ROL(Ebe, 1);
Egi ^= Di;
BCe = ROL(Egi, 6);
Eko ^= Do;
BCi = ROL(Eko, 25);
Emu ^= Du;
BCo = ROL_mult8(Emu, 8);
Esa ^= Da;
BCu = ROL(Esa, 18);
Aka = BCa ^((~BCe)& BCi );
Ake = BCe ^((~BCi)& BCo );
Aki = BCi ^((~BCo)& BCu );
Ako = BCo ^((~BCu)& BCa );
Aku = BCu ^((~BCa)& BCe );
Ebu ^= Du;
BCa = ROL(Ebu, 27);
Ega ^= Da;
BCe = ROL(Ega, 36);
Eke ^= De;
BCi = ROL(Eke, 10);
Emi ^= Di;
BCo = ROL(Emi, 15);
Eso ^= Do;
BCu = ROL_mult8(Eso, 56);
Ama = BCa ^((~BCe)& BCi );
Ame = BCe ^((~BCi)& BCo );
Ami = BCi ^((~BCo)& BCu );
Amo = BCo ^((~BCu)& BCa );
Amu = BCu ^((~BCa)& BCe );
Ebi ^= Di;
BCa = ROL(Ebi, 62);
Ego ^= Do;
BCe = ROL(Ego, 55);
Eku ^= Du;
BCi = ROL(Eku, 39);
Ema ^= Da;
BCo = ROL(Ema, 41);
Ese ^= De;
BCu = ROL(Ese, 2);
Asa = BCa ^((~BCe)& BCi );
Ase = BCe ^((~BCi)& BCo );
Asi = BCi ^((~BCo)& BCu );
Aso = BCo ^((~BCu)& BCa );
Asu = BCu ^((~BCa)& BCe );
}
if (validate) {
g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
g_out[3] = Abo;
g_out[2] = Abi;
g_out[1] = Abe;
g_out[0] = Aba;
}
// the likelyhood of meeting the hashing target is so low, that we're not guarding this
// with atomic writes, locks or similar...
uint64_t *g_good64 = (uint64_t*)g_good;
if (Abo <= ptarget64[3]) {
if (Abo < g_good64[3]) {
g_good64[3] = Abo;
g_good64[2] = Abi;
g_good64[1] = Abe;
g_good64[0] = Aba;
g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
}
}
}
static std::map<int, uint32_t *> context_good[2];
// ... keccak???
bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
{
static bool init[MAX_DEVICES] = {false};
if (!init[thr_id])
{
checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice));
// allocate pinned host memory for good hashes
uint32_t *tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
init[thr_id] = true;
}
checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
return context_good[0][thr_id] && context_good[1][thr_id];
}
void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
{
checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
crypto_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
// copy hashes from device memory to host (ALL hashes, lots of data...)
if (do_d2h && hash != NULL) {
size_t mem_size = throughput * sizeof(uint32_t) * 8;
checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
}
else if (hash != NULL) {
// asynchronous copy of winning nonce (just 4 bytes...)
checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
}
}

8
scrypt/keccak.h

@ -0,0 +1,8 @@ @@ -0,0 +1,8 @@
#ifndef KECCAK_H
#define KEKKAC_H
extern "C" void prepare_keccak512(int thr_id, const uint32_t host_pdata[20]);
extern "C" void pre_keccak512(int thr_id, int stream, uint32_t nonce, int throughput);
extern "C" void post_keccak512(int thr_id, int stream, uint32_t nonce, int throughput);
#endif // #ifndef KEKKAC_H

781
scrypt/kepler_kernel.cu

@ -0,0 +1,781 @@ @@ -0,0 +1,781 @@
/* Copyright (C) 2013 David G. Andersen. All rights reserved.
* with modifications by Christian Buchner
*
* Use of this code is covered under the Apache 2.0 license, which
* can be found in the file "LICENSE"
*/
// TODO: attempt V.Volkov style ILP (factor 4)
#include <map>
#include "cuda_runtime.h"
#include "miner.h"
#include "salsa_kernel.h"
#include "kepler_kernel.h"
#define TEXWIDTH 32768
#define THREADS_PER_WU 4 // four threads per hash
typedef enum
{
ANDERSEN,
SIMPLE
} MemoryAccess;
// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
// iteration count N
__constant__ uint32_t c_N;
__constant__ uint32_t c_N_1; // N-1
// scratch buffer size SCRATCH
__constant__ uint32_t c_SCRATCH;
__constant__ uint32_t c_SCRATCH_WU_PER_WARP; // (SCRATCH * WU_PER_WARP)
__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP) - 1
// using texture references for the "tex" variants of the B kernels
texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V;
texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
template <int ALGO> __device__ __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);
static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) {
left.x ^= right.x;
left.y ^= right.y;
left.z ^= right.z;
left.w ^= right.w;
return left;
}
static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) {
left.x += right.x;
left.y += right.y;
left.z += right.z;
left.w += right.w;
return left;
}
static __device__ uint4 __shfl(const uint4 bx, int target_thread) {
return make_uint4(
__shfl((int)bx.x, target_thread),
__shfl((int)bx.y, target_thread),
__shfl((int)bx.z, target_thread),
__shfl((int)bx.w, target_thread)
);
}
/* write_keys writes the 8 keys being processed by a warp to the global
* scratchpad. To effectively use memory bandwidth, it performs the writes
* (and reads, for read_keys) 128 bytes at a time per memory location
* by __shfl'ing the 4 entries in bx to the threads in the next-up
* thread group. It then has eight threads together perform uint4
* (128 bit) writes to the destination region. This seems to make
* quite effective use of memory bandwidth. An approach that spread
* uint32s across more threads was slower because of the increased
* computation it required.
*
* "start" is the loop iteration producing the write - the offset within
* the block's memory.
*
* Internally, this algorithm first __shfl's the 4 bx entries to
* the next up thread group, and then uses a conditional move to
* ensure that odd-numbered thread groups exchange the b/bx ordering
* so that the right parts are written together.
*
* Thanks to Babu for helping design the 128-bit-per-write version.
*
* _direct lets the caller specify the absolute start location instead of
* the relative start location, as an attempt to reduce some recomputation.
*/
template <MemoryAccess SCHEME> __device__ __forceinline__
void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
{
uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
if (SCHEME == ANDERSEN) {
int target_thread = (threadIdx.x + 4)%32;
uint4 t=b, t2=__shfl(bx, target_thread);
int t2_start = __shfl((int)start, target_thread) + 4;
bool c = (threadIdx.x & 0x4);
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
} else if (SCHEME == SIMPLE) {
*((uint4 *)(&scratch[start ])) = b;
*((uint4 *)(&scratch[start+16])) = bx;
}
}
template <MemoryAccess SCHEME, int TEX_DIM> __device__ __forceinline__
void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
{
uint32_t *scratch;
if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
if (SCHEME == ANDERSEN) {
int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4;
if (TEX_DIM > 0) { start /= 4; t2_start /= 4; }
bool c = (threadIdx.x & 0x4);
if (TEX_DIM == 0) {
b = *((uint4 *)(&scratch[c ? t2_start : start]));
bx = *((uint4 *)(&scratch[c ? start : t2_start]));
} else if (TEX_DIM == 1) {
b = tex1Dfetch(texRef1D_4_V, c ? t2_start : start);
bx = tex1Dfetch(texRef1D_4_V, c ? start : t2_start);
} else if (TEX_DIM == 2) {
b = tex2D(texRef2D_4_V, 0.5f + ((c ? t2_start : start)%TEXWIDTH), 0.5f + ((c ? t2_start : start)/TEXWIDTH));
bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH));
}
uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
bx = __shfl(bx, (threadIdx.x + 28)%32);
} else {
if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start]));
else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4);
else if (TEX_DIM == 2) b = tex2D(texRef2D_4_V, 0.5f + ((start/4)%TEXWIDTH), 0.5f + ((start/4)/TEXWIDTH));
if (TEX_DIM == 0) bx = *((uint4 *)(&scratch[start+16]));
else if (TEX_DIM == 1) bx = tex1Dfetch(texRef1D_4_V, (start+16)/4);
else if (TEX_DIM == 2) bx = tex2D(texRef2D_4_V, 0.5f + (((start+16)/4)%TEXWIDTH), 0.5f + (((start+16)/4)/TEXWIDTH));
}
}
__device__ __forceinline__
void primary_order_shuffle(uint4 &b, uint4 &bx)
{
/* Inner loop shuffle targets */
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
b.w = __shfl((int)b.w, x1);
b.z = __shfl((int)b.z, x2);
b.y = __shfl((int)b.y, x3);
uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
bx.w = __shfl((int)bx.w, x1);
bx.z = __shfl((int)bx.z, x2);
bx.y = __shfl((int)bx.y, x3);
tmp = bx.y; bx.y = bx.w; bx.w = tmp;
}
/*
* load_key loads a 32*32bit key from a contiguous region of memory in B.
* The input keys are in external order (i.e., 0, 1, 2, 3, ...).
* After loading, each thread has its four b and four bx keys stored
* in internal processing order.
*/
__device__ __forceinline__
void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx)
{
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
// Read in permuted order. Key loads are not our bottleneck right now.
b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4];
b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4];
b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4];
b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4];
bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16];
bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16];
bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16];
bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16];
primary_order_shuffle(b, bx);
}
/*
* store_key performs the opposite transform as load_key, taking
* internally-ordered b and bx and storing them into a contiguous
* region of B in external order.
*/
__device__ __forceinline__
void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx)
{
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
primary_order_shuffle(b, bx);
B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x;
B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y;
B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z;
B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w;
B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x;
B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y;
B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z;
B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w;
}
/*
* load_key loads a 32*32bit key from a contiguous region of memory in B.
* The input keys are in external order (i.e., 0, 1, 2, 3, ...).
* After loading, each thread has its four b and four bx keys stored
* in internal processing order.
*/
__device__ __forceinline__
void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx)
{
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
// Read in permuted order. Key loads are not our bottleneck right now.
b.x = B[key_offset + 4*0 + thread_in_block%4];
b.y = B[key_offset + 4*1 + thread_in_block%4];
b.z = B[key_offset + 4*2 + thread_in_block%4];
b.w = B[key_offset + 4*3 + thread_in_block%4];
bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16];
bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16];
bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16];
bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16];
}
/*
* store_key performs the opposite transform as load_key, taking
* internally-ordered b and bx and storing them into a contiguous
* region of B in external order.
*/
__device__ __forceinline__
void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx)
{
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
B[key_offset + 4*0 + thread_in_block%4] = b.x;
B[key_offset + 4*1 + thread_in_block%4] = b.y;
B[key_offset + 4*2 + thread_in_block%4] = b.z;
B[key_offset + 4*3 + thread_in_block%4] = b.w;
B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x;
B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y;
B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z;
B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w;
}
template <int ALGO> __device__ __forceinline__
void load_key(const uint32_t *B, uint4 &b, uint4 &bx)
{
switch(ALGO) {
case A_SCRYPT: load_key_salsa(B, b, bx); break;
case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break;
}
}
template <int ALGO> __device__ __forceinline__
void store_key(uint32_t *B, uint4 &b, uint4 &bx)
{
switch(ALGO) {
case A_SCRYPT: store_key_salsa(B, b, bx); break;
case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break;
}
}
/*
* salsa_xor_core (Salsa20/8 cypher)
* The original scrypt called:
* xor_salsa8(&X[0], &X[16]); <-- the "b" loop
* xor_salsa8(&X[16], &X[0]); <-- the "bx" loop
* This version is unrolled to handle both of these loops in a single
* call to avoid unnecessary data movement.
*/
#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<<amt)|(tmp>>(32-amt))); }
__device__ __forceinline__
void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
{
uint4 x;
b ^= bx;
x = b;
// Enter in "primary order" (t0 has 0, 4, 8, 12)
// (t1 has 5, 9, 13, 1)
// (t2 has 10, 14, 2, 6)
// (t3 has 15, 3, 7, 11)
#pragma unroll
for (int j = 0; j < 4; j++) {
// Mixing phase of salsa
XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
/* Transpose rows and columns. */
/* Unclear if this optimization is needed: These are ordered based
* upon the dependencies needed in the later xors. Compiler should be
* able to figure this out, but might as well give it a hand. */
x.y = __shfl((int)x.y, x3);
x.w = __shfl((int)x.w, x1);
x.z = __shfl((int)x.z, x2);
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
* but the register targets are rewritten here to swap x[1] and x[3] so that
* they can be directly shuffled to and from our peer threads without
* reassignment. The reverse shuffle then puts them back in the right place.
*/
XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
x.w = __shfl((int)x.w, x3);
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
}
b += x;
// The next two lines are the beginning of the BX-centric loop iteration
bx ^= b;
x = bx;
// This is a copy of the same loop above, identical but stripped of comments.
// Duplicated so that we can complete a bx-based loop with fewer register moves.
#pragma unroll
for (int j = 0; j < 4; j++) {
XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
x.y = __shfl((int)x.y, x3);
x.w = __shfl((int)x.w, x1);
x.z = __shfl((int)x.z, x2);
XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
x.w = __shfl((int)x.w, x3);
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
}
// At the end of these iterations, the data is in primary order again.
#undef XOR_ROTATE_ADD
bx += x;
}
/*
* chacha_xor_core (ChaCha20/8 cypher)
* This version is unrolled to handle both of these loops in a single
* call to avoid unnecessary data movement.
*
* load_key and store_key must not use primary order when
* using ChaCha20/8, but rather the basic transposed order
* (referred to as "column mode" below)
*/
#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<<amt)|(tmp>>(32-amt))); }
__device__ __forceinline__
void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
{
uint4 x;
b ^= bx;
x = b;
// Enter in "column" mode (t0 has 0, 4, 8, 12)
// (t1 has 1, 5, 9, 13)
// (t2 has 2, 6, 10, 14)
// (t3 has 3, 7, 11, 15)
#pragma unroll
for (int j = 0; j < 4; j++) {
// Column Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x3);
// Diagonal Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x3);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x1);
}
b += x;
// The next two lines are the beginning of the BX-centric loop iteration
bx ^= b;
x = bx;
#pragma unroll
for (int j = 0; j < 4; j++) {
// Column Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x3);
// Diagonal Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x3);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x1);
}
#undef CHACHA_PRIMITIVE
bx += x;
}
template <int ALGO> __device__ __forceinline__
void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
{
switch(ALGO) {
case A_SCRYPT: salsa_xor_core(b, bx, x1, x2, x3); break;
case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break;
}
}
/*
* The hasher_gen_kernel operates on a group of 1024-bit input keys
* in B, stored as:
* B = { k1B k1Bx k2B k2Bx ... }
* and fills up the scratchpad with the iterative hashes derived from
* those keys:
* scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... }
* scratch is 1024 times larger than the input keys B.
* It is extremely important to stream writes effectively into scratch;
* less important to coalesce the reads from B.
*
* Key ordering note: Keys are input from B in "original" order:
* K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 }
* After inputting into kernel_gen, each component k and kx of the
* key is transmuted into a permuted internal order to make processing faster:
* K = k, kx with:
* k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11
* and similarly for kx.
*/
template <int ALGO, MemoryAccess SCHEME> __global__
void kepler_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end)
{
uint4 b, bx;
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
int i=begin;
if (i == 0) {
load_key<ALGO>(d_idata, b, bx);
write_keys_direct<SCHEME>(b, bx, start);
++i;
} else read_keys_direct<SCHEME,0>(b, bx, start+32*(i-1));
while (i < end) {
block_mixer<ALGO>(b, bx, x1, x2, x3);
write_keys_direct<SCHEME>(b, bx, start+32*i);
++i;
}
}
template <int ALGO, MemoryAccess SCHEME> __global__
void kepler_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP)
{
uint4 b, bx;
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
int i=begin;
if (i == 0) {
load_key<ALGO>(d_idata, b, bx);
write_keys_direct<SCHEME>(b, bx, start);
++i;
} else {
int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP;
read_keys_direct<SCHEME,0>(b, bx, start+32*pos);
while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
}
while (i < end) {
block_mixer<ALGO>(b, bx, x1, x2, x3);
if (i % LOOKUP_GAP == 0)
write_keys_direct<SCHEME>(b, bx, start+32*(i/LOOKUP_GAP));
++i;
}
}
/*
* hasher_hash_kernel runs the second phase of scrypt after the scratch
* buffer is filled with the iterative hashes: It bounces through
* the scratch buffer in pseudorandom order, mixing the key as it goes.
*/
template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__
void kepler_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
{
uint4 b, bx;
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4);
if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP;
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
if (begin == 0) {
read_keys_direct<SCHEME, TEX_DIM>(b, bx, start+32*c_N_1);
block_mixer<ALGO>(b, bx, x1, x2, x3);
} else load_key<ALGO>(d_odata, b, bx);
for (int i = begin; i < end; i++) {
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
uint4 t, tx; read_keys_direct<SCHEME, TEX_DIM>(t, tx, start+32*j);
b ^= t; bx ^= tx;
block_mixer<ALGO>(b, bx, x1, x2, x3);
}
store_key<ALGO>(d_odata, b, bx);
}
template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__
void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP)
{
uint4 b, bx;
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4);
if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP;
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
if (begin == 0) {
int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP);
read_keys_direct<SCHEME,TEX_DIM>(b, bx, start+32*pos);
while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
} else load_key<ALGO>(d_odata, b, bx);
if (SCHEME == SIMPLE)
{
// better divergent thread handling submitted by nVidia engineers, but
// supposedly this does not run with the ANDERSEN memory access scheme
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
int pos = j/LOOKUP_GAP;
int loop = -1;
uint4 t, tx;
int i = begin;
while(i < end) {
if (loop==-1) {
j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
pos = j/LOOKUP_GAP;
loop = j-pos*LOOKUP_GAP;
read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
}
if (loop==0) {
b ^= t; bx ^= tx;
t=b;tx=bx;
}
block_mixer<ALGO>(t, tx, x1, x2, x3);
if (loop==0) {
b=t;bx=tx;
i++;
}
loop--;
}
}
else
{
// this is my original implementation, now used with the ANDERSEN
// memory access scheme only.
for (int i = begin; i < end; i++) {
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
b ^= t; bx ^= tx;
block_mixer<ALGO>(b, bx, x1, x2, x3);
}
}
//for (int i = begin; i < end; i++) {
// int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
// int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
// uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
// while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
// b ^= t; bx ^= tx;
// block_mixer<ALGO>(b, bx, x1, x2, x3);
//}
store_key<ALGO>(d_odata, b, bx);
}
KeplerKernel::KeplerKernel() : KernelInterface()
{
}
bool KeplerKernel::bindtexture_1D(uint32_t *d_V, size_t size)
{
cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
texRef1D_4_V.normalized = 0;
texRef1D_4_V.filterMode = cudaFilterModePoint;
texRef1D_4_V.addressMode[0] = cudaAddressModeClamp;
checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size));
return true;
}
bool KeplerKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch)
{
cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
texRef2D_4_V.normalized = 0;
texRef2D_4_V.filterMode = cudaFilterModePoint;
texRef2D_4_V.addressMode[0] = cudaAddressModeClamp;
texRef2D_4_V.addressMode[1] = cudaAddressModeClamp;
// maintain texture width of TEXWIDTH (max. limit is 65000)
while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; }
while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; }
checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch));
return true;
}
bool KeplerKernel::unbindtexture_1D()
{
checkCudaErrors(cudaUnbindTexture(texRef1D_4_V));
return true;
}
bool KeplerKernel::unbindtexture_2D()
{
checkCudaErrors(cudaUnbindTexture(texRef2D_4_V));
return true;
}
void KeplerKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
{
checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
}
bool KeplerKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream,
uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
{
bool success = true;
// make some constants available to kernel, update only initially and when changing
static int prev_N[MAX_DEVICES] = {0};
if (N != prev_N[thr_id]) {
uint32_t h_N = N;
uint32_t h_N_1 = N-1;
uint32_t h_SCRATCH = SCRATCH;
uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP);
uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1;
cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
prev_N[thr_id] = N;
}
// First phase: Sequential writes to scratchpad.
int batch = device_batchsize[thr_id];
//int num_sleeps = 2* ((N + (batch-1)) / batch);
//int sleeptime = 100;
unsigned int pos = 0;
do
{
if (LOOKUP_GAP == 1) {
if (IS_SCRYPT()) kepler_scrypt_core_kernelA<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
} else {
if (IS_SCRYPT()) kepler_scrypt_core_kernelA_LG<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
}
pos += batch;
} while (pos < N);
// Second phase: Random read access from scratchpad.
pos = 0;
do
{
if (LOOKUP_GAP == 1) {
if (texture_cache == 0) {
if (IS_SCRYPT()) kepler_scrypt_core_kernelB<A_SCRYPT ,ANDERSEN, 0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<A_SCRYPT_JANE,SIMPLE, 0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
} else if (texture_cache == 1) {
if (IS_SCRYPT()) kepler_scrypt_core_kernelB<A_SCRYPT ,ANDERSEN,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<A_SCRYPT_JANE,SIMPLE, 1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
} else if (texture_cache == 2) {
if (IS_SCRYPT()) kepler_scrypt_core_kernelB<A_SCRYPT ,ANDERSEN,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<A_SCRYPT_JANE,SIMPLE, 2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
}
} else {
if (texture_cache == 0) {
if (IS_SCRYPT()) kepler_scrypt_core_kernelB_LG<A_SCRYPT ,ANDERSEN,0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB_LG<A_SCRYPT_JANE,SIMPLE, 0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
} else if (texture_cache == 1) {
if (IS_SCRYPT()) kepler_scrypt_core_kernelB_LG<A_SCRYPT ,ANDERSEN,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB_LG<A_SCRYPT_JANE,SIMPLE, 1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
} else if (texture_cache == 2) {
if (IS_SCRYPT()) kepler_scrypt_core_kernelB_LG<A_SCRYPT ,ANDERSEN,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB_LG<A_SCRYPT_JANE,SIMPLE, 2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
}
}
pos += batch;
} while (pos < N);
return success;
}

29
scrypt/kepler_kernel.h

@ -0,0 +1,29 @@ @@ -0,0 +1,29 @@
#ifndef KEPLER_KERNEL_H
#define KEPLER_KERNEL_H
#include "salsa_kernel.h"
class KeplerKernel : public KernelInterface
{
public:
KeplerKernel();
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
virtual bool bindtexture_1D(uint32_t *d_V, size_t size);
virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch);
virtual bool unbindtexture_1D();
virtual bool unbindtexture_2D();
virtual char get_identifier() { return 'k'; };
virtual int get_major_version() { return 3; };
virtual int get_minor_version() { return 0; };
virtual int max_warps_per_block() { return 32; };
virtual int get_texel_width() { return 4; };
virtual int threads_per_wu() { return 4; }
virtual bool support_lookup_gap() { return true; }
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
};
#endif // #ifndef KEPLER_KERNEL_H

1488
scrypt/nv_kernel.cu

File diff suppressed because it is too large Load Diff

36
scrypt/nv_kernel.h

@ -0,0 +1,36 @@ @@ -0,0 +1,36 @@
#ifndef NV_KERNEL_H
#define NV_KERNEL_H
#include "salsa_kernel.h"
class NVKernel : public KernelInterface
{
public:
NVKernel();
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
virtual bool bindtexture_1D(uint32_t *d_V, size_t size);
virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch);
virtual bool unbindtexture_1D();
virtual bool unbindtexture_2D();
virtual char get_identifier() { return 'K'; };
virtual int get_major_version() { return 3; };
virtual int get_minor_version() { return 0; };
virtual int max_warps_per_block() { return 32; };
virtual int get_texel_width() { return 4; };
virtual bool support_lookup_gap() { return true; }
virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]);
virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
};
#endif // #ifndef NV_KERNEL_H

1723
scrypt/nv_kernel2.cu

File diff suppressed because it is too large Load Diff

36
scrypt/nv_kernel2.h

@ -0,0 +1,36 @@ @@ -0,0 +1,36 @@
#ifndef NV2_KERNEL_H
#define NV2_KERNEL_H
#include "miner.h"
#include <cuda_runtime.h>
#include "salsa_kernel.h"
class NV2Kernel : public KernelInterface
{
public:
NV2Kernel();
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
virtual char get_identifier() { return 'T'; };
virtual int get_major_version() { return 3; };
virtual int get_minor_version() { return 5; };
virtual int max_warps_per_block() { return 24; };
virtual int get_texel_width() { return 4; };
virtual bool no_textures() { return true; }
virtual bool support_lookup_gap() { return true; }
virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]);
virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
};
#endif // #ifndef NV2_KERNEL_H

939
scrypt/salsa_kernel.cu

@ -0,0 +1,939 @@ @@ -0,0 +1,939 @@
//
// Contains the autotuning logic and some utility functions.
// Note that all CUDA kernels have been moved to other .cu files
//
// NOTE: compile this .cu module for compute_20,sm_21 with --maxrregcount=64
//
#include <stdio.h>
#include <map>
#include <algorithm>
#include <unistd.h> // usleep
#include <ctype.h> // tolower
#include "cuda_helper.h"
#include "salsa_kernel.h"
#include "titan_kernel.h"
#include "fermi_kernel.h"
#include "test_kernel.h"
#include "nv_kernel.h"
#include "nv_kernel2.h"
#include "kepler_kernel.h"
#include "miner.h"
#if WIN32
#ifdef _WIN64
#define _64BIT 1
#endif
#else
#if __x86_64__
#define _64BIT 1
#endif
#endif
#if _64BIT
#define MAXMEM 0x300000000ULL // 12 GB (the largest Kepler)
#else
#define MAXMEM 0xFFFFFFFFULL // nearly 4 GB (32 bit limitations)
#endif
// require CUDA 5.5 driver API
#define DMAJ 5
#define DMIN 5
// define some error checking macros
#undef checkCudaErrors
#if WIN32
#define DELIMITER '/'
#else
#define DELIMITER '/'
#endif
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
#define checkCudaErrors(x) \
{ \
cudaGetLastError(); \
x; \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) \
applog(LOG_ERR, "GPU #%d: Err %d: %s (%s:%d)", device_map[thr_id], err, cudaGetErrorString(err), __FILENAME__, __LINE__); \
}
// some globals containing pointers to device memory (for chunked allocation)
// [MAX_DEVICES] indexes up to MAX_DEVICES threads (0...MAX_DEVICES-1)
int MAXWARPS[MAX_GPUS];
uint32_t* h_V[MAX_GPUS][TOTAL_WARP_LIMIT*64]; // NOTE: the *64 prevents buffer overflow for --keccak
uint32_t h_V_extra[MAX_GPUS][TOTAL_WARP_LIMIT*64]; // with really large kernel launch configurations
KernelInterface *Best_Kernel_Heuristics(cudaDeviceProp *props)
{
KernelInterface *kernel = NULL;
uint32_t N = (1UL << opt_nfactor+1); // not sure
if (IS_SCRYPT() || (IS_SCRYPT_JANE() && N <= 8192))
{
// high register count kernels (scrypt, low N-factor scrypt-jane)
if (props->major > 3 || (props->major == 3 && props->minor >= 5))
kernel = new NV2Kernel(); // we don't want this for Keccak though
else if (props->major == 3 && props->minor == 0)
kernel = new NVKernel();
else if (props->major == 2 || props->major == 1)
kernel = new FermiKernel();
}
else
{
// low register count kernels (high N-factor scrypt-jane)
if (props->major > 3 || (props->major == 3 && props->minor >= 5))
kernel = new TitanKernel();
else if (props->major == 3 && props->minor == 0)
kernel = new KeplerKernel();
else if (props->major == 2 || props->major == 1)
kernel = new TestKernel();
}
return kernel;
}
bool validate_config(char *config, int &b, int &w, KernelInterface **kernel = NULL, cudaDeviceProp *props = NULL)
{
bool success = false;
char kernelid = ' ';
if (config != NULL)
{
if (config[0] == 'T' || config[0] == 'K' || config[0] == 'F' || config[0] == 'L' ||
config[0] == 't' || config[0] == 'k' || config[0] == 'f' ||
config[0] == 'Z' || config[0] == 'Y' || config[0] == 'X') {
kernelid = config[0];
config++;
}
if (config[0] >= '0' && config[0] <= '9')
if (sscanf(config, "%dx%d", &b, &w) == 2)
success = true;
if (success && kernel != NULL)
{
switch (kernelid)
{
case 'T': case 'Z': *kernel = new NV2Kernel(); break;
case 't': *kernel = new TitanKernel(); break;
case 'K': case 'Y': *kernel = new NVKernel(); break;
case 'k': *kernel = new KeplerKernel(); break;
case 'F': case 'L': *kernel = new FermiKernel(); break;
case 'f': case 'X': *kernel = new TestKernel(); break;
case ' ': // choose based on device architecture
*kernel = Best_Kernel_Heuristics(props);
break;
}
}
}
return success;
}
std::map<int, int> context_blocks;
std::map<int, int> context_wpb;
std::map<int, bool> context_concurrent;
std::map<int, KernelInterface *> context_kernel;
std::map<int, uint32_t *> context_idata[2];
std::map<int, uint32_t *> context_odata[2];
std::map<int, cudaStream_t> context_streams[2];
std::map<int, uint32_t *> context_X[2];
std::map<int, uint32_t *> context_H[2];
std::map<int, cudaEvent_t> context_serialize[2];
// for SHA256 hashing on GPU
std::map<int, uint32_t *> context_tstate[2];
std::map<int, uint32_t *> context_ostate[2];
std::map<int, uint32_t *> context_hash[2];
int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurrent, int &wpb);
void cuda_shutdown(int thr_id)
{
cudaDeviceSynchronize();
cudaDeviceReset();
cudaThreadExit();
}
int cuda_throughput(int thr_id)
{
int GRID_BLOCKS, WARPS_PER_BLOCK;
if (context_blocks.find(thr_id) == context_blocks.end())
{
#if 0
CUcontext ctx;
cuCtxCreate( &ctx, CU_CTX_SCHED_YIELD, device_map[thr_id] );
cuCtxSetCurrent(ctx);
#else
checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleYield));
checkCudaErrors(cudaSetDevice(device_map[thr_id]));
checkCudaErrors(cudaFree(0));
#endif
KernelInterface *kernel;
bool concurrent;
GRID_BLOCKS = find_optimal_blockcount(thr_id, kernel, concurrent, WARPS_PER_BLOCK);
if(GRID_BLOCKS == 0)
return 0;
unsigned int THREADS_PER_WU = kernel->threads_per_wu();
unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * 32;
unsigned int state_size = WU_PER_LAUNCH * sizeof(uint32_t) * 8;
// allocate device memory for scrypt_core inputs and outputs
uint32_t *tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_idata[0][thr_id] = tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_idata[1][thr_id] = tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_odata[0][thr_id] = tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_odata[1][thr_id] = tmp;
// allocate pinned host memory for scrypt hashes
checkCudaErrors(cudaHostAlloc((void **) &tmp, state_size, cudaHostAllocDefault)); context_H[0][thr_id] = tmp;
checkCudaErrors(cudaHostAlloc((void **) &tmp, state_size, cudaHostAllocDefault)); context_H[1][thr_id] = tmp;
if (IS_SCRYPT())
{
if (parallel < 2)
{
// allocate pinned host memory for scrypt_core input/output
checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[0][thr_id] = tmp;
checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[1][thr_id] = tmp;
}
else
{
// allocate tstate, ostate, scrypt hash device memory
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_tstate[0][thr_id] = tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_tstate[1][thr_id] = tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_ostate[0][thr_id] = tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_ostate[1][thr_id] = tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[0][thr_id] = tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[1][thr_id] = tmp;
}
}
else if (IS_SCRYPT_JANE())
{
// allocate pinned host memory for scrypt_core input/output
checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[0][thr_id] = tmp;
checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[1][thr_id] = tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[0][thr_id] = tmp;
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[1][thr_id] = tmp;
}
// create two CUDA streams
cudaStream_t tmp2;
checkCudaErrors( cudaStreamCreate(&tmp2) ); context_streams[0][thr_id] = tmp2;
checkCudaErrors( cudaStreamCreate(&tmp2) ); context_streams[1][thr_id] = tmp2;
// events used to serialize the kernel launches (we don't want any overlapping of kernels)
cudaEvent_t tmp4;
checkCudaErrors(cudaEventCreateWithFlags(&tmp4, cudaEventDisableTiming)); context_serialize[0][thr_id] = tmp4;
checkCudaErrors(cudaEventCreateWithFlags(&tmp4, cudaEventDisableTiming)); context_serialize[1][thr_id] = tmp4;
checkCudaErrors(cudaEventRecord(context_serialize[1][thr_id]));
context_kernel[thr_id] = kernel;
context_concurrent[thr_id] = concurrent;
context_blocks[thr_id] = GRID_BLOCKS;
context_wpb[thr_id] = WARPS_PER_BLOCK;
}
GRID_BLOCKS = context_blocks[thr_id];
WARPS_PER_BLOCK = context_wpb[thr_id];
unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
return WU_PER_LAUNCH;
}
// Beginning of GPU Architecture definitions
inline int _ConvertSMVer2Cores(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] = {
{ 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
{ 0x30, 192 }, // Kepler Generation (SM 3.0) GK10x class - GK104 = 1536 cores / 8 SMs
{ 0x35, 192 }, // Kepler Generation (SM 3.5) GK11x class
{ 0x50, 128 }, // Maxwell Generation (SM 5.0) GTX750/750Ti
{ 0x52, 128 }, // Maxwell Second Generation (SM 5.2) GTX980 = 2048 cores / 16 SMs - GTX970 1664 cores / 13 SMs
{ -1, -1 },
};
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1)
{
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
// If we don't find the values, we default use the previous one to run properly
applog(LOG_WARNING, "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, 128);
return 128;
}
#ifdef WIN32
#include <windows.h>
static int console_width() {
CONSOLE_SCREEN_BUFFER_INFO csbi;
GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi);
return csbi.srWindow.Right - csbi.srWindow.Left + 1;
}
#else
static inline int console_width() {
return 999;
}
#endif
int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurrent, int &WARPS_PER_BLOCK)
{
int cw = console_width();
int optimal_blocks = 0;
cudaDeviceProp props;
checkCudaErrors(cudaGetDeviceProperties(&props, device_map[thr_id]));
concurrent = (props.concurrentKernels > 0);
device_name[thr_id] = strdup(props.name);
applog(LOG_INFO, "GPU #%d: %s with SM %d.%d", device_map[thr_id], props.name, props.major, props.minor);
WARPS_PER_BLOCK = -1;
// if not specified, use interactive mode for devices that have the watchdog timer enabled
if (device_interactive[thr_id] == -1)
device_interactive[thr_id] = props.kernelExecTimeoutEnabled;
// turn off texture cache if not otherwise specified
if (device_texturecache[thr_id] == -1)
device_texturecache[thr_id] = 0;
// if not otherwise specified or required, turn single memory allocations off as they reduce
// the amount of memory that we can allocate on Windows Vista, 7 and 8 (WDDM driver model issue)
if (device_singlememory[thr_id] == -1) device_singlememory[thr_id] = 0;
// figure out which kernel implementation to use
if (!validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK, &kernel, &props)) {
kernel = NULL;
if (device_config[thr_id] != NULL) {
if (device_config[thr_id][0] == 'T' || device_config[thr_id][0] == 'Z')
kernel = new NV2Kernel();
else if (device_config[thr_id][0] == 't')
kernel = new TitanKernel();
else if (device_config[thr_id][0] == 'K' || device_config[thr_id][0] == 'Y')
kernel = new NVKernel();
else if (device_config[thr_id][0] == 'k')
kernel = new KeplerKernel();
else if (device_config[thr_id][0] == 'F' || device_config[thr_id][0] == 'L')
kernel = new FermiKernel();
else if (device_config[thr_id][0] == 'f' || device_config[thr_id][0] == 'X')
kernel = new TestKernel();
}
if (kernel == NULL) kernel = Best_Kernel_Heuristics(&props);
}
if (kernel->get_major_version() > props.major || kernel->get_major_version() == props.major && kernel->get_minor_version() > props.minor)
{
applog(LOG_ERR, "GPU #%d: FATAL: the '%c' kernel requires %d.%d capability!", device_map[thr_id], kernel->get_identifier(), kernel->get_major_version(), kernel->get_minor_version());
return 0;
}
// set whatever cache configuration and shared memory bank mode the kernel prefers
checkCudaErrors(cudaDeviceSetCacheConfig(kernel->cache_config()));
checkCudaErrors(cudaDeviceSetSharedMemConfig(kernel->shared_mem_config()));
// some kernels (e.g. Titan) do not support the texture cache
if (kernel->no_textures() && device_texturecache[thr_id]) {
applog(LOG_WARNING, "GPU #%d: the '%c' kernel ignores the texture cache argument", device_map[thr_id], kernel->get_identifier());
device_texturecache[thr_id] = 0;
}
// Texture caching only works with single memory allocation
if (device_texturecache[thr_id]) device_singlememory[thr_id] = 1;
if (kernel->single_memory() && !device_singlememory[thr_id]) {
applog(LOG_WARNING, "GPU #%d: the '%c' kernel requires single memory allocation", device_map[thr_id], kernel->get_identifier());
device_singlememory[thr_id] = 1;
}
if (device_lookup_gap[thr_id] == 0) device_lookup_gap[thr_id] = 1;
if (!kernel->support_lookup_gap() && device_lookup_gap[thr_id] > 1)
{
applog(LOG_WARNING, "GPU #%d: the '%c' kernel does not support a lookup gap", device_map[thr_id], kernel->get_identifier());
device_lookup_gap[thr_id] = 1;
}
applog(LOG_INFO, "GPU #%d: interactive: %d, tex-cache: %d%s, single-alloc: %d", device_map[thr_id],
(device_interactive[thr_id] != 0) ? 1 : 0,
(device_texturecache[thr_id] != 0) ? device_texturecache[thr_id] : 0, (device_texturecache[thr_id] != 0) ? "D" : "",
(device_singlememory[thr_id] != 0) ? 1 : 0 );
// number of threads collaborating on one work unit (hash)
unsigned int THREADS_PER_WU = kernel->threads_per_wu();
unsigned int LOOKUP_GAP = device_lookup_gap[thr_id];
unsigned int BACKOFF = device_backoff[thr_id];
unsigned int N = (1 << (opt_nfactor+1));
double szPerWarp = (double)(SCRATCH * WU_PER_WARP * sizeof(uint32_t));
//applog(LOG_INFO, "WU_PER_WARP=%u, THREADS_PER_WU=%u, LOOKUP_GAP=%u, BACKOFF=%u, SCRATCH=%u", WU_PER_WARP, THREADS_PER_WU, LOOKUP_GAP, BACKOFF, SCRATCH);
applog(LOG_INFO, "GPU #%d: %d hashes / %.1f MB per warp.", device_map[thr_id], WU_PER_WARP, szPerWarp / (1024.0 * 1024.0));
// compute highest MAXWARPS numbers for kernels allowing cudaBindTexture to succeed
int MW_1D_4 = 134217728 / (SCRATCH * WU_PER_WARP / 4); // for uint4_t textures
int MW_1D_2 = 134217728 / (SCRATCH * WU_PER_WARP / 2); // for uint2_t textures
int MW_1D = kernel->get_texel_width() == 2 ? MW_1D_2 : MW_1D_4;
uint32_t *d_V = NULL;
if (device_singlememory[thr_id])
{
// if no launch config was specified, we simply
// allocate the single largest memory chunk on the device that we can get
if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK)) {
MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK;
}
else {
// compute no. of warps to allocate the largest number producing a single memory block
// PROBLEM: one some devices, ALL allocations will fail if the first one failed. This sucks.
size_t MEM_LIMIT = (size_t)min((unsigned long long)MAXMEM, (unsigned long long)props.totalGlobalMem);
int warpmax = (int)min((unsigned long long)TOTAL_WARP_LIMIT, (unsigned long long)(MEM_LIMIT / szPerWarp));
// run a bisection algorithm for memory allocation (way more reliable than the previous approach)
int best = 0;
int warp = (warpmax+1)/2;
int interval = (warpmax+1)/2;
while (interval > 0)
{
cudaGetLastError(); // clear the error state
cudaMalloc((void **)&d_V, (size_t)(szPerWarp * warp));
if (cudaGetLastError() == cudaSuccess) {
checkCudaErrors(cudaFree(d_V)); d_V = NULL;
if (warp > best) best = warp;
if (warp == warpmax) break;
interval = (interval+1)/2;
warp += interval;
if (warp > warpmax) warp = warpmax;
}
else
{
interval = interval/2;
warp -= interval;
if (warp < 1) warp = 1;
}
}
// back off a bit from the largest possible allocation size
MAXWARPS[thr_id] = ((100-BACKOFF)*best+50)/100;
}
// now allocate a buffer for determined MAXWARPS setting
cudaGetLastError(); // clear the error state
cudaMalloc((void **)&d_V, (size_t)SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t));
if (cudaGetLastError() == cudaSuccess) {
for (int i=0; i < MAXWARPS[thr_id]; ++i)
h_V[thr_id][i] = d_V + SCRATCH * WU_PER_WARP * i;
if (device_texturecache[thr_id] == 1)
{
if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK))
{
if ( optimal_blocks * WARPS_PER_BLOCK > MW_1D ) {
applog(LOG_ERR, "GPU #%d: '%s' exceeds limits for 1D cache. Using 2D cache instead.", device_map[thr_id], device_config[thr_id]);
device_texturecache[thr_id] = 2;
}
}
// bind linear memory to a 1D texture reference
if (kernel->get_texel_width() == 2)
kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * min(MAXWARPS[thr_id],MW_1D_2) * sizeof(uint32_t));
else
kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * min(MAXWARPS[thr_id],MW_1D_4) * sizeof(uint32_t));
}
else if (device_texturecache[thr_id] == 2)
{
// bind pitch linear memory to a 2D texture reference
if (kernel->get_texel_width() == 2)
kernel->bindtexture_2D(d_V, SCRATCH/2, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t));
else
kernel->bindtexture_2D(d_V, SCRATCH/4, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t));
}
}
else
{
applog(LOG_ERR, "GPU #%d: FATAL: Launch config '%s' requires too much memory!", device_map[thr_id], device_config[thr_id]);
return 0;
}
}
else
{
if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK))
MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK;
else
MAXWARPS[thr_id] = TOTAL_WARP_LIMIT;
// chunked memory allocation up to device limits
int warp;
for (warp = 0; warp < MAXWARPS[thr_id]; ++warp) {
// work around partition camping problems by adding a random start address offset to each allocation
h_V_extra[thr_id][warp] = (props.major == 1) ? (16 * (rand()%(16384/16))) : 0;
cudaGetLastError(); // clear the error state
cudaMalloc((void **) &h_V[thr_id][warp], (SCRATCH * WU_PER_WARP + h_V_extra[thr_id][warp])*sizeof(uint32_t));
if (cudaGetLastError() == cudaSuccess) h_V[thr_id][warp] += h_V_extra[thr_id][warp];
else {
h_V_extra[thr_id][warp] = 0;
// back off by several warp allocations to have some breathing room
int remove = (BACKOFF*warp+50)/100;
for (int i=0; warp > 0 && i < remove; ++i) {
warp--;
checkCudaErrors(cudaFree(h_V[thr_id][warp]-h_V_extra[thr_id][warp]));
h_V[thr_id][warp] = NULL; h_V_extra[thr_id][warp] = 0;
}
break;
}
}
MAXWARPS[thr_id] = warp;
}
if (IS_SCRYPT() || IS_SCRYPT_JANE()) {
kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]);
}
if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK))
{
if (optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id])
{
applog(LOG_ERR, "GPU #%d: FATAL: Given launch config '%s' requires too much memory.", device_map[thr_id], device_config[thr_id]);
return 0;
}
if (WARPS_PER_BLOCK > kernel->max_warps_per_block())
{
applog(LOG_ERR, "GPU #%d: FATAL: Given launch config '%s' exceeds warp limit for '%c' kernel.", device_map[thr_id], device_config[thr_id], kernel->get_identifier());
return 0;
}
}
else
{
if (device_config[thr_id] != NULL && strcasecmp("auto", device_config[thr_id]))
applog(LOG_WARNING, "GPU #%d: Given launch config '%s' does not validate.", device_map[thr_id], device_config[thr_id]);
if (autotune)
{
applog(LOG_INFO, "GPU #%d: Performing auto-tuning, please wait 2 minutes...", device_map[thr_id]);
// allocate device memory
uint32_t *d_idata = NULL, *d_odata = NULL;
if (IS_SCRYPT() || IS_SCRYPT_JANE()) {
unsigned int mem_size = MAXWARPS[thr_id] * WU_PER_WARP * sizeof(uint32_t) * 32;
checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size));
checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size));
// pre-initialize some device memory
uint32_t *h_idata = (uint32_t*)malloc(mem_size);
for (unsigned int i=0; i < mem_size/sizeof(uint32_t); ++i) h_idata[i] = i*2654435761UL; // knuth's method
checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
free(h_idata);
}
#if 0
else if (opt_algo == ALGO_KECCAK) {
uint32_t pdata[20] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
uint32_t ptarget[8] = {0,0,0,0,0,0,0,0};
kernel->prepare_keccak256(thr_id, pdata, ptarget);
} else if (opt_algo == ALGO_BLAKE) {
uint32_t pdata[20] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
uint32_t ptarget[8] = {0,0,0,0,0,0,0,0};
kernel->prepare_blake256(thr_id, pdata, ptarget);
}
#endif
double best_hash_sec = 0.0;
int best_wpb = 0;
// auto-tuning loop
{
// we want to have enough total warps for half the multiprocessors at least
// compute highest MAXWARPS number that we can support based on texture cache mode
int MINTW = props.multiProcessorCount / 2;
int MAXTW = (device_texturecache[thr_id] == 1) ? min(MAXWARPS[thr_id],MW_1D) : MAXWARPS[thr_id];
// we want to have blocks for half the multiprocessors at least
int MINB = props.multiProcessorCount / 2;
int MAXB = MAXTW;
double tmin = 0.05;
applog(LOG_INFO, "GPU #%d: maximum total warps (BxW): %d", (int) device_map[thr_id], MAXTW);
for (int GRID_BLOCKS = MINB; !abort_flag && GRID_BLOCKS <= MAXB; ++GRID_BLOCKS)
{
double Hash[32+1] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
for (WARPS_PER_BLOCK = 1; !abort_flag && WARPS_PER_BLOCK <= kernel->max_warps_per_block(); ++WARPS_PER_BLOCK)
{
double hash_sec = 0;
if (GRID_BLOCKS * WARPS_PER_BLOCK >= MINTW &&
GRID_BLOCKS * WARPS_PER_BLOCK <= MAXTW)
{
// setup execution parameters
dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
struct timeval tv_start, tv_end;
double tdelta = 0;
checkCudaErrors(cudaDeviceSynchronize());
gettimeofday(&tv_start, NULL);
int repeat = 0;
do // average several measurements for better exactness
{
if (IS_SCRYPT() || IS_SCRYPT_JANE())
kernel->run_kernel(
grid, threads, WARPS_PER_BLOCK, thr_id, NULL,
d_idata, d_odata, N, LOOKUP_GAP, device_interactive[thr_id], true, device_texturecache[thr_id]
);
if(cudaDeviceSynchronize() != cudaSuccess)
break;
++repeat;
gettimeofday(&tv_end, NULL);
// for a better result averaging, measure for at least 50ms (10ms for Keccak)
} while ((tdelta=(1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec))) < tmin);
if (cudaGetLastError() != cudaSuccess) continue;
tdelta /= repeat; // BUGFIX: this averaging over multiple measurements was missing
// for scrypt: in interactive mode only find launch configs where kernel launch times are short enough
// TODO: instead we could reduce the batchsize parameter to meet the launch time requirement.
if (IS_SCRYPT() && device_interactive[thr_id] && GRID_BLOCKS > 2*props.multiProcessorCount && tdelta > 1.0/30)
if (WARPS_PER_BLOCK == 1) goto skip; else goto skip2;
hash_sec = (double)WU_PER_LAUNCH / tdelta;
Hash[WARPS_PER_BLOCK] = hash_sec;
if (hash_sec > best_hash_sec) {
optimal_blocks = GRID_BLOCKS;
best_hash_sec = hash_sec;
best_wpb = WARPS_PER_BLOCK;
}
}
}
skip2: ;
if (opt_debug) {
if (GRID_BLOCKS == MINB) {
char line[512] = " ";
for (int i=1; i<=kernel->max_warps_per_block(); ++i) {
char tmp[16]; sprintf(tmp, i < 10 ? " x%-2d" : " x%-2d ", i);
strcat(line, tmp);
if (cw == 80 && (i % 8 == 0 && i != kernel->max_warps_per_block()))
strcat(line, "\n ");
}
applog(LOG_DEBUG, line);
}
char kMGT = ' '; bool flag;
for (int j=0; j < 4; ++j) {
flag=false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 1000, i++);
if (flag) for (int i=1; i<=kernel->max_warps_per_block(); Hash[i] /= 1000, i++);
else break;
if (kMGT == ' ') kMGT = 'k';
else if (kMGT == 'k') kMGT = 'M';
else if (kMGT == 'M') kMGT = 'G';
else if (kMGT == 'G') kMGT = 'T';
}
const char *format = "%5.4f%c";
flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 1, i++); if (flag) format = "%5.3f%c";
flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 10, i++); if (flag) format = "%5.2f%c";
flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 100, i++); if (flag) format = "%5.1f%c";
char line[512]; sprintf(line, "%3d:", GRID_BLOCKS);
for (int i=1; i<=kernel->max_warps_per_block(); ++i) {
char tmp[16];
if (Hash[i]>0)
sprintf(tmp, format, Hash[i], (i<kernel->max_warps_per_block())?'|':' ');
else
sprintf(tmp, " %c", (i<kernel->max_warps_per_block())?'|':' ');
strcat(line, tmp);
if (cw == 80 && (i % 8 == 0 && i != kernel->max_warps_per_block()))
strcat(line, "\n ");
}
int n = strlen(line)-1; line[n++] = '|'; line[n++] = ' '; line[n++] = kMGT; line[n++] = '\0';
strcat(line, "H/s");
applog(LOG_DEBUG, line);
}
}
skip: ;
}
if (IS_SCRYPT() || IS_SCRYPT_JANE()) {
checkCudaErrors(cudaFree(d_odata));
checkCudaErrors(cudaFree(d_idata));
}
WARPS_PER_BLOCK = best_wpb;
applog(LOG_INFO, "GPU #%d: %7.2f hash/s with configuration %c%dx%d", device_map[thr_id], best_hash_sec, kernel->get_identifier(), optimal_blocks, WARPS_PER_BLOCK);
}
else
{
// Heuristics to find a good kernel launch configuration
// base the initial block estimate on the number of multiprocessors
int device_cores = props.multiProcessorCount * _ConvertSMVer2Cores(props.major, props.minor);
// defaults, in case nothing else is chosen below
optimal_blocks = 4 * device_cores / WU_PER_WARP;
WARPS_PER_BLOCK = 2;
// Based on compute capability, pick a known good block x warp configuration.
if (props.major >= 3)
{
if (props.major == 3 && props.minor == 5) // GK110 (Tesla K20X, K20, GeForce GTX TITAN)
{
// TODO: what to do with Titan and Tesla K20(X)?
// for now, do the same as for GTX 660Ti (2GB)
optimal_blocks = (int)(optimal_blocks * 0.8809524);
WARPS_PER_BLOCK = 2;
}
else // GK104, GK106, GK107 ...
{
if (MAXWARPS[thr_id] > (int)(optimal_blocks * 1.7261905) * 2)
{
// this results in 290x2 configuration on GTX 660Ti (3GB)
// but it requires 3GB memory on the card!
optimal_blocks = (int)(optimal_blocks * 1.7261905);
WARPS_PER_BLOCK = 2;
}
else
{
// this results in 148x2 configuration on GTX 660Ti (2GB)
optimal_blocks = (int)(optimal_blocks * 0.8809524);
WARPS_PER_BLOCK = 2;
}
}
}
// 1st generation Fermi (compute 2.0) GF100, GF110
else if (props.major == 2 && props.minor == 0)
{
// this results in a 60x4 configuration on GTX 570
optimal_blocks = 4 * device_cores / WU_PER_WARP;
WARPS_PER_BLOCK = 4;
}
// 2nd generation Fermi (compute 2.1) GF104,106,108,114,116
else if (props.major == 2 && props.minor == 1)
{
// this results in a 56x2 configuration on GTX 460
optimal_blocks = props.multiProcessorCount * 8;
WARPS_PER_BLOCK = 2;
}
// in case we run out of memory with the automatically chosen configuration,
// first back off with WARPS_PER_BLOCK, then reduce optimal_blocks.
if (WARPS_PER_BLOCK==3 && optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id])
WARPS_PER_BLOCK = 2;
while (optimal_blocks > 0 && optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id])
optimal_blocks--;
}
}
applog(LOG_INFO, "GPU #%d: using launch configuration %c%dx%d", device_map[thr_id], kernel->get_identifier(), optimal_blocks, WARPS_PER_BLOCK);
if (device_singlememory[thr_id])
{
if (MAXWARPS[thr_id] != optimal_blocks * WARPS_PER_BLOCK)
{
MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK;
if (device_texturecache[thr_id] == 1)
kernel->unbindtexture_1D();
else if (device_texturecache[thr_id] == 2)
kernel->unbindtexture_2D();
checkCudaErrors(cudaFree(d_V)); d_V = NULL;
cudaGetLastError(); // clear the error state
cudaMalloc((void **)&d_V, (size_t)SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t));
if (cudaGetLastError() == cudaSuccess) {
for (int i=0; i < MAXWARPS[thr_id]; ++i)
h_V[thr_id][i] = d_V + SCRATCH * WU_PER_WARP * i;
if (device_texturecache[thr_id] == 1)
{
// bind linear memory to a 1D texture reference
if (kernel->get_texel_width() == 2)
kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t));
else
kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t));
}
else if (device_texturecache[thr_id] == 2)
{
// bind pitch linear memory to a 2D texture reference
if (kernel->get_texel_width() == 2)
kernel->bindtexture_2D(d_V, SCRATCH/2, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t));
else
kernel->bindtexture_2D(d_V, SCRATCH/4, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t));
}
// update pointers to scratch buffer in constant memory after reallocation
if (IS_SCRYPT() || IS_SCRYPT_JANE()) {
kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]);
}
}
else
{
applog(LOG_ERR, "GPU #%d: Unable to allocate enough memory for launch config '%s'.", device_map[thr_id], device_config[thr_id]);
}
}
}
else
{
// back off unnecessary memory allocations to have some breathing room
while (MAXWARPS[thr_id] > 0 && MAXWARPS[thr_id] > optimal_blocks * WARPS_PER_BLOCK) {
(MAXWARPS[thr_id])--;
checkCudaErrors(cudaFree(h_V[thr_id][MAXWARPS[thr_id]]-h_V_extra[thr_id][MAXWARPS[thr_id]]));
h_V[thr_id][MAXWARPS[thr_id]] = NULL; h_V_extra[thr_id][MAXWARPS[thr_id]] = 0;
}
}
return optimal_blocks;
}
void cuda_scrypt_HtoD(int thr_id, uint32_t *X, int stream)
{
unsigned int GRID_BLOCKS = context_blocks[thr_id];
unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * 32;
// copy host memory to device
cudaMemcpyAsync(context_idata[stream][thr_id], X, mem_size, cudaMemcpyHostToDevice, context_streams[stream][thr_id]);
}
void cuda_scrypt_serialize(int thr_id, int stream)
{
// if the device can concurrently execute multiple kernels, then we must
// wait for the serialization event recorded by the other stream
//if (context_concurrent[thr_id] || device_interactive[thr_id])
cudaStreamWaitEvent(context_streams[stream][thr_id], context_serialize[(stream+1)&1][thr_id], 0);
}
void cuda_scrypt_done(int thr_id, int stream)
{
// record the serialization event in the current stream
cudaEventRecord(context_serialize[stream][thr_id], context_streams[stream][thr_id]);
}
void cuda_scrypt_flush(int thr_id, int stream)
{
// flush the work queue (required for WDDM drivers)
cudaStreamSynchronize(context_streams[stream][thr_id]);
}
void cuda_scrypt_core(int thr_id, int stream, unsigned int N)
{
unsigned int GRID_BLOCKS = context_blocks[thr_id];
unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
unsigned int LOOKUP_GAP = device_lookup_gap[thr_id];
// setup execution parameters
dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
context_kernel[thr_id]->run_kernel(grid, threads, WARPS_PER_BLOCK, thr_id, context_streams[stream][thr_id], context_idata[stream][thr_id], context_odata[stream][thr_id], N, LOOKUP_GAP, device_interactive[thr_id], opt_benchmark, device_texturecache[thr_id]);
}
bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8])
{
return context_kernel[thr_id]->prepare_keccak256(thr_id, host_pdata, ptarget);
}
void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
{
unsigned int GRID_BLOCKS = context_blocks[thr_id];
unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
// setup execution parameters
dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
context_kernel[thr_id]->do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
}
bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8])
{
return context_kernel[thr_id]->prepare_blake256(thr_id, host_pdata, ptarget);
}
void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
{
unsigned int GRID_BLOCKS = context_blocks[thr_id];
unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
// setup execution parameters
dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
context_kernel[thr_id]->do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
}
void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA)
{
unsigned int GRID_BLOCKS = context_blocks[thr_id];
unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * (postSHA ? 8 : 32);
// copy result from device to host (asynchronously)
checkCudaErrors(cudaMemcpyAsync(X, postSHA ? context_hash[stream][thr_id] : context_odata[stream][thr_id], mem_size, cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
}
bool cuda_scrypt_sync(int thr_id, int stream)
{
cudaError_t err;
if(device_interactive[thr_id] && !opt_benchmark)
{
// For devices that also do desktop rendering or compositing, we want to free up some time slots.
// That requires making a pause in work submission when there is no active task on the GPU,
// and Device Synchronize ensures that.
// this call was replaced by the loop below to workaround the high CPU usage issue
//err = cudaDeviceSynchronize();
while((err = cudaStreamQuery(context_streams[0][thr_id])) == cudaErrorNotReady ||
(err == cudaSuccess && (err = cudaStreamQuery(context_streams[1][thr_id])) == cudaErrorNotReady))
usleep(1000);
usleep(1000);
}
else
{
// this call was replaced by the loop below to workaround the high CPU usage issue
//err = cudaStreamSynchronize(context_streams[stream][thr_id]);
while((err = cudaStreamQuery(context_streams[stream][thr_id])) == cudaErrorNotReady)
usleep(1000);
}
if(err != cudaSuccess)
{
applog(LOG_ERR, "GPU #%d: CUDA error `%s` while executing the kernel.", device_map[thr_id], cudaGetErrorString(err));
return false;
}
return true;
}
uint32_t* cuda_transferbuffer(int thr_id, int stream)
{
return context_X[stream][thr_id];
}
uint32_t* cuda_hashbuffer(int thr_id, int stream)
{
return context_H[stream][thr_id];
}

135
scrypt/salsa_kernel.h

@ -0,0 +1,135 @@ @@ -0,0 +1,135 @@
#ifndef SALSA_KERNEL_H
#define SALSA_KERNEL_H
#include <stdio.h>
#include <stdbool.h>
#include <malloc.h>
#include <string.h>
#include <cuda_runtime.h>
#include "miner.h"
#define MAX_DEVICES MAX_GPUS
#define A_SCRYPT 0
#define A_SCRYPT_JANE 1
// from ccminer.cpp
extern short device_map[MAX_GPUS];
extern int device_interactive[MAX_GPUS];
extern int device_batchsize[MAX_GPUS];
extern int device_backoff[MAX_GPUS];
extern int device_lookup_gap[MAX_GPUS];
extern int device_texturecache[MAX_GPUS];
extern int device_singlememory[MAX_GPUS];
extern char *device_config[MAX_GPUS];
extern char *device_name[MAX_GPUS];
extern bool autotune;
extern int opt_nfactor;
extern char *jane_params;
extern bool abort_flag;
extern bool autotune;
extern int parallel;
extern void get_currentalgo(char* buf, int sz);
typedef unsigned int uint32_t; // define this as 32 bit type derived from int
static char algo[64] = { 0 };
static __inline bool IS_SCRYPT() { if (algo[0] == '\0') get_currentalgo(algo, 64); return !strcmp(algo,"scrypt"); }
static __inline bool IS_SCRYPT_JANE() { if (algo[0] == '\0') get_currentalgo(algo, 64); return !strcmp(algo,"scrypt-jane"); }
// CUDA externals
extern int cuda_num_devices();
extern void cuda_shutdown(int thr_id);
extern int cuda_throughput(int thr_id);
extern uint32_t *cuda_transferbuffer(int thr_id, int stream);
extern uint32_t *cuda_hashbuffer(int thr_id, int stream);
extern void cuda_scrypt_HtoD(int thr_id, uint32_t *X, int stream);
extern void cuda_scrypt_serialize(int thr_id, int stream);
extern void cuda_scrypt_core(int thr_id, int stream, unsigned int N);
extern void cuda_scrypt_done(int thr_id, int stream);
extern void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA);
extern bool cuda_scrypt_sync(int thr_id, int stream);
extern void cuda_scrypt_flush(int thr_id, int stream);
extern bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
extern void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
extern bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
extern void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
extern void computeGold(uint32_t *idata, uint32_t *reference, uchar *scratchpad);
extern bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
extern bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
#ifdef __NVCC__
extern void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
extern void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
#endif
// If we're in C++ mode, we're either compiling .cu files or scrypt.cpp
#ifdef __NVCC__
/**
* An pure virtual interface for a CUDA kernel implementation.
* TODO: encapsulate the kernel launch parameters in some kind of wrapper.
*/
class KernelInterface
{
public:
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) = 0;
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) = 0;
virtual bool bindtexture_1D(uint32_t *d_V, size_t size) { return true; }
virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) { return true; }
virtual bool unbindtexture_1D() { return true; }
virtual bool unbindtexture_2D() { return true; }
virtual char get_identifier() = 0;
virtual int get_major_version() { return 1; }
virtual int get_minor_version() { return 0; }
virtual int max_warps_per_block() = 0;
virtual int get_texel_width() = 0;
virtual bool no_textures() { return false; };
virtual bool single_memory() { return false; };
virtual int threads_per_wu() { return 1; }
virtual bool support_lookup_gap() { return false; }
virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeDefault; }
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferNone; }
virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) {
return default_prepare_keccak256(thr_id, host_pdata, ptarget);
}
virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) {
default_do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
}
virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) {
return default_prepare_blake256(thr_id, host_pdata, ptarget);
}
virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) {
default_do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
}
};
// Not performing error checking is actually bad, but...
#define checkCudaErrors(x) x
#define getLastCudaError(x)
#endif // #ifdef __NVCC__
// Define work unit size
#define TOTAL_WARP_LIMIT 4096
#define WU_PER_WARP (32 / THREADS_PER_WU)
#define WU_PER_BLOCK (WU_PER_WARP*WARPS_PER_BLOCK)
#define WU_PER_LAUNCH (GRID_BLOCKS*WU_PER_BLOCK)
// make scratchpad size dependent on N and LOOKUP_GAP
#define SCRATCH (((N+LOOKUP_GAP-1)/LOOKUP_GAP)*32)
#endif // #ifndef SALSA_KERNEL_H

29
scrypt/scrypt-jane.h

@ -0,0 +1,29 @@ @@ -0,0 +1,29 @@
#ifndef SCRYPT_JANE_H
#define SCRYPT_JANE_H
/*
Nfactor: Increases CPU & Memory Hardness
N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used
rfactor: Increases Memory Hardness
r = (1 << rfactor): How large a chunk is
pfactor: Increases CPU Hardness
p = (1 << pfactor): Number of times to mix the main chunk
A block is the basic mixing unit (salsa/chacha block = 64 bytes)
A chunk is (2 * r) blocks
~Memory used = (N + 2) * ((2 * r) * block size)
*/
#include <stdlib.h>
#include <stdint.h>
#include <memory.h>
typedef void (*scrypt_fatal_errorfn)(const char *msg);
void scrypt_set_fatal_error(scrypt_fatal_errorfn fn);
void scrypt_N_1_1(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, uint32_t N, unsigned char *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V);
#endif /* SCRYPT_JANE_H */

638
scrypt/sha2.c

@ -0,0 +1,638 @@ @@ -0,0 +1,638 @@
/*
* Copyright 2011 ArtForz
* Copyright 2011-2013 pooler
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version. See COPYING for more details.
*/
#include "cpuminer-config.h"
#include "miner.h"
#include <string.h>
#include <stdint.h>
#ifdef WIN32
#define __attribute__(x)
#endif
#if defined(__arm__) && defined(__APCS_32__)
#define EXTERN_SHA256
#endif
static const uint32_t sha256_h[8] = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};
static const uint32_t sha256_k[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
void sha256_init(uint32_t *state)
{
memcpy(state, sha256_h, 32);
}
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
do { \
t0 = h + S1(e) + Ch(e, f, g) + k; \
t1 = S0(a) + Maj(a, b, c); \
d += t0; \
h = t0 + t1; \
} while (0)
/* Adjusted round function for rotating state */
#define RNDr(S, W, i) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i] + sha256_k[i])
#ifndef EXTERN_SHA256
/*
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state.
*/
void sha256_transform(uint32_t *state, const uint32_t *block, int swap)
{
uint32_t W[64];
uint32_t S[8];
uint32_t t0, t1;
int i;
/* 1. Prepare message schedule W. */
if (swap) {
for (i = 0; i < 16; i++)
W[i] = swab32(block[i]);
} else
memcpy(W, block, 64);
for (i = 16; i < 64; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
}
/* 2. Initialize working variables. */
memcpy(S, state, 32);
/* 3. Mix. */
RNDr(S, W, 0);
RNDr(S, W, 1);
RNDr(S, W, 2);
RNDr(S, W, 3);
RNDr(S, W, 4);
RNDr(S, W, 5);
RNDr(S, W, 6);
RNDr(S, W, 7);
RNDr(S, W, 8);
RNDr(S, W, 9);
RNDr(S, W, 10);
RNDr(S, W, 11);
RNDr(S, W, 12);
RNDr(S, W, 13);
RNDr(S, W, 14);
RNDr(S, W, 15);
RNDr(S, W, 16);
RNDr(S, W, 17);
RNDr(S, W, 18);
RNDr(S, W, 19);
RNDr(S, W, 20);
RNDr(S, W, 21);
RNDr(S, W, 22);
RNDr(S, W, 23);
RNDr(S, W, 24);
RNDr(S, W, 25);
RNDr(S, W, 26);
RNDr(S, W, 27);
RNDr(S, W, 28);
RNDr(S, W, 29);
RNDr(S, W, 30);
RNDr(S, W, 31);
RNDr(S, W, 32);
RNDr(S, W, 33);
RNDr(S, W, 34);
RNDr(S, W, 35);
RNDr(S, W, 36);
RNDr(S, W, 37);
RNDr(S, W, 38);
RNDr(S, W, 39);
RNDr(S, W, 40);
RNDr(S, W, 41);
RNDr(S, W, 42);
RNDr(S, W, 43);
RNDr(S, W, 44);
RNDr(S, W, 45);
RNDr(S, W, 46);
RNDr(S, W, 47);
RNDr(S, W, 48);
RNDr(S, W, 49);
RNDr(S, W, 50);
RNDr(S, W, 51);
RNDr(S, W, 52);
RNDr(S, W, 53);
RNDr(S, W, 54);
RNDr(S, W, 55);
RNDr(S, W, 56);
RNDr(S, W, 57);
RNDr(S, W, 58);
RNDr(S, W, 59);
RNDr(S, W, 60);
RNDr(S, W, 61);
RNDr(S, W, 62);
RNDr(S, W, 63);
/* 4. Mix local working variables into global state */
for (i = 0; i < 8; i++)
state[i] += S[i];
}
#endif /* EXTERN_SHA256 */
static const uint32_t sha256d_hash1[16] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x80000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000100
};
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
{
uint32_t S[16];
int i;
sha256_init(S);
sha256_transform(S, data, 0);
sha256_transform(S, data + 16, 0);
memcpy(S + 8, sha256d_hash1 + 8, 32);
sha256_init(hash);
sha256_transform(hash, S, 0);
for (i = 0; i < 8; i++)
hash[i] = swab32(hash[i]);
}
void sha256d(unsigned char *hash, const unsigned char *data, int len)
{
uint32_t S[16], T[16];
int i, r;
sha256_init(S);
for (r = len; r > -9; r -= 64) {
if (r < 64)
memset(T, 0, 64);
memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
if (r >= 0 && r < 64)
((unsigned char *)T)[r] = 0x80;
for (i = 0; i < 16; i++)
T[i] = be32dec(T + i);
if (r < 56)
T[15] = 8 * len;
sha256_transform(S, T, 0);
}
memcpy(S + 8, sha256d_hash1 + 8, 32);
sha256_init(T);
sha256_transform(T, S, 0);
for (i = 0; i < 8; i++)
be32enc((uint32_t *)hash + i, T[i]);
}
static inline void sha256d_preextend(uint32_t *W)
{
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1];
W[18] = s1(W[16]) + W[11] + W[ 2];
W[19] = s1(W[17]) + W[12] + s0(W[ 4]);
W[20] = W[13] + s0(W[ 5]) + W[ 4];
W[21] = W[14] + s0(W[ 6]) + W[ 5];
W[22] = W[15] + s0(W[ 7]) + W[ 6];
W[23] = W[16] + s0(W[ 8]) + W[ 7];
W[24] = W[17] + s0(W[ 9]) + W[ 8];
W[25] = s0(W[10]) + W[ 9];
W[26] = s0(W[11]) + W[10];
W[27] = s0(W[12]) + W[11];
W[28] = s0(W[13]) + W[12];
W[29] = s0(W[14]) + W[13];
W[30] = s0(W[15]) + W[14];
W[31] = s0(W[16]) + W[15];
}
static inline void sha256d_prehash(uint32_t *S, const uint32_t *W)
{
uint32_t t0, t1;
RNDr(S, W, 0);
RNDr(S, W, 1);
RNDr(S, W, 2);
}
#ifdef EXTERN_SHA256
void sha256d_ms(uint32_t *hash, uint32_t *W,
const uint32_t *midstate, const uint32_t *prehash);
#else
static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
const uint32_t *midstate, const uint32_t *prehash)
{
uint32_t S[64];
uint32_t t0, t1;
int i;
S[18] = W[18];
S[19] = W[19];
S[20] = W[20];
S[22] = W[22];
S[23] = W[23];
S[24] = W[24];
S[30] = W[30];
S[31] = W[31];
W[18] += s0(W[3]);
W[19] += W[3];
W[20] += s1(W[18]);
W[21] = s1(W[19]);
W[22] += s1(W[20]);
W[23] += s1(W[21]);
W[24] += s1(W[22]);
W[25] = s1(W[23]) + W[18];
W[26] = s1(W[24]) + W[19];
W[27] = s1(W[25]) + W[20];
W[28] = s1(W[26]) + W[21];
W[29] = s1(W[27]) + W[22];
W[30] += s1(W[28]) + W[23];
W[31] += s1(W[29]) + W[24];
for (i = 32; i < 64; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
}
memcpy(S, prehash, 32);
RNDr(S, W, 3);
RNDr(S, W, 4);
RNDr(S, W, 5);
RNDr(S, W, 6);
RNDr(S, W, 7);
RNDr(S, W, 8);
RNDr(S, W, 9);
RNDr(S, W, 10);
RNDr(S, W, 11);
RNDr(S, W, 12);
RNDr(S, W, 13);
RNDr(S, W, 14);
RNDr(S, W, 15);
RNDr(S, W, 16);
RNDr(S, W, 17);
RNDr(S, W, 18);
RNDr(S, W, 19);
RNDr(S, W, 20);
RNDr(S, W, 21);
RNDr(S, W, 22);
RNDr(S, W, 23);
RNDr(S, W, 24);
RNDr(S, W, 25);
RNDr(S, W, 26);
RNDr(S, W, 27);
RNDr(S, W, 28);
RNDr(S, W, 29);
RNDr(S, W, 30);
RNDr(S, W, 31);
RNDr(S, W, 32);
RNDr(S, W, 33);
RNDr(S, W, 34);
RNDr(S, W, 35);
RNDr(S, W, 36);
RNDr(S, W, 37);
RNDr(S, W, 38);
RNDr(S, W, 39);
RNDr(S, W, 40);
RNDr(S, W, 41);
RNDr(S, W, 42);
RNDr(S, W, 43);
RNDr(S, W, 44);
RNDr(S, W, 45);
RNDr(S, W, 46);
RNDr(S, W, 47);
RNDr(S, W, 48);
RNDr(S, W, 49);
RNDr(S, W, 50);
RNDr(S, W, 51);
RNDr(S, W, 52);
RNDr(S, W, 53);
RNDr(S, W, 54);
RNDr(S, W, 55);
RNDr(S, W, 56);
RNDr(S, W, 57);
RNDr(S, W, 58);
RNDr(S, W, 59);
RNDr(S, W, 60);
RNDr(S, W, 61);
RNDr(S, W, 62);
RNDr(S, W, 63);
for (i = 0; i < 8; i++)
S[i] += midstate[i];
W[18] = S[18];
W[19] = S[19];
W[20] = S[20];
W[22] = S[22];
W[23] = S[23];
W[24] = S[24];
W[30] = S[30];
W[31] = S[31];
memcpy(S + 8, sha256d_hash1 + 8, 32);
S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0];
S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1];
S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2];
S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3];
S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4];
S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5];
S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6];
S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7];
S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8];
S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9];
S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10];
S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11];
S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12];
S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13];
S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14];
S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15];
for (i = 32; i < 60; i += 2) {
S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
}
S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44];
sha256_init(hash);
RNDr(hash, S, 0);
RNDr(hash, S, 1);
RNDr(hash, S, 2);
RNDr(hash, S, 3);
RNDr(hash, S, 4);
RNDr(hash, S, 5);
RNDr(hash, S, 6);
RNDr(hash, S, 7);
RNDr(hash, S, 8);
RNDr(hash, S, 9);
RNDr(hash, S, 10);
RNDr(hash, S, 11);
RNDr(hash, S, 12);
RNDr(hash, S, 13);
RNDr(hash, S, 14);
RNDr(hash, S, 15);
RNDr(hash, S, 16);
RNDr(hash, S, 17);
RNDr(hash, S, 18);
RNDr(hash, S, 19);
RNDr(hash, S, 20);
RNDr(hash, S, 21);
RNDr(hash, S, 22);
RNDr(hash, S, 23);
RNDr(hash, S, 24);
RNDr(hash, S, 25);
RNDr(hash, S, 26);
RNDr(hash, S, 27);
RNDr(hash, S, 28);
RNDr(hash, S, 29);
RNDr(hash, S, 30);
RNDr(hash, S, 31);
RNDr(hash, S, 32);
RNDr(hash, S, 33);
RNDr(hash, S, 34);
RNDr(hash, S, 35);
RNDr(hash, S, 36);
RNDr(hash, S, 37);
RNDr(hash, S, 38);
RNDr(hash, S, 39);
RNDr(hash, S, 40);
RNDr(hash, S, 41);
RNDr(hash, S, 42);
RNDr(hash, S, 43);
RNDr(hash, S, 44);
RNDr(hash, S, 45);
RNDr(hash, S, 46);
RNDr(hash, S, 47);
RNDr(hash, S, 48);
RNDr(hash, S, 49);
RNDr(hash, S, 50);
RNDr(hash, S, 51);
RNDr(hash, S, 52);
RNDr(hash, S, 53);
RNDr(hash, S, 54);
RNDr(hash, S, 55);
RNDr(hash, S, 56);
hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5])
+ S[57] + sha256_k[57];
hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4])
+ S[58] + sha256_k[58];
hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3])
+ S[59] + sha256_k[59];
hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2])
+ S[60] + sha256_k[60]
+ sha256_h[7];
}
#endif /* EXTERN_SHA256 */
#ifdef HAVE_SHA256_4WAY
void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, struct timeval *tv_start, struct timeval *tv_end, unsigned long *hashes_done)
{
gettimeofday(tv_start, NULL);
uint32_t data[4 * 64] __attribute__((aligned(128)));
uint32_t hash[4 * 8] __attribute__((aligned(32)));
uint32_t midstate[4 * 8] __attribute__((aligned(32)));
uint32_t prehash[4 * 8] __attribute__((aligned(32)));
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
int i, j;
memcpy(data, pdata + 16, 64);
sha256d_preextend(data);
for (i = 31; i >= 0; i--)
for (j = 0; j < 4; j++)
data[i * 4 + j] = data[i];
sha256_init(midstate);
sha256_transform(midstate, pdata, 0);
memcpy(prehash, midstate, 32);
sha256d_prehash(prehash, pdata + 16);
for (i = 7; i >= 0; i--) {
for (j = 0; j < 4; j++) {
midstate[i * 4 + j] = midstate[i];
prehash[i * 4 + j] = prehash[i];
}
}
do {
for (i = 0; i < 4; i++)
data[4 * 3 + i] = ++n;
sha256d_ms_4way(hash, data, midstate, prehash);
for (i = 0; i < 4; i++) {
if (swab32(hash[4 * 7 + i]) <= Htarg) {
pdata[19] = data[4 * 3 + i];
sha256d_80_swap(hash, pdata);
if (fulltest(hash, ptarget)) {
*hashes_done = n - first_nonce + 1;
gettimeofday(&tv_end, NULL);
return 1;
}
}
}
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
gettimeofday(&tv_end, NULL);
return 0;
}
#endif /* HAVE_SHA256_4WAY */
#ifdef HAVE_SHA256_8WAY
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t data[8 * 64] __attribute__((aligned(128)));
uint32_t hash[8 * 8] __attribute__((aligned(32)));
uint32_t midstate[8 * 8] __attribute__((aligned(32)));
uint32_t prehash[8 * 8] __attribute__((aligned(32)));
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
int i, j;
memcpy(data, pdata + 16, 64);
sha256d_preextend(data);
for (i = 31; i >= 0; i--)
for (j = 0; j < 8; j++)
data[i * 8 + j] = data[i];
sha256_init(midstate);
sha256_transform(midstate, pdata, 0);
memcpy(prehash, midstate, 32);
sha256d_prehash(prehash, pdata + 16);
for (i = 7; i >= 0; i--) {
for (j = 0; j < 8; j++) {
midstate[i * 8 + j] = midstate[i];
prehash[i * 8 + j] = prehash[i];
}
}
do {
for (i = 0; i < 8; i++)
data[8 * 3 + i] = ++n;
sha256d_ms_8way(hash, data, midstate, prehash);
for (i = 0; i < 8; i++) {
if (swab32(hash[8 * 7 + i]) <= Htarg) {
pdata[19] = data[8 * 3 + i];
sha256d_80_swap(hash, pdata);
if (fulltest(hash, ptarget)) {
*hashes_done = n - first_nonce + 1;
return 1;
}
}
}
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
#endif /* HAVE_SHA256_8WAY */
int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, struct timeval *tv_start, struct timeval *tv_end, unsigned long *hashes_done)
{
uint32_t data[64] __attribute__((aligned(128)));
uint32_t hash[8] __attribute__((aligned(32)));
uint32_t midstate[8] __attribute__((aligned(32)));
uint32_t prehash[8] __attribute__((aligned(32)));
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
#ifdef HAVE_SHA256_8WAY
if (sha256_use_8way())
return scanhash_sha256d_8way(thr_id, pdata, ptarget,
max_nonce, hashes_done);
#endif
#ifdef HAVE_SHA256_4WAY
if (sha256_use_4way())
return scanhash_sha256d_4way(thr_id, pdata, ptarget,
max_nonce, hashes_done);
#endif
memcpy(data, pdata + 16, 64);
sha256d_preextend(data);
sha256_init(midstate);
sha256_transform(midstate, pdata, 0);
memcpy(prehash, midstate, 32);
sha256d_prehash(prehash, pdata + 16);
do {
data[3] = ++n;
sha256d_ms(hash, data, midstate, prehash);
if (swab32(hash[7]) <= Htarg) {
pdata[19] = data[3];
sha256d_80_swap(hash, pdata);
if (fulltest(hash, ptarget)) {
*hashes_done = n - first_nonce + 1;
return 1;
}
}
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}

441
scrypt/sha256.cu

@ -0,0 +1,441 @@ @@ -0,0 +1,441 @@
//
// =============== SHA256 part on nVidia GPU ======================
//
// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64
//
#include <map>
#include "cuda_runtime.h"
#include "miner.h"
#include "salsa_kernel.h"
#include "sha256.h"
// define some error checking macros
#undef checkCudaErrors
#if WIN32
#define DELIMITER '/'
#else
#define DELIMITER '/'
#endif
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
#define checkCudaErrors(x) { \
cudaGetLastError(); \
x; \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) \
applog(LOG_ERR, "GPU #%d: cudaError %d (%s) calling '%s' (%s line %d)\n", (int) device_map[thr_id], err, cudaGetErrorString(err), #x, __FILENAME__, __LINE__); \
}
// from salsa_kernel.cu
extern std::map<int, uint32_t *> context_idata[2];
extern std::map<int, uint32_t *> context_odata[2];
extern std::map<int, cudaStream_t> context_streams[2];
extern std::map<int, uint32_t *> context_tstate[2];
extern std::map<int, uint32_t *> context_ostate[2];
extern std::map<int, uint32_t *> context_hash[2];
static const uint32_t host_sha256_h[8] = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};
static const uint32_t host_sha256_k[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
do { \
t0 = h + S1(e) + Ch(e, f, g) + k; \
t1 = S0(a) + Maj(a, b, c); \
d += t0; \
h = t0 + t1; \
} while (0)
/* Adjusted round function for rotating state */
#define RNDr(S, W, i) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i] + sha256_k[i])
static const uint32_t host_keypad[12] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
};
static const uint32_t host_innerpad[11] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
};
static const uint32_t host_outerpad[8] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
};
static const uint32_t host_finalblk[16] = {
0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
};
//
// CUDA code
//
__constant__ uint32_t sha256_h[8];
__constant__ uint32_t sha256_k[64];
__constant__ uint32_t keypad[12];
__constant__ uint32_t innerpad[11];
__constant__ uint32_t outerpad[8];
__constant__ uint32_t finalblk[16];
__constant__ uint32_t pdata[20];
__constant__ uint32_t midstate[8];
__device__ void mycpy12(uint32_t *d, const uint32_t *s) {
#pragma unroll 3
for (int k=0; k < 3; k++) d[k] = s[k];
}
__device__ void mycpy16(uint32_t *d, const uint32_t *s) {
#pragma unroll 4
for (int k=0; k < 4; k++) d[k] = s[k];
}
__device__ void mycpy32(uint32_t *d, const uint32_t *s) {
#pragma unroll 8
for (int k=0; k < 8; k++) d[k] = s[k];
}
__device__ void mycpy44(uint32_t *d, const uint32_t *s) {
#pragma unroll 11
for (int k=0; k < 11; k++) d[k] = s[k];
}
__device__ void mycpy48(uint32_t *d, const uint32_t *s) {
#pragma unroll 12
for (int k=0; k < 12; k++) d[k] = s[k];
}
__device__ void mycpy64(uint32_t *d, const uint32_t *s) {
#pragma unroll 16
for (int k=0; k < 16; k++) d[k] = s[k];
}
__device__ uint32_t cuda_swab32(uint32_t x)
{
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
}
__device__ void mycpy32_swab32(uint32_t *d, const uint32_t *s) {
#pragma unroll 8
for (int k=0; k < 8; k++) d[k] = cuda_swab32(s[k]);
}
__device__ void mycpy64_swab32(uint32_t *d, const uint32_t *s) {
#pragma unroll 16
for (int k=0; k < 16; k++) d[k] = cuda_swab32(s[k]);
}
__device__ void cuda_sha256_init(uint32_t *state)
{
mycpy32(state, sha256_h);
}
/*
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state. Modified for lower register use.
*/
__device__ void cuda_sha256_transform(uint32_t *state, const uint32_t *block)
{
uint32_t W[64]; // only 4 of these are accessed during each partial Mix
uint32_t S[8];
uint32_t t0, t1;
int i;
/* 1. Initialize working variables. */
mycpy32(S, state);
/* 2. Prepare message schedule W and Mix. */
mycpy16(W, block);
RNDr(S, W, 0); RNDr(S, W, 1); RNDr(S, W, 2); RNDr(S, W, 3);
mycpy16(W+4, block+4);
RNDr(S, W, 4); RNDr(S, W, 5); RNDr(S, W, 6); RNDr(S, W, 7);
mycpy16(W+8, block+8);
RNDr(S, W, 8); RNDr(S, W, 9); RNDr(S, W, 10); RNDr(S, W, 11);
mycpy16(W+12, block+12);
RNDr(S, W, 12); RNDr(S, W, 13); RNDr(S, W, 14); RNDr(S, W, 15);
#pragma unroll 2
for (i = 16; i < 20; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
RNDr(S, W, 16); RNDr(S, W, 17); RNDr(S, W, 18); RNDr(S, W, 19);
#pragma unroll 2
for (i = 20; i < 24; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
RNDr(S, W, 20); RNDr(S, W, 21); RNDr(S, W, 22); RNDr(S, W, 23);
#pragma unroll 2
for (i = 24; i < 28; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
RNDr(S, W, 24); RNDr(S, W, 25); RNDr(S, W, 26); RNDr(S, W, 27);
#pragma unroll 2
for (i = 28; i < 32; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
RNDr(S, W, 28); RNDr(S, W, 29); RNDr(S, W, 30); RNDr(S, W, 31);
#pragma unroll 2
for (i = 32; i < 36; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
RNDr(S, W, 32); RNDr(S, W, 33); RNDr(S, W, 34); RNDr(S, W, 35);
#pragma unroll 2
for (i = 36; i < 40; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
RNDr(S, W, 36); RNDr(S, W, 37); RNDr(S, W, 38); RNDr(S, W, 39);
#pragma unroll 2
for (i = 40; i < 44; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
RNDr(S, W, 40); RNDr(S, W, 41); RNDr(S, W, 42); RNDr(S, W, 43);
#pragma unroll 2
for (i = 44; i < 48; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
RNDr(S, W, 44); RNDr(S, W, 45); RNDr(S, W, 46); RNDr(S, W, 47);
#pragma unroll 2
for (i = 48; i < 52; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
RNDr(S, W, 48); RNDr(S, W, 49); RNDr(S, W, 50); RNDr(S, W, 51);
#pragma unroll 2
for (i = 52; i < 56; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
RNDr(S, W, 52); RNDr(S, W, 53); RNDr(S, W, 54); RNDr(S, W, 55);
#pragma unroll 2
for (i = 56; i < 60; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
RNDr(S, W, 56); RNDr(S, W, 57); RNDr(S, W, 58); RNDr(S, W, 59);
#pragma unroll 2
for (i = 60; i < 64; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
RNDr(S, W, 60); RNDr(S, W, 61); RNDr(S, W, 62); RNDr(S, W, 63);
/* 3. Mix local working variables into global state */
#pragma unroll 8
for (i = 0; i < 8; i++)
state[i] += S[i];
}
//
// HMAC SHA256 functions, modified to work with pdata and nonce directly
//
__device__ void cuda_HMAC_SHA256_80_init(uint32_t *tstate, uint32_t *ostate, uint32_t nonce)
{
uint32_t ihash[8];
uint32_t pad[16];
int i;
/* tstate is assumed to contain the midstate of key */
mycpy12(pad, pdata + 16);
pad[3] = nonce;
mycpy48(pad + 4, keypad);
cuda_sha256_transform(tstate, pad);
mycpy32(ihash, tstate);
cuda_sha256_init(ostate);
#pragma unroll 8
for (i = 0; i < 8; i++)
pad[i] = ihash[i] ^ 0x5c5c5c5c;
#pragma unroll 8
for (i=8; i < 16; i++)
pad[i] = 0x5c5c5c5c;
cuda_sha256_transform(ostate, pad);
cuda_sha256_init(tstate);
#pragma unroll 8
for (i = 0; i < 8; i++)
pad[i] = ihash[i] ^ 0x36363636;
#pragma unroll 8
for (i=8; i < 16; i++)
pad[i] = 0x36363636;
cuda_sha256_transform(tstate, pad);
}
__device__ void cuda_PBKDF2_SHA256_80_128(const uint32_t *tstate,
const uint32_t *ostate, uint32_t *output, uint32_t nonce)
{
uint32_t istate[8], ostate2[8];
uint32_t ibuf[16], obuf[16];
mycpy32(istate, tstate);
cuda_sha256_transform(istate, pdata);
mycpy12(ibuf, pdata + 16);
ibuf[3] = nonce;
ibuf[4] = 1;
mycpy44(ibuf + 5, innerpad);
mycpy32(obuf, istate);
mycpy32(obuf + 8, outerpad);
cuda_sha256_transform(obuf, ibuf);
mycpy32(ostate2, ostate);
cuda_sha256_transform(ostate2, obuf);
mycpy32_swab32(output, ostate2); // TODO: coalescing would be desired
mycpy32(obuf, istate);
ibuf[4] = 2;
cuda_sha256_transform(obuf, ibuf);
mycpy32(ostate2, ostate);
cuda_sha256_transform(ostate2, obuf);
mycpy32_swab32(output+8, ostate2); // TODO: coalescing would be desired
mycpy32(obuf, istate);
ibuf[4] = 3;
cuda_sha256_transform(obuf, ibuf);
mycpy32(ostate2, ostate);
cuda_sha256_transform(ostate2, obuf);
mycpy32_swab32(output+16, ostate2); // TODO: coalescing would be desired
mycpy32(obuf, istate);
ibuf[4] = 4;
cuda_sha256_transform(obuf, ibuf);
mycpy32(ostate2, ostate);
cuda_sha256_transform(ostate2, obuf);
mycpy32_swab32(output+24, ostate2); // TODO: coalescing would be desired
}
__global__ void cuda_pre_sha256(uint32_t g_inp[32], uint32_t g_tstate_ext[8], uint32_t g_ostate_ext[8], uint32_t nonce)
{
nonce += (blockIdx.x * blockDim.x) + threadIdx.x;
g_inp += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x);
g_tstate_ext += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
g_ostate_ext += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
uint32_t tstate[8], ostate[8];
mycpy32(tstate, midstate);
cuda_HMAC_SHA256_80_init(tstate, ostate, nonce);
mycpy32(g_tstate_ext, tstate); // TODO: coalescing would be desired
mycpy32(g_ostate_ext, ostate); // TODO: coalescing would be desired
cuda_PBKDF2_SHA256_80_128(tstate, ostate, g_inp, nonce);
}
__global__ void cuda_post_sha256(uint32_t g_output[8], uint32_t g_tstate_ext[8], uint32_t g_ostate_ext[8], uint32_t g_salt_ext[32])
{
g_output += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
g_tstate_ext += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
g_ostate_ext += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
g_salt_ext += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x);
uint32_t tstate[16];
mycpy32(tstate, g_tstate_ext); // TODO: coalescing would be desired
uint32_t halfsalt[16];
mycpy64_swab32(halfsalt, g_salt_ext); // TODO: coalescing would be desired
cuda_sha256_transform(tstate, halfsalt);
mycpy64_swab32(halfsalt, g_salt_ext+16); // TODO: coalescing would be desired
cuda_sha256_transform(tstate, halfsalt);
cuda_sha256_transform(tstate, finalblk);
uint32_t buf[16];
mycpy32(buf, tstate);
mycpy32(buf + 8, outerpad);
uint32_t ostate[16];
mycpy32(ostate, g_ostate_ext);
cuda_sha256_transform(ostate, buf);
mycpy32_swab32(g_output, ostate); // TODO: coalescing would be desired
}
//
// callable host code to initialize constants and to call kernels
//
void prepare_sha256(int thr_id, uint32_t host_pdata[20], uint32_t host_midstate[8])
{
static bool init[8] = {false, false, false, false, false, false, false, false};
if (!init[thr_id])
{
checkCudaErrors(cudaMemcpyToSymbol(sha256_h, host_sha256_h, sizeof(host_sha256_h), 0, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpyToSymbol(sha256_k, host_sha256_k, sizeof(host_sha256_k), 0, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpyToSymbol(keypad, host_keypad, sizeof(host_keypad), 0, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpyToSymbol(innerpad, host_innerpad, sizeof(host_innerpad), 0, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpyToSymbol(outerpad, host_outerpad, sizeof(host_outerpad), 0, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpyToSymbol(finalblk, host_finalblk, sizeof(host_finalblk), 0, cudaMemcpyHostToDevice));
init[thr_id] = true;
}
checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpyToSymbol(midstate, host_midstate, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
}
void pre_sha256(int thr_id, int stream, uint32_t nonce, int throughput)
{
dim3 block(128);
dim3 grid((throughput+127)/128);
cuda_pre_sha256<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_idata[stream][thr_id], context_tstate[stream][thr_id], context_ostate[stream][thr_id], nonce);
}
void post_sha256(int thr_id, int stream, int throughput)
{
dim3 block(128);
dim3 grid((throughput+127)/128);
cuda_post_sha256<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_hash[stream][thr_id], context_tstate[stream][thr_id], context_ostate[stream][thr_id], context_odata[stream][thr_id]);
}

10
scrypt/sha256.h

@ -0,0 +1,10 @@ @@ -0,0 +1,10 @@
#ifndef SHA256_H
#define SHA256_H
#include <stdint.h>
extern "C" void prepare_sha256(int thr_id, uint32_t cpu_pdata[20], uint32_t cpu_midstate[8]);
extern "C" void pre_sha256(int thr_id, int stream, uint32_t nonce, int throughput);
extern "C" void post_sha256(int thr_id, int stream, int throughput);
#endif // #ifndef SHA256_H

781
scrypt/test_kernel.cu

@ -0,0 +1,781 @@ @@ -0,0 +1,781 @@
/* Copyright (C) 2013 David G. Andersen. All rights reserved.
* with modifications by Christian Buchner
*
* Use of this code is covered under the Apache 2.0 license, which
* can be found in the file "LICENSE"
*
* The array notation for b[] and bx[] arrays was converted to uint4,
* in preparation for some experimental changes to memory access patterns.
* Also this kernel is going to be a testbed for adaptation to Fermi devices.
*/
// TODO: experiment with different memory access patterns in write/read_keys_direct functions
// TODO: attempt V.Volkov style ILP (factor 4)
#include <map>
#include "cuda_runtime.h"
#include "miner.h"
#include "salsa_kernel.h"
#include "test_kernel.h"
#define TEXWIDTH 32768
#define THREADS_PER_WU 4 // four threads per hash
typedef enum
{
ANDERSEN,
SIMPLE
} MemoryAccess;
// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
// iteration count N
__constant__ uint32_t c_N;
__constant__ uint32_t c_N_1; // N-1
// scratch buffer size SCRATCH
__constant__ uint32_t c_SCRATCH;
__constant__ uint32_t c_SCRATCH_WU_PER_WARP; // (SCRATCH * WU_PER_WARP)
__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP) - 1
// using texture references for the "tex" variants of the B kernels
texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V;
texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
template <int ALGO> __device__ __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);
static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) {
left.x ^= right.x;
left.y ^= right.y;
left.z ^= right.z;
left.w ^= right.w;
return left;
}
static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) {
left.x += right.x;
left.y += right.y;
left.z += right.z;
left.w += right.w;
return left;
}
/* write_keys writes the 8 keys being processed by a warp to the global
* scratchpad. To effectively use memory bandwidth, it performs the writes
* (and reads, for read_keys) 128 bytes at a time per memory location
* by __shfl'ing the 4 entries in bx to the threads in the next-up
* thread group. It then has eight threads together perform uint4
* (128 bit) writes to the destination region. This seems to make
* quite effective use of memory bandwidth. An approach that spread
* uint32s across more threads was slower because of the increased
* computation it required.
*
* "start" is the loop iteration producing the write - the offset within
* the block's memory.
*
* Internally, this algorithm first __shfl's the 4 bx entries to
* the next up thread group, and then uses a conditional move to
* ensure that odd-numbered thread groups exchange the b/bx ordering
* so that the right parts are written together.
*
* Thanks to Babu for helping design the 128-bit-per-write version.
*
* _direct lets the caller specify the absolute start location instead of
* the relative start location, as an attempt to reduce some recomputation.
*/
template <MemoryAccess SCHEME> __device__ __forceinline__
void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
{
uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
if (SCHEME == ANDERSEN) {
uint4 t=b, t2;
extern __shared__ unsigned char shared[];
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
uint32_t *s = &tmp[threadIdx.x/32][threadIdx.x%32];
uint32_t *st = &tmp[threadIdx.x/32][(threadIdx.x + 4)%32];
*s = bx.x; t2.x = *st;
*s = bx.y; t2.y = *st;
*s = bx.z; t2.z = *st;
*s = bx.w; t2.w = *st;
*s = start; int t2_start = *st + 4;
bool c = (threadIdx.x & 0x4);
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
} else {
*((uint4 *)(&scratch[start ])) = b;
*((uint4 *)(&scratch[start+16])) = bx;
}
}
template <MemoryAccess SCHEME, int TEX_DIM> __device__ __forceinline__
void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
{
uint32_t *scratch;
if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
if (SCHEME == ANDERSEN) {
extern __shared__ unsigned char shared[];
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
uint32_t *s = &tmp[threadIdx.x/32][threadIdx.x%32];
*s = start; int t2_start = tmp[threadIdx.x/32][(threadIdx.x + 4)%32] + 4;
if (TEX_DIM > 0) { start /= 4; t2_start /= 4; }
bool c = (threadIdx.x & 0x4);
if (TEX_DIM == 0) {
b = *((uint4 *)(&scratch[c ? t2_start : start]));
bx = *((uint4 *)(&scratch[c ? start : t2_start]));
} else if (TEX_DIM == 1) {
b = tex1Dfetch(texRef1D_4_V, c ? t2_start : start);
bx = tex1Dfetch(texRef1D_4_V, c ? start : t2_start);
} else if (TEX_DIM == 2) {
b = tex2D(texRef2D_4_V, 0.5f + ((c ? t2_start : start)%TEXWIDTH), 0.5f + ((c ? t2_start : start)/TEXWIDTH));
bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH));
}
uint4 temp = b; b = (c ? bx : b); bx = (c ? temp : bx);
uint32_t *st = &tmp[threadIdx.x/32][(threadIdx.x + 28)%32];
*s = bx.x; bx.x = *st;
*s = bx.y; bx.y = *st;
*s = bx.z; bx.z = *st;
*s = bx.w; bx.w = *st;
} else {
if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start]));
else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4);
else if (TEX_DIM == 2) b = tex2D(texRef2D_4_V, 0.5f + ((start/4)%TEXWIDTH), 0.5f + ((start/4)/TEXWIDTH));
if (TEX_DIM == 0) bx = *((uint4 *)(&scratch[start+16]));
else if (TEX_DIM == 1) bx = tex1Dfetch(texRef1D_4_V, (start+16)/4);
else if (TEX_DIM == 2) bx = tex2D(texRef2D_4_V, 0.5f + (((start+16)/4)%TEXWIDTH), 0.5f + (((start+16)/4)/TEXWIDTH));
}
}
__device__ __forceinline__
void primary_order_shuffle(uint4 &b, uint4 &bx)
{
/* Inner loop shuffle targets */
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
extern __shared__ unsigned char shared[];
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
unsigned int wrp = threadIdx.x/32, lane = threadIdx.x%32;
uint32_t *s = &tmp[wrp][lane];
uint32_t *s1 = &tmp[wrp][x1];
uint32_t *s2 = &tmp[wrp][x2];
uint32_t *s3 = &tmp[wrp][x3];
*s = b.w; b.w = *s1;
*s = b.z; b.z = *s2;
*s = b.y; b.y = *s3;
uint32_t temp = b.y; b.y = b.w; b.w = temp;
*s = bx.w; bx.w = *s1;
*s = bx.z; bx.z = *s2;
*s = bx.y; bx.y = *s3;
temp = bx.y; bx.y = bx.w; bx.w = temp;
}
/*
* load_key loads a 32*32bit key from a contiguous region of memory in B.
* The input keys are in external order (i.e., 0, 1, 2, 3, ...).
* After loading, each thread has its four b and four bx keys stored
* in internal processing order.
*/
__device__ __forceinline__
void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx)
{
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
// Read in permuted order. Key loads are not our bottleneck right now.
b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4];
b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4];
b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4];
b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4];
bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16];
bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16];
bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16];
bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16];
primary_order_shuffle(b, bx);
}
/*
* store_key performs the opposite transform as load_key, taking
* internally-ordered b and bx and storing them into a contiguous
* region of B in external order.
*/
__device__ __forceinline__
void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx)
{
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
primary_order_shuffle(b, bx);
B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x;
B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y;
B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z;
B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w;
B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x;
B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y;
B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z;
B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w;
}
/*
* load_key loads a 32*32bit key from a contiguous region of memory in B.
* The input keys are in external order (i.e., 0, 1, 2, 3, ...).
* After loading, each thread has its four b and four bx keys stored
* in internal processing order.
*/
__device__ __forceinline__
void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx)
{
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
// Read in permuted order. Key loads are not our bottleneck right now.
b.x = B[key_offset + 4*0 + thread_in_block%4];
b.y = B[key_offset + 4*1 + thread_in_block%4];
b.z = B[key_offset + 4*2 + thread_in_block%4];
b.w = B[key_offset + 4*3 + thread_in_block%4];
bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16];
bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16];
bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16];
bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16];
}
/*
* store_key performs the opposite transform as load_key, taking
* internally-ordered b and bx and storing them into a contiguous
* region of B in external order.
*/
__device__ __forceinline__
void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx)
{
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
B[key_offset + 4*0 + thread_in_block%4] = b.x;
B[key_offset + 4*1 + thread_in_block%4] = b.y;
B[key_offset + 4*2 + thread_in_block%4] = b.z;
B[key_offset + 4*3 + thread_in_block%4] = b.w;
B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x;
B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y;
B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z;
B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w;
}
template <int ALGO> __device__ __forceinline__
void load_key(const uint32_t *B, uint4 &b, uint4 &bx)
{
switch(ALGO) {
case A_SCRYPT: load_key_salsa(B, b, bx); break;
case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break;
}
}
template <int ALGO> __device__ __forceinline__
void store_key(uint32_t *B, uint4 &b, uint4 &bx)
{
switch(ALGO) {
case A_SCRYPT: store_key_salsa(B, b, bx); break;
case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break;
}
}
/*
* salsa_xor_core (Salsa20/8 cypher)
* The original scrypt called:
* xor_salsa8(&X[0], &X[16]); <-- the "b" loop
* xor_salsa8(&X[16], &X[0]); <-- the "bx" loop
* This version is unrolled to handle both of these loops in a single
* call to avoid unnecessary data movement.
*/
#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<<amt)|(tmp>>(32-amt))); }
__device__ __forceinline__
void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
{
extern __shared__ unsigned char shared[];
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
unsigned int wrp = threadIdx.x/32, lane = threadIdx.x%32;
uint32_t *s = &tmp[wrp][lane];
uint32_t *s1 = &tmp[wrp][x1];
uint32_t *s2 = &tmp[wrp][x2];
uint32_t *s3 = &tmp[wrp][x3];
uint4 x;
b ^= bx;
x = b;
// Enter in "primary order" (t0 has 0, 4, 8, 12)
// (t1 has 5, 9, 13, 1)
// (t2 has 10, 14, 2, 6)
// (t3 has 15, 3, 7, 11)
#pragma unroll
for (int j = 0; j < 4; j++) {
// Mixing phase of salsa
XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
/* Transpose rows and columns. */
/* Unclear if this optimization is needed: These are ordered based
* upon the dependencies needed in the later xors. Compiler should be
* able to figure this out, but might as well give it a hand. */
*s = x.y; x.y = *s3;
*s = x.w; x.w = *s1;
*s = x.z; x.z = *s2;
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
* but the register targets are rewritten here to swap x[1] and x[3] so that
* they can be directly shuffled to and from our peer threads without
* reassignment. The reverse shuffle then puts them back in the right place.
*/
XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
*s = x.w; x.w = *s3;
*s = x.y; x.y = *s1;
*s = x.z; x.z = *s2;
}
b += x;
// The next two lines are the beginning of the BX-centric loop iteration
bx ^= b;
x = bx;
// This is a copy of the same loop above, identical but stripped of comments.
// Duplicated so that we can complete a bx-based loop with fewer register moves.
#pragma unroll
for (int j = 0; j < 4; j++) {
XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
*s = x.y; x.y = *s3;
*s = x.w; x.w = *s1;
*s = x.z; x.z = *s2;
XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
*s = x.w; x.w = *s3;
*s = x.y; x.y = *s1;
*s = x.z; x.z = *s2;
}
// At the end of these iterations, the data is in primary order again.
#undef XOR_ROTATE_ADD
bx += x;
}
/*
* chacha_xor_core (ChaCha20/8 cypher)
* This version is unrolled to handle both of these loops in a single
* call to avoid unnecessary data movement.
*
* load_key and store_key must not use primary order when
* using ChaCha20/8, but rather the basic transposed order
* (referred to as "column mode" below)
*/
#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<<amt)|(tmp>>(32-amt))); }
__device__ __forceinline__
void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
{
extern __shared__ unsigned char shared[];
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
unsigned int wrp = threadIdx.x/32, lane = threadIdx.x%32;
uint32_t *s = &tmp[wrp][lane];
uint32_t *s1 = &tmp[wrp][x1];
uint32_t *s2 = &tmp[wrp][x2];
uint32_t *s3 = &tmp[wrp][x3];
uint4 x;
b ^= bx;
x = b;
// Enter in "column" mode (t0 has 0, 4, 8, 12)
// (t1 has 1, 5, 9, 13)
// (t2 has 2, 6, 10, 14)
// (t3 has 3, 7, 11, 15)
#pragma unroll
for (int j = 0; j < 4; j++) {
// Column Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
*s = x.y; x.y = *s1;
*s = x.z; x.z = *s2;
*s = x.w; x.w = *s3;
// Diagonal Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
*s = x.y; x.y = *s3;
*s = x.z; x.z = *s2;
*s = x.w; x.w = *s1;
}
b += x;
// The next two lines are the beginning of the BX-centric loop iteration
bx ^= b;
x = bx;
#pragma unroll
for (int j = 0; j < 4; j++) {
// Column Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
*s = x.y; x.y = *s1;
*s = x.z; x.z = *s2;
*s = x.w; x.w = *s3;
// Diagonal Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
*s = x.y; x.y = *s3;
*s = x.z; x.z = *s2;
*s = x.w; x.w = *s1;
}
#undef CHACHA_PRIMITIVE
bx += x;
}
template <int ALGO> __device__ __forceinline__
void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
{
switch(ALGO) {
case A_SCRYPT: salsa_xor_core(b, bx, x1, x2, x3); break;
case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break;
}
}
/*
* The hasher_gen_kernel operates on a group of 1024-bit input keys
* in B, stored as:
* B = { k1B k1Bx k2B k2Bx ... }
* and fills up the scratchpad with the iterative hashes derived from
* those keys:
* scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... }
* scratch is 1024 times larger than the input keys B.
* It is extremely important to stream writes effectively into scratch;
* less important to coalesce the reads from B.
*
* Key ordering note: Keys are input from B in "original" order:
* K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 }
* After inputting into kernel_gen, each component k and kx of the
* key is transmuted into a permuted internal order to make processing faster:
* K = k, kx with:
* k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11
* and similarly for kx.
*/
template <int ALGO, MemoryAccess SCHEME> __global__
void test_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end)
{
uint4 b, bx;
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
int i=begin;
if (i == 0) {
load_key<ALGO>(d_idata, b, bx);
write_keys_direct<SCHEME>(b, bx, start);
++i;
} else read_keys_direct<SCHEME,0>(b, bx, start+32*(i-1));
while (i < end) {
block_mixer<ALGO>(b, bx, x1, x2, x3);
write_keys_direct<SCHEME>(b, bx, start+32*i);
++i;
}
}
template <int ALGO, MemoryAccess SCHEME> __global__
void test_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP)
{
uint4 b, bx;
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
int i=begin;
if (i == 0) {
load_key<ALGO>(d_idata, b, bx);
write_keys_direct<SCHEME>(b, bx, start);
++i;
} else {
int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP;
read_keys_direct<SCHEME,0>(b, bx, start+32*pos);
while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
}
while (i < end) {
block_mixer<ALGO>(b, bx, x1, x2, x3);
if (i % LOOKUP_GAP == 0)
write_keys_direct<SCHEME>(b, bx, start+32*(i/LOOKUP_GAP));
++i;
}
}
/*
* hasher_hash_kernel runs the second phase of scrypt after the scratch
* buffer is filled with the iterative hashes: It bounces through
* the scratch buffer in pseudorandom order, mixing the key as it goes.
*/
template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__
void test_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
{
extern __shared__ unsigned char shared[];
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
uint4 b, bx;
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4);
if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP;
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
if (begin == 0) {
read_keys_direct<SCHEME,TEX_DIM>(b, bx, start+32*c_N_1);
block_mixer<ALGO>(b, bx, x1, x2, x3);
} else load_key<ALGO>(d_odata, b, bx);
for (int i = begin; i < end; i++) {
tmp[threadIdx.x/32][threadIdx.x%32] = bx.x;
int j = (tmp[threadIdx.x/32][(threadIdx.x & 0x1c)] & (c_N_1));
uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*j);
b ^= t; bx ^= tx;
block_mixer<ALGO>(b, bx, x1, x2, x3);
}
store_key<ALGO>(d_odata, b, bx);
}
template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__
void test_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP)
{
extern __shared__ unsigned char shared[];
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
uint4 b, bx;
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4);
if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP;
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
if (begin == 0) {
int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP);
read_keys_direct<SCHEME,TEX_DIM>(b, bx, start+32*pos);
while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
} else load_key<ALGO>(d_odata, b, bx);
for (int i = begin; i < end; i++) {
tmp[threadIdx.x/32][threadIdx.x%32] = bx.x;
int j = (tmp[threadIdx.x/32][(threadIdx.x & 0x1c)] & (c_N_1));
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
b ^= t; bx ^= tx;
block_mixer<ALGO>(b, bx, x1, x2, x3);
}
store_key<ALGO>(d_odata, b, bx);
}
TestKernel::TestKernel() : KernelInterface()
{
}
bool TestKernel::bindtexture_1D(uint32_t *d_V, size_t size)
{
cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
texRef1D_4_V.normalized = 0;
texRef1D_4_V.filterMode = cudaFilterModePoint;
texRef1D_4_V.addressMode[0] = cudaAddressModeClamp;
checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size));
return true;
}
bool TestKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch)
{
cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
texRef2D_4_V.normalized = 0;
texRef2D_4_V.filterMode = cudaFilterModePoint;
texRef2D_4_V.addressMode[0] = cudaAddressModeClamp;
texRef2D_4_V.addressMode[1] = cudaAddressModeClamp;
// maintain texture width of TEXWIDTH (max. limit is 65000)
while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; }
while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; }
checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch));
return true;
}
bool TestKernel::unbindtexture_1D()
{
checkCudaErrors(cudaUnbindTexture(texRef1D_4_V));
return true;
}
bool TestKernel::unbindtexture_2D()
{
checkCudaErrors(cudaUnbindTexture(texRef2D_4_V));
return true;
}
void TestKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
{
checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
}
bool TestKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
{
bool success = true;
// compute required shared memory per block for __shfl() emulation
size_t shared = ((threads.x + 31) / 32) * (32+1) * sizeof(uint32_t);
// make some constants available to kernel, update only initially and when changing
static int prev_N[MAX_DEVICES] = {0};
if (N != prev_N[thr_id]) {
uint32_t h_N = N;
uint32_t h_N_1 = N-1;
uint32_t h_SCRATCH = SCRATCH;
uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP);
uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1;
cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
prev_N[thr_id] = N;
}
// First phase: Sequential writes to scratchpad.
int batch = device_batchsize[thr_id];
unsigned int pos = 0;
do {
if (LOOKUP_GAP == 1) {
if (IS_SCRYPT()) test_scrypt_core_kernelA<A_SCRYPT, ANDERSEN> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N));
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N));
} else {
if (IS_SCRYPT()) test_scrypt_core_kernelA_LG<A_SCRYPT, ANDERSEN> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
}
pos += batch;
} while (pos < N);
// Second phase: Random read access from scratchpad.
pos = 0;
do {
if (LOOKUP_GAP == 1) {
if (texture_cache == 0) {
if (IS_SCRYPT()) test_scrypt_core_kernelB<A_SCRYPT, ANDERSEN, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
}
else if (texture_cache == 1) {
if (IS_SCRYPT()) test_scrypt_core_kernelB<A_SCRYPT, ANDERSEN, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
}
else if (texture_cache == 2) {
if (IS_SCRYPT()) test_scrypt_core_kernelB<A_SCRYPT, ANDERSEN, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
}
} else {
if (texture_cache == 0) {
if (IS_SCRYPT()) test_scrypt_core_kernelB_LG<A_SCRYPT, ANDERSEN, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
}
else if (texture_cache == 1) {
if (IS_SCRYPT()) test_scrypt_core_kernelB_LG<A_SCRYPT, ANDERSEN, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
}
else if (texture_cache == 2) {
if (IS_SCRYPT()) test_scrypt_core_kernelB_LG<A_SCRYPT, ANDERSEN, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
}
}
pos += batch;
} while (pos < N);
return success;
}

30
scrypt/test_kernel.h

@ -0,0 +1,30 @@ @@ -0,0 +1,30 @@
#ifndef TEST_KERNEL_H
#define TEST_KERNEL_H
#include "salsa_kernel.h"
class TestKernel : public KernelInterface
{
public:
TestKernel();
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
virtual bool bindtexture_1D(uint32_t *d_V, size_t size);
virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch);
virtual bool unbindtexture_1D();
virtual bool unbindtexture_2D();
virtual char get_identifier() { return 'f'; };
virtual int get_major_version() { return 1; };
virtual int get_minor_version() { return 0; };
virtual int max_warps_per_block() { return 32; };
virtual int get_texel_width() { return 4; };
virtual int threads_per_wu() { return 4; }
virtual bool support_lookup_gap() { return true; }
virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
};
#endif // #ifndef TEST_KERNEL_H

731
scrypt/titan_kernel.cu

@ -0,0 +1,731 @@ @@ -0,0 +1,731 @@
/* Copyright (C) 2013 David G. Andersen. All rights reserved.
* with modifications by Christian Buchner
*
* Use of this code is covered under the Apache 2.0 license, which
* can be found in the file "LICENSE"
*/
// attempt V.Volkov style ILP (factor 4)
#include <map>
#include "cuda_runtime.h"
#include "miner.h"
#include "salsa_kernel.h"
#include "titan_kernel.h"
#define THREADS_PER_WU 4 // four threads per hash
typedef enum
{
ANDERSEN,
SIMPLE
} MemoryAccess;
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define __ldg(x) (*(x))
#endif
// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
// iteration count N
__constant__ uint32_t c_N;
__constant__ uint32_t c_N_1; // N-1
// scratch buffer size SCRATCH
__constant__ uint32_t c_SCRATCH;
__constant__ uint32_t c_SCRATCH_WU_PER_WARP; // (SCRATCH * WU_PER_WARP)
__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP)-1
template <int ALGO> __device__ __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);
static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) {
left.x ^= right.x;
left.y ^= right.y;
left.z ^= right.z;
left.w ^= right.w;
return left;
}
static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) {
left.x += right.x;
left.y += right.y;
left.z += right.z;
left.w += right.w;
return left;
}
static __device__ uint4 __shfl(const uint4 bx, int target_thread) {
return make_uint4(__shfl((int)bx.x, target_thread), __shfl((int)bx.y, target_thread), __shfl((int)bx.z, target_thread), __shfl((int)bx.w, target_thread));
}
/* write_keys writes the 8 keys being processed by a warp to the global
* scratchpad. To effectively use memory bandwidth, it performs the writes
* (and reads, for read_keys) 128 bytes at a time per memory location
* by __shfl'ing the 4 entries in bx to the threads in the next-up
* thread group. It then has eight threads together perform uint4
* (128 bit) writes to the destination region. This seems to make
* quite effective use of memory bandwidth. An approach that spread
* uint32s across more threads was slower because of the increased
* computation it required.
*
* "start" is the loop iteration producing the write - the offset within
* the block's memory.
*
* Internally, this algorithm first __shfl's the 4 bx entries to
* the next up thread group, and then uses a conditional move to
* ensure that odd-numbered thread groups exchange the b/bx ordering
* so that the right parts are written together.
*
* Thanks to Babu for helping design the 128-bit-per-write version.
*
* _direct lets the caller specify the absolute start location instead of
* the relative start location, as an attempt to reduce some recomputation.
*/
template <MemoryAccess SCHEME> __device__ __forceinline__
void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
{
uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
if (SCHEME == ANDERSEN) {
int target_thread = (threadIdx.x + 4)%32;
uint4 t=b, t2=__shfl(bx, target_thread);
int t2_start = __shfl((int)start, target_thread) + 4;
bool c = (threadIdx.x & 0x4);
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
} else {
*((uint4 *)(&scratch[start ])) = b;
*((uint4 *)(&scratch[start+16])) = bx;
}
}
template <MemoryAccess SCHEME> __device__ __forceinline__
void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
{
uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
if (SCHEME == ANDERSEN) {
int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4;
bool c = (threadIdx.x & 0x4);
b = __ldg((uint4 *)(&scratch[c ? t2_start : start]));
bx = __ldg((uint4 *)(&scratch[c ? start : t2_start]));
uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
bx = __shfl(bx, (threadIdx.x + 28)%32);
} else {
b = *((uint4 *)(&scratch[start]));
bx = *((uint4 *)(&scratch[start+16]));
}
}
__device__ __forceinline__
void primary_order_shuffle(uint32_t b[4], uint32_t bx[4]) {
/* Inner loop shuffle targets */
int x1 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+3)&0x3);
b[3] = __shfl((int)b[3], x1);
b[2] = __shfl((int)b[2], x2);
b[1] = __shfl((int)b[1], x3);
uint32_t tmp = b[1]; b[1] = b[3]; b[3] = tmp;
bx[3] = __shfl((int)bx[3], x1);
bx[2] = __shfl((int)bx[2], x2);
bx[1] = __shfl((int)bx[1], x3);
tmp = bx[1]; bx[1] = bx[3]; bx[3] = tmp;
}
__device__ __forceinline__
void primary_order_shuffle(uint4 &b, uint4 &bx) {
/* Inner loop shuffle targets */
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
b.w = __shfl((int)b.w, x1);
b.z = __shfl((int)b.z, x2);
b.y = __shfl((int)b.y, x3);
uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
bx.w = __shfl((int)bx.w, x1);
bx.z = __shfl((int)bx.z, x2);
bx.y = __shfl((int)bx.y, x3);
tmp = bx.y; bx.y = bx.w; bx.w = tmp;
}
/*
* load_key loads a 32*32bit key from a contiguous region of memory in B.
* The input keys are in external order (i.e., 0, 1, 2, 3, ...).
* After loading, each thread has its four b and four bx keys stored
* in internal processing order.
*/
__device__ __forceinline__
void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx)
{
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
// Read in permuted order. Key loads are not our bottleneck right now.
b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4];
b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4];
b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4];
b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4];
bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16];
bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16];
bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16];
bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16];
primary_order_shuffle(b, bx);
}
/*
* store_key performs the opposite transform as load_key, taking
* internally-ordered b and bx and storing them into a contiguous
* region of B in external order.
*/
__device__ __forceinline__
void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx)
{
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
primary_order_shuffle(b, bx);
B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x;
B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y;
B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z;
B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w;
B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x;
B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y;
B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z;
B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w;
}
/*
* load_key loads a 32*32bit key from a contiguous region of memory in B.
* The input keys are in external order (i.e., 0, 1, 2, 3, ...).
* After loading, each thread has its four b and four bx keys stored
* in internal processing order.
*/
__device__ __forceinline__
void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx)
{
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
// Read in permuted order. Key loads are not our bottleneck right now.
b.x = B[key_offset + 4*0 + thread_in_block%4];
b.y = B[key_offset + 4*1 + thread_in_block%4];
b.z = B[key_offset + 4*2 + thread_in_block%4];
b.w = B[key_offset + 4*3 + thread_in_block%4];
bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16];
bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16];
bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16];
bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16];
}
/*
* store_key performs the opposite transform as load_key, taking
* internally-ordered b and bx and storing them into a contiguous
* region of B in external order.
*/
__device__ __forceinline__
void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx)
{
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int key_offset = scrypt_block * 32;
uint32_t thread_in_block = threadIdx.x % 4;
B[key_offset + 4*0 + thread_in_block%4] = b.x;
B[key_offset + 4*1 + thread_in_block%4] = b.y;
B[key_offset + 4*2 + thread_in_block%4] = b.z;
B[key_offset + 4*3 + thread_in_block%4] = b.w;
B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x;
B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y;
B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z;
B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w;
}
template <int ALGO> __device__ __forceinline__
void load_key(const uint32_t *B, uint4 &b, uint4 &bx)
{
switch(ALGO) {
case A_SCRYPT: load_key_salsa(B, b, bx); break;
case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break;
}
}
template <int ALGO> __device__ __forceinline__
void store_key(uint32_t *B, uint4 &b, uint4 &bx)
{
switch(ALGO) {
case A_SCRYPT: store_key_salsa(B, b, bx); break;
case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break;
}
}
/*
* salsa_xor_core (Salsa20/8 cypher)
* The original scrypt called:
* xor_salsa8(&X[0], &X[16]); <-- the "b" loop
* xor_salsa8(&X[16], &X[0]); <-- the "bx" loop
* This version is unrolled to handle both of these loops in a single
* call to avoid unnecessary data movement.
*/
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<<amt)|(tmp>>(32-amt))); }
#else
// Kepler (Compute 3.5)
#define ROTL(a, b) __funnelshift_l( a, a, b );
#define XOR_ROTATE_ADD(dst, s1, s2, amt) dst ^= ROTL(s1+s2, amt);
#endif
__device__ __forceinline__
void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
{
uint4 x;
b ^= bx;
x = b;
// Enter in "primary order" (t0 has 0, 4, 8, 12)
// (t1 has 5, 9, 13, 1)
// (t2 has 10, 14, 2, 6)
// (t3 has 15, 3, 7, 11)
#pragma unroll
for (int j = 0; j < 4; j++)
{
// Mixing phase of salsa
XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
/* Transpose rows and columns. */
/* Unclear if this optimization is needed: These are ordered based
* upon the dependencies needed in the later xors. Compiler should be
* able to figure this out, but might as well give it a hand. */
x.y = __shfl((int)x.y, x3);
x.w = __shfl((int)x.w, x1);
x.z = __shfl((int)x.z, x2);
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
* but the register targets are rewritten here to swap x[1] and x[3] so that
* they can be directly shuffled to and from our peer threads without
* reassignment. The reverse shuffle then puts them back in the right place.
*/
XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
x.w = __shfl((int)x.w, x3);
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
}
b += x;
// The next two lines are the beginning of the BX-centric loop iteration
bx ^= b;
x = bx;
// This is a copy of the same loop above, identical but stripped of comments.
// Duplicated so that we can complete a bx-based loop with fewer register moves.
#pragma unroll 4
for (int j = 0; j < 4; j++)
{
XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
x.y = __shfl((int)x.y, x3);
x.w = __shfl((int)x.w, x1);
x.z = __shfl((int)x.z, x2);
XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
x.w = __shfl((int)x.w, x3);
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
}
// At the end of these iterations, the data is in primary order again.
#undef XOR_ROTATE_ADD
bx += x;
}
/*
* chacha_xor_core (ChaCha20/8 cypher)
* This version is unrolled to handle both of these loops in a single
* call to avoid unnecessary data movement.
*
* load_key and store_key must not use primary order when
* using ChaCha20/8, but rather the basic transposed order
* (referred to as "column mode" below)
*/
#if __CUDA_ARCH__ < 320
// Kepler (Compute 3.0)
#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<<amt)|(tmp>>(32-amt))); }
#else
// Kepler (Compute 3.5)
#define ROTL(a, b) __funnelshift_l( a, a, b );
#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { pt += ps; rt = ROTL(rt ^ pt,amt); }
#endif
__device__ __forceinline__
void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
{
uint4 x;
b ^= bx;
x = b;
// Enter in "column" mode (t0 has 0, 4, 8, 12)
// (t1 has 1, 5, 9, 13)
// (t2 has 2, 6, 10, 14)
// (t3 has 3, 7, 11, 15)
#pragma unroll 4
for (int j = 0; j < 4; j++) {
// Column Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x3);
// Diagonal Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x3);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x1);
}
b += x;
// The next two lines are the beginning of the BX-centric loop iteration
bx ^= b;
x = bx;
#pragma unroll
for (int j = 0; j < 4; j++)
{
// Column Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x3);
// Diagonal Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x3);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x1);
}
#undef CHACHA_PRIMITIVE
bx += x;
}
template <int ALGO> __device__ __forceinline__
void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
{
switch(ALGO) {
case A_SCRYPT: salsa_xor_core(b, bx, x1, x2, x3); break;
case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break;
}
}
/*
* The hasher_gen_kernel operates on a group of 1024-bit input keys
* in B, stored as:
* B = { k1B k1Bx k2B k2Bx ... }
* and fills up the scratchpad with the iterative hashes derived from
* those keys:
* scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... }
* scratch is 1024 times larger than the input keys B.
* It is extremely important to stream writes effectively into scratch;
* less important to coalesce the reads from B.
*
* Key ordering note: Keys are input from B in "original" order:
* K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 }
* After inputting into kernel_gen, each component k and kx of the
* key is transmuted into a permuted internal order to make processing faster:
* K = k, kx with:
* k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11
* and similarly for kx.
*/
template <int ALGO, MemoryAccess SCHEME> __global__
void titan_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end)
{
uint4 b, bx;
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
int i=begin;
if (i == 0) {
load_key<ALGO>(d_idata, b, bx);
write_keys_direct<SCHEME>(b, bx, start);
++i;
} else read_keys_direct<SCHEME>(b, bx, start+32*(i-1));
while (i < end) {
block_mixer<ALGO>(b, bx, x1, x2, x3);
write_keys_direct<SCHEME>(b, bx, start+32*i);
++i;
}
}
template <int ALGO, MemoryAccess SCHEME> __global__
void titan_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP)
{
uint4 b, bx;
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
int i=begin;
if (i == 0) {
load_key<ALGO>(d_idata, b, bx);
write_keys_direct<SCHEME>(b, bx, start);
++i;
} else {
int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP;
read_keys_direct<SCHEME>(b, bx, start+32*pos);
while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
}
while (i < end) {
block_mixer<ALGO>(b, bx, x1, x2, x3);
if (i % LOOKUP_GAP == 0)
write_keys_direct<SCHEME>(b, bx, start+32*(i/LOOKUP_GAP));
++i;
}
}
/*
* hasher_hash_kernel runs the second phase of scrypt after the scratch
* buffer is filled with the iterative hashes: It bounces through
* the scratch buffer in pseudorandom order, mixing the key as it goes.
*/
template <int ALGO, MemoryAccess SCHEME> __global__
void titan_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
{
uint4 b, bx;
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int start = ((scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
if (begin == 0) {
read_keys_direct<SCHEME>(b, bx, start+32*c_N_1);
block_mixer<ALGO>(b, bx, x1, x2, x3);
} else load_key<ALGO>(d_odata, b, bx);
for (int i = begin; i < end; i++) {
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
uint4 t, tx; read_keys_direct<SCHEME>(t, tx, start+32*j);
b ^= t; bx ^= tx;
block_mixer<ALGO>(b, bx, x1, x2, x3);
}
store_key<ALGO>(d_odata, b, bx);
}
template <int ALGO, MemoryAccess SCHEME> __global__
void titan_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP)
{
uint4 b, bx;
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
int start = ((scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
if (begin == 0) {
int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP);
read_keys_direct<SCHEME>(b, bx, start+32*pos);
while(loop--)
block_mixer<ALGO>(b, bx, x1, x2, x3);
}
else
load_key<ALGO>(d_odata, b, bx);
if (SCHEME == SIMPLE)
{
// better divergent thread handling submitted by nVidia engineers, but
// supposedly this does not run with the ANDERSEN memory access scheme
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
int pos = j/LOOKUP_GAP;
int loop = -1;
uint4 t, tx;
int i = begin;
while(i < end)
{
if (loop == -1) {
j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
pos = j/LOOKUP_GAP;
loop = j-pos*LOOKUP_GAP;
read_keys_direct<SCHEME>(t, tx, start+32*pos);
}
if (loop == 0) {
b ^= t; bx ^= tx;
t=b;tx=bx;
}
block_mixer<ALGO>(t, tx, x1, x2, x3);
if (loop == 0) {
b=t;bx=tx;
i++;
}
loop--;
}
}
else
{
// this is my original implementation, now used with the ANDERSEN
// memory access scheme only.
for (int i = begin; i < end; i++) {
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
uint4 t, tx; read_keys_direct<SCHEME>(t, tx, start+32*pos);
while (loop--)
block_mixer<ALGO>(t, tx, x1, x2, x3);
b ^= t; bx ^= tx;
block_mixer<ALGO>(b, bx, x1, x2, x3);
}
}
store_key<ALGO>(d_odata, b, bx);
}
TitanKernel::TitanKernel() : KernelInterface()
{
}
void TitanKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
{
checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
}
bool TitanKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream,
uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
{
bool success = true;
// make some constants available to kernel, update only initially and when changing
static int prev_N[MAX_DEVICES] = {0};
if (N != prev_N[thr_id]) {
uint32_t h_N = N;
uint32_t h_N_1 = N-1;
uint32_t h_SCRATCH = SCRATCH;
uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP);
uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1;
cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
prev_N[thr_id] = N;
}
// First phase: Sequential writes to scratchpad.
int batch = device_batchsize[thr_id];
unsigned int pos = 0;
do {
if (LOOKUP_GAP == 1) {
if (IS_SCRYPT()) titan_scrypt_core_kernelA<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
} else {
if (IS_SCRYPT()) titan_scrypt_core_kernelA_LG<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
}
pos += batch;
} while (pos < N);
// Second phase: Random read access from scratchpad.
pos = 0;
do {
if (LOOKUP_GAP == 1) {
if (IS_SCRYPT()) titan_scrypt_core_kernelB<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
} else {
if (IS_SCRYPT()) titan_scrypt_core_kernelB_LG<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
}
pos += batch;
} while (pos < N);
return success;
}

26
scrypt/titan_kernel.h

@ -0,0 +1,26 @@ @@ -0,0 +1,26 @@
#ifndef TITAN_KERNEL_H
#define TITAN_KERNEL_H
#include "salsa_kernel.h"
class TitanKernel : public KernelInterface
{
public:
TitanKernel();
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
virtual char get_identifier() { return 't'; }
virtual int get_major_version() { return 3; }
virtual int get_minor_version() { return 5; }
virtual int max_warps_per_block() { return 32; }
virtual int get_texel_width() { return 4; }
virtual bool no_textures() { return true; }
virtual int threads_per_wu() { return 4; }
virtual bool support_lookup_gap() { return true; }
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
};
#endif // #ifndef TITAN_KERNEL_H

3
util.cpp

@ -1788,6 +1788,9 @@ void print_hash_tests(void) @@ -1788,6 +1788,9 @@ void print_hash_tests(void)
qubithash(&hash[0], &buf[0]);
printpfx("qubit", hash);
scrypthash(&hash[0], &buf[0]);
printpfx("scrypt", hash);
skeincoinhash(&hash[0], &buf[0]);
printpfx("skein", hash);

Loading…
Cancel
Save