diff --git a/Makefile.am b/Makefile.am index e8a35ca..d6edb34 100644 --- a/Makefile.am +++ b/Makefile.am @@ -18,7 +18,7 @@ bin_PROGRAMS = ccminer ccminer_SOURCES = elist.h miner.h compat.h \ compat/inttypes.h compat/stdbool.h compat/unistd.h \ compat/sys/time.h compat/getopt/getopt.h \ - crc32.c hefty1.c scrypt.c \ + crc32.c hefty1.c \ ccminer.cpp util.cpp \ api.cpp hashlog.cpp nvml.cpp stats.cpp sysinfos.cpp cuda.cpp \ heavy/heavy.cu \ @@ -57,6 +57,13 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \ x11/s3.cu +# scrypt +ccminer_SOURCES += scrypt.cpp scrypt-jane.cpp \ + scrypt/blake.cu scrypt/keccak.cu scrypt/sha256.cu \ + scrypt/salsa_kernel.cu scrypt/test_kernel.cu \ + scrypt/fermi_kernel.cu scrypt/kepler_kernel.cu \ + scrypt/nv_kernel.cu scrypt/nv_kernel2.cu scrypt/titan_kernel.cu + if HAVE_NVML nvml_defs = -DUSE_WRAPNVML nvml_libs = -ldl @@ -118,6 +125,10 @@ quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $< +# This kernel need also an older SM to be able to autotune kernels +scrypt/salsa_kernel.o: scrypt/salsa_kernel.cu + $(NVCC) $(nvcc_FLAGS) -gencode=arch=compute_20,code=\"sm_21,compute_20\" --maxrregcount=80 -o $@ -c $< + skein.o: skein.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $< diff --git a/ccminer.cpp b/ccminer.cpp index def1420..d320325 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -102,6 +102,8 @@ enum sha_algos { ALGO_PLUCK, ALGO_QUARK, ALGO_QUBIT, + ALGO_SCRYPT, + ALGO_SCRYPT_JANE, ALGO_SKEIN, ALGO_SKEIN2, ALGO_S3, @@ -137,6 +139,8 @@ static const char *algo_names[] = { "pluck", "quark", "qubit", + "scrypt", + "scrypt-jane", "skein", "skein2", "s3", @@ -184,6 +188,20 @@ char * device_name[MAX_GPUS]; short device_map[MAX_GPUS] = { 0 }; long device_sm[MAX_GPUS] = { 0 }; uint32_t gpus_intensity[MAX_GPUS] = { 0 }; + +int device_interactive[MAX_GPUS] = { 0 }; +int device_batchsize[MAX_GPUS] = { 0 }; +int device_backoff[MAX_GPUS] = { 0 }; +int device_lookup_gap[MAX_GPUS] = { 0 }; +int device_texturecache[MAX_GPUS] = { 0 }; +int device_singlememory[MAX_GPUS] = { 0 }; +char *device_config[MAX_GPUS] = { 0 }; +int opt_nfactor = 0; +int parallel = 2; +bool autotune = true; +bool abort_flag = false; +char *jane_params = NULL; + char *rpc_user = NULL; static char *rpc_pass; static char *rpc_userpass = NULL; @@ -255,6 +273,8 @@ Options:\n\ pluck SupCoin\n\ quark Quark\n\ qubit Qubit\n\ + scrypt Scrypt\n\ + scrypt-jane Scrypt-jane Chacha\n\ skein Skein SHA2 (Skeincoin)\n\ skein2 Double Skein (Woodcoin)\n\ s3 S3 (1Coin)\n\ @@ -439,6 +459,7 @@ void get_currentalgo(char* buf, int sz) */ void proper_exit(int reason) { + abort_flag = true; cuda_devicereset(); if (check_dups) @@ -1173,6 +1194,8 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) switch (opt_algo) { case ALGO_JACKPOT: case ALGO_PLUCK: + case ALGO_SCRYPT: + case ALGO_SCRYPT_JANE: diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty)); break; case ALGO_DMD_GR: @@ -1386,6 +1409,8 @@ static void *miner_thread(void *userdata) minmax = 0x400000; break; case ALGO_LYRA2: + case ALGO_SCRYPT: + case ALGO_SCRYPT_JANE: minmax = 0x100000; break; case ALGO_PLUCK: @@ -1526,6 +1551,16 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; + case ALGO_SCRYPT: + rc = scanhash_scrypt(thr_id, work.data, work.target, NULL, + max_nonce, &hashes_done, &tv_start, &tv_end); + break; + + case ALGO_SCRYPT_JANE: + rc = scanhash_scrypt_jane(thr_id, work.data, work.target, NULL, + max_nonce, &hashes_done, &tv_start, &tv_end); + break; + case ALGO_SKEIN: rc = scanhash_skeincoin(thr_id, work.data, work.target, max_nonce, &hashes_done); @@ -1942,15 +1977,29 @@ void parse_arg(int key, char *arg) switch(key) { case 'a': + p = strstr(arg, ":"); // optional factor + if (p) *p = '\0'; for (i = 0; i < ARRAY_SIZE(algo_names); i++) { - if (algo_names[i] && - !strcmp(arg, algo_names[i])) { + if (algo_names[i] && !strcasecmp(arg, algo_names[i])) { opt_algo = (enum sha_algos)i; break; } } if (i == ARRAY_SIZE(algo_names)) show_usage_and_exit(1); + if (p) { + opt_nfactor = atoi(p + 1); + if (opt_algo == ALGO_SCRYPT_JANE) { + free(jane_params); + jane_params = strdup(p+1); + } + } + if (!opt_nfactor) { + switch (opt_algo) { + case ALGO_SCRYPT: opt_nfactor = 9; break; + case ALGO_SCRYPT_JANE: opt_nfactor = 14; break; + } + } break; case 'b': p = strstr(arg, ":"); @@ -2404,6 +2453,8 @@ int main(int argc, char *argv[]) rpc_pass = strdup(""); rpc_url = strdup(""); + jane_params = strdup(""); + pthread_mutex_init(&applog_lock, NULL); // number of cpus for thread affinity @@ -2423,9 +2474,17 @@ int main(int argc, char *argv[]) if (num_cpus < 1) num_cpus = 1; - // default thread to device map for (i = 0; i < MAX_GPUS; i++) { device_map[i] = i; + device_name[i] = NULL; + // for future use, maybe + device_interactive[i] = -1; + device_batchsize[i] = 1024; + device_backoff[i] = is_windows() ? 12 : 2; + device_lookup_gap[i] = 1; + device_texturecache[i] = -1; + device_singlememory[i] = -1; + device_config[i] = NULL; } // number of gpus diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 528d5c0..f8b3a6c 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -250,6 +250,8 @@ false Full + + @@ -261,10 +263,6 @@ - - Full - /Tp %(AdditionalOptions) - @@ -322,6 +320,7 @@ + @@ -352,6 +351,22 @@ + + + compute_20,sm_21;compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_52,sm_52 + + + + + + compute_35,sm_35;compute_50,sm_50;compute_52,sm_52 + + + compute_20,sm_21 + + + + @@ -510,4 +525,4 @@ - + \ No newline at end of file diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 3d1fd91..9f62c54 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -73,6 +73,9 @@ {f5117ccb-a70d-411a-b7ea-d6faed230bc7} + + {c26f5b02-37b5-4420-a4e8-ee1ad517dc95} + @@ -111,9 +114,6 @@ Source Files - - Source Files - Source Files @@ -225,6 +225,12 @@ Source Files\sph + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + @@ -377,6 +383,9 @@ Ressources + + Source Files\CUDA\scrypt + @@ -580,6 +589,36 @@ Source Files\CUDA + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + @@ -596,4 +635,4 @@ Ressources - + \ No newline at end of file diff --git a/miner.h b/miner.h index 71bf153..2b8a7b4 100644 --- a/miner.h +++ b/miner.h @@ -272,8 +272,6 @@ void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap); extern int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); -extern unsigned char *scrypt_buffer_alloc(); - extern int scanhash_deep(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -343,8 +341,12 @@ extern int scanhash_qubit(int thr_id, uint32_t *pdata, unsigned long *hashes_done); extern int scanhash_scrypt(int thr_id, uint32_t *pdata, - unsigned char *scratchbuf, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done); + const uint32_t *ptarget, unsigned char *scratchbuf, uint32_t max_nonce, + unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end); + +extern int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, unsigned char *scratchbuf, uint32_t max_nonce, + unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end); extern int scanhash_skeincoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, @@ -683,6 +685,7 @@ void pentablakehash(void *output, const void *input); void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N); void quarkhash(void *state, const void *input); void qubithash(void *state, const void *input); +void scrypthash(void* output, const void* input); void skeincoinhash(void *output, const void *input); void skein2hash(void *output, const void *input); void s3hash(void *output, const void *input); diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp new file mode 100644 index 0000000..ce21ea2 --- /dev/null +++ b/scrypt-jane.cpp @@ -0,0 +1,626 @@ +/* + scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane + + Public Domain or MIT License, whichever is easier +*/ + +#include "miner.h" + +#include "scrypt/scrypt-jane.h" +#include "scrypt/code/scrypt-jane-portable.h" +#include "scrypt/code/scrypt-jane-romix.h" +#include "scrypt/keccak.h" + +#include "scrypt/salsa_kernel.h" + +#define scrypt_maxN 30 /* (1 << (30 + 1)) = ~2 billion */ +#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */ +#define scrypt_maxr scrypt_r_32kb /* 32kb */ +#define scrypt_maxp 25 /* (1 << 25) = ~33 million */ + +// ---------------------------- BEGIN keccak functions ------------------------------------ + +#define SCRYPT_HASH "Keccak-512" +#define SCRYPT_HASH_DIGEST_SIZE 64 +#define SCRYPT_KECCAK_F 1600 +#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 1024 */ +#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 576 */ +#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8) + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +typedef struct scrypt_hash_state_t { + uint64_t state[SCRYPT_KECCAK_F / 64]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +static const uint64_t keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull +}; + +static void +keccak_block(scrypt_hash_state *S, const uint8_t *in) { + size_t i; + uint64_t *s = S->state, t[5], u[5], v, w; + + /* absorb input */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8) + s[i] ^= U8TO64_LE(in); + + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROTL64(t[1], 1); + u[1] = t[0] ^ ROTL64(t[2], 1); + u[2] = t[1] ^ ROTL64(t[3], 1); + u[3] = t[2] ^ ROTL64(t[4], 1); + u[4] = t[3] ^ ROTL64(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[ 1]; + s[ 1] = ROTL64(s[ 6], 44); + s[ 6] = ROTL64(s[ 9], 20); + s[ 9] = ROTL64(s[22], 61); + s[22] = ROTL64(s[14], 39); + s[14] = ROTL64(s[20], 18); + s[20] = ROTL64(s[ 2], 62); + s[ 2] = ROTL64(s[12], 43); + s[12] = ROTL64(s[13], 25); + s[13] = ROTL64(s[19], 8); + s[19] = ROTL64(s[23], 56); + s[23] = ROTL64(s[15], 41); + s[15] = ROTL64(s[ 4], 27); + s[ 4] = ROTL64(s[24], 14); + s[24] = ROTL64(s[21], 2); + s[21] = ROTL64(s[ 8], 55); + s[ 8] = ROTL64(s[16], 45); + s[16] = ROTL64(s[ 5], 36); + s[ 5] = ROTL64(s[ 3], 28); + s[ 3] = ROTL64(s[18], 21); + s[18] = ROTL64(s[17], 15); + s[17] = ROTL64(s[11], 10); + s[11] = ROTL64(s[ 7], 6); + s[ 7] = ROTL64(s[10], 3); + s[10] = ROTL64( v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; + v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; + } +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + memset(S, 0, sizeof(*S)); +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + keccak_block(S, S->buffer); + } + + /* handle the current data */ + while (inlen >= SCRYPT_HASH_BLOCK_SIZE) { + keccak_block(S, in); + in += SCRYPT_HASH_BLOCK_SIZE; + inlen -= SCRYPT_HASH_BLOCK_SIZE; + } + + /* handle leftover data */ + S->leftover = (uint32_t)inlen; + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + size_t i; + + S->buffer[S->leftover] = 0x01; + memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1)); + S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80; + keccak_block(S, S->buffer); + + for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) { + U64TO8_LE(&hash[i], S->state[i / 8]); + } +} + +// ---------------------------- END keccak functions ------------------------------------ + +// ---------------------------- BEGIN PBKDF2 functions ------------------------------------ + +typedef struct scrypt_hmac_state_t { + scrypt_hash_state inner, outer; +} scrypt_hmac_state; + + +static void +scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) { + scrypt_hash_state st; + scrypt_hash_init(&st); + scrypt_hash_update(&st, m, mlen); + scrypt_hash_finish(&st, hash); +} + +/* hmac */ +static void +scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) { + uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; + size_t i; + + scrypt_hash_init(&st->inner); + scrypt_hash_init(&st->outer); + + if (keylen <= SCRYPT_HASH_BLOCK_SIZE) { + /* use the key directly if it's <= blocksize bytes */ + memcpy(pad, key, keylen); + } else { + /* if it's > blocksize bytes, hash it */ + scrypt_hash(pad, key, keylen); + } + + /* inner = (key ^ 0x36) */ + /* h(inner || ...) */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= 0x36; + scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); + + /* outer = (key ^ 0x5c) */ + /* h(outer || ...) */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= (0x5c ^ 0x36); + scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); +} + +static void +scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) { + /* h(inner || m...) */ + scrypt_hash_update(&st->inner, m, mlen); +} + +static void +scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) { + /* h(inner || m) */ + scrypt_hash_digest innerhash; + scrypt_hash_finish(&st->inner, innerhash); + + /* h(outer || h(inner || m)) */ + scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash)); + scrypt_hash_finish(&st->outer, mac); +} + +/* + * Special version where N = 1 + * - mikaelh + */ +static void +scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) { + scrypt_hmac_state hmac_pw, hmac_pw_salt, work; + scrypt_hash_digest ti, u; + uint8_t be[4]; + uint32_t i, /*j,*/ blocks; +// uint64_t c; + + /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ + + /* hmac(password, ...) */ + scrypt_hmac_init(&hmac_pw, password, password_len); + + /* hmac(password, salt...) */ + hmac_pw_salt = hmac_pw; + scrypt_hmac_update(&hmac_pw_salt, salt, salt_len); + + blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; + for (i = 1; i <= blocks; i++) { + /* U1 = hmac(password, salt || be(i)) */ + U32TO8_BE(be, i); + work = hmac_pw_salt; + scrypt_hmac_update(&work, be, 4); + scrypt_hmac_finish(&work, ti); + memcpy(u, ti, sizeof(u)); + + memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes); + out += SCRYPT_HASH_DIGEST_SIZE; + bytes -= SCRYPT_HASH_DIGEST_SIZE; + } +} + +// ---------------------------- END PBKDF2 functions ------------------------------------ + +static void +scrypt_fatal_error_default(const char *msg) { + fprintf(stderr, "%s\n", msg); + exit(1); +} + +static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default; + +void +scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) { + scrypt_fatal_error = fn; +} + +typedef struct scrypt_aligned_alloc_t { + uint8_t *mem, *ptr; +} scrypt_aligned_alloc; + +#if defined(SCRYPT_TEST_SPEED) +static uint8_t *mem_base = (uint8_t *)0; +static size_t mem_bump = 0; + +/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */ +static scrypt_aligned_alloc +scrypt_alloc(uint64_t size) { + scrypt_aligned_alloc aa; + if (!mem_base) { + mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1)); + if (!mem_base) + scrypt_fatal_error("scrypt: out of memory"); + mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); + } + aa.mem = mem_base + mem_bump; + aa.ptr = aa.mem; + mem_bump += (size_t)size; + return aa; +} + +static void +scrypt_free(scrypt_aligned_alloc *aa) { + mem_bump = 0; +} +#else +static scrypt_aligned_alloc +scrypt_alloc(uint64_t size) { + static const size_t max_alloc = (size_t)-1; + scrypt_aligned_alloc aa; + size += (SCRYPT_BLOCK_BYTES - 1); + if (size > max_alloc) + scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory"); + aa.mem = (uint8_t *)malloc((size_t)size); + aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); + if (!aa.mem) + scrypt_fatal_error("scrypt: out of memory"); + return aa; +} + +static void +scrypt_free(scrypt_aligned_alloc *aa) { + free(aa->mem); +} +#endif + + +// yacoin: increasing Nfactor gradually +unsigned char GetNfactor(unsigned int nTimestamp) { + int l = 0; + + unsigned int Nfactor = 0; + + // Yacoin defaults + unsigned int Ntimestamp = 1367991200; + unsigned int minN = 4; + unsigned int maxN = 30; + + if (strlen(jane_params) > 0) { + if (!strcmp(jane_params, "YAC") || !strcasecmp(jane_params, "Yacoin")) {} // No-Op + // + // NO WARRANTY FOR CORRECTNESS. Look for the int64 nChainStartTime constant + // in the src/main.cpp file of the official wallet clients as well as the + // const unsigned char minNfactor and const unsigned char maxNfactor + // + else if (!strcmp(jane_params, "YBC") || !strcasecmp(jane_params, "YBCoin")) { + // YBCoin: 1372386273, minN: 4, maxN: 30 + Ntimestamp = 1372386273; minN= 4; maxN= 30; + } else if (!strcmp(jane_params, "ZZC") || !strcasecmp(jane_params, "ZZCoin")) { + // ZcCoin: 1375817223, minN: 12, maxN: 30 + Ntimestamp = 1375817223; minN= 12; maxN= 30; + } else if (!strcmp(jane_params, "FEC") || !strcasecmp(jane_params, "FreeCoin")) { + // FreeCoin: 1375801200, minN: 6, maxN: 32 + Ntimestamp = 1375801200; minN= 6; maxN= 32; + } else if (!strcmp(jane_params, "ONC") || !strcasecmp(jane_params, "OneCoin")) { + // OneCoin: 1371119462, minN: 6, maxN: 30 + Ntimestamp = 1371119462; minN= 6; maxN= 30; + } else if (!strcmp(jane_params, "QQC") || !strcasecmp(jane_params, "QQCoin")) { + // QQCoin: 1387769316, minN: 4, maxN: 30 + Ntimestamp = 1387769316; minN= 4; maxN= 30; + } else if (!strcmp(jane_params, "GPL") || !strcasecmp(jane_params, "GoldPressedLatinum")) { + // GoldPressedLatinum:1377557832, minN: 4, maxN: 30 + Ntimestamp = 1377557832; minN= 4; maxN= 30; + } else if (!strcmp(jane_params, "MRC") || !strcasecmp(jane_params, "MicroCoin")) { + // MicroCoin:1389028879, minN: 4, maxN: 30 + Ntimestamp = 1389028879; minN= 4; maxN= 30; + } else if (!strcmp(jane_params, "APC") || !strcasecmp(jane_params, "AppleCoin")) { + // AppleCoin:1384720832, minN: 4, maxN: 30 + Ntimestamp = 1384720832; minN= 4; maxN= 30; + } else if (!strcmp(jane_params, "CPR") || !strcasecmp(jane_params, "Copperbars")) { + // Copperbars:1376184687, minN: 4, maxN: 30 + Ntimestamp = 1376184687; minN= 4; maxN= 30; + } else if (!strcmp(jane_params, "CACH") || !strcasecmp(jane_params, "CacheCoin")) { + // CacheCoin:1388949883, minN: 4, maxN: 30 + Ntimestamp = 1388949883; minN= 4; maxN= 30; + } else if (!strcmp(jane_params, "UTC") || !strcasecmp(jane_params, "UltraCoin")) { + // MicroCoin:1388361600, minN: 4, maxN: 30 + Ntimestamp = 1388361600; minN= 4; maxN= 30; + } else if (!strcmp(jane_params, "VEL") || !strcasecmp(jane_params, "VelocityCoin")) { + // VelocityCoin:1387769316, minN: 4, maxN: 30 + Ntimestamp = 1387769316; minN= 4; maxN= 30; + } else if (!strcmp(jane_params, "ITC") || !strcasecmp(jane_params, "InternetCoin")) { + // InternetCoin:1388385602, minN: 4, maxN: 30 + Ntimestamp = 1388385602; minN= 4; maxN= 30; + } else if (!strcmp(jane_params, "RAD") || !strcasecmp(jane_params, "RadioactiveCoin")) { + // InternetCoin:1389196388, minN: 4, maxN: 30 + Ntimestamp = 1389196388; minN= 4; maxN= 30; + } else { + if (sscanf(jane_params, "%u,%u,%u", &Ntimestamp, &minN, &maxN) != 3) + if (sscanf(jane_params, "%u", &Nfactor) == 1) return Nfactor; // skip bounding against minN, maxN + else applog(LOG_INFO, "Unable to parse scrypt-jane parameters: '%s'. Defaulting to Yacoin.", jane_params); + } + } + // determination based on the constants determined above + if (nTimestamp <= Ntimestamp) + return minN; + + unsigned long int s = nTimestamp - Ntimestamp; + while ((s >> 1) > 3) { + l += 1; + s >>= 1; + } + + s &= 3; + + int n = (l * 170 + s * 25 - 2320) / 100; + + if (n < 0) n = 0; + + if (n > 255) + printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n); + + Nfactor = n; + if (NfactormaxN) return maxN; + return Nfactor; +} + +#define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \ + | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) + +static int s_Nfactor = 0; + +int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, unsigned char *scratchbuf, + uint32_t max_nonce, unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end) +{ + const uint32_t Htarg = ptarget[7]; + + if (s_Nfactor == 0 && strlen(jane_params) > 0) + applog(LOG_INFO, "Given scrypt-jane parameters: %s", jane_params); + + int Nfactor = GetNfactor(bswap_32x4(pdata[17])); + if (Nfactor > scrypt_maxN) { + scrypt_fatal_error("scrypt: N out of range"); + } + + if (Nfactor != s_Nfactor) + { + // all of this isn't very thread-safe... + opt_nfactor = (1 << (Nfactor + 1)); + + applog(LOG_INFO, "Nfactor is %d (N=%d)!", Nfactor, opt_nfactor); + + if (s_Nfactor != 0) { + // handle N-factor increase at runtime + // by adjusting the lookup_gap by factor 2 + if (s_Nfactor == Nfactor-1) + for (int i=0; i < 8; ++i) + device_lookup_gap[i] *= 2; + } + s_Nfactor = Nfactor; + } + + int throughput = cuda_throughput(thr_id); + + if(throughput == 0) + return -1; + + gettimeofday(tv_start, NULL); + + uint32_t *data[2] = { new uint32_t[20*throughput], new uint32_t[20*throughput] }; + uint32_t* hash[2] = { cuda_hashbuffer(thr_id,0), cuda_hashbuffer(thr_id,1) }; + + uint32_t n = pdata[19]; + + /* byte swap pdata into data[0]/[1] arrays */ + for (int k=0; k<2; ++k) { + for(int z=0;z<20;z++) data[k][z] = bswap_32x4(pdata[z]); + for(int i=1;i 0) fprintf(stderr, "%d out of %d hashes differ.\n", err, throughput); + } +#endif + } else { + n += throughput; + + cuda_scrypt_serialize(thr_id, nxt); + pre_keccak512(thr_id, nxt, nonce[nxt], throughput); + cuda_scrypt_core(thr_id, nxt, opt_nfactor); + + cuda_scrypt_flush(thr_id, nxt); + + post_keccak512(thr_id, nxt, nonce[nxt], throughput); + cuda_scrypt_done(thr_id, nxt); + + cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true); + + if(!cuda_scrypt_sync(thr_id, cur)) + { + return -1; + } + } + + if(iteration > 0) + { + for(int i=0;i -#include -#include - -static const uint32_t keypad[12] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 -}; -static const uint32_t innerpad[11] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 -}; -static const uint32_t outerpad[8] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300 -}; -static const uint32_t finalblk[16] = { - 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t ihash[8]; - uint32_t pad[16]; - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 16, 16); - memcpy(pad + 4, keypad, 48); - sha256_transform(tstate, pad, 0); - memcpy(ihash, tstate, 32); - - sha256_init(ostate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform(ostate, pad, 0); - - sha256_init(tstate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 16; i++) - pad[i] = 0x36363636; - sha256_transform(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t istate[8], ostate2[8]; - uint32_t ibuf[16], obuf[16]; - int i, j; - - memcpy(istate, tstate, 32); - sha256_transform(istate, salt, 0); - - memcpy(ibuf, salt + 16, 16); - memcpy(ibuf + 5, innerpad, 44); - memcpy(obuf + 8, outerpad, 32); - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 32); - ibuf[4] = i + 1; - sha256_transform(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 32); - sha256_transform(ostate2, obuf, 0); - for (j = 0; j < 8; j++) - output[8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, - const uint32_t *salt, uint32_t *output) -{ - uint32_t buf[16]; - int i; - - sha256_transform(tstate, salt, 1); - sha256_transform(tstate, salt + 16, 1); - sha256_transform(tstate, finalblk, 0); - memcpy(buf, tstate, 32); - memcpy(buf + 8, outerpad, 32); - - sha256_transform(ostate, buf, 0); - for (i = 0; i < 8; i++) - output[i] = swab32(ostate[i]); -} - - -#if HAVE_SHA256_4WAY - -static const uint32_t keypad_4way[4 * 12] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000280, 0x00000280, 0x00000280, 0x00000280 -}; -static const uint32_t innerpad_4way[4 * 11] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0 -}; -static const uint32_t outerpad_4way[4 * 8] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000300, 0x00000300, 0x00000300, 0x00000300 -}; -static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = { - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000620, 0x00000620, 0x00000620, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t ihash[4 * 8] __attribute__((aligned(16))); - uint32_t pad[4 * 16] __attribute__((aligned(16))); - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 4 * 16, 4 * 16); - memcpy(pad + 4 * 4, keypad_4way, 4 * 48); - sha256_transform_4way(tstate, pad, 0); - memcpy(ihash, tstate, 4 * 32); - - sha256_init_4way(ostate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 4 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_4way(ostate, pad, 0); - - sha256_init_4way(tstate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 4 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_4way(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t istate[4 * 8] __attribute__((aligned(16))); - uint32_t ostate2[4 * 8] __attribute__((aligned(16))); - uint32_t ibuf[4 * 16] __attribute__((aligned(16))); - uint32_t obuf[4 * 16] __attribute__((aligned(16))); - int i, j; - - memcpy(istate, tstate, 4 * 32); - sha256_transform_4way(istate, salt, 0); - - memcpy(ibuf, salt + 4 * 16, 4 * 16); - memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); - memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32); - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 4 * 32); - ibuf[4 * 4 + 0] = i + 1; - ibuf[4 * 4 + 1] = i + 1; - ibuf[4 * 4 + 2] = i + 1; - ibuf[4 * 4 + 3] = i + 1; - sha256_transform_4way(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 4 * 32); - sha256_transform_4way(ostate2, obuf, 0); - for (j = 0; j < 4 * 8; j++) - output[4 * 8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t buf[4 * 16] __attribute__((aligned(16))); - int i; - - sha256_transform_4way(tstate, salt, 1); - sha256_transform_4way(tstate, salt + 4 * 16, 1); - sha256_transform_4way(tstate, finalblk_4way, 0); - memcpy(buf, tstate, 4 * 32); - memcpy(buf + 4 * 8, outerpad_4way, 4 * 32); - - sha256_transform_4way(ostate, buf, 0); - for (i = 0; i < 4 * 8; i++) - output[i] = swab32(ostate[i]); -} - -#endif /* HAVE_SHA256_4WAY */ - - -#if HAVE_SHA256_8WAY - -static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = { - 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init_8way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t ihash[8 * 8] __attribute__((aligned(32))); - uint32_t pad[8 * 16] __attribute__((aligned(32))); - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - pad[8 * 4 + i] = 0x80000000; - memset(pad + 8 * 5, 0x00, 8 * 40); - for (i = 0; i < 8; i++) - pad[8 * 15 + i] = 0x00000280; - sha256_transform_8way(tstate, pad, 0); - memcpy(ihash, tstate, 8 * 32); - - sha256_init_8way(ostate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 8 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_8way(ostate, pad, 0); - - sha256_init_8way(tstate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 8 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_8way(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t istate[8 * 8] __attribute__((aligned(32))); - uint32_t ostate2[8 * 8] __attribute__((aligned(32))); - uint32_t ibuf[8 * 16] __attribute__((aligned(32))); - uint32_t obuf[8 * 16] __attribute__((aligned(32))); - int i, j; - - memcpy(istate, tstate, 8 * 32); - sha256_transform_8way(istate, salt, 0); - - memcpy(ibuf, salt + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - ibuf[8 * 5 + i] = 0x80000000; - memset(ibuf + 8 * 6, 0x00, 8 * 36); - for (i = 0; i < 8; i++) - ibuf[8 * 15 + i] = 0x000004a0; - - for (i = 0; i < 8; i++) - obuf[8 * 8 + i] = 0x80000000; - memset(obuf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - obuf[8 * 15 + i] = 0x00000300; - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 8 * 32); - ibuf[8 * 4 + 0] = i + 1; - ibuf[8 * 4 + 1] = i + 1; - ibuf[8 * 4 + 2] = i + 1; - ibuf[8 * 4 + 3] = i + 1; - ibuf[8 * 4 + 4] = i + 1; - ibuf[8 * 4 + 5] = i + 1; - ibuf[8 * 4 + 6] = i + 1; - ibuf[8 * 4 + 7] = i + 1; - sha256_transform_8way(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 8 * 32); - sha256_transform_8way(ostate2, obuf, 0); - for (j = 0; j < 8 * 8; j++) - output[8 * 8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t buf[8 * 16] __attribute__((aligned(32))); - int i; - - sha256_transform_8way(tstate, salt, 1); - sha256_transform_8way(tstate, salt + 8 * 16, 1); - sha256_transform_8way(tstate, finalblk_8way, 0); - - memcpy(buf, tstate, 8 * 32); - for (i = 0; i < 8; i++) - buf[8 * 8 + i] = 0x80000000; - memset(buf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - buf[8 * 15 + i] = 0x00000300; - sha256_transform_8way(ostate, buf, 0); - - for (i = 0; i < 8 * 8; i++) - output[i] = swab32(ostate[i]); -} - -#endif /* HAVE_SHA256_8WAY */ - - -#if defined(__x86_64__) - -#define SCRYPT_MAX_WAYS 1 -#define HAVE_SCRYPT_3WAY 0 -#define scrypt_best_throughput() 1 -static void scrypt_core(uint32_t *X, uint32_t *V); -void scrypt_core_3way(uint32_t *X, uint32_t *V); -#if defined(USE_AVX2) -#undef SCRYPT_MAX_WAYS -#define SCRYPT_MAX_WAYS 21 -#define HAVE_SCRYPT_6WAY 0 -void scrypt_core_6way(uint32_t *X, uint32_t *V); -#endif - -#elif defined(__i386__) - -#define SCRYPT_MAX_WAYS 1 -#define scrypt_best_throughput() 1 -static void scrypt_core(uint32_t *X, uint32_t *V); - -#elif defined(__arm__) && defined(__APCS_32__) - -static void scrypt_core(uint32_t *X, uint32_t *V); -#if defined(__ARM_NEON__) -#undef HAVE_SHA256_4WAY -#define SCRYPT_MAX_WAYS 1 -#define HAVE_SCRYPT_3WAY 0 -#define scrypt_best_throughput() 1 -void scrypt_core_3way(uint32_t *X, uint32_t *V); -#endif - -#endif - -static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) -{ - uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; - int i; - - x00 = (B[ 0] ^= Bx[ 0]); - x01 = (B[ 1] ^= Bx[ 1]); - x02 = (B[ 2] ^= Bx[ 2]); - x03 = (B[ 3] ^= Bx[ 3]); - x04 = (B[ 4] ^= Bx[ 4]); - x05 = (B[ 5] ^= Bx[ 5]); - x06 = (B[ 6] ^= Bx[ 6]); - x07 = (B[ 7] ^= Bx[ 7]); - x08 = (B[ 8] ^= Bx[ 8]); - x09 = (B[ 9] ^= Bx[ 9]); - x10 = (B[10] ^= Bx[10]); - x11 = (B[11] ^= Bx[11]); - x12 = (B[12] ^= Bx[12]); - x13 = (B[13] ^= Bx[13]); - x14 = (B[14] ^= Bx[14]); - x15 = (B[15] ^= Bx[15]); - for (i = 0; i < 8; i += 2) { -#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) - /* Operate on columns. */ - x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); - x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7); - - x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); - x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9); - - x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); - x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13); - - x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); - x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18); - - /* Operate on rows. */ - x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); - x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7); - - x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); - x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9); - - x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); - x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13); - - x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); - x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18); -#undef R - } - B[ 0] += x00; - B[ 1] += x01; - B[ 2] += x02; - B[ 3] += x03; - B[ 4] += x04; - B[ 5] += x05; - B[ 6] += x06; - B[ 7] += x07; - B[ 8] += x08; - B[ 9] += x09; - B[10] += x10; - B[11] += x11; - B[12] += x12; - B[13] += x13; - B[14] += x14; - B[15] += x15; -} - -static inline void scrypt_core(uint32_t *X, uint32_t *V) -{ - uint32_t i, j, k; - - for (i = 0; i < 1024; i++) { - memcpy(&V[i * 32], X, 128); - xor_salsa8(&X[0], &X[16]); - xor_salsa8(&X[16], &X[0]); - } - for (i = 0; i < 1024; i++) { - j = 32 * (X[16] & 1023); - for (k = 0; k < 32; k++) - X[k] ^= V[j + k]; - xor_salsa8(&X[0], &X[16]); - xor_salsa8(&X[16], &X[0]); - } -} - -#ifndef SCRYPT_MAX_WAYS -#define SCRYPT_MAX_WAYS 1 -#define scrypt_best_throughput() 1 -#endif - -#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63) - -unsigned char *scrypt_buffer_alloc() -{ - return (unsigned char *)malloc(SCRYPT_BUFFER_SIZE); -} - -static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, - uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[8], ostate[8]; - uint32_t X[32]; - uint32_t *V; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - memcpy(tstate, midstate, 32); - HMAC_SHA256_80_init(input, tstate, ostate); - PBKDF2_SHA256_80_128(tstate, ostate, input, X); - - scrypt_core(X, V); - - PBKDF2_SHA256_128_32(tstate, ostate, X, output); -} - -#if HAVE_SHA256_4WAY -static void scrypt_1024_1_1_256_4way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[4 * 8] __attribute__((aligned(128))); - uint32_t ostate[4 * 8] __attribute__((aligned(128))); - uint32_t W[4 * 32] __attribute__((aligned(128))); - uint32_t X[4 * 32] __attribute__((aligned(128))); - uint32_t *V; - int i, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = input[k * 20 + i]; - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[4 * i + k] = midstate[i]; - HMAC_SHA256_80_init_4way(W, tstate, ostate); - PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[k * 32 + i] = W[4 * i + k]; - scrypt_core(X + 0 * 32, V); - scrypt_core(X + 1 * 32, V); - scrypt_core(X + 2 * 32, V); - scrypt_core(X + 3 * 32, V); - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = X[k * 32 + i]; - PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[k * 8 + i] = W[4 * i + k]; -} -#endif /* HAVE_SHA256_4WAY */ - -#if HAVE_SCRYPT_3WAY - -static void scrypt_1024_1_1_256_3way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[3 * 8], ostate[3 * 8]; - uint32_t X[3 * 32] __attribute__((aligned(64))); - uint32_t *V; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - memcpy(tstate + 0, midstate, 32); - memcpy(tstate + 8, midstate, 32); - memcpy(tstate + 16, midstate, 32); - HMAC_SHA256_80_init(input + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init(input + 20, tstate + 8, ostate + 8); - HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16); - PBKDF2_SHA256_80_128(tstate + 0, ostate + 0, input + 0, X + 0); - PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32); - PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64); - - scrypt_core_3way(X, V); - - PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0); - PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8); - PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16); -} - -#if HAVE_SHA256_4WAY -static void scrypt_1024_1_1_256_12way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[12 * 8] __attribute__((aligned(128))); - uint32_t ostate[12 * 8] __attribute__((aligned(128))); - uint32_t W[12 * 32] __attribute__((aligned(128))); - uint32_t X[12 * 32] __attribute__((aligned(128))); - uint32_t *V; - int i, j, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (j = 0; j < 3; j++) - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i]; - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[32 * j + 4 * i + k] = midstate[i]; - HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32); - HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64); - PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k]; - scrypt_core_3way(X + 0 * 96, V); - scrypt_core_3way(X + 1 * 96, V); - scrypt_core_3way(X + 2 * 96, V); - scrypt_core_3way(X + 3 * 96, V); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i]; - PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256); - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k]; -} -#endif /* HAVE_SHA256_4WAY */ - -#endif /* HAVE_SCRYPT_3WAY */ - -#if HAVE_SCRYPT_6WAY -static void scrypt_1024_1_1_256_24way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[24 * 8] __attribute__((aligned(128))); - uint32_t ostate[24 * 8] __attribute__((aligned(128))); - uint32_t W[24 * 32] __attribute__((aligned(128))); - uint32_t X[24 * 32] __attribute__((aligned(128))); - uint32_t *V; - int i, j, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (j = 0; j < 3; j++) - for (i = 0; i < 20; i++) - for (k = 0; k < 8; k++) - W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i]; - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 8; k++) - tstate[8 * 8 * j + 8 * i + k] = midstate[i]; - HMAC_SHA256_80_init_8way(W + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init_8way(W + 256, tstate + 64, ostate + 64); - HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128); - PBKDF2_SHA256_80_128_8way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_80_128_8way(tstate + 64, ostate + 64, W + 256, W + 256); - PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 8; k++) - X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k]; - scrypt_core_6way(X + 0 * 32, V); - scrypt_core_6way(X + 6 * 32, V); - scrypt_core_6way(X + 12 * 32, V); - scrypt_core_6way(X + 18 * 32, V); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 8; k++) - W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i]; - PBKDF2_SHA256_128_32_8way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_128_32_8way(tstate + 64, ostate + 64, W + 256, W + 256); - PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512); - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 8; k++) - output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k]; -} -#endif /* HAVE_SCRYPT_6WAY */ - -int scanhash_scrypt(int thr_id, uint32_t *pdata, - unsigned char *scratchbuf, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; - uint32_t midstate[8]; - uint32_t n = pdata[19] - 1; - const uint32_t Htarg = ptarget[7]; - uint32_t throughput = scrypt_best_throughput(); - uint32_t i; - -#if HAVE_SHA256_4WAY - if (sha256_use_4way()) - throughput *= 4; -#endif - - for (i = 0; i < throughput; i++) - memcpy(data + i * 20, pdata, 80); - - sha256_init(midstate); - sha256_transform(midstate, data, 0); - - do { - for (i = 0; i < throughput; i++) - data[i * 20 + 19] = ++n; - -#if HAVE_SHA256_4WAY - if (throughput == 4) - scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf); - else -#endif -#if HAVE_SCRYPT_3WAY && HAVE_SHA256_4WAY - if (throughput == 12) - scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf); - else -#endif -#if HAVE_SCRYPT_6WAY - if (throughput == 24) - scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf); - else -#endif -#if HAVE_SCRYPT_3WAY - if (throughput == 3) - scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf); - else -#endif - scrypt_1024_1_1_256(data, hash, midstate, scratchbuf); - - for (i = 0; i < throughput; i++) { - if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) { - *hashes_done = n - pdata[19] + 1; - pdata[19] = data[i * 20 + 19]; - return 1; - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - pdata[19] + 1; - pdata[19] = n; - return 0; -} diff --git a/scrypt.cpp b/scrypt.cpp new file mode 100644 index 0000000..f7a3422 --- /dev/null +++ b/scrypt.cpp @@ -0,0 +1,1097 @@ +/* + * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#ifdef WIN32 +#include +using namespace Concurrency; +#else +#include +#endif + +#include "miner.h" +#include "scrypt/salsa_kernel.h" +#include "scrypt/sha256.h" + +#include +#include +#include + +#include +#include +#include + +// A thin wrapper around the builtin __m128i type +class uint32x4_t +{ +public: +#if WIN32 + void * operator new(size_t size) _THROW1(_STD bad_alloc) { void *p; if ((p = _aligned_malloc(size, 16)) == 0) { static const std::bad_alloc nomem; _RAISE(nomem); } return (p); } + void operator delete(void *p) { _aligned_free(p); } + void * operator new[](size_t size) _THROW1(_STD bad_alloc) { void *p; if ((p = _aligned_malloc(size, 16)) == 0) { static const std::bad_alloc nomem; _RAISE(nomem); } return (p); } + void operator delete[](void *p) { _aligned_free(p); } +#else + void * operator new(size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); } + void operator delete(void *p) { free(p); } + void * operator new[](size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); } + void operator delete[](void *p) { free(p); } +#endif + uint32x4_t() { }; + uint32x4_t(const __m128i init) { val = init; } + uint32x4_t(const uint32_t init) { val = _mm_set1_epi32((int)init); } + uint32x4_t(const uint32_t a, const uint32_t b, const uint32_t c, const uint32_t d) { val = _mm_setr_epi32((int)a,(int)b,(int)c,(int)d); } + inline operator const __m128i() const { return val; } + inline const uint32x4_t operator+(const uint32x4_t &other) const { return _mm_add_epi32(val, other); } + inline const uint32x4_t operator+(const uint32_t other) const { return _mm_add_epi32(val, _mm_set1_epi32((int)other)); } + inline uint32x4_t& operator+=(const uint32x4_t other) { val = _mm_add_epi32(val, other); return *this; } + inline uint32x4_t& operator+=(const uint32_t other) { val = _mm_add_epi32(val, _mm_set1_epi32((int)other)); return *this; } + inline const uint32x4_t operator&(const uint32_t other) const { return _mm_and_si128(val, _mm_set1_epi32((int)other)); } + inline const uint32x4_t operator&(const uint32x4_t &other) const { return _mm_and_si128(val, other); } + inline const uint32x4_t operator|(const uint32x4_t &other) const { return _mm_or_si128(val, other); } + inline const uint32x4_t operator^(const uint32x4_t &other) const { return _mm_xor_si128(val, other); } + inline const uint32x4_t operator<<(const int num) const { return _mm_slli_epi32(val, num); } + inline const uint32x4_t operator>>(const int num) const { return _mm_srli_epi32(val, num); } + inline const uint32_t operator[](const int num) const { return ((uint32_t*)&val)[num]; } + protected: + __m128i val; +}; + +// non-member overload +inline const uint32x4_t operator+(const uint32_t left, const uint32x4_t &right) { return _mm_add_epi32(_mm_set1_epi32((int)left), right); } + + +// +// Code taken from sha2.cpp and vectorized, with minimal changes where required +// Not all subroutines are actually used. +// + +#define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \ + | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) + +static __inline uint32x4_t swab32x4(const uint32x4_t &v) +{ + return bswap_32x4(v); +} + +static const uint32_t sha256_h[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 +}; + +static const uint32_t sha256_k[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +void sha256_initx4(uint32x4_t *statex4) +{ + for (int i=0; i<8; ++i) + statex4[i] = sha256_h[i]; +} + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + do { \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; \ + } while (0) + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + sha256_k[i]) + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +void sha256_transformx4(uint32x4_t *state, const uint32x4_t *block, int swap) +{ + uint32x4_t W[64]; + uint32x4_t S[8]; + uint32x4_t t0, t1; + int i; + + /* 1. Prepare message schedule W. */ + if (swap) { + for (i = 0; i < 16; i++) + W[i] = swab32x4(block[i]); + } else + memcpy(W, block, 4*64); + for (i = 16; i < 64; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; + } + + /* 2. Initialize working variables. */ + memcpy(S, state, 4*32); + + /* 3. Mix. */ + RNDr(S, W, 0); + RNDr(S, W, 1); + RNDr(S, W, 2); + RNDr(S, W, 3); + RNDr(S, W, 4); + RNDr(S, W, 5); + RNDr(S, W, 6); + RNDr(S, W, 7); + RNDr(S, W, 8); + RNDr(S, W, 9); + RNDr(S, W, 10); + RNDr(S, W, 11); + RNDr(S, W, 12); + RNDr(S, W, 13); + RNDr(S, W, 14); + RNDr(S, W, 15); + RNDr(S, W, 16); + RNDr(S, W, 17); + RNDr(S, W, 18); + RNDr(S, W, 19); + RNDr(S, W, 20); + RNDr(S, W, 21); + RNDr(S, W, 22); + RNDr(S, W, 23); + RNDr(S, W, 24); + RNDr(S, W, 25); + RNDr(S, W, 26); + RNDr(S, W, 27); + RNDr(S, W, 28); + RNDr(S, W, 29); + RNDr(S, W, 30); + RNDr(S, W, 31); + RNDr(S, W, 32); + RNDr(S, W, 33); + RNDr(S, W, 34); + RNDr(S, W, 35); + RNDr(S, W, 36); + RNDr(S, W, 37); + RNDr(S, W, 38); + RNDr(S, W, 39); + RNDr(S, W, 40); + RNDr(S, W, 41); + RNDr(S, W, 42); + RNDr(S, W, 43); + RNDr(S, W, 44); + RNDr(S, W, 45); + RNDr(S, W, 46); + RNDr(S, W, 47); + RNDr(S, W, 48); + RNDr(S, W, 49); + RNDr(S, W, 50); + RNDr(S, W, 51); + RNDr(S, W, 52); + RNDr(S, W, 53); + RNDr(S, W, 54); + RNDr(S, W, 55); + RNDr(S, W, 56); + RNDr(S, W, 57); + RNDr(S, W, 58); + RNDr(S, W, 59); + RNDr(S, W, 60); + RNDr(S, W, 61); + RNDr(S, W, 62); + RNDr(S, W, 63); + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +static const uint32_t sha256d_hash1[16] = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x80000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000100 +}; + +static void sha256dx4(uint32x4_t *hash, uint32x4_t *data) +{ + uint32x4_t S[16]; + + sha256_initx4(S); + sha256_transformx4(S, data, 0); + sha256_transformx4(S, data + 16, 0); + for (int i=8; i<16; ++i) + S[i] = sha256d_hash1[i]; + sha256_initx4(hash); + sha256_transformx4(hash, S, 0); +} + +static inline void sha256d_preextendx4(uint32x4_t *W) +{ + W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; + W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1]; + W[18] = s1(W[16]) + W[11] + W[ 2]; + W[19] = s1(W[17]) + W[12] + s0(W[ 4]); + W[20] = W[13] + s0(W[ 5]) + W[ 4]; + W[21] = W[14] + s0(W[ 6]) + W[ 5]; + W[22] = W[15] + s0(W[ 7]) + W[ 6]; + W[23] = W[16] + s0(W[ 8]) + W[ 7]; + W[24] = W[17] + s0(W[ 9]) + W[ 8]; + W[25] = s0(W[10]) + W[ 9]; + W[26] = s0(W[11]) + W[10]; + W[27] = s0(W[12]) + W[11]; + W[28] = s0(W[13]) + W[12]; + W[29] = s0(W[14]) + W[13]; + W[30] = s0(W[15]) + W[14]; + W[31] = s0(W[16]) + W[15]; +} + +static inline void sha256d_prehashx4(uint32x4_t *S, const uint32x4_t *W) +{ + uint32x4_t t0, t1; + RNDr(S, W, 0); + RNDr(S, W, 1); + RNDr(S, W, 2); +} + +static inline void sha256d_msx4(uint32x4_t *hash, uint32x4_t *W, + const uint32_t *midstate, const uint32_t *prehash) +{ + uint32x4_t S[64]; + uint32x4_t t0, t1; + int i; + + S[18] = W[18]; + S[19] = W[19]; + S[20] = W[20]; + S[22] = W[22]; + S[23] = W[23]; + S[24] = W[24]; + S[30] = W[30]; + S[31] = W[31]; + + W[18] += s0(W[3]); + W[19] += W[3]; + W[20] += s1(W[18]); + W[21] = s1(W[19]); + W[22] += s1(W[20]); + W[23] += s1(W[21]); + W[24] += s1(W[22]); + W[25] = s1(W[23]) + W[18]; + W[26] = s1(W[24]) + W[19]; + W[27] = s1(W[25]) + W[20]; + W[28] = s1(W[26]) + W[21]; + W[29] = s1(W[27]) + W[22]; + W[30] += s1(W[28]) + W[23]; + W[31] += s1(W[29]) + W[24]; + for (i = 32; i < 64; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; + } + + for (i=0; i<8; ++i) + S[i] = prehash[i]; + + RNDr(S, W, 3); + RNDr(S, W, 4); + RNDr(S, W, 5); + RNDr(S, W, 6); + RNDr(S, W, 7); + RNDr(S, W, 8); + RNDr(S, W, 9); + RNDr(S, W, 10); + RNDr(S, W, 11); + RNDr(S, W, 12); + RNDr(S, W, 13); + RNDr(S, W, 14); + RNDr(S, W, 15); + RNDr(S, W, 16); + RNDr(S, W, 17); + RNDr(S, W, 18); + RNDr(S, W, 19); + RNDr(S, W, 20); + RNDr(S, W, 21); + RNDr(S, W, 22); + RNDr(S, W, 23); + RNDr(S, W, 24); + RNDr(S, W, 25); + RNDr(S, W, 26); + RNDr(S, W, 27); + RNDr(S, W, 28); + RNDr(S, W, 29); + RNDr(S, W, 30); + RNDr(S, W, 31); + RNDr(S, W, 32); + RNDr(S, W, 33); + RNDr(S, W, 34); + RNDr(S, W, 35); + RNDr(S, W, 36); + RNDr(S, W, 37); + RNDr(S, W, 38); + RNDr(S, W, 39); + RNDr(S, W, 40); + RNDr(S, W, 41); + RNDr(S, W, 42); + RNDr(S, W, 43); + RNDr(S, W, 44); + RNDr(S, W, 45); + RNDr(S, W, 46); + RNDr(S, W, 47); + RNDr(S, W, 48); + RNDr(S, W, 49); + RNDr(S, W, 50); + RNDr(S, W, 51); + RNDr(S, W, 52); + RNDr(S, W, 53); + RNDr(S, W, 54); + RNDr(S, W, 55); + RNDr(S, W, 56); + RNDr(S, W, 57); + RNDr(S, W, 58); + RNDr(S, W, 59); + RNDr(S, W, 60); + RNDr(S, W, 61); + RNDr(S, W, 62); + RNDr(S, W, 63); + + for (i = 0; i < 8; i++) + S[i] += midstate[i]; + + W[18] = S[18]; + W[19] = S[19]; + W[20] = S[20]; + W[22] = S[22]; + W[23] = S[23]; + W[24] = S[24]; + W[30] = S[30]; + W[31] = S[31]; + + for (i=8; i<16; ++i) + S[i] = sha256d_hash1[i]; + S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0]; + S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1]; + S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2]; + S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3]; + S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4]; + S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5]; + S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6]; + S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7]; + S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8]; + S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9]; + S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10]; + S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11]; + S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12]; + S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13]; + S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14]; + S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15]; + for (i = 32; i < 60; i += 2) { + S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16]; + S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15]; + } + S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44]; + + sha256_initx4(hash); + + RNDr(hash, S, 0); + RNDr(hash, S, 1); + RNDr(hash, S, 2); + RNDr(hash, S, 3); + RNDr(hash, S, 4); + RNDr(hash, S, 5); + RNDr(hash, S, 6); + RNDr(hash, S, 7); + RNDr(hash, S, 8); + RNDr(hash, S, 9); + RNDr(hash, S, 10); + RNDr(hash, S, 11); + RNDr(hash, S, 12); + RNDr(hash, S, 13); + RNDr(hash, S, 14); + RNDr(hash, S, 15); + RNDr(hash, S, 16); + RNDr(hash, S, 17); + RNDr(hash, S, 18); + RNDr(hash, S, 19); + RNDr(hash, S, 20); + RNDr(hash, S, 21); + RNDr(hash, S, 22); + RNDr(hash, S, 23); + RNDr(hash, S, 24); + RNDr(hash, S, 25); + RNDr(hash, S, 26); + RNDr(hash, S, 27); + RNDr(hash, S, 28); + RNDr(hash, S, 29); + RNDr(hash, S, 30); + RNDr(hash, S, 31); + RNDr(hash, S, 32); + RNDr(hash, S, 33); + RNDr(hash, S, 34); + RNDr(hash, S, 35); + RNDr(hash, S, 36); + RNDr(hash, S, 37); + RNDr(hash, S, 38); + RNDr(hash, S, 39); + RNDr(hash, S, 40); + RNDr(hash, S, 41); + RNDr(hash, S, 42); + RNDr(hash, S, 43); + RNDr(hash, S, 44); + RNDr(hash, S, 45); + RNDr(hash, S, 46); + RNDr(hash, S, 47); + RNDr(hash, S, 48); + RNDr(hash, S, 49); + RNDr(hash, S, 50); + RNDr(hash, S, 51); + RNDr(hash, S, 52); + RNDr(hash, S, 53); + RNDr(hash, S, 54); + RNDr(hash, S, 55); + RNDr(hash, S, 56); + + hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5]) + + S[57] + sha256_k[57]; + hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4]) + + S[58] + sha256_k[58]; + hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3]) + + S[59] + sha256_k[59]; + hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2]) + + S[60] + sha256_k[60] + + sha256_h[7]; +} + +// +// Code taken from original scrypt.cpp and vectorized with minimal changes. +// + +static const uint32x4_t keypadx4[12] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 +}; +static const uint32x4_t innerpadx4[11] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 +}; +static const uint32x4_t outerpadx4[8] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300 +}; +static const uint32x4_t finalblkx4[16] = { + 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 +}; + +static inline void HMAC_SHA256_80_initx4(const uint32x4_t *key, + uint32x4_t *tstate, uint32x4_t *ostate) +{ + uint32x4_t ihash[8]; + uint32x4_t pad[16]; + int i; + + /* tstate is assumed to contain the midstate of key */ + memcpy(pad, key + 16, 4*16); + memcpy(pad + 4, keypadx4, 4*48); + sha256_transformx4(tstate, pad, 0); + memcpy(ihash, tstate, 4*32); + + sha256_initx4(ostate); + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x5c5c5c5c; + for (; i < 16; i++) + pad[i] = 0x5c5c5c5c; + sha256_transformx4(ostate, pad, 0); + + sha256_initx4(tstate); + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x36363636; + for (; i < 16; i++) + pad[i] = 0x36363636; + sha256_transformx4(tstate, pad, 0); +} + +static inline void PBKDF2_SHA256_80_128x4(const uint32x4_t *tstate, + const uint32x4_t *ostate, const uint32x4_t *salt, uint32x4_t *output) +{ + uint32x4_t istate[8], ostate2[8]; + uint32x4_t ibuf[16], obuf[16]; + int i, j; + + memcpy(istate, tstate, 4*32); + sha256_transformx4(istate, salt, 0); + + memcpy(ibuf, salt + 16, 4*16); + memcpy(ibuf + 5, innerpadx4, 4*44); + memcpy(obuf + 8, outerpadx4, 4*32); + + for (i = 0; i < 4; i++) { + memcpy(obuf, istate, 4*32); + ibuf[4] = i + 1; + sha256_transformx4(obuf, ibuf, 0); + + memcpy(ostate2, ostate, 4*32); + sha256_transformx4(ostate2, obuf, 0); + for (j = 0; j < 8; j++) + output[8 * i + j] = swab32x4(ostate2[j]); + } +} + +static inline void PBKDF2_SHA256_128_32x4(uint32x4_t *tstate, uint32x4_t *ostate, + const uint32x4_t *salt, uint32x4_t *output) +{ + uint32x4_t buf[16]; + int i; + + sha256_transformx4(tstate, salt, 1); + sha256_transformx4(tstate, salt + 16, 1); + sha256_transformx4(tstate, finalblkx4, 0); + memcpy(buf, tstate, 4*32); + memcpy(buf + 8, outerpadx4, 4*32); + + sha256_transformx4(ostate, buf, 0); + for (i = 0; i < 8; i++) + output[i] = swab32x4(ostate[i]); +} + + +// +// Original scrypt.cpp HMAC SHA256 functions +// + +static const uint32_t keypad[12] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 +}; +static const uint32_t innerpad[11] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 +}; +static const uint32_t outerpad[8] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300 +}; +static const uint32_t finalblk[16] = { + 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 +}; + +static inline void HMAC_SHA256_80_init(const uint32_t *key, + uint32_t *tstate, uint32_t *ostate) +{ + uint32_t ihash[8]; + uint32_t pad[16]; + int i; + + /* tstate is assumed to contain the midstate of key */ + memcpy(pad, key + 16, 16); + memcpy(pad + 4, keypad, 48); + sha256_transform(tstate, pad, 0); + memcpy(ihash, tstate, 32); + + sha256_init(ostate); + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x5c5c5c5c; + for (; i < 16; i++) + pad[i] = 0x5c5c5c5c; + sha256_transform(ostate, pad, 0); + + sha256_init(tstate); + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x36363636; + for (; i < 16; i++) + pad[i] = 0x36363636; + sha256_transform(tstate, pad, 0); +} + +static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t istate[8], ostate2[8]; + uint32_t ibuf[16], obuf[16]; + int i, j; + + memcpy(istate, tstate, 32); + sha256_transform(istate, salt, 0); + + memcpy(ibuf, salt + 16, 16); + memcpy(ibuf + 5, innerpad, 44); + memcpy(obuf + 8, outerpad, 32); + + for (i = 0; i < 4; i++) { + memcpy(obuf, istate, 32); + ibuf[4] = i + 1; + sha256_transform(obuf, ibuf, 0); + + memcpy(ostate2, ostate, 32); + sha256_transform(ostate2, obuf, 0); + for (j = 0; j < 8; j++) + output[8 * i + j] = swab32(ostate2[j]); + } +} + +static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, + const uint32_t *salt, uint32_t *output) +{ + uint32_t buf[16]; + + sha256_transform(tstate, salt, 1); + sha256_transform(tstate, salt + 16, 1); + sha256_transform(tstate, finalblk, 0); + memcpy(buf, tstate, 32); + memcpy(buf + 8, outerpad, 32); + + sha256_transform(ostate, buf, 0); + for (int i = 0; i < 8; i++) + output[i] = swab32(ostate[i]); +} + +static int lastFactor = 0; +// +// Scrypt proof of work algorithm +// using SSE2 vectorized HMAC SHA256 on CPU and +// a salsa core implementation on GPU with CUDA +// + +int scanhash_scrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, unsigned char *scratchbuf, + uint32_t max_nonce, unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end) +{ + int result = 0; + int throughput = cuda_throughput(thr_id); + + if(throughput == 0) + return -1; + + gettimeofday(tv_start, NULL); + + uint32_t n = pdata[19]; + const uint32_t Htarg = ptarget[7]; + + // no default set with --cputest + if (opt_nfactor == 0) opt_nfactor = 9; + uint32_t N = (1UL<<(opt_nfactor+1)); + uint32_t *scratch = new uint32_t[N*32]; // scratchbuffer for CPU based validation + + uint32_t nonce[2]; + uint32_t* hash[2] = { cuda_hashbuffer(thr_id,0), cuda_hashbuffer(thr_id,1) }; + uint32_t* X[2] = { cuda_transferbuffer(thr_id,0), cuda_transferbuffer(thr_id,1) }; + + bool sha_on_cpu = (parallel < 2); + bool sha_multithreaded = (parallel == 1); + uint32x4_t* datax4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 20] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 20] : NULL }; + uint32x4_t* hashx4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 8] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 8] : NULL }; + uint32x4_t* tstatex4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 8] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 8] : NULL }; + uint32x4_t* ostatex4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 8] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 8] : NULL }; + uint32x4_t* Xx4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 32] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 32] : NULL }; + + // log n-factor + if (!opt_quiet && lastFactor != opt_nfactor) { + applog(LOG_WARNING, "scrypt factor set to %d (%u)", opt_nfactor, N); + lastFactor = opt_nfactor; + } + + uint32_t _ALIGN(64) midstate[8]; + sha256_init(midstate); + sha256_transform(midstate, pdata, 0); + + if (sha_on_cpu) { + for (int i = 0; i < throughput/4; ++i) { + for (int j = 0; j < 20; j++) { + datax4[0][20*i+j] = uint32x4_t(pdata[j]); + datax4[1][20*i+j] = uint32x4_t(pdata[j]); + } + } + } + else prepare_sha256(thr_id, pdata, midstate); + + int cur = 1, nxt = 0; + int iteration = 0; + int num_shares = (4*opt_n_threads) || 1; // opt_n_threads can be 0 with --cputest + int share_workload = ((((throughput + num_shares-1) / num_shares) + 3) / 4) * 4; + + do { + nonce[nxt] = n; + + if (sha_on_cpu) + { + for (int i = 0; i < throughput/4; i++) { + datax4[nxt][i * 20 + 19] = uint32x4_t(n+0, n+1, n+2, n+3); + n += 4; + } + if (sha_multithreaded) + { +#ifdef WIN32 + parallel_for (0, num_shares, [&](int share) { + for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) { + for (int l = 0; l < 8; l++) + tstatex4[nxt][k * 8 + l] = uint32x4_t(midstate[l]); + HMAC_SHA256_80_initx4(&datax4[nxt][k * 20], &tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8]); + PBKDF2_SHA256_80_128x4(&tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8], &datax4[nxt][k * 20], &Xx4[nxt][k * 32]); + } + } ); +#else + #pragma omp parallel for + for (int share = 0; share < num_shares; share++) { + for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) { + for (int l = 0; l < 8; l++) + tstatex4[nxt][k * 8 + l] = uint32x4_t(midstate[l]); + HMAC_SHA256_80_initx4(&datax4[nxt][k * 20], &tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8]); + PBKDF2_SHA256_80_128x4(&tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8], &datax4[nxt][k * 20], &Xx4[nxt][k * 32]); + } + } +#endif + } + else /* sha_multithreaded */ + { + for (int k = 0; k < throughput/4; k++) { + for (int l = 0; l < 8; l++) + tstatex4[nxt][k * 8 + l] = uint32x4_t(midstate[l]); + HMAC_SHA256_80_initx4(&datax4[nxt][k * 20], &tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8]); + PBKDF2_SHA256_80_128x4(&tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8], &datax4[nxt][k * 20], &Xx4[nxt][k * 32]); + } + } + + for (int i = 0; i < throughput/4; i++) { + for (int j = 0; j < 32; j++) { + uint32x4_t &t = Xx4[nxt][i * 32 + j]; + X[nxt][(4*i+0)*32+j] = t[0]; X[nxt][(4*i+1)*32+j] = t[1]; + X[nxt][(4*i+2)*32+j] = t[2]; X[nxt][(4*i+3)*32+j] = t[3]; + } + } + + cuda_scrypt_serialize(thr_id, nxt); + cuda_scrypt_HtoD(thr_id, X[nxt], nxt); + + cuda_scrypt_core(thr_id, nxt, N); + cuda_scrypt_done(thr_id, nxt); + + cuda_scrypt_DtoH(thr_id, X[nxt], nxt, false); + cuda_scrypt_flush(thr_id, nxt); + + if(!cuda_scrypt_sync(thr_id, cur)) + { + result = -1; + break; + } + + for (int i = 0; i < throughput/4; i++) { + for (int j = 0; j < 32; j++) { + Xx4[cur][i * 32 + j] = uint32x4_t( + X[cur][(4*i+0)*32+j], X[cur][(4*i+1)*32+j], + X[cur][(4*i+2)*32+j], X[cur][(4*i+3)*32+j] + ); + } + } + + if (sha_multithreaded) + { +#ifdef WIN32 + parallel_for (0, num_shares, [&](int share) { + for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) { + PBKDF2_SHA256_128_32x4(&tstatex4[cur][k * 8], &ostatex4[cur][k * 8], &Xx4[cur][k * 32], &hashx4[cur][k * 8]); + } + } ); +#else + #pragma omp parallel for + for (int share = 0; share < num_shares; share++) { + for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) { + PBKDF2_SHA256_128_32x4(&tstatex4[cur][k * 8], &ostatex4[cur][k * 8], &Xx4[cur][k * 32], &hashx4[cur][k * 8]); + } + } +#endif + } else { + + for (int k = 0; k < throughput/4; k++) { + PBKDF2_SHA256_128_32x4(&tstatex4[cur][k * 8], &ostatex4[cur][k * 8], &Xx4[cur][k * 32], &hashx4[cur][k * 8]); + } + } + + for (int i = 0; i < throughput/4; i++) { + for (int j = 0; j < 8; j++) { + uint32x4_t &t = hashx4[cur][i * 8 + j]; + hash[cur][(4*i+0)*8+j] = t[0]; hash[cur][(4*i+1)*8+j] = t[1]; + hash[cur][(4*i+2)*8+j] = t[2]; hash[cur][(4*i+3)*8+j] = t[3]; + } + } + } + else /* sha_on_cpu */ + { + n += throughput; + + cuda_scrypt_serialize(thr_id, nxt); + pre_sha256(thr_id, nxt, nonce[nxt], throughput); + + cuda_scrypt_core(thr_id, nxt, N); + cuda_scrypt_flush(thr_id, nxt); // required here ? + + post_sha256(thr_id, nxt, throughput); + cuda_scrypt_done(thr_id, nxt); + + cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true); + cuda_scrypt_flush(thr_id, nxt); // required here ? + + if (!cuda_scrypt_sync(thr_id, cur)) { + printf("error\n"); + result = -1; + break; + } + } + + if (iteration > 0 || opt_n_threads == 0) + { + for (int i = 0; i < throughput; i++) + { + if (hash[cur][i * 8 + 7] <= Htarg && fulltest(hash[cur] + i * 8, ptarget)) + { + // CPU based validation to rule out GPU errors (scalar CPU code) + uint32_t _ALIGN(64) inp[32], ref[32], tstate[8], ostate[8], refhash[8], ldata[20]; + + memcpy(ldata, pdata, 80); ldata[19] = nonce[cur] + i; + memcpy(tstate, midstate, 32); + HMAC_SHA256_80_init(ldata, tstate, ostate); + PBKDF2_SHA256_80_128(tstate, ostate, ldata, inp); + computeGold(inp, ref, (uchar*)scratch); + bool good = true; + + if (sha_on_cpu) { + if (memcmp(&X[cur][i * 32], ref, 32*sizeof(uint32_t)) != 0) good = false; + } else { + PBKDF2_SHA256_128_32(tstate, ostate, ref, refhash); + if (memcmp(&hash[cur][i * 8], refhash, 32) != 0) good = false; + } + + if (!good) + applog(LOG_INFO, "GPU #%d: %s result does not validate on CPU (i=%d, s=%d)!", device_map[thr_id], device_name[thr_id], i, cur); + else { + *hashes_done = n - pdata[19]; + pdata[19] = nonce[cur] + i; + result = 1; + goto byebye; + } + } + } + } + + cur = (cur+1)&1; + nxt = (nxt+1)&1; + ++iteration; + + //printf("n=%d, thr=%d, max=%d, rest=%d\n", n, throughput, max_nonce, work_restart[thr_id].restart); + } while (n <= max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - pdata[19]; + pdata[19] = n; +byebye: + delete[] datax4[0]; delete[] datax4[1]; delete[] hashx4[0]; delete[] hashx4[1]; + delete[] tstatex4[0]; delete[] tstatex4[1]; delete[] ostatex4[0]; delete[] ostatex4[1]; + delete[] Xx4[0]; delete[] Xx4[1]; + delete [] scratch; + gettimeofday(tv_end, NULL); + return result; +} + +#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) + +static void xor_salsa8(uint32_t * const B, const uint32_t * const C) +{ + uint32_t x0 = (B[ 0] ^= C[ 0]), x1 = (B[ 1] ^= C[ 1]), x2 = (B[ 2] ^= C[ 2]), x3 = (B[ 3] ^= C[ 3]); + uint32_t x4 = (B[ 4] ^= C[ 4]), x5 = (B[ 5] ^= C[ 5]), x6 = (B[ 6] ^= C[ 6]), x7 = (B[ 7] ^= C[ 7]); + uint32_t x8 = (B[ 8] ^= C[ 8]), x9 = (B[ 9] ^= C[ 9]), xa = (B[10] ^= C[10]), xb = (B[11] ^= C[11]); + uint32_t xc = (B[12] ^= C[12]), xd = (B[13] ^= C[13]), xe = (B[14] ^= C[14]), xf = (B[15] ^= C[15]); + + /* Operate on columns. */ + x4 ^= ROTL(x0 + xc, 7); x9 ^= ROTL(x5 + x1, 7); xe ^= ROTL(xa + x6, 7); x3 ^= ROTL(xf + xb, 7); + x8 ^= ROTL(x4 + x0, 9); xd ^= ROTL(x9 + x5, 9); x2 ^= ROTL(xe + xa, 9); x7 ^= ROTL(x3 + xf, 9); + xc ^= ROTL(x8 + x4, 13); x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13); xb ^= ROTL(x7 + x3, 13); + x0 ^= ROTL(xc + x8, 18); x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18); xf ^= ROTL(xb + x7, 18); + + /* Operate on rows. */ + x1 ^= ROTL(x0 + x3, 7); x6 ^= ROTL(x5 + x4, 7); xb ^= ROTL(xa + x9, 7); xc ^= ROTL(xf + xe, 7); + x2 ^= ROTL(x1 + x0, 9); x7 ^= ROTL(x6 + x5, 9); x8 ^= ROTL(xb + xa, 9); xd ^= ROTL(xc + xf, 9); + x3 ^= ROTL(x2 + x1, 13); x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13); xe ^= ROTL(xd + xc, 13); + x0 ^= ROTL(x3 + x2, 18); x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18); xf ^= ROTL(xe + xd, 18); + + /* Operate on columns. */ + x4 ^= ROTL(x0 + xc, 7); x9 ^= ROTL(x5 + x1, 7); xe ^= ROTL(xa + x6, 7); x3 ^= ROTL(xf + xb, 7); + x8 ^= ROTL(x4 + x0, 9); xd ^= ROTL(x9 + x5, 9); x2 ^= ROTL(xe + xa, 9); x7 ^= ROTL(x3 + xf, 9); + xc ^= ROTL(x8 + x4, 13); x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13); xb ^= ROTL(x7 + x3, 13); + x0 ^= ROTL(xc + x8, 18); x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18); xf ^= ROTL(xb + x7, 18); + + /* Operate on rows. */ + x1 ^= ROTL(x0 + x3, 7); x6 ^= ROTL(x5 + x4, 7); xb ^= ROTL(xa + x9, 7); xc ^= ROTL(xf + xe, 7); + x2 ^= ROTL(x1 + x0, 9); x7 ^= ROTL(x6 + x5, 9); x8 ^= ROTL(xb + xa, 9); xd ^= ROTL(xc + xf, 9); + x3 ^= ROTL(x2 + x1, 13); x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13); xe ^= ROTL(xd + xc, 13); + x0 ^= ROTL(x3 + x2, 18); x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18); xf ^= ROTL(xe + xd, 18); + + /* Operate on columns. */ + x4 ^= ROTL(x0 + xc, 7); x9 ^= ROTL(x5 + x1, 7); xe ^= ROTL(xa + x6, 7); x3 ^= ROTL(xf + xb, 7); + x8 ^= ROTL(x4 + x0, 9); xd ^= ROTL(x9 + x5, 9); x2 ^= ROTL(xe + xa, 9); x7 ^= ROTL(x3 + xf, 9); + xc ^= ROTL(x8 + x4, 13); x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13); xb ^= ROTL(x7 + x3, 13); + x0 ^= ROTL(xc + x8, 18); x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18); xf ^= ROTL(xb + x7, 18); + + /* Operate on rows. */ + x1 ^= ROTL(x0 + x3, 7); x6 ^= ROTL(x5 + x4, 7); xb ^= ROTL(xa + x9, 7); xc ^= ROTL(xf + xe, 7); + x2 ^= ROTL(x1 + x0, 9); x7 ^= ROTL(x6 + x5, 9); x8 ^= ROTL(xb + xa, 9); xd ^= ROTL(xc + xf, 9); + x3 ^= ROTL(x2 + x1, 13); x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13); xe ^= ROTL(xd + xc, 13); + x0 ^= ROTL(x3 + x2, 18); x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18); xf ^= ROTL(xe + xd, 18); + + /* Operate on columns. */ + x4 ^= ROTL(x0 + xc, 7); x9 ^= ROTL(x5 + x1, 7); xe ^= ROTL(xa + x6, 7); x3 ^= ROTL(xf + xb, 7); + x8 ^= ROTL(x4 + x0, 9); xd ^= ROTL(x9 + x5, 9); x2 ^= ROTL(xe + xa, 9); x7 ^= ROTL(x3 + xf, 9); + xc ^= ROTL(x8 + x4, 13); x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13); xb ^= ROTL(x7 + x3, 13); + x0 ^= ROTL(xc + x8, 18); x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18); xf ^= ROTL(xb + x7, 18); + + /* Operate on rows. */ + x1 ^= ROTL(x0 + x3, 7); x6 ^= ROTL(x5 + x4, 7); xb ^= ROTL(xa + x9, 7); xc ^= ROTL(xf + xe, 7); + x2 ^= ROTL(x1 + x0, 9); x7 ^= ROTL(x6 + x5, 9); x8 ^= ROTL(xb + xa, 9); xd ^= ROTL(xc + xf, 9); + x3 ^= ROTL(x2 + x1, 13); x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13); xe ^= ROTL(xd + xc, 13); + x0 ^= ROTL(x3 + x2, 18); x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18); xf ^= ROTL(xe + xd, 18); + + B[ 0] += x0; B[ 1] += x1; B[ 2] += x2; B[ 3] += x3; B[ 4] += x4; B[ 5] += x5; B[ 6] += x6; B[ 7] += x7; + B[ 8] += x8; B[ 9] += x9; B[10] += xa; B[11] += xb; B[12] += xc; B[13] += xd; B[14] += xe; B[15] += xf; +} + +/** + * @param X input/ouput + * @param V scratch buffer + * @param N factor + */ +static void scrypt_core(uint32_t *X, uint32_t *V, int N) +{ + for (int i = 0; i < N; i++) { + memcpy(&V[i * 32], X, 128); + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); + } + for (int i = 0; i < N; i++) { + uint32_t j = 32 * (X[16] & (N - 1)); + for (uint8_t k = 0; k < 32; k++) + X[k] ^= V[j + k]; + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); + } +} + +/** + * Compute reference data set on the CPU + * @param input input data as provided to device + * @param reference reference data, computed but preallocated + * @param scratchpad scrypt scratchpad + **/ +void computeGold(uint32_t* const input, uint32_t *reference, uchar *scratchpad) +{ + uint32_t X[32] = { 0 }; + uint32_t *V = (uint32_t*) scratchpad; + int N = (1<<(opt_nfactor+1)); // default 9 = 1024 + + for (int k = 0; k < 32; k++) + X[k] = input[k]; + + scrypt_core(X, V, N); + + for (int k = 0; k < 32; k++) + reference[k] = X[k]; +} + +static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, + uint32_t *midstate, unsigned char *scratchpad, int N) +{ + uint32_t tstate[8], ostate[8]; + uint32_t X[32] = { 0 }; + uint32_t *V = (uint32_t *) scratchpad; + + memcpy(tstate, midstate, 32); + HMAC_SHA256_80_init(input, tstate, ostate); + PBKDF2_SHA256_80_128(tstate, ostate, input, X); + + scrypt_core(X, V, N); + + PBKDF2_SHA256_128_32(tstate, ostate, X, output); +} + +/* cputest */ +void scrypthash(void* output, const void* input) +{ + uint32_t _ALIGN(64) X[32], ref[32] = { 0 }, tstate[8], ostate[8], midstate[8]; + uint32_t _ALIGN(64) data[20]; + uchar *scratchbuf = (uchar *) calloc(4 * 128 + 63, 1024); + + // no default set with --cputest + if (opt_nfactor == 0) opt_nfactor = 9; + + memcpy(data, input, 80); + + sha256_init(midstate); + sha256_transform(midstate, data, 0); /* ok */ + + memcpy(tstate, midstate, 32); + HMAC_SHA256_80_init(data, tstate, ostate); + PBKDF2_SHA256_80_128(tstate, ostate, data, X); /* ok */ + + if (scratchbuf) { + computeGold(X, ref, scratchbuf); + PBKDF2_SHA256_128_32(tstate, ostate, ref, (uint32_t*) output); + } else { + memset(output, 0, 32); + } + + free(scratchbuf); +} + +#define SCRYPT_MAX_WAYS 1 +/* cputest */ +void scrypthash2(void* output, const void* input) +{ + uint32_t midstate[8] = { 0 }; + uint32_t data[SCRYPT_MAX_WAYS * 20] = { 0 }; + uint32_t hash[SCRYPT_MAX_WAYS * 8] = { 0 }; + uint32_t N = 1U << ((opt_nfactor ? opt_nfactor : 9) + 1); // default 1024 + + uchar* scratch = (uchar*) calloc(4 * 128 + 63, N); // scrypt_buffer_alloc(N); + + memcpy(data, input, 80); + + sha256_init(midstate); + sha256_transform(midstate, data, 0); + + scrypt_1024_1_1_256(data, hash, midstate, scratch, N); + + memcpy(output, hash, 32); + + free(scratch); +} diff --git a/scrypt/blake.cu b/scrypt/blake.cu new file mode 100644 index 0000000..bcaa965 --- /dev/null +++ b/scrypt/blake.cu @@ -0,0 +1,454 @@ +// +// =============== BLAKE part on nVidia GPU ====================== +// +// This is the generic "default" implementation when no architecture +// specific implementation is available in the kernel. +// +// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64 +// +// TODO: CUDA porting work remains to be done. +// + +#include +#include + +#include "cuda_runtime.h" +#include "salsa_kernel.h" +#include "miner.h" + +typedef uint32_t sph_u32; +#define SPH_C32(x) ((sph_u32)(x)) +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) +#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) +#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) + +__constant__ uint64_t ptarget64[4]; +__constant__ uint32_t pdata[20]; + +// define some error checking macros +#undef checkCudaErrors + +#if WIN32 +#define DELIMITER '/' +#else +#define DELIMITER '/' +#endif +#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ ) + +#define checkCudaErrors(x) \ +{ \ + cudaGetLastError(); \ + x; \ + cudaError_t err = cudaGetLastError(); \ + if (err != cudaSuccess) \ + applog(LOG_ERR, "GPU #%d: cudaError %d (%s) calling '%s' (%s line %d)\n", device_map[thr_id], err, cudaGetErrorString(err), #x, __FILENAME__, __LINE__); \ +} + +// from salsa_kernel.cu +extern std::map context_idata[2]; +extern std::map context_odata[2]; +extern std::map context_streams[2]; +extern std::map context_hash[2]; + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static __device__ sph_u32 cuda_sph_bswap32(sph_u32 x) +{ + return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) + | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); +} + +/** + * Encode a 32-bit value into the provided buffer (big endian convention). + * + * @param dst the destination buffer + * @param val the 32-bit value to encode + */ +static __device__ void +cuda_sph_enc32be(void *dst, sph_u32 val) +{ + *(sph_u32 *)dst = cuda_sph_bswap32(val); +} + +#define Z00 0 +#define Z01 1 +#define Z02 2 +#define Z03 3 +#define Z04 4 +#define Z05 5 +#define Z06 6 +#define Z07 7 +#define Z08 8 +#define Z09 9 +#define Z0A A +#define Z0B B +#define Z0C C +#define Z0D D +#define Z0E E +#define Z0F F + +#define Z10 E +#define Z11 A +#define Z12 4 +#define Z13 8 +#define Z14 9 +#define Z15 F +#define Z16 D +#define Z17 6 +#define Z18 1 +#define Z19 C +#define Z1A 0 +#define Z1B 2 +#define Z1C B +#define Z1D 7 +#define Z1E 5 +#define Z1F 3 + +#define Z20 B +#define Z21 8 +#define Z22 C +#define Z23 0 +#define Z24 5 +#define Z25 2 +#define Z26 F +#define Z27 D +#define Z28 A +#define Z29 E +#define Z2A 3 +#define Z2B 6 +#define Z2C 7 +#define Z2D 1 +#define Z2E 9 +#define Z2F 4 + +#define Z30 7 +#define Z31 9 +#define Z32 3 +#define Z33 1 +#define Z34 D +#define Z35 C +#define Z36 B +#define Z37 E +#define Z38 2 +#define Z39 6 +#define Z3A 5 +#define Z3B A +#define Z3C 4 +#define Z3D 0 +#define Z3E F +#define Z3F 8 + +#define Z40 9 +#define Z41 0 +#define Z42 5 +#define Z43 7 +#define Z44 2 +#define Z45 4 +#define Z46 A +#define Z47 F +#define Z48 E +#define Z49 1 +#define Z4A B +#define Z4B C +#define Z4C 6 +#define Z4D 8 +#define Z4E 3 +#define Z4F D + +#define Z50 2 +#define Z51 C +#define Z52 6 +#define Z53 A +#define Z54 0 +#define Z55 B +#define Z56 8 +#define Z57 3 +#define Z58 4 +#define Z59 D +#define Z5A 7 +#define Z5B 5 +#define Z5C F +#define Z5D E +#define Z5E 1 +#define Z5F 9 + +#define Z60 C +#define Z61 5 +#define Z62 1 +#define Z63 F +#define Z64 E +#define Z65 D +#define Z66 4 +#define Z67 A +#define Z68 0 +#define Z69 7 +#define Z6A 6 +#define Z6B 3 +#define Z6C 9 +#define Z6D 2 +#define Z6E 8 +#define Z6F B + +#define Z70 D +#define Z71 B +#define Z72 7 +#define Z73 E +#define Z74 C +#define Z75 1 +#define Z76 3 +#define Z77 9 +#define Z78 5 +#define Z79 0 +#define Z7A F +#define Z7B 4 +#define Z7C 8 +#define Z7D 6 +#define Z7E 2 +#define Z7F A + +#define Z80 6 +#define Z81 F +#define Z82 E +#define Z83 9 +#define Z84 B +#define Z85 3 +#define Z86 0 +#define Z87 8 +#define Z88 C +#define Z89 2 +#define Z8A D +#define Z8B 7 +#define Z8C 1 +#define Z8D 4 +#define Z8E A +#define Z8F 5 + +#define Z90 A +#define Z91 2 +#define Z92 8 +#define Z93 4 +#define Z94 7 +#define Z95 6 +#define Z96 1 +#define Z97 5 +#define Z98 F +#define Z99 B +#define Z9A 9 +#define Z9B E +#define Z9C 3 +#define Z9D C +#define Z9E D +#define Z9F 0 + +#define Mx(r, i) Mx_(Z ## r ## i) +#define Mx_(n) Mx__(n) +#define Mx__(n) M ## n + +#define CSx(r, i) CSx_(Z ## r ## i) +#define CSx_(n) CSx__(n) +#define CSx__(n) CS ## n + +#define CS0 SPH_C32(0x243F6A88) +#define CS1 SPH_C32(0x85A308D3) +#define CS2 SPH_C32(0x13198A2E) +#define CS3 SPH_C32(0x03707344) +#define CS4 SPH_C32(0xA4093822) +#define CS5 SPH_C32(0x299F31D0) +#define CS6 SPH_C32(0x082EFA98) +#define CS7 SPH_C32(0xEC4E6C89) +#define CS8 SPH_C32(0x452821E6) +#define CS9 SPH_C32(0x38D01377) +#define CSA SPH_C32(0xBE5466CF) +#define CSB SPH_C32(0x34E90C6C) +#define CSC SPH_C32(0xC0AC29B7) +#define CSD SPH_C32(0xC97C50DD) +#define CSE SPH_C32(0x3F84D5B5) +#define CSF SPH_C32(0xB5470917) + +#define GS(m0, m1, c0, c1, a, b, c, d) do { \ + a = SPH_T32(a + b + (m0 ^ c1)); \ + d = SPH_ROTR32(d ^ a, 16); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 12); \ + a = SPH_T32(a + b + (m1 ^ c0)); \ + d = SPH_ROTR32(d ^ a, 8); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 7); \ + } while (0) + +#define ROUND_S(r) do { \ + GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ + GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ + GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ + GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ + GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ + GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ + GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ + GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ + } while (0) + +#define COMPRESS32 do { \ + sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \ + sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \ + sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CS0; \ + V9 = S1 ^ CS1; \ + VA = S2 ^ CS2; \ + VB = S3 ^ CS3; \ + VC = T0 ^ CS4; \ + VD = T0 ^ CS5; \ + VE = T1 ^ CS6; \ + VF = T1 ^ CS7; \ + M0 = input[0]; \ + M1 = input[1]; \ + M2 = input[2]; \ + M3 = input[3]; \ + M4 = input[4]; \ + M5 = input[5]; \ + M6 = input[6]; \ + M7 = input[7]; \ + M8 = input[8]; \ + M9 = input[9]; \ + MA = input[10]; \ + MB = input[11]; \ + MC = input[12]; \ + MD = input[13]; \ + ME = input[14]; \ + MF = input[15]; \ + ROUND_S(0); \ + ROUND_S(1); \ + ROUND_S(2); \ + ROUND_S(3); \ + ROUND_S(4); \ + ROUND_S(5); \ + ROUND_S(6); \ + ROUND_S(7); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + +__global__ void cuda_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate ) +{ + uint32_t input[16]; + uint64_t output[4]; + +#pragma unroll 16 + for (int i=0; i < 16; ++i) input[i] = pdata[i]; + + sph_u32 H0 = 0x6A09E667; + sph_u32 H1 = 0xBB67AE85; + sph_u32 H2 = 0x3C6EF372; + sph_u32 H3 = 0xA54FF53A; + sph_u32 H4 = 0x510E527F; + sph_u32 H5 = 0x9B05688C; + sph_u32 H6 = 0x1F83D9AB; + sph_u32 H7 = 0x5BE0CD19; + sph_u32 S0 = 0; + sph_u32 S1 = 0; + sph_u32 S2 = 0; + sph_u32 S3 = 0; + sph_u32 T0 = 0; + sph_u32 T1 = 0; + T0 = SPH_T32(T0 + 512); + COMPRESS32; + +#pragma unroll 3 + for (int i=0; i < 3; ++i) input[i] = pdata[16+i]; + input[3] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); + input[4] = 0x80000000; +#pragma unroll 8 + for (int i=5; i < 13; ++i) input[i] = 0; + input[13] = 0x00000001; + input[14] = T1; + input[15] = T0 + 128; + + T0 = SPH_T32(T0 + 128); + COMPRESS32; + + cuda_sph_enc32be((unsigned char*)output + 4*6, H6); + cuda_sph_enc32be((unsigned char*)output + 4*7, H7); + if (validate || output[3] <= ptarget64[3]) + { + // this data is only needed when we actually need to save the hashes + cuda_sph_enc32be((unsigned char*)output + 4*0, H0); + cuda_sph_enc32be((unsigned char*)output + 4*1, H1); + cuda_sph_enc32be((unsigned char*)output + 4*2, H2); + cuda_sph_enc32be((unsigned char*)output + 4*3, H3); + cuda_sph_enc32be((unsigned char*)output + 4*4, H4); + cuda_sph_enc32be((unsigned char*)output + 4*5, H5); + } + + if (validate) + { + g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x); +#pragma unroll 4 + for (int i=0; i < 4; ++i) g_out[i] = output[i]; + } + + if (output[3] <= ptarget64[3]) { + uint64_t *g_good64 = (uint64_t*)g_good; + if (output[3] < g_good64[3]) { + g_good64[3] = output[3]; + g_good64[2] = output[2]; + g_good64[1] = output[1]; + g_good64[0] = output[0]; + g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); + } + } +} + +static bool init[MAX_GPUS] = { 0 }; +static std::map context_good[2]; + +bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) +{ + if (!init[thr_id]) + { + // allocate pinned host memory for good hashes + uint32_t *tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp; + + init[thr_id] = true; + } + checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 80, 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 32, 0, cudaMemcpyHostToDevice)); + + return context_good[0][thr_id] && context_good[1][thr_id]; +} + +void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) +{ + checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id])); + + cuda_blake256_hash<<>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h); + + // copy hashes from device memory to host (ALL hashes, lots of data...) + if (do_d2h && hash != NULL) { + size_t mem_size = throughput * sizeof(uint32_t) * 8; + checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size, + cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); + } + else if (hash != NULL) { + // asynchronous copy of winning nonce (just 4 bytes...) + checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t), + cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); + } +} diff --git a/scrypt/code/scrypt-conf.h b/scrypt/code/scrypt-conf.h new file mode 100644 index 0000000..46685a5 --- /dev/null +++ b/scrypt/code/scrypt-conf.h @@ -0,0 +1,28 @@ +/* + pick the best algo at runtime or compile time? + ---------------------------------------------- + SCRYPT_CHOOSE_COMPILETIME (gcc only!) + SCRYPT_CHOOSE_RUNTIME +*/ +#define SCRYPT_CHOOSE_RUNTIME + + +/* + hash function to use + ------------------------------- + SCRYPT_BLAKE256 + SCRYPT_BLAKE512 + SCRYPT_SHA256 + SCRYPT_SHA512 + SCRYPT_SKEIN512 +*/ +//#define SCRYPT_SHA256 + + +/* + block mixer to use + ----------------------------- + SCRYPT_CHACHA + SCRYPT_SALSA +*/ +//#define SCRYPT_SALSA diff --git a/scrypt/code/scrypt-jane-chacha.h b/scrypt/code/scrypt-jane-chacha.h new file mode 100644 index 0000000..7536004 --- /dev/null +++ b/scrypt/code/scrypt-jane-chacha.h @@ -0,0 +1,58 @@ +#define SCRYPT_MIX_BASE "ChaCha20/8" + +typedef uint32_t scrypt_mix_word_t; + +#define SCRYPT_WORDTO8_LE U32TO8_LE +#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP + +#define SCRYPT_BLOCK_BYTES 64 +#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + +/* must have these here in case block bytes is ever != 64 */ +#include "scrypt-jane-romix-basic.h" + +#include "scrypt-jane-mix_chacha.h" + +/* cpu agnostic */ +#define SCRYPT_ROMIX_FN scrypt_ROMix_basic +#define SCRYPT_MIX_FN chacha_core_basic +#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian +#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian +#include "scrypt-jane-romix-template.h" + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +static scrypt_ROMixfn +scrypt_getROMix() { + size_t cpuflags = detect_cpu(); + + return scrypt_ROMix_basic; +} +#endif + + +#if defined(SCRYPT_TEST_SPEED) +static size_t +available_implementations() { + size_t cpuflags = detect_cpu(); + size_t flags = 0; + + return flags; +} +#endif + +static int +scrypt_test_mix() { + static const uint8_t expected[16] = { + 0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a, + }; + + int ret = 1; + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_CHACHA_BASIC) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); +#endif + + return ret; +} + diff --git a/scrypt/code/scrypt-jane-mix_chacha.h b/scrypt/code/scrypt-jane-mix_chacha.h new file mode 100644 index 0000000..85ee9c1 --- /dev/null +++ b/scrypt/code/scrypt-jane-mix_chacha.h @@ -0,0 +1,69 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED) + +#undef SCRYPT_MIX +#define SCRYPT_MIX "ChaCha20/8 Ref" + +#undef SCRYPT_CHACHA_INCLUDED +#define SCRYPT_CHACHA_INCLUDED +#define SCRYPT_CHACHA_BASIC + +static void +chacha_core_basic(uint32_t state[16]) { + size_t rounds = 8; + uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; + + x0 = state[0]; + x1 = state[1]; + x2 = state[2]; + x3 = state[3]; + x4 = state[4]; + x5 = state[5]; + x6 = state[6]; + x7 = state[7]; + x8 = state[8]; + x9 = state[9]; + x10 = state[10]; + x11 = state[11]; + x12 = state[12]; + x13 = state[13]; + x14 = state[14]; + x15 = state[15]; + + #define quarter(a,b,c,d) \ + a += b; t = d^a; d = ROTL32(t,16); \ + c += d; t = b^c; b = ROTL32(t,12); \ + a += b; t = d^a; d = ROTL32(t, 8); \ + c += d; t = b^c; b = ROTL32(t, 7); + + for (; rounds; rounds -= 2) { + quarter( x0, x4, x8,x12) + quarter( x1, x5, x9,x13) + quarter( x2, x6,x10,x14) + quarter( x3, x7,x11,x15) + quarter( x0, x5,x10,x15) + quarter( x1, x6,x11,x12) + quarter( x2, x7, x8,x13) + quarter( x3, x4, x9,x14) + } + + state[0] += x0; + state[1] += x1; + state[2] += x2; + state[3] += x3; + state[4] += x4; + state[5] += x5; + state[6] += x6; + state[7] += x7; + state[8] += x8; + state[9] += x9; + state[10] += x10; + state[11] += x11; + state[12] += x12; + state[13] += x13; + state[14] += x14; + state[15] += x15; + + #undef quarter +} + +#endif \ No newline at end of file diff --git a/scrypt/code/scrypt-jane-portable-x86.h b/scrypt/code/scrypt-jane-portable-x86.h new file mode 100644 index 0000000..44f97f9 --- /dev/null +++ b/scrypt/code/scrypt-jane-portable-x86.h @@ -0,0 +1,32 @@ + +typedef enum cpu_flags_x86_t { }cpu_flags_x86; + +typedef enum cpu_vendors_x86_t { + cpu_nobody, + cpu_intel, + cpu_amd +} cpu_vendors_x86; + +typedef struct x86_regs_t { + uint32_t eax, ebx, ecx, edx; +} x86_regs; + + +#if defined(SCRYPT_TEST_SPEED) +size_t cpu_detect_mask = (size_t)-1; +#endif + +static size_t +detect_cpu(void) { + size_t cpu_flags = 0; + return cpu_flags; +} + +#if defined(SCRYPT_TEST_SPEED) +static const char * +get_top_cpuflag_desc(size_t flag) { + return "Basic"; +} +#endif + +#define asm_calling_convention diff --git a/scrypt/code/scrypt-jane-portable.h b/scrypt/code/scrypt-jane-portable.h new file mode 100644 index 0000000..ef5b93d --- /dev/null +++ b/scrypt/code/scrypt-jane-portable.h @@ -0,0 +1,284 @@ +/* determine os */ +#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__) + #include + #include + #define OS_WINDOWS +#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__) + #include + #include + #include + + #define OS_SOLARIS +#else + #include + #include + #include /* need this to define BSD */ + #include + #include + + #define OS_NIX + #if defined(__linux__) + #include + #define OS_LINUX + #elif defined(BSD) + #define OS_BSD + + #if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__)) + #define OS_OSX + #elif defined(macintosh) || defined(Macintosh) + #define OS_MAC + #elif defined(__OpenBSD__) + #define OS_OPENBSD + #endif + #endif +#endif + + +/* determine compiler */ +#if defined(_MSC_VER) + #define COMPILER_MSVC _MSC_VER + #if ((COMPILER_MSVC > 1200) || defined(_mm_free)) + #define COMPILER_MSVC6PP_AND_LATER + #endif + #if (COMPILER_MSVC >= 1500) + #define COMPILER_HAS_TMMINTRIN + #endif + + #pragma warning(disable : 4127) /* conditional expression is constant */ + #pragma warning(disable : 4100) /* unreferenced formal parameter */ + + #ifndef _CRT_SECURE_NO_WARNINGS + #define _CRT_SECURE_NO_WARNINGS + #endif + + #include + #include /* _rotl */ + #include + + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + typedef signed int int32_t; + typedef unsigned __int64 uint64_t; + typedef signed __int64 int64_t; + + #define ROTL32(a,b) _rotl(a,b) + #define ROTR32(a,b) _rotr(a,b) + #define ROTL64(a,b) _rotl64(a,b) + #define ROTR64(a,b) _rotr64(a,b) + #undef NOINLINE + #define NOINLINE __declspec(noinline) + #undef INLINE + #define INLINE __forceinline + #undef FASTCALL + #define FASTCALL __fastcall + #undef CDECL + #define CDECL __cdecl + #undef STDCALL + #define STDCALL __stdcall + #undef NAKED + #define NAKED __declspec(naked) + #define MM16 __declspec(align(16)) +#endif +#if defined(__ICC) + #define COMPILER_INTEL +#endif +#if defined(__GNUC__) + #if (__GNUC__ >= 3) + #define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__ + #else + #define COMPILER_GCC_PATCHLEVEL 0 + #endif + #define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL) + #define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) + #define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) + #define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) + #define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b))) + #undef NOINLINE + #if (COMPILER_GCC >= 30000) + #define NOINLINE __attribute__((noinline)) + #else + #define NOINLINE + #endif + #undef INLINE + #if (COMPILER_GCC >= 30000) + #define INLINE __attribute__((always_inline)) + #else + #define INLINE inline + #endif + #undef FASTCALL + #if (COMPILER_GCC >= 30400) + #define FASTCALL __attribute__((fastcall)) + #else + #define FASTCALL + #endif + #undef CDECL + #define CDECL __attribute__((cdecl)) + #undef STDCALL + #define STDCALL __attribute__((stdcall)) + #define MM16 __attribute__((aligned(16))) + #include +#endif +#if defined(__MINGW32__) || defined(__MINGW64__) + #define COMPILER_MINGW +#endif +#if defined(__PATHCC__) + #define COMPILER_PATHCC +#endif + +#define OPTIONAL_INLINE +#if defined(OPTIONAL_INLINE) + #undef OPTIONAL_INLINE + #define OPTIONAL_INLINE INLINE +#else + #define OPTIONAL_INLINE +#endif + +#define CRYPTO_FN NOINLINE STDCALL + +/* determine cpu */ +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64) + #define CPU_X86_64 +#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500)) + #define CPU_X86 500 +#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400)) + #define CPU_X86 400 +#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__) + #define CPU_X86 300 +#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64) + #define CPU_IA64 +#endif + +#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9) + #define CPU_SPARC + #if defined(__sparcv9) + #define CPU_SPARC64 + #endif +#endif + +#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64)) + #define CPU_64BITS + #undef FASTCALL + #define FASTCALL + #undef CDECL + #define CDECL + #undef STDCALL + #define STDCALL +#endif + +#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC) + #define CPU_PPC + #if defined(_ARCH_PWR7) + #define CPU_POWER7 + #elif defined(__64BIT__) + #define CPU_PPC64 + #else + #define CPU_PPC32 + #endif +#endif + +#if defined(__hppa__) || defined(__hppa) + #define CPU_HPPA +#endif + +#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) + #define CPU_ALPHA +#endif + +/* endian */ + +#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \ + (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \ + (defined(CPU_X86) || defined(CPU_X86_64)) || \ + (defined(vax) || defined(MIPSEL) || defined(_MIPSEL))) +#define CPU_LE +#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \ + (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \ + (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB)) +#define CPU_BE +#else + /* unknown endian! */ +#endif + + +#define U8TO32_BE(p) \ + (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) + +#define U8TO32_LE(p) \ + (((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \ + ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) + +#define U32TO8_BE(p, v) \ + (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); + +#define U32TO8_LE(p, v) \ + (p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \ + (p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24); + +#define U8TO64_BE(p) \ + (((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4)) + +#define U8TO64_LE(p) \ + (((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32)) + +#define U64TO8_BE(p, v) \ + U32TO8_BE((p), (uint32_t)((v) >> 32)); \ + U32TO8_BE((p) + 4, (uint32_t)((v) )); + +#define U64TO8_LE(p, v) \ + U32TO8_LE((p), (uint32_t)((v) )); \ + U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); + +#define U32_SWAP(v) { \ + (v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \ + (v) = ((v) << 16) | ((v) >> 16); \ +} + +#define U64_SWAP(v) { \ + (v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \ + (v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \ + (v) = ((v) << 32) | ((v) >> 32); \ +} + +static int +scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) { + uint32_t differentbits = 0; + while (len--) + differentbits |= (*x++ ^ *y++); + return (1 & ((differentbits - 1) >> 8)); +} + +void +scrypt_ensure_zero(void *p, size_t len) { +#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC)) + __stosb((unsigned char *)p, 0, len); +#elif (defined(CPU_X86) && defined(COMPILER_GCC)) + __asm__ __volatile__( + "pushl %%edi;\n" + "pushl %%ecx;\n" + "rep stosb;\n" + "popl %%ecx;\n" + "popl %%edi;\n" + :: "a"(0), "D"(p), "c"(len) : "cc", "memory" + ); +#elif (defined(CPU_X86_64) && defined(COMPILER_GCC)) + __asm__ __volatile__( + "pushq %%rdi;\n" + "pushq %%rcx;\n" + "rep stosb;\n" + "popq %%rcx;\n" + "popq %%rdi;\n" + :: "a"(0), "D"(p), "c"(len) : "cc", "memory" + ); +#else + volatile uint8_t *b = (volatile uint8_t *)p; + size_t i; + for (i = 0; i < len; i++) + b[i] = 0; +#endif +} + +#include "scrypt-jane-portable-x86.h" + diff --git a/scrypt/code/scrypt-jane-romix-basic.h b/scrypt/code/scrypt-jane-romix-basic.h new file mode 100644 index 0000000..1cdb3fb --- /dev/null +++ b/scrypt/code/scrypt-jane-romix-basic.h @@ -0,0 +1,67 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +/* function type returned by scrypt_getROMix, used with cpu detection */ +typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r); +#endif + +/* romix pre/post nop function */ +static void asm_calling_convention +scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) { +} + +/* romix pre/post endian conversion function */ +static void asm_calling_convention +scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { +#if !defined(CPU_LE) + static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}}; + size_t i; + if (endian_test.w == 0x100) { + nblocks *= SCRYPT_BLOCK_WORDS; + for (i = 0; i < nblocks; i++) { + SCRYPT_WORD_ENDIAN_SWAP(blocks[i]); + } + } +#endif +} + +/* chunkmix test function */ +typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r); +typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks); + +static int +scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) { + /* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */ + const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS; + scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v; + uint8_t final[16]; + size_t i; + + for (i = 0; i < words; i++) { + v = (scrypt_mix_word_t)i; + v = (v << 8) | v; + v = (v << 16) | v; + chunk[0][i] = v; + } + + prefn(chunk[0], blocks); + mixfn(chunk[1], chunk[0], NULL, r); + postfn(chunk[1], blocks); + + /* grab the last 16 bytes of the final block */ + for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) { + SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]); + } + + return scrypt_verify(expected, final, 16); +} + +/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */ +static scrypt_mix_word_t * +scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) { + return base + (i * len); +} + +/* returns a pointer to block i */ +static scrypt_mix_word_t * +scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) { + return base + (i * SCRYPT_BLOCK_WORDS); +} diff --git a/scrypt/code/scrypt-jane-romix-template.h b/scrypt/code/scrypt-jane-romix-template.h new file mode 100644 index 0000000..7879c58 --- /dev/null +++ b/scrypt/code/scrypt-jane-romix-template.h @@ -0,0 +1,179 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) + +#if defined(SCRYPT_CHOOSE_COMPILETIME) +#undef SCRYPT_ROMIX_FN +#define SCRYPT_ROMIX_FN scrypt_ROMix +#endif + +#undef SCRYPT_HAVE_ROMIX +#define SCRYPT_HAVE_ROMIX + +#if !defined(SCRYPT_CHUNKMIX_FN) + +#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic + +/* + Bout = ChunkMix(Bin) + + 2*r: number of blocks in the chunk +*/ +static void asm_calling_convention +SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) { + scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block; + uint32_t i, j, blocksPerChunk = r * 2, half = 0; + + /* 1: X = B_{2r - 1} */ + block = scrypt_block(Bin, blocksPerChunk - 1); + for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) + X[i] = block[i]; + + if (Bxor) { + block = scrypt_block(Bxor, blocksPerChunk - 1); + for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) + X[i] ^= block[i]; + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + block = scrypt_block(Bin, i); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + X[j] ^= block[j]; + + if (Bxor) { + block = scrypt_block(Bxor, i); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + X[j] ^= block[j]; + } + SCRYPT_MIX_FN(X); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + block = scrypt_block(Bout, (i / 2) + half); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + block[j] = X[j]; + } +} +#endif + +/* + X = ROMix(X) + + X: chunk to mix + Y: scratch chunk + N: number of rounds + V[N]: array of chunks to randomly index in to + 2*r: number of blocks in a chunk +*/ + +static void NOINLINE FASTCALL +SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) { + uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; + scrypt_mix_word_t *block = V; + + SCRYPT_ROMIX_TANGLE_FN(X, r * 2); + + /* 1: X = B */ + /* implicit */ + + /* 2: for i = 0 to N - 1 do */ + memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); + for (i = 0; i < N - 1; i++, block += chunkWords) { + /* 3: V_i = X */ + /* 4: X = H(X) */ + SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r); + } + SCRYPT_CHUNKMIX_FN(X, block, NULL, r); + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < N; i += 2) { + /* 7: j = Integerify(X) % N */ + j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); + + /* 8: X = H(Y ^ V_j) */ + SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r); + + /* 7: j = Integerify(Y) % N */ + j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); + + /* 8: X = H(Y ^ V_j) */ + SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r); + } + + /* 10: B' = X */ + /* implicit */ + + SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2); +} + +/* + * Special version with hard-coded r = 1 + * - mikaelh + */ +static void NOINLINE FASTCALL +scrypt_ROMix_1(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N) { + const uint32_t r = 1; + uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; + scrypt_mix_word_t *block = V; + + SCRYPT_ROMIX_TANGLE_FN(X, r * 2); + + /* 1: X = B */ + /* implicit */ + + /* 2: for i = 0 to N - 1 do */ + memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); + for (i = 0; i < N - 1; i++, block += chunkWords) { + /* 3: V_i = X */ + /* 4: X = H(X) */ +#ifdef SCRYPT_CHUNKMIX_1_FN + SCRYPT_CHUNKMIX_1_FN(block + chunkWords, block); +#else + SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r); +#endif + } +#ifdef SCRYPT_CHUNKMIX_1_FN + SCRYPT_CHUNKMIX_1_FN(X, block); +#else + SCRYPT_CHUNKMIX_FN(X, block, NULL, r); +#endif + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < N; i += 2) { + /* 7: j = Integerify(X) % N */ + j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); + + /* 8: X = H(Y ^ V_j) */ +#ifdef SCRYPT_CHUNKMIX_1_XOR_FN + SCRYPT_CHUNKMIX_1_XOR_FN(Y, X, scrypt_item(V, j, chunkWords)); +#else + SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r); +#endif + + /* 7: j = Integerify(Y) % N */ + j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); + + /* 8: X = H(Y ^ V_j) */ +#ifdef SCRYPT_CHUNKMIX_1_XOR_FN + SCRYPT_CHUNKMIX_1_XOR_FN(X, Y, scrypt_item(V, j, chunkWords)); +#else + SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r); +#endif + } + + /* 10: B' = X */ + /* implicit */ + + SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2); +} + +#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */ + + +#undef SCRYPT_CHUNKMIX_FN +#undef SCRYPT_ROMIX_FN +#undef SCRYPT_MIX_FN +#undef SCRYPT_ROMIX_TANGLE_FN +#undef SCRYPT_ROMIX_UNTANGLE_FN + diff --git a/scrypt/code/scrypt-jane-romix.h b/scrypt/code/scrypt-jane-romix.h new file mode 100644 index 0000000..478e9cb --- /dev/null +++ b/scrypt/code/scrypt-jane-romix.h @@ -0,0 +1 @@ +#include "scrypt-jane-chacha.h" diff --git a/scrypt/fermi_kernel.cu b/scrypt/fermi_kernel.cu new file mode 100644 index 0000000..c7f9026 --- /dev/null +++ b/scrypt/fermi_kernel.cu @@ -0,0 +1,907 @@ +// +// Kernel that runs best on Fermi devices +// +// - shared memory use reduced by nearly factor 2 over legacy kernel +// by transferring only half work units (16 x uint32_t) at once. +// - uses ulong2/uint4 based memory transfers (each thread moves 16 bytes), +// allowing for shorter unrolled loops. This relies on Fermi's better +// memory controllers to get high memory troughput. +// +// NOTE: compile this .cu module for compute_20,sm_20 with --maxrregcount=63 +// +// TODO: batch-size support for this kernel +// + +#include + +#include "cuda_runtime.h" +#include "miner.h" + +#include "salsa_kernel.h" +#include "fermi_kernel.h" + +#define THREADS_PER_WU 1 // single thread per hash + +#define TEXWIDTH 32768 + +// forward references +template __global__ void fermi_scrypt_core_kernelA(uint32_t *g_idata, unsigned int N); +template __global__ void fermi_scrypt_core_kernelB(uint32_t *g_odata, unsigned int N); +template __global__ void fermi_scrypt_core_kernelB_tex(uint32_t *g_odata, unsigned int N); +template __global__ void fermi_scrypt_core_kernelA_LG(uint32_t *g_idata, unsigned int N, unsigned int LOOKUP_GAP); +template __global__ void fermi_scrypt_core_kernelB_LG(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP); +template __global__ void fermi_scrypt_core_kernelB_LG_tex(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP); + +// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes) +__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT]; + +// using texture references for the "tex" variants of the B kernels +texture texRef1D_4_V; +texture texRef2D_4_V; + +FermiKernel::FermiKernel() : KernelInterface() +{ +} + +bool FermiKernel::bindtexture_1D(uint32_t *d_V, size_t size) +{ + cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc(); + texRef1D_4_V.normalized = 0; + texRef1D_4_V.filterMode = cudaFilterModePoint; + texRef1D_4_V.addressMode[0] = cudaAddressModeClamp; + checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size)); + return true; +} + +bool FermiKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) +{ + cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc(); + texRef2D_4_V.normalized = 0; + texRef2D_4_V.filterMode = cudaFilterModePoint; + texRef2D_4_V.addressMode[0] = cudaAddressModeClamp; + texRef2D_4_V.addressMode[1] = cudaAddressModeClamp; + // maintain texture width of TEXWIDTH (max. limit is 65000) + while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; } + while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; } +// fprintf(stderr, "total size: %u, %u bytes\n", pitch * height, width * sizeof(uint32_t) * 4 * height); +// fprintf(stderr, "binding width width=%d, height=%d, pitch=%d\n", width, height,pitch); + checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch)); + return true; +} + +bool FermiKernel::unbindtexture_1D() +{ + checkCudaErrors(cudaUnbindTexture(texRef1D_4_V)); + return true; +} + +bool FermiKernel::unbindtexture_2D() +{ + checkCudaErrors(cudaUnbindTexture(texRef2D_4_V)); + return true; +} + +void FermiKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) +{ + checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice)); +} + +bool FermiKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) +{ + bool success = true; + + int shared = WARPS_PER_BLOCK * WU_PER_WARP * (16+4) * sizeof(uint32_t); + + // First phase: Sequential writes to scratchpad. + + if (LOOKUP_GAP == 1) { + if (IS_SCRYPT()) fermi_scrypt_core_kernelA<<< grid, threads, shared, stream >>>(d_idata, N); + if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelA<<< grid, threads, shared, stream >>>(d_idata, N); + } else { + if (IS_SCRYPT()) fermi_scrypt_core_kernelA_LG<<< grid, threads, shared, stream >>>(d_idata, N, LOOKUP_GAP); + if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelA_LG<<< grid, threads, shared, stream >>>(d_idata, N, LOOKUP_GAP); + } + + // Second phase: Random read access from scratchpad. + + if (LOOKUP_GAP == 1) { + if (texture_cache) { + if (texture_cache == 1) { + if (IS_SCRYPT()) fermi_scrypt_core_kernelB_tex<<< grid, threads, shared, stream >>>(d_odata, N); + if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_tex<<< grid, threads, shared, stream >>>(d_odata, N); + } else if (texture_cache == 2) { + if (IS_SCRYPT()) fermi_scrypt_core_kernelB_tex<<< grid, threads, shared, stream >>>(d_odata, N); + if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_tex<<< grid, threads, shared, stream >>>(d_odata, N); + } + else success = false; + } else { + if (IS_SCRYPT()) fermi_scrypt_core_kernelB<<< grid, threads, shared, stream >>>(d_odata, N); + if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB<<< grid, threads, shared, stream >>>(d_odata, N); + } + } else { + if (texture_cache) { + if (texture_cache == 1) { + if (IS_SCRYPT()) fermi_scrypt_core_kernelB_LG_tex<<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP); + if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_LG_tex<<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP); + } else if (texture_cache == 2) { + if (IS_SCRYPT()) fermi_scrypt_core_kernelB_LG_tex<<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP); + if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_LG_tex<<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP); + } + else success = false; + } else { + if (IS_SCRYPT()) fermi_scrypt_core_kernelB_LG<<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP); + if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_LG<<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP); + } + } + + return success; +} + +#if 0 + +#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) + +#define QUARTER(a,b,c,d) \ + a += b; d ^= a; d = ROTL(d,16); \ + c += d; b ^= c; b = ROTL(b,12); \ + a += b; d ^= a; d = ROTL(d,8); \ + c += d; b ^= c; b = ROTL(b,7); + +static __device__ void xor_chacha8(uint4 *B, uint4 *C) +{ + uint32_t x[16]; + x[0]=(B[0].x ^= C[0].x); + x[1]=(B[0].y ^= C[0].y); + x[2]=(B[0].z ^= C[0].z); + x[3]=(B[0].w ^= C[0].w); + x[4]=(B[1].x ^= C[1].x); + x[5]=(B[1].y ^= C[1].y); + x[6]=(B[1].z ^= C[1].z); + x[7]=(B[1].w ^= C[1].w); + x[8]=(B[2].x ^= C[2].x); + x[9]=(B[2].y ^= C[2].y); + x[10]=(B[2].z ^= C[2].z); + x[11]=(B[2].w ^= C[2].w); + x[12]=(B[3].x ^= C[3].x); + x[13]=(B[3].y ^= C[3].y); + x[14]=(B[3].z ^= C[3].z); + x[15]=(B[3].w ^= C[3].w); + + /* Operate on columns. */ + QUARTER( x[0], x[4], x[ 8], x[12] ) + QUARTER( x[1], x[5], x[ 9], x[13] ) + QUARTER( x[2], x[6], x[10], x[14] ) + QUARTER( x[3], x[7], x[11], x[15] ) + + /* Operate on diagonals */ + QUARTER( x[0], x[5], x[10], x[15] ) + QUARTER( x[1], x[6], x[11], x[12] ) + QUARTER( x[2], x[7], x[ 8], x[13] ) + QUARTER( x[3], x[4], x[ 9], x[14] ) + + /* Operate on columns. */ + QUARTER( x[0], x[4], x[ 8], x[12] ) + QUARTER( x[1], x[5], x[ 9], x[13] ) + QUARTER( x[2], x[6], x[10], x[14] ) + QUARTER( x[3], x[7], x[11], x[15] ) + + /* Operate on diagonals */ + QUARTER( x[0], x[5], x[10], x[15] ) + QUARTER( x[1], x[6], x[11], x[12] ) + QUARTER( x[2], x[7], x[ 8], x[13] ) + QUARTER( x[3], x[4], x[ 9], x[14] ) + + /* Operate on columns. */ + QUARTER( x[0], x[4], x[ 8], x[12] ) + QUARTER( x[1], x[5], x[ 9], x[13] ) + QUARTER( x[2], x[6], x[10], x[14] ) + QUARTER( x[3], x[7], x[11], x[15] ) + + /* Operate on diagonals */ + QUARTER( x[0], x[5], x[10], x[15] ) + QUARTER( x[1], x[6], x[11], x[12] ) + QUARTER( x[2], x[7], x[ 8], x[13] ) + QUARTER( x[3], x[4], x[ 9], x[14] ) + + /* Operate on columns. */ + QUARTER( x[0], x[4], x[ 8], x[12] ) + QUARTER( x[1], x[5], x[ 9], x[13] ) + QUARTER( x[2], x[6], x[10], x[14] ) + QUARTER( x[3], x[7], x[11], x[15] ) + + /* Operate on diagonals */ + QUARTER( x[0], x[5], x[10], x[15] ) + QUARTER( x[1], x[6], x[11], x[12] ) + QUARTER( x[2], x[7], x[ 8], x[13] ) + QUARTER( x[3], x[4], x[ 9], x[14] ) + + B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7]; + B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15]; +} + +#else + +#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) + +#define ADD4(d1,d2,d3,d4,s1,s2,s3,s4) \ + d1 += s1; d2 += s2; d3 += s3; d4 += s4; + +#define XOR4(d1,d2,d3,d4,s1,s2,s3,s4) \ + d1 ^= s1; d2 ^= s2; d3 ^= s3; d4 ^= s4; + +#define ROTL4(d1,d2,d3,d4,amt) \ + d1 = ROTL(d1, amt); d2 = ROTL(d2, amt); d3 = ROTL(d3, amt); d4 = ROTL(d4, amt); + +#define QROUND(a1,a2,a3,a4, b1,b2,b3,b4, c1,c2,c3,c4, amt) \ + ADD4 (a1,a2,a3,a4, c1,c2,c3,c4) \ + XOR4 (b1,b2,b3,b4, a1,a2,a3,a4) \ + ROTL4(b1,b2,b3,b4, amt) + +static __device__ void xor_chacha8(uint4 *B, uint4 *C) +{ + uint32_t x[16]; + x[0]=(B[0].x ^= C[0].x); + x[1]=(B[0].y ^= C[0].y); + x[2]=(B[0].z ^= C[0].z); + x[3]=(B[0].w ^= C[0].w); + x[4]=(B[1].x ^= C[1].x); + x[5]=(B[1].y ^= C[1].y); + x[6]=(B[1].z ^= C[1].z); + x[7]=(B[1].w ^= C[1].w); + x[8]=(B[2].x ^= C[2].x); + x[9]=(B[2].y ^= C[2].y); + x[10]=(B[2].z ^= C[2].z); + x[11]=(B[2].w ^= C[2].w); + x[12]=(B[3].x ^= C[3].x); + x[13]=(B[3].y ^= C[3].y); + x[14]=(B[3].z ^= C[3].z); + x[15]=(B[3].w ^= C[3].w); + + /* Operate on columns. */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); + + /* Operate on diagonals */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); + + /* Operate on columns. */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); + + /* Operate on diagonals */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); + + /* Operate on columns. */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); + + /* Operate on diagonals */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); + + /* Operate on columns. */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); + + /* Operate on diagonals */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); + + B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7]; + B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15]; +} + +#endif + +#define ROTL7(a0,a1,a2,a3,a00,a10,a20,a30){\ +a0^=(((a00)<<7) | ((a00)>>25) );\ +a1^=(((a10)<<7) | ((a10)>>25) );\ +a2^=(((a20)<<7) | ((a20)>>25) );\ +a3^=(((a30)<<7) | ((a30)>>25) );\ +};\ + +#define ROTL9(a0,a1,a2,a3,a00,a10,a20,a30){\ +a0^=(((a00)<<9) | ((a00)>>23) );\ +a1^=(((a10)<<9) | ((a10)>>23) );\ +a2^=(((a20)<<9) | ((a20)>>23) );\ +a3^=(((a30)<<9) | ((a30)>>23) );\ +};\ + +#define ROTL13(a0,a1,a2,a3,a00,a10,a20,a30){\ +a0^=(((a00)<<13) | ((a00)>>19) );\ +a1^=(((a10)<<13) | ((a10)>>19) );\ +a2^=(((a20)<<13) | ((a20)>>19) );\ +a3^=(((a30)<<13) | ((a30)>>19) );\ +};\ + +#define ROTL18(a0,a1,a2,a3,a00,a10,a20,a30){\ +a0^=(((a00)<<18) | ((a00)>>14) );\ +a1^=(((a10)<<18) | ((a10)>>14) );\ +a2^=(((a20)<<18) | ((a20)>>14) );\ +a3^=(((a30)<<18) | ((a30)>>14) );\ +};\ + +static __device__ void xor_salsa8(uint4 *B, uint4 *C) +{ + uint32_t x[16]; + x[0]=(B[0].x ^= C[0].x); + x[1]=(B[0].y ^= C[0].y); + x[2]=(B[0].z ^= C[0].z); + x[3]=(B[0].w ^= C[0].w); + x[4]=(B[1].x ^= C[1].x); + x[5]=(B[1].y ^= C[1].y); + x[6]=(B[1].z ^= C[1].z); + x[7]=(B[1].w ^= C[1].w); + x[8]=(B[2].x ^= C[2].x); + x[9]=(B[2].y ^= C[2].y); + x[10]=(B[2].z ^= C[2].z); + x[11]=(B[2].w ^= C[2].w); + x[12]=(B[3].x ^= C[3].x); + x[13]=(B[3].y ^= C[3].y); + x[14]=(B[3].z ^= C[3].z); + x[15]=(B[3].w ^= C[3].w); + + /* Operate on columns. */ + ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); + ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); + ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); + ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); + + /* Operate on rows. */ + ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); + ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); + ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); + ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); + + /* Operate on columns. */ + ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); + ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); + ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); + ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); + + /* Operate on rows. */ + ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); + ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); + ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); + ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); + + /* Operate on columns. */ + ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); + ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); + ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); + ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); + + /* Operate on rows. */ + ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); + ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); + ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); + ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); + + /* Operate on columns. */ + ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); + ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); + ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); + ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); + + /* Operate on rows. */ + ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); + ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); + ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); + ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); + + B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7]; + B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15]; +} + +static __device__ __forceinline__ uint4& operator^=(uint4& left, const uint4& right) +{ + left.x ^= right.x; + left.y ^= right.y; + left.z ^= right.z; + left.w ^= right.w; + return left; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Scrypt core kernel for Fermi class devices. +//! @param g_idata input data in global memory +//! @param g_odata output data in global memory +//////////////////////////////////////////////////////////////////////////////// +template __global__ +void fermi_scrypt_core_kernelA(uint32_t *g_idata, unsigned int N) +{ + extern __shared__ unsigned char x[]; + uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x; + + int warpIdx = threadIdx.x / warpSize; + int warpThread = threadIdx.x % warpSize; + const unsigned int LOOKUP_GAP = 1; + + // variables supporting the large memory transaction magic + unsigned int Y = warpThread/4; + unsigned int Z = 4*(warpThread%4); + + // add block specific offsets + int WARPS_PER_BLOCK = blockDim.x / 32; + int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP; + g_idata += 32 * offset; + uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z; + + // registers to store an entire work unit + uint4 B[4], C[4]; + + uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z]; + uint32_t *XX = X[warpIdx][warpThread]; + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&V[SCRATCH*wu])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+Z])); +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]); + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&V[SCRATCH*wu+16])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+16+Z])); +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]); + + for (int i = 1; i < N; i++) { + + switch(ALGO) { + case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; + case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; + } + +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&V[SCRATCH*wu + i*32])) = *((ulonglong2*)XB[wu]); + +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&V[SCRATCH*wu + i*32 + 16])) = *((ulonglong2*)XB[wu]); + } +} + +template __global__ +void fermi_scrypt_core_kernelB(uint32_t *g_odata, unsigned int N) +{ + extern __shared__ unsigned char x[]; + uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x; + + int warpIdx = threadIdx.x / warpSize; + int warpThread = threadIdx.x % warpSize; + const unsigned int LOOKUP_GAP = 1; + + // variables supporting the large memory transaction magic + unsigned int Y = warpThread/4; + unsigned int Z = 4*(warpThread%4); + + // add block specific offsets + int WARPS_PER_BLOCK = blockDim.x / 32; + int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP; + g_odata += 32 * offset; + uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z; + + // registers to store an entire work unit + uint4 B[4], C[4]; + + uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z]; + uint32_t *XX = X[warpIdx][warpThread]; + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + (N-1)*32])); +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]); + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + (N-1)*32 + 16])); +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]); + + switch(ALGO) { + case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; + case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; + } + + for (int i = 0; i < N; i++) { + + XX[16] = 32 * (C[0].x & (N-1)); + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z]])); +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) B[idx] ^= *((uint4*)&XX[4*idx]); + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z] + 16])); +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) C[idx] ^= *((uint4*)&XX[4*idx]); + + switch(ALGO) { + case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; + case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; + } + } + +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]); + +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]); + +} + +template __global__ void +fermi_scrypt_core_kernelB_tex(uint32_t *g_odata, unsigned int N) +{ + extern __shared__ unsigned char x[]; + uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x; + + int warpIdx = threadIdx.x / warpSize; + int warpThread = threadIdx.x % warpSize; + const unsigned int LOOKUP_GAP = 1; + + // variables supporting the large memory transaction magic + unsigned int Y = warpThread/4; + unsigned int Z = 4*(warpThread%4); + + // add block specific offsets + int WARPS_PER_BLOCK = blockDim.x / 32; + int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP; + g_odata += 32 * offset; + + // registers to store an entire work unit + uint4 B[4], C[4]; + + uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z]; + uint32_t *XX = X[warpIdx][warpThread]; + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + (N-1)*32 + Z)/4; + *((uint4*)XB[wu]) = ((TEX_DIM == 1) ? + tex1Dfetch(texRef1D_4_V, loc) : + tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]); + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + (N-1)*32 + 16+Z)/4; + *((uint4*)XB[wu]) = ((TEX_DIM == 1) ? + tex1Dfetch(texRef1D_4_V, loc) : + tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]); + + switch(ALGO) { + case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; + case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; + } + + for (int i = 0; i < N; i++) { + + XX[16] = 32 * (C[0].x & (N-1)); + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + Z)/4; + *((uint4*)XB[wu]) = ((TEX_DIM == 1) ? + tex1Dfetch(texRef1D_4_V, loc) : + tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) B[idx] ^= *((uint4*)&XX[4*idx]); + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + 16+Z)/4; + *((uint4*)XB[wu]) = ((TEX_DIM == 1) ? + tex1Dfetch(texRef1D_4_V, loc) : + tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) C[idx] ^= *((uint4*)&XX[4*idx]); + + switch(ALGO) { + case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; + case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; + } + } + +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]); + +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]); +} + +// +// Lookup-Gap variations of the above functions +// + +template __global__ void +fermi_scrypt_core_kernelA_LG(uint32_t *g_idata, unsigned int N, unsigned int LOOKUP_GAP) +{ + extern __shared__ unsigned char x[]; + uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x; + + int warpIdx = threadIdx.x / warpSize; + int warpThread = threadIdx.x % warpSize; + + // variables supporting the large memory transaction magic + unsigned int Y = warpThread/4; + unsigned int Z = 4*(warpThread%4); + + // add block specific offsets + int WARPS_PER_BLOCK = blockDim.x / 32; + int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP; + g_idata += 32 * offset; + uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z; + + // registers to store an entire work unit + uint4 B[4], C[4]; + + uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z]; + uint32_t *XX = X[warpIdx][warpThread]; + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&V[SCRATCH*wu])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+Z])); +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]); + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&V[SCRATCH*wu+16])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+16+Z])); +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]); + + for (int i = 1; i < N; i++) { + + switch(ALGO) { + case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; + case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; + } + + if (i % LOOKUP_GAP == 0) { +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&V[SCRATCH*wu + (i/LOOKUP_GAP)*32])) = *((ulonglong2*)XB[wu]); + +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&V[SCRATCH*wu + (i/LOOKUP_GAP)*32 + 16])) = *((ulonglong2*)XB[wu]); + } + } +} + +template __global__ void +fermi_scrypt_core_kernelB_LG(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP) +{ + extern __shared__ unsigned char x[]; + uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x; + + int warpIdx = threadIdx.x / warpSize; + int warpThread = threadIdx.x % warpSize; + + // variables supporting the large memory transaction magic + unsigned int Y = warpThread/4; + unsigned int Z = 4*(warpThread%4); + + // add block specific offsets + int WARPS_PER_BLOCK = blockDim.x / 32; + int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP; + g_odata += 32 * offset; + uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z; + + // registers to store an entire work unit + uint4 B[4], C[4]; + + uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z]; + uint32_t *XX = X[warpIdx][warpThread]; + + uint32_t pos = (N-1)/LOOKUP_GAP; uint32_t loop = 1 + (N-1)-pos*LOOKUP_GAP; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + pos*32])); +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]); + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + pos*32 + 16])); +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]); + + while (loop--) + switch(ALGO) { + case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; + case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; + } + + for (int i = 0; i < N; i++) { + + uint32_t j = C[0].x & (N-1); + uint32_t pos = j / LOOKUP_GAP; uint32_t loop = j - pos*LOOKUP_GAP; + XX[16] = 32 * pos; + + uint4 b[4], c[4]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z]])); +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) b[idx] = *((uint4*)&XX[4*idx]); + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z] + 16])); +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) c[idx] = *((uint4*)&XX[4*idx]); + + while (loop--) + switch(ALGO) { + case A_SCRYPT: xor_salsa8(b, c); xor_salsa8(c, b); break; + case A_SCRYPT_JANE: xor_chacha8(b, c); xor_chacha8(c, b); break; + } + +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) B[idx] ^= b[idx]; +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) C[idx] ^= c[idx]; + + switch(ALGO) { + case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; + case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; + } + } + +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]); + +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]); + +} + +template __global__ void +fermi_scrypt_core_kernelB_LG_tex(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP) +{ + extern __shared__ unsigned char x[]; + uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x; + + int warpIdx = threadIdx.x / warpSize; + int warpThread = threadIdx.x % warpSize; + + // variables supporting the large memory transaction magic + unsigned int Y = warpThread/4; + unsigned int Z = 4*(warpThread%4); + + // add block specific offsets + int WARPS_PER_BLOCK = blockDim.x / 32; + int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP; + g_odata += 32 * offset; + + // registers to store an entire work unit + uint4 B[4], C[4]; + + uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z]; + uint32_t *XX = X[warpIdx][warpThread]; + + uint32_t pos = (N-1)/LOOKUP_GAP; uint32_t loop = 1 + (N-1)-pos*LOOKUP_GAP; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + pos*32 + Z)/4; + *((uint4*)XB[wu]) = ((TEX_DIM == 1) ? + tex1Dfetch(texRef1D_4_V, loc) : + tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]); + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + pos*32 + 16+Z)/4; + *((uint4*)XB[wu]) = ((TEX_DIM == 1) ? + tex1Dfetch(texRef1D_4_V, loc) : + tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]); + + while (loop--) + switch(ALGO) { + case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; + case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; + } + + for (int i = 0; i < N; i++) { + + uint32_t j = C[0].x & (N-1); + uint32_t pos = j / LOOKUP_GAP; uint32_t loop = j - pos*LOOKUP_GAP; + XX[16] = 32 * pos; + + uint4 b[4], c[4]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + Z)/4; + *((uint4*)XB[wu]) = ((TEX_DIM == 1) ? + tex1Dfetch(texRef1D_4_V, loc) : + tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) b[idx] = *((uint4*)&XX[4*idx]); + +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + 16+Z)/4; + *((uint4*)XB[wu]) = ((TEX_DIM == 1) ? + tex1Dfetch(texRef1D_4_V, loc) : + tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) c[idx] = *((uint4*)&XX[4*idx]); + + while (loop--) + switch(ALGO) { + case A_SCRYPT: xor_salsa8(b, c); xor_salsa8(c, b); break; + case A_SCRYPT_JANE: xor_chacha8(b, c); xor_chacha8(c, b); break; + } + +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) B[idx] ^= b[idx]; +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) C[idx] ^= c[idx]; + + switch(ALGO) { + case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; + case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; + } + } + +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]); + +#pragma unroll 4 + for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx]; +#pragma unroll 4 + for (int wu=0; wu < 32; wu+=8) + *((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]); +} diff --git a/scrypt/fermi_kernel.h b/scrypt/fermi_kernel.h new file mode 100644 index 0000000..54f822d --- /dev/null +++ b/scrypt/fermi_kernel.h @@ -0,0 +1,28 @@ +#ifndef FERMI_KERNEL_H +#define FERMI_KERNEL_H + +#include "salsa_kernel.h" + +class FermiKernel : public KernelInterface +{ +public: + FermiKernel(); + + virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V); + virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache); + virtual bool bindtexture_1D(uint32_t *d_V, size_t size); + virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch); + virtual bool unbindtexture_1D(); + virtual bool unbindtexture_2D(); + + virtual char get_identifier() { return 'F'; }; + virtual int get_major_version() { return 1; } + virtual int get_minor_version() { return 0; } + virtual int max_warps_per_block() { return 16; }; + virtual int get_texel_width() { return 4; }; + virtual bool support_lookup_gap() { return true; } + virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; } + virtual cudaFuncCache cache_config() { return cudaFuncCachePreferShared; } +}; + +#endif // #ifndef FERMI_KERNEL_H diff --git a/scrypt/keccak.cu b/scrypt/keccak.cu new file mode 100644 index 0000000..aa23e50 --- /dev/null +++ b/scrypt/keccak.cu @@ -0,0 +1,837 @@ +// +// =============== KECCAK part on nVidia GPU ====================== +// +// The keccak512 (SHA-3) is used in the PBKDF2 for scrypt-jane coins +// in place of the SHA2 based PBKDF2 used in scrypt coins. +// +// The keccak256 is used exclusively in Maxcoin and clones. This module +// holds the generic "default" implementation when no architecture +// specific implementation is available in the kernel. +// +// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64 +// + +#include +#include + +#include "salsa_kernel.h" +#include "cuda_runtime.h" +#include "miner.h" + +#include "keccak.h" + +// define some error checking macros +#undef checkCudaErrors + +#if WIN32 +#define DELIMITER '/' +#else +#define DELIMITER '/' +#endif +#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ ) + +#define checkCudaErrors(x) \ +{ \ + cudaGetLastError(); \ + x; \ + cudaError_t err = cudaGetLastError(); \ + if (err != cudaSuccess) \ + applog(LOG_ERR, "GPU #%d: cudaError %d (%s) calling '%s' (%s line %d)\n", device_map[thr_id], err, cudaGetErrorString(err), #x, __FILENAME__, __LINE__); \ +} + +// from salsa_kernel.cu +extern std::map context_idata[2]; +extern std::map context_odata[2]; +extern std::map context_streams[2]; +extern std::map context_hash[2]; + +#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) + +// CB +#define U32TO64_LE(p) \ + (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) + +#define U64TO32_LE(p, v) \ + *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); + +static __device__ void mycpy64(uint32_t *d, const uint32_t *s) { +#pragma unroll 16 + for (int k=0; k < 16; ++k) d[k] = s[k]; +} + +static __device__ void mycpy56(uint32_t *d, const uint32_t *s) { +#pragma unroll 14 + for (int k=0; k < 14; ++k) d[k] = s[k]; +} + +static __device__ void mycpy32(uint32_t *d, const uint32_t *s) { +#pragma unroll 8 + for (int k=0; k < 8; ++k) d[k] = s[k]; +} + +static __device__ void mycpy8(uint32_t *d, const uint32_t *s) { +#pragma unroll 2 + for (int k=0; k < 2; ++k) d[k] = s[k]; +} + +static __device__ void mycpy4(uint32_t *d, const uint32_t *s) { + *d = *s; +} + +// ---------------------------- BEGIN keccak functions ------------------------------------ + +#define KECCAK_HASH "Keccak-512" + +typedef struct keccak_hash_state_t { + uint64_t state[25]; // 25*2 + uint32_t buffer[72/4]; // 72 +} keccak_hash_state; + +__device__ void statecopy0(keccak_hash_state *d, keccak_hash_state *s) +{ +#pragma unroll 25 + for (int i=0; i < 25; ++i) + d->state[i] = s->state[i]; +} + +__device__ void statecopy8(keccak_hash_state *d, keccak_hash_state *s) +{ +#pragma unroll 25 + for (int i=0; i < 25; ++i) + d->state[i] = s->state[i]; +#pragma unroll 2 + for (int i=0; i < 2; ++i) + d->buffer[i] = s->buffer[i]; +} + +static const uint64_t host_keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull +}; + +__constant__ uint64_t c_keccak_round_constants[24]; +__constant__ uint32_t pdata[20]; + +__device__ +void keccak_block(keccak_hash_state *S, const uint32_t *in) { + size_t i; + uint64_t *s = S->state, t[5], u[5], v, w; + + /* absorb input */ + #pragma unroll 9 + for (i = 0; i < 72 / 8; i++, in += 2) + s[i] ^= U32TO64_LE(in); + + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROTL64(t[1], 1); + u[1] = t[0] ^ ROTL64(t[2], 1); + u[2] = t[1] ^ ROTL64(t[3], 1); + u[3] = t[2] ^ ROTL64(t[4], 1); + u[4] = t[3] ^ ROTL64(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[ 1]; + s[ 1] = ROTL64(s[ 6], 44); + s[ 6] = ROTL64(s[ 9], 20); + s[ 9] = ROTL64(s[22], 61); + s[22] = ROTL64(s[14], 39); + s[14] = ROTL64(s[20], 18); + s[20] = ROTL64(s[ 2], 62); + s[ 2] = ROTL64(s[12], 43); + s[12] = ROTL64(s[13], 25); + s[13] = ROTL64(s[19], 8); + s[19] = ROTL64(s[23], 56); + s[23] = ROTL64(s[15], 41); + s[15] = ROTL64(s[ 4], 27); + s[ 4] = ROTL64(s[24], 14); + s[24] = ROTL64(s[21], 2); + s[21] = ROTL64(s[ 8], 55); + s[ 8] = ROTL64(s[16], 45); + s[16] = ROTL64(s[ 5], 36); + s[ 5] = ROTL64(s[ 3], 28); + s[ 3] = ROTL64(s[18], 21); + s[18] = ROTL64(s[17], 15); + s[17] = ROTL64(s[11], 10); + s[11] = ROTL64(s[ 7], 6); + s[ 7] = ROTL64(s[10], 3); + s[10] = ROTL64( v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; + v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= c_keccak_round_constants[i]; + } +} + +__device__ +void keccak_hash_init(keccak_hash_state *S) { +#pragma unroll 25 + for (int i=0; i<25; ++i) + S->state[i] = 0ULL; +} + +// assuming there is no leftover data and exactly 72 bytes are incoming +// we can directly call into the block hashing function +__device__ void keccak_hash_update72(keccak_hash_state *S, const uint32_t *in) { + keccak_block(S, in); +} + +__device__ void keccak_hash_update8(keccak_hash_state *S, const uint32_t *in) { + mycpy8(S->buffer, in); +} + +__device__ void keccak_hash_update4_8(keccak_hash_state *S, const uint32_t *in) { + mycpy4(S->buffer+8/4, in); +} + +__device__ void keccak_hash_update4_56(keccak_hash_state *S, const uint32_t *in) { + mycpy4(S->buffer+56/4, in); +} + +__device__ void keccak_hash_update56(keccak_hash_state *S, const uint32_t *in) { + mycpy56(S->buffer, in); +} + +__device__ void keccak_hash_update64(keccak_hash_state *S, const uint32_t *in) { + mycpy64(S->buffer, in); +} + +__device__ void keccak_hash_finish8(keccak_hash_state *S, uint32_t *hash) { + S->buffer[8/4] = 0x01; +#pragma unroll 15 + for (int i=8/4+1; i < 72/4; ++i) S->buffer[i] = 0; + S->buffer[72/4 - 1] |= 0x80000000; + keccak_block(S, (const uint32_t*)S->buffer); +#pragma unroll 8 + for (size_t i = 0; i < 64; i += 8) { + U64TO32_LE((&hash[i/4]), S->state[i / 8]); + } +} + +__device__ void keccak_hash_finish12(keccak_hash_state *S, uint32_t *hash) { + S->buffer[12/4] = 0x01; +#pragma unroll 14 + for (int i=12/4+1; i < 72/4; ++i) S->buffer[i] = 0; + S->buffer[72/4 - 1] |= 0x80000000; + keccak_block(S, (const uint32_t*)S->buffer); +#pragma unroll 8 + for (size_t i = 0; i < 64; i += 8) { + U64TO32_LE((&hash[i/4]), S->state[i / 8]); + } +} + +__device__ void keccak_hash_finish60(keccak_hash_state *S, uint32_t *hash) { + S->buffer[60/4] = 0x01; +#pragma unroll 2 + for (int i=60/4+1; i < 72/4; ++i) S->buffer[i] = 0; + S->buffer[72/4 - 1] |= 0x80000000; + keccak_block(S, (const uint32_t*)S->buffer); +#pragma unroll 8 + for (size_t i = 0; i < 64; i += 8) { + U64TO32_LE((&hash[i/4]), S->state[i / 8]); + } +} + +__device__ void keccak_hash_finish64(keccak_hash_state *S, uint32_t *hash) { + S->buffer[64/4] = 0x01; +#pragma unroll 1 + for (int i=64/4+1; i < 72/4; ++i) S->buffer[i] = 0; + S->buffer[72/4 - 1] |= 0x80000000; + keccak_block(S, (const uint32_t*)S->buffer); +#pragma unroll 8 + for (size_t i = 0; i < 64; i += 8) { + U64TO32_LE((&hash[i/4]), S->state[i / 8]); + } +} + +// ---------------------------- END keccak functions ------------------------------------ + +// ---------------------------- BEGIN PBKDF2 functions ------------------------------------ + +typedef struct pbkdf2_hmac_state_t { + keccak_hash_state inner, outer; +} pbkdf2_hmac_state; + + +__device__ void pbkdf2_hash(uint32_t *hash, const uint32_t *m) { + keccak_hash_state st; + keccak_hash_init(&st); + keccak_hash_update72(&st, m); + keccak_hash_update8(&st, m+72/4); + keccak_hash_finish8(&st, hash); +} + +/* hmac */ +__device__ void pbkdf2_hmac_init80(pbkdf2_hmac_state *st, const uint32_t *key) { + uint32_t pad[72/4]; + size_t i; + + keccak_hash_init(&st->inner); + keccak_hash_init(&st->outer); + +#pragma unroll 18 + for (i = 0; i < 72/4; i++) + pad[i] = 0; + + /* key > blocksize bytes, hash it */ + pbkdf2_hash(pad, key); + + /* inner = (key ^ 0x36) */ + /* h(inner || ...) */ +#pragma unroll 18 + for (i = 0; i < 72/4; i++) + pad[i] ^= 0x36363636; + keccak_hash_update72(&st->inner, pad); + + /* outer = (key ^ 0x5c) */ + /* h(outer || ...) */ +#pragma unroll 18 + for (i = 0; i < 72/4; i++) + pad[i] ^= 0x6a6a6a6a; + keccak_hash_update72(&st->outer, pad); +} + +// assuming there is no leftover data and exactly 72 bytes are incoming +// we can directly call into the block hashing function +__device__ void pbkdf2_hmac_update72(pbkdf2_hmac_state *st, const uint32_t *m) { + /* h(inner || m...) */ + keccak_hash_update72(&st->inner, m); +} + +__device__ void pbkdf2_hmac_update8(pbkdf2_hmac_state *st, const uint32_t *m) { + /* h(inner || m...) */ + keccak_hash_update8(&st->inner, m); +} + +__device__ void pbkdf2_hmac_update4_8(pbkdf2_hmac_state *st, const uint32_t *m) { + /* h(inner || m...) */ + keccak_hash_update4_8(&st->inner, m); +} + +__device__ void pbkdf2_hmac_update4_56(pbkdf2_hmac_state *st, const uint32_t *m) { + /* h(inner || m...) */ + keccak_hash_update4_56(&st->inner, m); +} + +__device__ void pbkdf2_hmac_update56(pbkdf2_hmac_state *st, const uint32_t *m) { + /* h(inner || m...) */ + keccak_hash_update56(&st->inner, m); +} + +__device__ void pbkdf2_hmac_finish12(pbkdf2_hmac_state *st, uint32_t *mac) { + /* h(inner || m) */ + uint32_t innerhash[16]; + keccak_hash_finish12(&st->inner, innerhash); + + /* h(outer || h(inner || m)) */ + keccak_hash_update64(&st->outer, innerhash); + keccak_hash_finish64(&st->outer, mac); +} + +__device__ void pbkdf2_hmac_finish60(pbkdf2_hmac_state *st, uint32_t *mac) { + /* h(inner || m) */ + uint32_t innerhash[16]; + keccak_hash_finish60(&st->inner, innerhash); + + /* h(outer || h(inner || m)) */ + keccak_hash_update64(&st->outer, innerhash); + keccak_hash_finish64(&st->outer, mac); +} + +__device__ void pbkdf2_statecopy8(pbkdf2_hmac_state *d, pbkdf2_hmac_state *s) { + statecopy8(&d->inner, &s->inner); + statecopy0(&d->outer, &s->outer); +} + +// ---------------------------- END PBKDF2 functions ------------------------------------ + +static __device__ uint32_t cuda_swab32(uint32_t x) { + return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) + | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); +} + +__global__ __launch_bounds__(128) +void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce) +{ + nonce += (blockIdx.x * blockDim.x) + threadIdx.x; + g_idata += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x); + + uint32_t data[20]; + + #pragma unroll + for (int i=0; i <19; ++i) + data[i] = cuda_swab32(pdata[i]); + data[19] = cuda_swab32(nonce); + +// scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)data, 80, (uint8_t*)g_idata, 128); + + pbkdf2_hmac_state hmac_pw, work; + uint32_t ti[16]; + uint32_t be; + + /* hmac(password, ...) */ + pbkdf2_hmac_init80(&hmac_pw, data); + + /* hmac(password, salt...) */ + pbkdf2_hmac_update72(&hmac_pw, data); + pbkdf2_hmac_update8(&hmac_pw, data+72/4); + + /* U1 = hmac(password, salt || be(i)) */ + be = cuda_swab32(1); + pbkdf2_statecopy8(&work, &hmac_pw); + pbkdf2_hmac_update4_8(&work, &be); + pbkdf2_hmac_finish12(&work, ti); + mycpy64(g_idata, ti); + + be = cuda_swab32(2); + pbkdf2_statecopy8(&work, &hmac_pw); + pbkdf2_hmac_update4_8(&work, &be); + pbkdf2_hmac_finish12(&work, ti); + mycpy64(g_idata+16, ti); +} + + +__global__ __launch_bounds__(128) +void cuda_post_keccak512(uint32_t *g_odata, uint32_t *g_hash, uint32_t nonce) +{ + nonce += (blockIdx.x * blockDim.x) + threadIdx.x; + g_odata += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x); + g_hash += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x); + + uint32_t data[20]; + +#pragma unroll 19 + for (int i=0; i <19; ++i) + data[i] = cuda_swab32(pdata[i]); + data[19] = cuda_swab32(nonce); + +// scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)g_odata, 128, (uint8_t*)g_hash, 32); + + pbkdf2_hmac_state hmac_pw; + uint32_t ti[16]; + uint32_t be; + + /* hmac(password, ...) */ + pbkdf2_hmac_init80(&hmac_pw, data); + + /* hmac(password, salt...) */ + pbkdf2_hmac_update72(&hmac_pw, g_odata); + pbkdf2_hmac_update56(&hmac_pw, g_odata+72/4); + + /* U1 = hmac(password, salt || be(i)) */ + be = cuda_swab32(1); + pbkdf2_hmac_update4_56(&hmac_pw, &be); + pbkdf2_hmac_finish60(&hmac_pw, ti); + mycpy32(g_hash, ti); +} + +// +// callable host code to initialize constants and to call kernels +// + +static bool init[MAX_GPUS] = { 0 }; + +extern "C" void prepare_keccak512(int thr_id, const uint32_t host_pdata[20]) +{ + if (!init[thr_id]) + { + checkCudaErrors(cudaMemcpyToSymbol(c_keccak_round_constants, host_keccak_round_constants, sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice)); + init[thr_id] = true; + } + checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); +} + +extern "C" void pre_keccak512(int thr_id, int stream, uint32_t nonce, int throughput) +{ + dim3 block(128); + dim3 grid((throughput+127)/128); + + cuda_pre_keccak512<<>>(context_idata[stream][thr_id], nonce); +} + +extern "C" void post_keccak512(int thr_id, int stream, uint32_t nonce, int throughput) +{ + dim3 block(128); + dim3 grid((throughput+127)/128); + + cuda_post_keccak512<<>>(context_odata[stream][thr_id], context_hash[stream][thr_id], nonce); +} + + +// +// Maxcoin related Keccak implementation (Keccak256) +// + +#include + +#include +extern std::map context_blocks; +extern std::map context_wpb; +extern std::map context_kernel; + +__constant__ uint64_t ptarget64[4]; + +#define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64)))) +#define ROL_mult8(a, offset) ROL(a, offset) + +__constant__ uint64_t KeccakF_RoundConstants[24]; + +static uint64_t host_KeccakF_RoundConstants[24] = { + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + +__constant__ uint64_t pdata64[10]; + +__global__ +void crypto_hash(uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate) +{ + uint64_t Aba, Abe, Abi, Abo, Abu; + uint64_t Aga, Age, Agi, Ago, Agu; + uint64_t Aka, Ake, Aki, Ako, Aku; + uint64_t Ama, Ame, Ami, Amo, Amu; + uint64_t Asa, Ase, Asi, Aso, Asu; + uint64_t BCa, BCe, BCi, BCo, BCu; + uint64_t Da, De, Di, Do, Du; + uint64_t Eba, Ebe, Ebi, Ebo, Ebu; + uint64_t Ega, Ege, Egi, Ego, Egu; + uint64_t Eka, Eke, Eki, Eko, Eku; + uint64_t Ema, Eme, Emi, Emo, Emu; + uint64_t Esa, Ese, Esi, Eso, Esu; + + //copyFromState(A, state) + Aba = pdata64[0]; + Abe = pdata64[1]; + Abi = pdata64[2]; + Abo = pdata64[3]; + Abu = pdata64[4]; + Aga = pdata64[5]; + Age = pdata64[6]; + Agi = pdata64[7]; + Ago = pdata64[8]; + Agu = (pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32); + Aka = 0x0000000000000001ULL; + Ake = 0; + Aki = 0; + Ako = 0; + Aku = 0; + Ama = 0; + Ame = 0x8000000000000000ULL; + Ami = 0; + Amo = 0; + Amu = 0; + Asa = 0; + Ase = 0; + Asi = 0; + Aso = 0; + Asu = 0; + +#pragma unroll 12 + for( int laneCount = 0; laneCount < 24; laneCount += 2 ) + { + // prepareTheta + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= (uint64_t)KeccakF_RoundConstants[laneCount]; + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL_mult8(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL_mult8(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + // prepareTheta + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= (uint64_t)KeccakF_RoundConstants[laneCount+1]; + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + Abu = BCu ^((~BCa)& BCe ); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^((~BCe)& BCi ); + Age = BCe ^((~BCi)& BCo ); + Agi = BCi ^((~BCo)& BCu ); + Ago = BCo ^((~BCu)& BCa ); + Agu = BCu ^((~BCa)& BCe ); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL_mult8(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^((~BCe)& BCi ); + Ake = BCe ^((~BCi)& BCo ); + Aki = BCi ^((~BCo)& BCu ); + Ako = BCo ^((~BCu)& BCa ); + Aku = BCu ^((~BCa)& BCe ); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL_mult8(Eso, 56); + Ama = BCa ^((~BCe)& BCi ); + Ame = BCe ^((~BCi)& BCo ); + Ami = BCi ^((~BCo)& BCu ); + Amo = BCo ^((~BCu)& BCa ); + Amu = BCu ^((~BCa)& BCe ); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^((~BCe)& BCi ); + Ase = BCe ^((~BCi)& BCo ); + Asi = BCi ^((~BCo)& BCu ); + Aso = BCo ^((~BCu)& BCa ); + Asu = BCu ^((~BCa)& BCe ); + } + + if (validate) { + g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x); + g_out[3] = Abo; + g_out[2] = Abi; + g_out[1] = Abe; + g_out[0] = Aba; + } + + // the likelyhood of meeting the hashing target is so low, that we're not guarding this + // with atomic writes, locks or similar... + uint64_t *g_good64 = (uint64_t*)g_good; + if (Abo <= ptarget64[3]) { + if (Abo < g_good64[3]) { + g_good64[3] = Abo; + g_good64[2] = Abi; + g_good64[1] = Abe; + g_good64[0] = Aba; + g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); + } + } +} + +static std::map context_good[2]; + +// ... keccak??? +bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) +{ + static bool init[MAX_DEVICES] = {false}; + if (!init[thr_id]) + { + checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice)); + + // allocate pinned host memory for good hashes + uint32_t *tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp; + + init[thr_id] = true; + } + checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + + return context_good[0][thr_id] && context_good[1][thr_id]; +} + +void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) +{ + checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id])); + + crypto_hash<<>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h); + + // copy hashes from device memory to host (ALL hashes, lots of data...) + if (do_d2h && hash != NULL) { + size_t mem_size = throughput * sizeof(uint32_t) * 8; + checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size, + cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); + } + else if (hash != NULL) { + // asynchronous copy of winning nonce (just 4 bytes...) + checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t), + cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); + } +} diff --git a/scrypt/keccak.h b/scrypt/keccak.h new file mode 100644 index 0000000..62ac1cd --- /dev/null +++ b/scrypt/keccak.h @@ -0,0 +1,8 @@ +#ifndef KECCAK_H +#define KEKKAC_H + +extern "C" void prepare_keccak512(int thr_id, const uint32_t host_pdata[20]); +extern "C" void pre_keccak512(int thr_id, int stream, uint32_t nonce, int throughput); +extern "C" void post_keccak512(int thr_id, int stream, uint32_t nonce, int throughput); + +#endif // #ifndef KEKKAC_H diff --git a/scrypt/kepler_kernel.cu b/scrypt/kepler_kernel.cu new file mode 100644 index 0000000..45b94ee --- /dev/null +++ b/scrypt/kepler_kernel.cu @@ -0,0 +1,781 @@ +/* Copyright (C) 2013 David G. Andersen. All rights reserved. + * with modifications by Christian Buchner + * + * Use of this code is covered under the Apache 2.0 license, which + * can be found in the file "LICENSE" + */ + +// TODO: attempt V.Volkov style ILP (factor 4) + +#include + +#include "cuda_runtime.h" +#include "miner.h" + +#include "salsa_kernel.h" +#include "kepler_kernel.h" + +#define TEXWIDTH 32768 +#define THREADS_PER_WU 4 // four threads per hash + +typedef enum +{ + ANDERSEN, + SIMPLE +} MemoryAccess; + +// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes) +__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT]; + +// iteration count N +__constant__ uint32_t c_N; +__constant__ uint32_t c_N_1; // N-1 +// scratch buffer size SCRATCH +__constant__ uint32_t c_SCRATCH; +__constant__ uint32_t c_SCRATCH_WU_PER_WARP; // (SCRATCH * WU_PER_WARP) +__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP) - 1 + +// using texture references for the "tex" variants of the B kernels +texture texRef1D_4_V; +texture texRef2D_4_V; + +template __device__ __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3); + +static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) { + left.x ^= right.x; + left.y ^= right.y; + left.z ^= right.z; + left.w ^= right.w; + return left; +} + +static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) { + left.x += right.x; + left.y += right.y; + left.z += right.z; + left.w += right.w; + return left; +} + +static __device__ uint4 __shfl(const uint4 bx, int target_thread) { + return make_uint4( + __shfl((int)bx.x, target_thread), + __shfl((int)bx.y, target_thread), + __shfl((int)bx.z, target_thread), + __shfl((int)bx.w, target_thread) + ); +} + +/* write_keys writes the 8 keys being processed by a warp to the global + * scratchpad. To effectively use memory bandwidth, it performs the writes + * (and reads, for read_keys) 128 bytes at a time per memory location + * by __shfl'ing the 4 entries in bx to the threads in the next-up + * thread group. It then has eight threads together perform uint4 + * (128 bit) writes to the destination region. This seems to make + * quite effective use of memory bandwidth. An approach that spread + * uint32s across more threads was slower because of the increased + * computation it required. + * + * "start" is the loop iteration producing the write - the offset within + * the block's memory. + * + * Internally, this algorithm first __shfl's the 4 bx entries to + * the next up thread group, and then uses a conditional move to + * ensure that odd-numbered thread groups exchange the b/bx ordering + * so that the right parts are written together. + * + * Thanks to Babu for helping design the 128-bit-per-write version. + * + * _direct lets the caller specify the absolute start location instead of + * the relative start location, as an attempt to reduce some recomputation. + */ + +template __device__ __forceinline__ +void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start) +{ + uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; + + if (SCHEME == ANDERSEN) { + int target_thread = (threadIdx.x + 4)%32; + uint4 t=b, t2=__shfl(bx, target_thread); + int t2_start = __shfl((int)start, target_thread) + 4; + bool c = (threadIdx.x & 0x4); + *((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t); + *((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2); + } else if (SCHEME == SIMPLE) { + *((uint4 *)(&scratch[start ])) = b; + *((uint4 *)(&scratch[start+16])) = bx; + } +} + +template __device__ __forceinline__ +void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start) +{ + uint32_t *scratch; + + if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; + if (SCHEME == ANDERSEN) { + int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4; + if (TEX_DIM > 0) { start /= 4; t2_start /= 4; } + bool c = (threadIdx.x & 0x4); + if (TEX_DIM == 0) { + b = *((uint4 *)(&scratch[c ? t2_start : start])); + bx = *((uint4 *)(&scratch[c ? start : t2_start])); + } else if (TEX_DIM == 1) { + b = tex1Dfetch(texRef1D_4_V, c ? t2_start : start); + bx = tex1Dfetch(texRef1D_4_V, c ? start : t2_start); + } else if (TEX_DIM == 2) { + b = tex2D(texRef2D_4_V, 0.5f + ((c ? t2_start : start)%TEXWIDTH), 0.5f + ((c ? t2_start : start)/TEXWIDTH)); + bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH)); + } + uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx); + bx = __shfl(bx, (threadIdx.x + 28)%32); + } else { + if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start])); + else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4); + else if (TEX_DIM == 2) b = tex2D(texRef2D_4_V, 0.5f + ((start/4)%TEXWIDTH), 0.5f + ((start/4)/TEXWIDTH)); + if (TEX_DIM == 0) bx = *((uint4 *)(&scratch[start+16])); + else if (TEX_DIM == 1) bx = tex1Dfetch(texRef1D_4_V, (start+16)/4); + else if (TEX_DIM == 2) bx = tex2D(texRef2D_4_V, 0.5f + (((start+16)/4)%TEXWIDTH), 0.5f + (((start+16)/4)/TEXWIDTH)); + } +} + + +__device__ __forceinline__ +void primary_order_shuffle(uint4 &b, uint4 &bx) +{ + /* Inner loop shuffle targets */ + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + b.w = __shfl((int)b.w, x1); + b.z = __shfl((int)b.z, x2); + b.y = __shfl((int)b.y, x3); + uint32_t tmp = b.y; b.y = b.w; b.w = tmp; + + bx.w = __shfl((int)bx.w, x1); + bx.z = __shfl((int)bx.z, x2); + bx.y = __shfl((int)bx.y, x3); + tmp = bx.y; bx.y = bx.w; bx.w = tmp; +} + +/* + * load_key loads a 32*32bit key from a contiguous region of memory in B. + * The input keys are in external order (i.e., 0, 1, 2, 3, ...). + * After loading, each thread has its four b and four bx keys stored + * in internal processing order. + */ + +__device__ __forceinline__ +void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx) +{ + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int key_offset = scrypt_block * 32; + uint32_t thread_in_block = threadIdx.x % 4; + + // Read in permuted order. Key loads are not our bottleneck right now. + b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4]; + b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4]; + b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4]; + b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4]; + bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16]; + bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16]; + bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16]; + bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16]; + + primary_order_shuffle(b, bx); +} + +/* + * store_key performs the opposite transform as load_key, taking + * internally-ordered b and bx and storing them into a contiguous + * region of B in external order. + */ + +__device__ __forceinline__ +void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx) +{ + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int key_offset = scrypt_block * 32; + uint32_t thread_in_block = threadIdx.x % 4; + + primary_order_shuffle(b, bx); + + B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x; + B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y; + B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z; + B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w; + B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x; + B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y; + B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z; + B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w; +} + + +/* + * load_key loads a 32*32bit key from a contiguous region of memory in B. + * The input keys are in external order (i.e., 0, 1, 2, 3, ...). + * After loading, each thread has its four b and four bx keys stored + * in internal processing order. + */ + +__device__ __forceinline__ +void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx) +{ + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int key_offset = scrypt_block * 32; + uint32_t thread_in_block = threadIdx.x % 4; + + // Read in permuted order. Key loads are not our bottleneck right now. + b.x = B[key_offset + 4*0 + thread_in_block%4]; + b.y = B[key_offset + 4*1 + thread_in_block%4]; + b.z = B[key_offset + 4*2 + thread_in_block%4]; + b.w = B[key_offset + 4*3 + thread_in_block%4]; + bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16]; + bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16]; + bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16]; + bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16]; +} + +/* + * store_key performs the opposite transform as load_key, taking + * internally-ordered b and bx and storing them into a contiguous + * region of B in external order. + */ + +__device__ __forceinline__ +void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx) +{ + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int key_offset = scrypt_block * 32; + uint32_t thread_in_block = threadIdx.x % 4; + + B[key_offset + 4*0 + thread_in_block%4] = b.x; + B[key_offset + 4*1 + thread_in_block%4] = b.y; + B[key_offset + 4*2 + thread_in_block%4] = b.z; + B[key_offset + 4*3 + thread_in_block%4] = b.w; + B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x; + B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y; + B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z; + B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w; +} + + +template __device__ __forceinline__ +void load_key(const uint32_t *B, uint4 &b, uint4 &bx) +{ + switch(ALGO) { + case A_SCRYPT: load_key_salsa(B, b, bx); break; + case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break; + } +} + +template __device__ __forceinline__ +void store_key(uint32_t *B, uint4 &b, uint4 &bx) +{ + switch(ALGO) { + case A_SCRYPT: store_key_salsa(B, b, bx); break; + case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break; + } +} + + +/* + * salsa_xor_core (Salsa20/8 cypher) + * The original scrypt called: + * xor_salsa8(&X[0], &X[16]); <-- the "b" loop + * xor_salsa8(&X[16], &X[0]); <-- the "bx" loop + * This version is unrolled to handle both of these loops in a single + * call to avoid unnecessary data movement. + */ + +#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<>(32-amt))); } + +__device__ __forceinline__ +void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) +{ + uint4 x; + + b ^= bx; + x = b; + + // Enter in "primary order" (t0 has 0, 4, 8, 12) + // (t1 has 5, 9, 13, 1) + // (t2 has 10, 14, 2, 6) + // (t3 has 15, 3, 7, 11) + + #pragma unroll + for (int j = 0; j < 4; j++) { + + // Mixing phase of salsa + XOR_ROTATE_ADD(x.y, x.x, x.w, 7); + XOR_ROTATE_ADD(x.z, x.y, x.x, 9); + XOR_ROTATE_ADD(x.w, x.z, x.y, 13); + XOR_ROTATE_ADD(x.x, x.w, x.z, 18); + + /* Transpose rows and columns. */ + /* Unclear if this optimization is needed: These are ordered based + * upon the dependencies needed in the later xors. Compiler should be + * able to figure this out, but might as well give it a hand. */ + x.y = __shfl((int)x.y, x3); + x.w = __shfl((int)x.w, x1); + x.z = __shfl((int)x.z, x2); + + /* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first, + * but the register targets are rewritten here to swap x[1] and x[3] so that + * they can be directly shuffled to and from our peer threads without + * reassignment. The reverse shuffle then puts them back in the right place. + */ + + XOR_ROTATE_ADD(x.w, x.x, x.y, 7); + XOR_ROTATE_ADD(x.z, x.w, x.x, 9); + XOR_ROTATE_ADD(x.y, x.z, x.w, 13); + XOR_ROTATE_ADD(x.x, x.y, x.z, 18); + + x.w = __shfl((int)x.w, x3); + x.y = __shfl((int)x.y, x1); + x.z = __shfl((int)x.z, x2); + } + + b += x; + // The next two lines are the beginning of the BX-centric loop iteration + bx ^= b; + x = bx; + + // This is a copy of the same loop above, identical but stripped of comments. + // Duplicated so that we can complete a bx-based loop with fewer register moves. + #pragma unroll + for (int j = 0; j < 4; j++) { + XOR_ROTATE_ADD(x.y, x.x, x.w, 7); + XOR_ROTATE_ADD(x.z, x.y, x.x, 9); + XOR_ROTATE_ADD(x.w, x.z, x.y, 13); + XOR_ROTATE_ADD(x.x, x.w, x.z, 18); + + x.y = __shfl((int)x.y, x3); + x.w = __shfl((int)x.w, x1); + x.z = __shfl((int)x.z, x2); + + XOR_ROTATE_ADD(x.w, x.x, x.y, 7); + XOR_ROTATE_ADD(x.z, x.w, x.x, 9); + XOR_ROTATE_ADD(x.y, x.z, x.w, 13); + XOR_ROTATE_ADD(x.x, x.y, x.z, 18); + + x.w = __shfl((int)x.w, x3); + x.y = __shfl((int)x.y, x1); + x.z = __shfl((int)x.z, x2); + } + + // At the end of these iterations, the data is in primary order again. +#undef XOR_ROTATE_ADD + + bx += x; +} + + +/* + * chacha_xor_core (ChaCha20/8 cypher) + * This version is unrolled to handle both of these loops in a single + * call to avoid unnecessary data movement. + * + * load_key and store_key must not use primary order when + * using ChaCha20/8, but rather the basic transposed order + * (referred to as "column mode" below) + */ + +#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<>(32-amt))); } + +__device__ __forceinline__ +void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) +{ + uint4 x; + + b ^= bx; + x = b; + + // Enter in "column" mode (t0 has 0, 4, 8, 12) + // (t1 has 1, 5, 9, 13) + // (t2 has 2, 6, 10, 14) + // (t3 has 3, 7, 11, 15) + + #pragma unroll + for (int j = 0; j < 4; j++) { + + // Column Mixing phase of chacha + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) + + x.y = __shfl((int)x.y, x1); + x.z = __shfl((int)x.z, x2); + x.w = __shfl((int)x.w, x3); + + // Diagonal Mixing phase of chacha + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) + + x.y = __shfl((int)x.y, x3); + x.z = __shfl((int)x.z, x2); + x.w = __shfl((int)x.w, x1); + } + + b += x; + // The next two lines are the beginning of the BX-centric loop iteration + bx ^= b; + x = bx; + + #pragma unroll + for (int j = 0; j < 4; j++) { + + // Column Mixing phase of chacha + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) + + x.y = __shfl((int)x.y, x1); + x.z = __shfl((int)x.z, x2); + x.w = __shfl((int)x.w, x3); + + // Diagonal Mixing phase of chacha + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) + + x.y = __shfl((int)x.y, x3); + x.z = __shfl((int)x.z, x2); + x.w = __shfl((int)x.w, x1); + } + +#undef CHACHA_PRIMITIVE + + bx += x; +} + + +template __device__ __forceinline__ +void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) +{ + switch(ALGO) { + case A_SCRYPT: salsa_xor_core(b, bx, x1, x2, x3); break; + case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break; + } +} + + +/* + * The hasher_gen_kernel operates on a group of 1024-bit input keys + * in B, stored as: + * B = { k1B k1Bx k2B k2Bx ... } + * and fills up the scratchpad with the iterative hashes derived from + * those keys: + * scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... } + * scratch is 1024 times larger than the input keys B. + * It is extremely important to stream writes effectively into scratch; + * less important to coalesce the reads from B. + * + * Key ordering note: Keys are input from B in "original" order: + * K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 } + * After inputting into kernel_gen, each component k and kx of the + * key is transmuted into a permuted internal order to make processing faster: + * K = k, kx with: + * k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11 + * and similarly for kx. + */ + +template __global__ +void kepler_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end) +{ + uint4 b, bx; + + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; + + int i=begin; + + if (i == 0) { + load_key(d_idata, b, bx); + write_keys_direct(b, bx, start); + ++i; + } else read_keys_direct(b, bx, start+32*(i-1)); + + while (i < end) { + block_mixer(b, bx, x1, x2, x3); + write_keys_direct(b, bx, start+32*i); + ++i; + } +} + +template __global__ +void kepler_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP) +{ + uint4 b, bx; + + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; + + int i=begin; + + if (i == 0) { + load_key(d_idata, b, bx); + write_keys_direct(b, bx, start); + ++i; + } else { + int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP; + read_keys_direct(b, bx, start+32*pos); + while(loop--) block_mixer(b, bx, x1, x2, x3); + } + + while (i < end) { + block_mixer(b, bx, x1, x2, x3); + if (i % LOOKUP_GAP == 0) + write_keys_direct(b, bx, start+32*(i/LOOKUP_GAP)); + ++i; + } +} + + +/* + * hasher_hash_kernel runs the second phase of scrypt after the scratch + * buffer is filled with the iterative hashes: It bounces through + * the scratch buffer in pseudorandom order, mixing the key as it goes. + */ + +template __global__ +void kepler_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end) +{ + uint4 b, bx; + + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4); + if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP; + + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + if (begin == 0) { + read_keys_direct(b, bx, start+32*c_N_1); + block_mixer(b, bx, x1, x2, x3); + } else load_key(d_odata, b, bx); + + for (int i = begin; i < end; i++) { + int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + uint4 t, tx; read_keys_direct(t, tx, start+32*j); + b ^= t; bx ^= tx; + block_mixer(b, bx, x1, x2, x3); + } + + store_key(d_odata, b, bx); +} + +template __global__ +void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP) +{ + uint4 b, bx; + + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4); + if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP; + + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + if (begin == 0) { + int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP); + read_keys_direct(b, bx, start+32*pos); + while(loop--) block_mixer(b, bx, x1, x2, x3); + } else load_key(d_odata, b, bx); + + if (SCHEME == SIMPLE) + { + // better divergent thread handling submitted by nVidia engineers, but + // supposedly this does not run with the ANDERSEN memory access scheme + int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + int pos = j/LOOKUP_GAP; + int loop = -1; + uint4 t, tx; + + int i = begin; + while(i < end) { + if (loop==-1) { + j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + pos = j/LOOKUP_GAP; + loop = j-pos*LOOKUP_GAP; + read_keys_direct(t, tx, start+32*pos); + } + if (loop==0) { + b ^= t; bx ^= tx; + t=b;tx=bx; + } + block_mixer(t, tx, x1, x2, x3); + if (loop==0) { + b=t;bx=tx; + i++; + } + loop--; + } + } + else + { + // this is my original implementation, now used with the ANDERSEN + // memory access scheme only. + for (int i = begin; i < end; i++) { + int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; + uint4 t, tx; read_keys_direct(t, tx, start+32*pos); + while(loop--) block_mixer(t, tx, x1, x2, x3); + b ^= t; bx ^= tx; + block_mixer(b, bx, x1, x2, x3); + } + } + +//for (int i = begin; i < end; i++) { +// int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); +// int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; +// uint4 t, tx; read_keys_direct(t, tx, start+32*pos); +// while(loop--) block_mixer(t, tx, x1, x2, x3); +// b ^= t; bx ^= tx; +// block_mixer(b, bx, x1, x2, x3); +//} + + store_key(d_odata, b, bx); +} + +KeplerKernel::KeplerKernel() : KernelInterface() +{ +} + +bool KeplerKernel::bindtexture_1D(uint32_t *d_V, size_t size) +{ + cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc(); + texRef1D_4_V.normalized = 0; + texRef1D_4_V.filterMode = cudaFilterModePoint; + texRef1D_4_V.addressMode[0] = cudaAddressModeClamp; + checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size)); + return true; +} + +bool KeplerKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) +{ + cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc(); + texRef2D_4_V.normalized = 0; + texRef2D_4_V.filterMode = cudaFilterModePoint; + texRef2D_4_V.addressMode[0] = cudaAddressModeClamp; + texRef2D_4_V.addressMode[1] = cudaAddressModeClamp; + // maintain texture width of TEXWIDTH (max. limit is 65000) + while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; } + while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; } + checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch)); + return true; +} + +bool KeplerKernel::unbindtexture_1D() +{ + checkCudaErrors(cudaUnbindTexture(texRef1D_4_V)); + return true; +} + +bool KeplerKernel::unbindtexture_2D() +{ + checkCudaErrors(cudaUnbindTexture(texRef2D_4_V)); + return true; +} + +void KeplerKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) +{ + checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice)); +} + +bool KeplerKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, + uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) +{ + bool success = true; + + // make some constants available to kernel, update only initially and when changing + static int prev_N[MAX_DEVICES] = {0}; + if (N != prev_N[thr_id]) { + uint32_t h_N = N; + uint32_t h_N_1 = N-1; + uint32_t h_SCRATCH = SCRATCH; + uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP); + uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1; + + cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + + prev_N[thr_id] = N; + } + + // First phase: Sequential writes to scratchpad. + + int batch = device_batchsize[thr_id]; + //int num_sleeps = 2* ((N + (batch-1)) / batch); + //int sleeptime = 100; + + unsigned int pos = 0; + do + { + if (LOOKUP_GAP == 1) { + if (IS_SCRYPT()) kepler_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + } else { + if (IS_SCRYPT()) kepler_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + } + pos += batch; + } while (pos < N); + + // Second phase: Random read access from scratchpad. + + pos = 0; + do + { + if (LOOKUP_GAP == 1) { + + if (texture_cache == 0) { + if (IS_SCRYPT()) kepler_scrypt_core_kernelB<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + } else if (texture_cache == 1) { + if (IS_SCRYPT()) kepler_scrypt_core_kernelB<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + } else if (texture_cache == 2) { + if (IS_SCRYPT()) kepler_scrypt_core_kernelB<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + } + + } else { + + if (texture_cache == 0) { + if (IS_SCRYPT()) kepler_scrypt_core_kernelB_LG<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB_LG<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + } else if (texture_cache == 1) { + if (IS_SCRYPT()) kepler_scrypt_core_kernelB_LG<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB_LG<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + } else if (texture_cache == 2) { + if (IS_SCRYPT()) kepler_scrypt_core_kernelB_LG<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB_LG<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + } + } + + pos += batch; + } while (pos < N); + + return success; +} diff --git a/scrypt/kepler_kernel.h b/scrypt/kepler_kernel.h new file mode 100644 index 0000000..afe78da --- /dev/null +++ b/scrypt/kepler_kernel.h @@ -0,0 +1,29 @@ +#ifndef KEPLER_KERNEL_H +#define KEPLER_KERNEL_H + +#include "salsa_kernel.h" + +class KeplerKernel : public KernelInterface +{ +public: + KeplerKernel(); + + virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V); + virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache); + virtual bool bindtexture_1D(uint32_t *d_V, size_t size); + virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch); + virtual bool unbindtexture_1D(); + virtual bool unbindtexture_2D(); + + virtual char get_identifier() { return 'k'; }; + virtual int get_major_version() { return 3; }; + virtual int get_minor_version() { return 0; }; + + virtual int max_warps_per_block() { return 32; }; + virtual int get_texel_width() { return 4; }; + virtual int threads_per_wu() { return 4; } + virtual bool support_lookup_gap() { return true; } + virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; } +}; + +#endif // #ifndef KEPLER_KERNEL_H diff --git a/scrypt/nv_kernel.cu b/scrypt/nv_kernel.cu new file mode 100644 index 0000000..28a2708 --- /dev/null +++ b/scrypt/nv_kernel.cu @@ -0,0 +1,1488 @@ +// +// Experimental Kernel for Kepler (Compute 3.5) devices +// code submitted by nVidia performance engineer Alexey Panteleev +// with modifications by Christian Buchner +// +// for Compute 3.5 +// NOTE: compile this .cu module for compute_35,sm_35 with --maxrregcount=80 +// for Compute 3.0 +// NOTE: compile this .cu module for compute_30,sm_30 with --maxrregcount=63 +// + +#include + +#include "cuda_runtime.h" + +#include "miner.h" +#include "salsa_kernel.h" +#include "nv_kernel.h" + +#define THREADS_PER_WU 1 // single thread per hash + +#define TEXWIDTH 32768 + +#if __CUDA_ARCH__ < 350 + // Kepler (Compute 3.0) + #define __ldg(x) (*(x)) +#endif + +// grab lane ID +static __device__ __inline__ unsigned int __laneId() { unsigned int laneId; asm( "mov.u32 %0, %%laneid;" : "=r"( laneId ) ); return laneId; } + +// forward references +template __global__ void nv_scrypt_core_kernelA(uint32_t *g_idata, int begin, int end); +template __global__ void nv_scrypt_core_kernelB(uint32_t *g_odata, int begin, int end); +template __global__ void nv_scrypt_core_kernelA_LG(uint32_t *g_idata, int begin, int end, unsigned int LOOKUP_GAP); +template __global__ void nv_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned int LOOKUP_GAP); + +// scratchbuf constants (pointers to scratch buffer for each work unit) +__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT]; + +// using texture references for the "tex" variants of the B kernels +texture texRef1D_4_V; +texture texRef2D_4_V; + +// iteration count N +__constant__ uint32_t c_N; +__constant__ uint32_t c_N_1; // N - 1 +__constant__ uint32_t c_spacing; // (N+LOOKUP_GAP-1)/LOOKUP_GAP + +NVKernel::NVKernel() : KernelInterface() +{ +} + +bool NVKernel::bindtexture_1D(uint32_t *d_V, size_t size) +{ + cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc(); + texRef1D_4_V.normalized = 0; + texRef1D_4_V.filterMode = cudaFilterModePoint; + texRef1D_4_V.addressMode[0] = cudaAddressModeClamp; + checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size)); + return true; +} + +bool NVKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) +{ + cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc(); + texRef2D_4_V.normalized = 0; + texRef2D_4_V.filterMode = cudaFilterModePoint; + texRef2D_4_V.addressMode[0] = cudaAddressModeClamp; + texRef2D_4_V.addressMode[1] = cudaAddressModeClamp; + // maintain texture width of TEXWIDTH (max. limit is 65000) + while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; } + while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; } + checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch)); + return true; +} + +bool NVKernel::unbindtexture_1D() +{ + checkCudaErrors(cudaUnbindTexture(texRef1D_4_V)); + return true; +} + +bool NVKernel::unbindtexture_2D() +{ + checkCudaErrors(cudaUnbindTexture(texRef2D_4_V)); + return true; +} + +void NVKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) +{ + checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice)); +} + +bool NVKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) +{ + bool success = true; + + // make some constants available to kernel, update only initially and when changing + static int prev_N[MAX_DEVICES] = {0}; + if (N != prev_N[thr_id]) { + uint32_t h_N = N; + uint32_t h_N_1 = N-1; + uint32_t h_spacing = (N+LOOKUP_GAP-1)/LOOKUP_GAP; + + cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_spacing, &h_spacing, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + + prev_N[thr_id] = N; + } + + // First phase: Sequential writes to scratchpad. + const int batch = device_batchsize[thr_id]; + unsigned int pos = 0; + + do + { + if (LOOKUP_GAP == 1) { + if (IS_SCRYPT()) nv_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelA<<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + } + else { + if (IS_SCRYPT()) nv_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelA_LG<<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + } + + pos += batch; + } while (pos < N); + + // Second phase: Random read access from scratchpad. + pos = 0; + do + { + if (LOOKUP_GAP == 1) { + if (texture_cache == 0) { + if (IS_SCRYPT()) nv_scrypt_core_kernelB<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + } + else if (texture_cache == 1) { + if (IS_SCRYPT()) nv_scrypt_core_kernelB<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + } + else if (texture_cache == 2) { + if (IS_SCRYPT()) nv_scrypt_core_kernelB<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + } + } else { + if (texture_cache == 0) { + if (IS_SCRYPT()) nv_scrypt_core_kernelB_LG<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB_LG<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + } + else if (texture_cache == 1) { + if (IS_SCRYPT()) nv_scrypt_core_kernelB_LG<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB_LG<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + } + else if (texture_cache == 2) { + if (IS_SCRYPT()) nv_scrypt_core_kernelB_LG<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB_LG<<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + } + } + + pos += batch; + } while (pos < N); + + return success; +} + +static __device__ uint4& operator^=(uint4& left, const uint4& right) +{ + left.x ^= right.x; + left.y ^= right.y; + left.z ^= right.z; + left.w ^= right.w; + return left; +} + +__device__ __forceinline__ uint4 __shfl(const uint4 val, unsigned int lane, unsigned int width) +{ + return make_uint4( + (unsigned int)__shfl((int)val.x, lane, width), + (unsigned int)__shfl((int)val.y, lane, width), + (unsigned int)__shfl((int)val.z, lane, width), + (unsigned int)__shfl((int)val.w, lane, width)); +} + +__device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[4], uint4 *D, int spacing) +{ + unsigned int laneId = __laneId(); + + unsigned int lane8 = laneId%8; + unsigned int tile = laneId/8; + + uint4 T1[8], T2[8]; + + /* Source matrix, A-H are threads, 0-7 are data items, thread A is marked with `*`: + + *A0 B0 C0 D0 E0 F0 G0 H0 + *A1 B1 C1 D1 E1 F1 G1 H1 + *A2 B2 C2 D2 E2 F2 G2 H2 + *A3 B3 C3 D3 E3 F3 G3 H3 + *A4 B4 C4 D4 E4 F4 G4 H4 + *A5 B5 C5 D5 E5 F5 G5 H5 + *A6 B6 C6 D6 E6 F6 G6 H6 + *A7 B7 C7 D7 E7 F7 G7 H7 + */ + + // rotate rows + T1[0] = B[0]; + T1[1] = __shfl(B[1], lane8 + 7, 8); + T1[2] = __shfl(B[2], lane8 + 6, 8); + T1[3] = __shfl(B[3], lane8 + 5, 8); + T1[4] = __shfl(C[0], lane8 + 4, 8); + T1[5] = __shfl(C[1], lane8 + 3, 8); + T1[6] = __shfl(C[2], lane8 + 2, 8); + T1[7] = __shfl(C[3], lane8 + 1, 8); + + /* Matrix after row rotates: + + *A0 B0 C0 D0 E0 F0 G0 H0 + H1 *A1 B1 C1 D1 E1 F1 G1 + G2 H2 *A2 B2 C2 D2 E2 F2 + F3 G3 H3 *A3 B3 C3 D3 E3 + E4 F4 G4 H4 *A4 B4 C4 D4 + D5 E5 F5 G5 H5 *A5 B5 C5 + C6 D6 E6 F6 G6 H6 *A6 B6 + B7 C7 D7 E7 F7 G7 H7 *A7 + */ + + // rotate columns up using a barrel shifter simulation + // column X is rotated up by (X+1) items +#pragma unroll 8 + for(int n = 0; n < 8; n++) T2[n] = ((lane8+1) & 1) ? T1[(n+1) % 8] : T1[n]; +#pragma unroll 8 + for(int n = 0; n < 8; n++) T1[n] = ((lane8+1) & 2) ? T2[(n+2) % 8] : T2[n]; +#pragma unroll 8 + for(int n = 0; n < 8; n++) T2[n] = ((lane8+1) & 4) ? T1[(n+4) % 8] : T1[n]; + + /* Matrix after column rotates: + + H1 H2 H3 H4 H5 H6 H7 H0 + G2 G3 G4 G5 G6 G7 G0 G1 + F3 F4 F5 F6 F7 F0 F1 F2 + E4 E5 E6 E7 E0 E1 E2 E3 + D5 D6 D7 D0 D1 D2 D3 D4 + C6 C7 C0 C1 C2 C3 C4 C5 + B7 B0 B1 B2 B3 B4 B5 B6 + *A0 *A1 *A2 *A3 *A4 *A5 *A6 *A7 + */ + + // rotate rows again using address math and write to D, in reverse row order + D[spacing*2*(32*tile )+ lane8 ] = T2[7]; + D[spacing*2*(32*tile+4 )+(lane8+7)%8] = T2[6]; + D[spacing*2*(32*tile+8 )+(lane8+6)%8] = T2[5]; + D[spacing*2*(32*tile+12)+(lane8+5)%8] = T2[4]; + D[spacing*2*(32*tile+16)+(lane8+4)%8] = T2[3]; + D[spacing*2*(32*tile+20)+(lane8+3)%8] = T2[2]; + D[spacing*2*(32*tile+24)+(lane8+2)%8] = T2[1]; + D[spacing*2*(32*tile+28)+(lane8+1)%8] = T2[0]; +} + +template __device__ __forceinline__ void __transposed_read_BC(const uint4 *S, uint4 (&B)[4], uint4 (&C)[4], int spacing, int row) +{ + unsigned int laneId = __laneId(); + + unsigned int lane8 = laneId%8; + unsigned int tile = laneId/8; + + // Perform the same transposition as in __transposed_write_BC, but in reverse order. + // See the illustrations in comments for __transposed_write_BC. + + // read and rotate rows, in reverse row order + uint4 T1[8], T2[8]; + const uint4 *loc; + loc = &S[(spacing*2*(32*tile ) + lane8 + 8*__shfl(row, 0, 8))]; + T1[7] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH)); + loc = &S[(spacing*2*(32*tile+4 ) + (lane8+7)%8 + 8*__shfl(row, 1, 8))]; + T1[6] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH)); + loc = &S[(spacing*2*(32*tile+8 ) + (lane8+6)%8 + 8*__shfl(row, 2, 8))]; + T1[5] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH)); + loc = &S[(spacing*2*(32*tile+12) + (lane8+5)%8 + 8*__shfl(row, 3, 8))]; + T1[4] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH)); + loc = &S[(spacing*2*(32*tile+16) + (lane8+4)%8 + 8*__shfl(row, 4, 8))]; + T1[3] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH)); + loc = &S[(spacing*2*(32*tile+20) + (lane8+3)%8 + 8*__shfl(row, 5, 8))]; + T1[2] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH)); + loc = &S[(spacing*2*(32*tile+24) + (lane8+2)%8 + 8*__shfl(row, 6, 8))]; + T1[1] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH)); + loc = &S[(spacing*2*(32*tile+28) + (lane8+1)%8 + 8*__shfl(row, 7, 8))]; + T1[0] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH)); + + // rotate columns down using a barrel shifter simulation + // column X is rotated down by (X+1) items, or up by (8-(X+1)) = (7-X) items +#pragma unroll 8 + for(int n = 0; n < 8; n++) T2[n] = ((7-lane8) & 1) ? T1[(n+1) % 8] : T1[n]; +#pragma unroll 8 + for(int n = 0; n < 8; n++) T1[n] = ((7-lane8) & 2) ? T2[(n+2) % 8] : T2[n]; +#pragma unroll 8 + for(int n = 0; n < 8; n++) T2[n] = ((7-lane8) & 4) ? T1[(n+4) % 8] : T1[n]; + + // rotate rows + B[0] = T2[0]; + B[1] = __shfl(T2[1], lane8 + 1, 8); + B[2] = __shfl(T2[2], lane8 + 2, 8); + B[3] = __shfl(T2[3], lane8 + 3, 8); + C[0] = __shfl(T2[4], lane8 + 4, 8); + C[1] = __shfl(T2[5], lane8 + 5, 8); + C[2] = __shfl(T2[6], lane8 + 6, 8); + C[3] = __shfl(T2[7], lane8 + 7, 8); + +} + +template __device__ __forceinline__ void __transposed_xor_BC(const uint4 *S, uint4 (&B)[4], uint4 (&C)[4], int spacing, int row) +{ + uint4 BT[4], CT[4]; + __transposed_read_BC(S, BT, CT, spacing, row); + +#pragma unroll 4 + for(int n = 0; n < 4; n++) + { + B[n] ^= BT[n]; + C[n] ^= CT[n]; + } +} + +#if __CUDA_ARCH__ < 350 + // Kepler (Compute 3.0) + #define ROTL(a, b) ((a)<<(b))|((a)>>(32-(b))) +#else + // Kepler (Compute 3.5) + #define ROTL(a, b) __funnelshift_l( a, a, b ); +#endif + + + +#if 0 + +#define QUARTER(a,b,c,d) \ + a += b; d ^= a; d = ROTL(d,16); \ + c += d; b ^= c; b = ROTL(b,12); \ + a += b; d ^= a; d = ROTL(d,8); \ + c += d; b ^= c; b = ROTL(b,7); + +static __device__ void xor_chacha8(uint4 *B, uint4 *C) +{ + uint32_t x[16]; + x[0]=(B[0].x ^= C[0].x); + x[1]=(B[0].y ^= C[0].y); + x[2]=(B[0].z ^= C[0].z); + x[3]=(B[0].w ^= C[0].w); + x[4]=(B[1].x ^= C[1].x); + x[5]=(B[1].y ^= C[1].y); + x[6]=(B[1].z ^= C[1].z); + x[7]=(B[1].w ^= C[1].w); + x[8]=(B[2].x ^= C[2].x); + x[9]=(B[2].y ^= C[2].y); + x[10]=(B[2].z ^= C[2].z); + x[11]=(B[2].w ^= C[2].w); + x[12]=(B[3].x ^= C[3].x); + x[13]=(B[3].y ^= C[3].y); + x[14]=(B[3].z ^= C[3].z); + x[15]=(B[3].w ^= C[3].w); + + /* Operate on columns. */ + QUARTER( x[0], x[4], x[ 8], x[12] ) + QUARTER( x[1], x[5], x[ 9], x[13] ) + QUARTER( x[2], x[6], x[10], x[14] ) + QUARTER( x[3], x[7], x[11], x[15] ) + + /* Operate on diagonals */ + QUARTER( x[0], x[5], x[10], x[15] ) + QUARTER( x[1], x[6], x[11], x[12] ) + QUARTER( x[2], x[7], x[ 8], x[13] ) + QUARTER( x[3], x[4], x[ 9], x[14] ) + + /* Operate on columns. */ + QUARTER( x[0], x[4], x[ 8], x[12] ) + QUARTER( x[1], x[5], x[ 9], x[13] ) + QUARTER( x[2], x[6], x[10], x[14] ) + QUARTER( x[3], x[7], x[11], x[15] ) + + /* Operate on diagonals */ + QUARTER( x[0], x[5], x[10], x[15] ) + QUARTER( x[1], x[6], x[11], x[12] ) + QUARTER( x[2], x[7], x[ 8], x[13] ) + QUARTER( x[3], x[4], x[ 9], x[14] ) + + /* Operate on columns. */ + QUARTER( x[0], x[4], x[ 8], x[12] ) + QUARTER( x[1], x[5], x[ 9], x[13] ) + QUARTER( x[2], x[6], x[10], x[14] ) + QUARTER( x[3], x[7], x[11], x[15] ) + + /* Operate on diagonals */ + QUARTER( x[0], x[5], x[10], x[15] ) + QUARTER( x[1], x[6], x[11], x[12] ) + QUARTER( x[2], x[7], x[ 8], x[13] ) + QUARTER( x[3], x[4], x[ 9], x[14] ) + + /* Operate on columns. */ + QUARTER( x[0], x[4], x[ 8], x[12] ) + QUARTER( x[1], x[5], x[ 9], x[13] ) + QUARTER( x[2], x[6], x[10], x[14] ) + QUARTER( x[3], x[7], x[11], x[15] ) + + /* Operate on diagonals */ + QUARTER( x[0], x[5], x[10], x[15] ) + QUARTER( x[1], x[6], x[11], x[12] ) + QUARTER( x[2], x[7], x[ 8], x[13] ) + QUARTER( x[3], x[4], x[ 9], x[14] ) + + B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7]; + B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15]; +} + +#else + +#define ADD4(d1,d2,d3,d4,s1,s2,s3,s4) \ + d1 += s1; d2 += s2; d3 += s3; d4 += s4; + +#define XOR4(d1,d2,d3,d4,s1,s2,s3,s4) \ + d1 ^= s1; d2 ^= s2; d3 ^= s3; d4 ^= s4; + +#define ROTL4(d1,d2,d3,d4,amt) \ + d1 = ROTL(d1, amt); d2 = ROTL(d2, amt); d3 = ROTL(d3, amt); d4 = ROTL(d4, amt); + +#define QROUND(a1,a2,a3,a4, b1,b2,b3,b4, c1,c2,c3,c4, amt) \ + ADD4 (a1,a2,a3,a4, c1,c2,c3,c4) \ + XOR4 (b1,b2,b3,b4, a1,a2,a3,a4) \ + ROTL4(b1,b2,b3,b4, amt) + +static __device__ void xor_chacha8(uint4 *B, uint4 *C) +{ + uint32_t x[16]; + x[0]=(B[0].x ^= C[0].x); + x[1]=(B[0].y ^= C[0].y); + x[2]=(B[0].z ^= C[0].z); + x[3]=(B[0].w ^= C[0].w); + x[4]=(B[1].x ^= C[1].x); + x[5]=(B[1].y ^= C[1].y); + x[6]=(B[1].z ^= C[1].z); + x[7]=(B[1].w ^= C[1].w); + x[8]=(B[2].x ^= C[2].x); + x[9]=(B[2].y ^= C[2].y); + x[10]=(B[2].z ^= C[2].z); + x[11]=(B[2].w ^= C[2].w); + x[12]=(B[3].x ^= C[3].x); + x[13]=(B[3].y ^= C[3].y); + x[14]=(B[3].z ^= C[3].z); + x[15]=(B[3].w ^= C[3].w); + + /* Operate on columns. */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); + + /* Operate on diagonals */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); + + /* Operate on columns. */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); + + /* Operate on diagonals */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); + + /* Operate on columns. */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); + + /* Operate on diagonals */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); + + /* Operate on columns. */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); + + /* Operate on diagonals */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); + + B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7]; + B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15]; +} + +#endif + + +#define ROTL7(a0,a1,a2,a3,a00,a10,a20,a30){\ +a0^=ROTL(a00, 7); a1^=ROTL(a10, 7); a2^=ROTL(a20, 7); a3^=ROTL(a30, 7);\ +};\ + +#define ROTL9(a0,a1,a2,a3,a00,a10,a20,a30){\ +a0^=ROTL(a00, 9); a1^=ROTL(a10, 9); a2^=ROTL(a20, 9); a3^=ROTL(a30, 9);\ +};\ + +#define ROTL13(a0,a1,a2,a3,a00,a10,a20,a30){\ +a0^=ROTL(a00, 13); a1^=ROTL(a10, 13); a2^=ROTL(a20, 13); a3^=ROTL(a30, 13);\ +};\ + +#define ROTL18(a0,a1,a2,a3,a00,a10,a20,a30){\ +a0^=ROTL(a00, 18); a1^=ROTL(a10, 18); a2^=ROTL(a20, 18); a3^=ROTL(a30, 18);\ +};\ + +static __device__ void xor_salsa8(uint4 *B, uint4 *C) +{ + uint32_t x[16]; + x[0]=(B[0].x ^= C[0].x); + x[1]=(B[0].y ^= C[0].y); + x[2]=(B[0].z ^= C[0].z); + x[3]=(B[0].w ^= C[0].w); + x[4]=(B[1].x ^= C[1].x); + x[5]=(B[1].y ^= C[1].y); + x[6]=(B[1].z ^= C[1].z); + x[7]=(B[1].w ^= C[1].w); + x[8]=(B[2].x ^= C[2].x); + x[9]=(B[2].y ^= C[2].y); + x[10]=(B[2].z ^= C[2].z); + x[11]=(B[2].w ^= C[2].w); + x[12]=(B[3].x ^= C[3].x); + x[13]=(B[3].y ^= C[3].y); + x[14]=(B[3].z ^= C[3].z); + x[15]=(B[3].w ^= C[3].w); + + /* Operate on columns. */ + ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); + ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); + ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); + ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); + + /* Operate on rows. */ + ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); + ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); + ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); + ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); + + /* Operate on columns. */ + ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); + ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); + ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); + ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); + + /* Operate on rows. */ + ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); + ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); + ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); + ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); + + /* Operate on columns. */ + ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); + ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); + ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); + ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); + + /* Operate on rows. */ + ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); + ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); + ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); + ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); + + /* Operate on columns. */ + ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); + ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); + ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); + ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); + + /* Operate on rows. */ + ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); + ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); + ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); + ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); + + B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7]; + B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15]; +} + + +template static __device__ void block_mixer(uint4 *B, uint4 *C) +{ + switch (ALGO) { + case A_SCRYPT: xor_salsa8(B, C); break; + case A_SCRYPT_JANE: xor_chacha8(B, C); break; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Experimental Scrypt core kernel for Kepler devices. +//! @param g_idata input data in global memory +//! @param g_odata output data in global memory +//////////////////////////////////////////////////////////////////////////////// +template __global__ +void nv_scrypt_core_kernelA(uint32_t *g_idata, int begin, int end) +{ + int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize; + g_idata += 32 * offset; + uint32_t * V = c_V[offset / warpSize]; + uint4 B[4], C[4]; + int i = begin; + + if(i == 0) { + __transposed_read_BC<0>((uint4*)g_idata, B, C, 1, 0); + __transposed_write_BC(B, C, (uint4*)V, c_N); + ++i; + } else + __transposed_read_BC<0>((uint4*)(V + (i-1)*32), B, C, c_N, 0); + + while(i < end) { + block_mixer(B, C); block_mixer(C, B); + __transposed_write_BC(B, C, (uint4*)(V + i*32), c_N); + ++i; + } +} + +template __global__ +void nv_scrypt_core_kernelA_LG(uint32_t *g_idata, int begin, int end, unsigned int LOOKUP_GAP) +{ + int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize; + g_idata += 32 * offset; + uint32_t * V = c_V[offset / warpSize]; + uint4 B[4], C[4]; + int i = begin; + + if(i == 0) { + __transposed_read_BC<0>((uint4*)g_idata, B, C, 1, 0); + __transposed_write_BC(B, C, (uint4*)V, c_spacing); + ++i; + } else { + int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP; + __transposed_read_BC<0>((uint4*)(V + pos*32), B, C, c_spacing, 0); + while(loop--) { block_mixer(B, C); block_mixer(C, B); } + } + + while(i < end) { + block_mixer(B, C); block_mixer(C, B); + if (i % LOOKUP_GAP == 0) + __transposed_write_BC(B, C, (uint4*)(V + (i/LOOKUP_GAP)*32), c_spacing); + ++i; + } +} + +template __global__ +void nv_scrypt_core_kernelB(uint32_t *g_odata, int begin, int end) +{ + int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize; + g_odata += 32 * offset; + uint32_t * V = c_V[offset / warpSize]; + uint4 B[4], C[4]; + + if(begin == 0) { + __transposed_read_BC((uint4*)V, B, C, c_N, c_N_1); + block_mixer(B, C); block_mixer(C, B); + } else + __transposed_read_BC<0>((uint4*)g_odata, B, C, 1, 0); + + for (int i = begin; i < end; i++) { + int slot = C[0].x & c_N_1; + __transposed_xor_BC((uint4*)(V), B, C, c_N, slot); + block_mixer(B, C); block_mixer(C, B); + } + + __transposed_write_BC(B, C, (uint4*)(g_odata), 1); +} + +template __global__ +void nv_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned int LOOKUP_GAP) +{ + int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize; + g_odata += 32 * offset; + uint32_t * V = c_V[offset / warpSize]; + uint4 B[4], C[4]; + + if(begin == 0) { + int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP); + __transposed_read_BC((uint4*)V, B, C, c_spacing, pos); + while(loop--) { block_mixer(B, C); block_mixer(C, B); } + } else { + __transposed_read_BC((uint4*)g_odata, B, C, 1, 0); + } + + for (int i = begin; i < end; i++) { + int slot = C[0].x & c_N_1; + int pos = slot/LOOKUP_GAP, loop = slot-pos*LOOKUP_GAP; + uint4 b[4], c[4]; __transposed_read_BC((uint4*)(V), b, c, c_spacing, pos); + while(loop--) { block_mixer(b, c); block_mixer(c, b); } +#pragma unroll 4 + for(int n = 0; n < 4; n++) { B[n] ^= b[n]; C[n] ^= c[n]; } + block_mixer(B, C); block_mixer(C, B); + } + + __transposed_write_BC(B, C, (uint4*)(g_odata), 1); +} + + + +// +// Maxcoin related Keccak implementation (Keccak256) +// + +// from salsa_kernel.cu +extern std::map context_blocks; +extern std::map context_wpb; +extern std::map context_kernel; +extern std::map context_streams[2]; +extern std::map context_hash[2]; + +__constant__ uint64_t ptarget64[4]; + +#define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64)))) +#define ROL_mult8(a, offset) ROL(a, offset) + +__constant__ uint64_t KeccakF_RoundConstants[24]; +static uint64_t host_KeccakF_RoundConstants[24] = { + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + +__constant__ uint64_t pdata64[10]; + +static __device__ uint32_t cuda_swab32(uint32_t x) +{ + return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) + | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); +} + +__global__ +void kepler_crypto_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate ) +{ + uint64_t Aba, Abe, Abi, Abo, Abu; + uint64_t Aga, Age, Agi, Ago, Agu; + uint64_t Aka, Ake, Aki, Ako, Aku; + uint64_t Ama, Ame, Ami, Amo, Amu; + uint64_t Asa, Ase, Asi, Aso, Asu; + uint64_t BCa, BCe, BCi, BCo, BCu; + uint64_t Da, De, Di, Do, Du; + uint64_t Eba, Ebe, Ebi, Ebo, Ebu; + uint64_t Ega, Ege, Egi, Ego, Egu; + uint64_t Eka, Eke, Eki, Eko, Eku; + uint64_t Ema, Eme, Emi, Emo, Emu; + uint64_t Esa, Ese, Esi, Eso, Esu; + + //copyFromState(A, state) + Aba = pdata64[0]; + Abe = pdata64[1]; + Abi = pdata64[2]; + Abo = pdata64[3]; + Abu = pdata64[4]; + Aga = pdata64[5]; + Age = pdata64[6]; + Agi = pdata64[7]; + Ago = pdata64[8]; + Agu = (pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32); + Aka = 0x0000000000000001ULL; + Ake = 0; + Aki = 0; + Ako = 0; + Aku = 0; + Ama = 0; + Ame = 0x8000000000000000ULL; + Ami = 0; + Amo = 0; + Amu = 0; + Asa = 0; + Ase = 0; + Asi = 0; + Aso = 0; + Asu = 0; + +#pragma unroll 12 + for( int laneCount = 0; laneCount < 24; laneCount += 2 ) + { + // prepareTheta + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= (uint64_t)KeccakF_RoundConstants[laneCount]; + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL_mult8(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL_mult8(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + // prepareTheta + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= (uint64_t)KeccakF_RoundConstants[laneCount+1]; + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + Abu = BCu ^((~BCa)& BCe ); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^((~BCe)& BCi ); + Age = BCe ^((~BCi)& BCo ); + Agi = BCi ^((~BCo)& BCu ); + Ago = BCo ^((~BCu)& BCa ); + Agu = BCu ^((~BCa)& BCe ); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL_mult8(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^((~BCe)& BCi ); + Ake = BCe ^((~BCi)& BCo ); + Aki = BCi ^((~BCo)& BCu ); + Ako = BCo ^((~BCu)& BCa ); + Aku = BCu ^((~BCa)& BCe ); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL_mult8(Eso, 56); + Ama = BCa ^((~BCe)& BCi ); + Ame = BCe ^((~BCi)& BCo ); + Ami = BCi ^((~BCo)& BCu ); + Amo = BCo ^((~BCu)& BCa ); + Amu = BCu ^((~BCa)& BCe ); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^((~BCe)& BCi ); + Ase = BCe ^((~BCi)& BCo ); + Asi = BCi ^((~BCo)& BCu ); + Aso = BCo ^((~BCu)& BCa ); + Asu = BCu ^((~BCa)& BCe ); + } + + if (validate) { + g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x); + g_out[3] = Abo; + g_out[2] = Abi; + g_out[1] = Abe; + g_out[0] = Aba; + } + + // the likelyhood of meeting the hashing target is so low, that we're not guarding this + // with atomic writes, locks or similar... + uint64_t *g_good64 = (uint64_t*)g_good; + if (Abo <= ptarget64[3]) { + if (Abo < g_good64[3]) { + g_good64[3] = Abo; + g_good64[2] = Abi; + g_good64[1] = Abe; + g_good64[0] = Aba; + g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); + } + } +} + +static std::map context_good[2]; + +bool NVKernel::prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) +{ + static bool init[MAX_DEVICES] = {false}; + if (!init[thr_id]) + { + checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice)); + + // allocate pinned host memory for good hashes + uint32_t *tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp; + + init[thr_id] = true; + } + checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + + return context_good[0][thr_id] && context_good[1][thr_id]; +} + +void NVKernel::do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) +{ + checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id])); + + kepler_crypto_hash<<>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h); + + // copy hashes from device memory to host (ALL hashes, lots of data...) + if (do_d2h && hash != NULL) { + size_t mem_size = throughput * sizeof(uint32_t) * 8; + checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size, + cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); + } + else if (hash != NULL) { + // asynchronous copy of winning nonce (just 4 bytes...) + checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t), + cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); + } +} + + +// +// Blakecoin related Keccak implementation (Keccak256) +// + +typedef uint32_t sph_u32; +#define SPH_C32(x) ((sph_u32)(x)) +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) +#if __CUDA_ARCH__ < 350 + // Kepler (Compute 3.0) + #define SPH_ROTL32(a, b) ((a)<<(b))|((a)>>(32-(b))) +#else + // Kepler (Compute 3.5) + #define SPH_ROTL32(a, b) __funnelshift_l( a, a, b ); +#endif +#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) + +__constant__ uint32_t pdata[20]; + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static __device__ sph_u32 cuda_sph_bswap32(sph_u32 x) +{ + return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) + | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); +} + +/** + * Encode a 32-bit value into the provided buffer (big endian convention). + * + * @param dst the destination buffer + * @param val the 32-bit value to encode + */ +static __device__ void +cuda_sph_enc32be(void *dst, sph_u32 val) +{ + *(sph_u32 *)dst = cuda_sph_bswap32(val); +} + +#define Z00 0 +#define Z01 1 +#define Z02 2 +#define Z03 3 +#define Z04 4 +#define Z05 5 +#define Z06 6 +#define Z07 7 +#define Z08 8 +#define Z09 9 +#define Z0A A +#define Z0B B +#define Z0C C +#define Z0D D +#define Z0E E +#define Z0F F + +#define Z10 E +#define Z11 A +#define Z12 4 +#define Z13 8 +#define Z14 9 +#define Z15 F +#define Z16 D +#define Z17 6 +#define Z18 1 +#define Z19 C +#define Z1A 0 +#define Z1B 2 +#define Z1C B +#define Z1D 7 +#define Z1E 5 +#define Z1F 3 + +#define Z20 B +#define Z21 8 +#define Z22 C +#define Z23 0 +#define Z24 5 +#define Z25 2 +#define Z26 F +#define Z27 D +#define Z28 A +#define Z29 E +#define Z2A 3 +#define Z2B 6 +#define Z2C 7 +#define Z2D 1 +#define Z2E 9 +#define Z2F 4 + +#define Z30 7 +#define Z31 9 +#define Z32 3 +#define Z33 1 +#define Z34 D +#define Z35 C +#define Z36 B +#define Z37 E +#define Z38 2 +#define Z39 6 +#define Z3A 5 +#define Z3B A +#define Z3C 4 +#define Z3D 0 +#define Z3E F +#define Z3F 8 + +#define Z40 9 +#define Z41 0 +#define Z42 5 +#define Z43 7 +#define Z44 2 +#define Z45 4 +#define Z46 A +#define Z47 F +#define Z48 E +#define Z49 1 +#define Z4A B +#define Z4B C +#define Z4C 6 +#define Z4D 8 +#define Z4E 3 +#define Z4F D + +#define Z50 2 +#define Z51 C +#define Z52 6 +#define Z53 A +#define Z54 0 +#define Z55 B +#define Z56 8 +#define Z57 3 +#define Z58 4 +#define Z59 D +#define Z5A 7 +#define Z5B 5 +#define Z5C F +#define Z5D E +#define Z5E 1 +#define Z5F 9 + +#define Z60 C +#define Z61 5 +#define Z62 1 +#define Z63 F +#define Z64 E +#define Z65 D +#define Z66 4 +#define Z67 A +#define Z68 0 +#define Z69 7 +#define Z6A 6 +#define Z6B 3 +#define Z6C 9 +#define Z6D 2 +#define Z6E 8 +#define Z6F B + +#define Z70 D +#define Z71 B +#define Z72 7 +#define Z73 E +#define Z74 C +#define Z75 1 +#define Z76 3 +#define Z77 9 +#define Z78 5 +#define Z79 0 +#define Z7A F +#define Z7B 4 +#define Z7C 8 +#define Z7D 6 +#define Z7E 2 +#define Z7F A + +#define Z80 6 +#define Z81 F +#define Z82 E +#define Z83 9 +#define Z84 B +#define Z85 3 +#define Z86 0 +#define Z87 8 +#define Z88 C +#define Z89 2 +#define Z8A D +#define Z8B 7 +#define Z8C 1 +#define Z8D 4 +#define Z8E A +#define Z8F 5 + +#define Z90 A +#define Z91 2 +#define Z92 8 +#define Z93 4 +#define Z94 7 +#define Z95 6 +#define Z96 1 +#define Z97 5 +#define Z98 F +#define Z99 B +#define Z9A 9 +#define Z9B E +#define Z9C 3 +#define Z9D C +#define Z9E D +#define Z9F 0 + +#define Mx(r, i) Mx_(Z ## r ## i) +#define Mx_(n) Mx__(n) +#define Mx__(n) M ## n + +#define CSx(r, i) CSx_(Z ## r ## i) +#define CSx_(n) CSx__(n) +#define CSx__(n) CS ## n + +#define CS0 SPH_C32(0x243F6A88) +#define CS1 SPH_C32(0x85A308D3) +#define CS2 SPH_C32(0x13198A2E) +#define CS3 SPH_C32(0x03707344) +#define CS4 SPH_C32(0xA4093822) +#define CS5 SPH_C32(0x299F31D0) +#define CS6 SPH_C32(0x082EFA98) +#define CS7 SPH_C32(0xEC4E6C89) +#define CS8 SPH_C32(0x452821E6) +#define CS9 SPH_C32(0x38D01377) +#define CSA SPH_C32(0xBE5466CF) +#define CSB SPH_C32(0x34E90C6C) +#define CSC SPH_C32(0xC0AC29B7) +#define CSD SPH_C32(0xC97C50DD) +#define CSE SPH_C32(0x3F84D5B5) +#define CSF SPH_C32(0xB5470917) + +#define GS(m0, m1, c0, c1, a, b, c, d) do { \ + a = SPH_T32(a + b + (m0 ^ c1)); \ + d = SPH_ROTR32(d ^ a, 16); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 12); \ + a = SPH_T32(a + b + (m1 ^ c0)); \ + d = SPH_ROTR32(d ^ a, 8); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 7); \ + } while (0) + +#define ROUND_S(r) do { \ + GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ + GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ + GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ + GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ + GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ + GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ + GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ + GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ + } while (0) + +#define COMPRESS32 do { \ + sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \ + sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \ + sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CS0; \ + V9 = S1 ^ CS1; \ + VA = S2 ^ CS2; \ + VB = S3 ^ CS3; \ + VC = T0 ^ CS4; \ + VD = T0 ^ CS5; \ + VE = T1 ^ CS6; \ + VF = T1 ^ CS7; \ + M0 = input[0]; \ + M1 = input[1]; \ + M2 = input[2]; \ + M3 = input[3]; \ + M4 = input[4]; \ + M5 = input[5]; \ + M6 = input[6]; \ + M7 = input[7]; \ + M8 = input[8]; \ + M9 = input[9]; \ + MA = input[10]; \ + MB = input[11]; \ + MC = input[12]; \ + MD = input[13]; \ + ME = input[14]; \ + MF = input[15]; \ + ROUND_S(0); \ + ROUND_S(1); \ + ROUND_S(2); \ + ROUND_S(3); \ + ROUND_S(4); \ + ROUND_S(5); \ + ROUND_S(6); \ + ROUND_S(7); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + + +__global__ +void kepler_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate) +{ + uint32_t input[16]; + uint64_t output[4]; + + #pragma unroll + for (int i=0; i < 16; ++i) input[i] = pdata[i]; + + sph_u32 H0 = 0x6A09E667; + sph_u32 H1 = 0xBB67AE85; + sph_u32 H2 = 0x3C6EF372; + sph_u32 H3 = 0xA54FF53A; + sph_u32 H4 = 0x510E527F; + sph_u32 H5 = 0x9B05688C; + sph_u32 H6 = 0x1F83D9AB; + sph_u32 H7 = 0x5BE0CD19; + sph_u32 S0 = 0; + sph_u32 S1 = 0; + sph_u32 S2 = 0; + sph_u32 S3 = 0; + sph_u32 T0 = 0; + sph_u32 T1 = 0; + T0 = SPH_T32(T0 + 512); + COMPRESS32; + + #pragma unroll + for (int i=0; i < 3; ++i) input[i] = pdata[16+i]; + + input[3] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); + input[4] = 0x80000000; + + #pragma unroll 8 + for (int i=5; i < 13; ++i) input[i] = 0; + + input[13] = 0x00000001; + input[14] = T1; + input[15] = T0 + 128; + + T0 = SPH_T32(T0 + 128); + COMPRESS32; + + cuda_sph_enc32be((unsigned char*)output + 4*6, H6); + cuda_sph_enc32be((unsigned char*)output + 4*7, H7); + if (validate || output[3] <= ptarget64[3]) + { + // this data is only needed when we actually need to save the hashes + cuda_sph_enc32be((unsigned char*)output + 4*0, H0); + cuda_sph_enc32be((unsigned char*)output + 4*1, H1); + cuda_sph_enc32be((unsigned char*)output + 4*2, H2); + cuda_sph_enc32be((unsigned char*)output + 4*3, H3); + cuda_sph_enc32be((unsigned char*)output + 4*4, H4); + cuda_sph_enc32be((unsigned char*)output + 4*5, H5); + } + + if (validate) + { + g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x); + #pragma unroll + for (int i=0; i < 4; ++i) g_out[i] = output[i]; + } + + if (output[3] <= ptarget64[3]) { + uint64_t *g_good64 = (uint64_t*)g_good; + if (output[3] < g_good64[3]) { + g_good64[3] = output[3]; + g_good64[2] = output[2]; + g_good64[1] = output[1]; + g_good64[0] = output[0]; + g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); + } + } +} + +bool NVKernel::prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) +{ + static bool init[MAX_DEVICES] = {false}; + if (!init[thr_id]) + { + // allocate pinned host memory for good hashes + uint32_t *tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp; + + init[thr_id] = true; + } + checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + + return context_good[0][thr_id] && context_good[1][thr_id]; +} + +void NVKernel::do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) +{ + checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id])); + + kepler_blake256_hash<<>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h); + + // copy hashes from device memory to host (ALL hashes, lots of data...) + if (do_d2h && hash != NULL) { + size_t mem_size = throughput * sizeof(uint32_t) * 8; + checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size, + cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); + } + else if (hash != NULL) { + // asynchronous copy of winning nonce (just 4 bytes...) + checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t), + cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); + } +} diff --git a/scrypt/nv_kernel.h b/scrypt/nv_kernel.h new file mode 100644 index 0000000..e45ed9b --- /dev/null +++ b/scrypt/nv_kernel.h @@ -0,0 +1,36 @@ +#ifndef NV_KERNEL_H +#define NV_KERNEL_H + +#include "salsa_kernel.h" + +class NVKernel : public KernelInterface +{ +public: + NVKernel(); + + virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V); + virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache); + + virtual bool bindtexture_1D(uint32_t *d_V, size_t size); + virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch); + virtual bool unbindtexture_1D(); + virtual bool unbindtexture_2D(); + + virtual char get_identifier() { return 'K'; }; + virtual int get_major_version() { return 3; }; + virtual int get_minor_version() { return 0; }; + + virtual int max_warps_per_block() { return 32; }; + virtual int get_texel_width() { return 4; }; + virtual bool support_lookup_gap() { return true; } + virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; } + virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; } + + virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); + virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false); + + virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]); + virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false); +}; + +#endif // #ifndef NV_KERNEL_H diff --git a/scrypt/nv_kernel2.cu b/scrypt/nv_kernel2.cu new file mode 100644 index 0000000..cc01843 --- /dev/null +++ b/scrypt/nv_kernel2.cu @@ -0,0 +1,1723 @@ +// +// Experimental Kernel for Kepler (Compute 3.5) devices +// code submitted by nVidia performance engineer Alexey Panteleev +// with modifications by Christian Buchner +// +// for Compute 3.5 +// NOTE: compile this .cu module for compute_35,sm_35 with --maxrregcount=80 +// for Compute 3.0 +// NOTE: compile this .cu module for compute_30,sm_30 with --maxrregcount=63 +// + +#include + +#include "cuda_runtime.h" +#include "miner.h" + +#include "salsa_kernel.h" +#include "nv_kernel2.h" + +#define THREADS_PER_WU 1 // single thread per hash + +#if __CUDA_ARCH__ < 350 + // Kepler (Compute 3.0) + #define __ldg(x) (*(x)) +#endif + +// grab lane ID +static __device__ __inline__ unsigned int __laneId() { unsigned int laneId; asm( "mov.u32 %0, %%laneid;" : "=r"( laneId ) ); return laneId; } + +// forward references +template __global__ void nv2_scrypt_core_kernelA(uint32_t *g_idata, int begin, int end); +template __global__ void nv2_scrypt_core_kernelB(uint32_t *g_odata, int begin, int end); +template __global__ void nv2_scrypt_core_kernelA_LG(uint32_t *g_idata, int begin, int end, unsigned int LOOKUP_GAP); +template __global__ void nv2_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned int LOOKUP_GAP); + +// scratchbuf constants (pointers to scratch buffer for each work unit) +__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT]; + +// iteration count N +__constant__ uint32_t c_N; +__constant__ uint32_t c_N_1; // N - 1 +__constant__ uint32_t c_spacing; // (N+LOOKUP_GAP-1)/LOOKUP_GAP + + +NV2Kernel::NV2Kernel() : KernelInterface() +{ +} + +void NV2Kernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) +{ + checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice)); +} + +bool NV2Kernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) +{ + bool success = true; + + // make some constants available to kernel, update only initially and when changing + static int prev_N[MAX_DEVICES] = {0}; + if (N != prev_N[thr_id]) { + uint32_t h_N = N; + uint32_t h_N_1 = N-1; + uint32_t h_spacing = (N+LOOKUP_GAP-1)/LOOKUP_GAP; + + cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_spacing, &h_spacing, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + + prev_N[thr_id] = N; + } + + // First phase: Sequential writes to scratchpad. + const int batch = device_batchsize[thr_id]; + unsigned int pos = 0; + + do + { + if (LOOKUP_GAP == 1) { + if (IS_SCRYPT()) nv2_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelA<<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + } else { + if (IS_SCRYPT()) nv2_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelA_LG<<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + } + pos += batch; + } while (pos < N); + + // Second phase: Random read access from scratchpad. + pos = 0; + do + { + if (LOOKUP_GAP == 1) { + if (IS_SCRYPT()) nv2_scrypt_core_kernelB <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelB <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + } else { + if (IS_SCRYPT()) nv2_scrypt_core_kernelB_LG <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelB_LG <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + } + + pos += batch; + } while (pos < N); + + return success; +} + +static __device__ uint4& operator^=(uint4& left, const uint4& right) +{ + left.x ^= right.x; + left.y ^= right.y; + left.z ^= right.z; + left.w ^= right.w; + return left; +} + +__device__ __forceinline__ uint4 __shfl(const uint4 val, unsigned int lane, unsigned int width) +{ + return make_uint4( + (unsigned int)__shfl((int)val.x, lane, width), + (unsigned int)__shfl((int)val.y, lane, width), + (unsigned int)__shfl((int)val.z, lane, width), + (unsigned int)__shfl((int)val.w, lane, width)); +} + +__device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[4], uint4 *D, int spacing) +{ + unsigned int laneId = __laneId(); + + unsigned int lane8 = laneId%8; + unsigned int tile = laneId/8; + + uint4 T1[8], T2[8]; + + /* Source matrix, A-H are threads, 0-7 are data items, thread A is marked with `*`: + + *A0 B0 C0 D0 E0 F0 G0 H0 + *A1 B1 C1 D1 E1 F1 G1 H1 + *A2 B2 C2 D2 E2 F2 G2 H2 + *A3 B3 C3 D3 E3 F3 G3 H3 + *A4 B4 C4 D4 E4 F4 G4 H4 + *A5 B5 C5 D5 E5 F5 G5 H5 + *A6 B6 C6 D6 E6 F6 G6 H6 + *A7 B7 C7 D7 E7 F7 G7 H7 + */ + + // rotate rows + T1[0] = B[0]; + T1[1] = __shfl(B[1], lane8 + 7, 8); + T1[2] = __shfl(B[2], lane8 + 6, 8); + T1[3] = __shfl(B[3], lane8 + 5, 8); + T1[4] = __shfl(C[0], lane8 + 4, 8); + T1[5] = __shfl(C[1], lane8 + 3, 8); + T1[6] = __shfl(C[2], lane8 + 2, 8); + T1[7] = __shfl(C[3], lane8 + 1, 8); + + /* Matrix after row rotates: + + *A0 B0 C0 D0 E0 F0 G0 H0 + H1 *A1 B1 C1 D1 E1 F1 G1 + G2 H2 *A2 B2 C2 D2 E2 F2 + F3 G3 H3 *A3 B3 C3 D3 E3 + E4 F4 G4 H4 *A4 B4 C4 D4 + D5 E5 F5 G5 H5 *A5 B5 C5 + C6 D6 E6 F6 G6 H6 *A6 B6 + B7 C7 D7 E7 F7 G7 H7 *A7 + */ + + // rotate columns up using a barrel shifter simulation + // column X is rotated up by (X+1) items +#pragma unroll 8 + for(int n = 0; n < 8; n++) T2[n] = ((lane8+1) & 1) ? T1[(n+1) % 8] : T1[n]; +#pragma unroll 8 + for(int n = 0; n < 8; n++) T1[n] = ((lane8+1) & 2) ? T2[(n+2) % 8] : T2[n]; +#pragma unroll 8 + for(int n = 0; n < 8; n++) T2[n] = ((lane8+1) & 4) ? T1[(n+4) % 8] : T1[n]; + + /* Matrix after column rotates: + + H1 H2 H3 H4 H5 H6 H7 H0 + G2 G3 G4 G5 G6 G7 G0 G1 + F3 F4 F5 F6 F7 F0 F1 F2 + E4 E5 E6 E7 E0 E1 E2 E3 + D5 D6 D7 D0 D1 D2 D3 D4 + C6 C7 C0 C1 C2 C3 C4 C5 + B7 B0 B1 B2 B3 B4 B5 B6 + *A0 *A1 *A2 *A3 *A4 *A5 *A6 *A7 + */ + + // rotate rows again using address math and write to D, in reverse row order + D[spacing*2*(32*tile )+ lane8 ] = T2[7]; + D[spacing*2*(32*tile+4 )+(lane8+7)%8] = T2[6]; + D[spacing*2*(32*tile+8 )+(lane8+6)%8] = T2[5]; + D[spacing*2*(32*tile+12)+(lane8+5)%8] = T2[4]; + D[spacing*2*(32*tile+16)+(lane8+4)%8] = T2[3]; + D[spacing*2*(32*tile+20)+(lane8+3)%8] = T2[2]; + D[spacing*2*(32*tile+24)+(lane8+2)%8] = T2[1]; + D[spacing*2*(32*tile+28)+(lane8+1)%8] = T2[0]; +} + +__device__ __forceinline__ void __transposed_read_BC(const uint4 *S, uint4 (&B)[4], uint4 (&C)[4], int spacing, int row) +{ + unsigned int laneId = __laneId(); + + unsigned int lane8 = laneId%8; + unsigned int tile = laneId/8; + + // Perform the same transposition as in __transposed_write_BC, but in reverse order. + // See the illustrations in comments for __transposed_write_BC. + + // read and rotate rows, in reverse row order + uint4 T1[8], T2[8]; + T1[7] = __ldg(&S[(spacing*2*(32*tile ) + lane8 + 8*__shfl(row, 0, 8))]); + T1[6] = __ldg(&S[(spacing*2*(32*tile+4 ) + (lane8+7)%8 + 8*__shfl(row, 1, 8))]); + T1[5] = __ldg(&S[(spacing*2*(32*tile+8 ) + (lane8+6)%8 + 8*__shfl(row, 2, 8))]); + T1[4] = __ldg(&S[(spacing*2*(32*tile+12) + (lane8+5)%8 + 8*__shfl(row, 3, 8))]); + T1[3] = __ldg(&S[(spacing*2*(32*tile+16) + (lane8+4)%8 + 8*__shfl(row, 4, 8))]); + T1[2] = __ldg(&S[(spacing*2*(32*tile+20) + (lane8+3)%8 + 8*__shfl(row, 5, 8))]); + T1[1] = __ldg(&S[(spacing*2*(32*tile+24) + (lane8+2)%8 + 8*__shfl(row, 6, 8))]); + T1[0] = __ldg(&S[(spacing*2*(32*tile+28) + (lane8+1)%8 + 8*__shfl(row, 7, 8))]); + + // rotate columns down using a barrel shifter simulation + // column X is rotated down by (X+1) items, or up by (8-(X+1)) = (7-X) items +#pragma unroll 8 + for(int n = 0; n < 8; n++) T2[n] = ((7-lane8) & 1) ? T1[(n+1) % 8] : T1[n]; +#pragma unroll 8 + for(int n = 0; n < 8; n++) T1[n] = ((7-lane8) & 2) ? T2[(n+2) % 8] : T2[n]; +#pragma unroll 8 + for(int n = 0; n < 8; n++) T2[n] = ((7-lane8) & 4) ? T1[(n+4) % 8] : T1[n]; + + // rotate rows + B[0] = T2[0]; + B[1] = __shfl(T2[1], lane8 + 1, 8); + B[2] = __shfl(T2[2], lane8 + 2, 8); + B[3] = __shfl(T2[3], lane8 + 3, 8); + C[0] = __shfl(T2[4], lane8 + 4, 8); + C[1] = __shfl(T2[5], lane8 + 5, 8); + C[2] = __shfl(T2[6], lane8 + 6, 8); + C[3] = __shfl(T2[7], lane8 + 7, 8); + +} + +__device__ __forceinline__ void __transposed_xor_BC(const uint4 *S, uint4 (&B)[4], uint4 (&C)[4], int spacing, int row) +{ + uint4 BT[4], CT[4]; + __transposed_read_BC(S, BT, CT, spacing, row); + +#pragma unroll 4 + for(int n = 0; n < 4; n++) + { + B[n] ^= BT[n]; + C[n] ^= CT[n]; + } +} + +#if __CUDA_ARCH__ < 350 + // Kepler (Compute 3.0) + #define ROTL(a, b) ((a)<<(b))|((a)>>(32-(b))) +#else + // Kepler (Compute 3.5) + #define ROTL(a, b) __funnelshift_l( a, a, b ); +#endif + + + +#if 0 + +#define QUARTER(a,b,c,d) \ + a += b; d ^= a; d = ROTL(d,16); \ + c += d; b ^= c; b = ROTL(b,12); \ + a += b; d ^= a; d = ROTL(d,8); \ + c += d; b ^= c; b = ROTL(b,7); + +static __device__ void xor_chacha8(uint4 *B, uint4 *C) +{ + uint32_t x[16]; + x[0]=(B[0].x ^= C[0].x); + x[1]=(B[0].y ^= C[0].y); + x[2]=(B[0].z ^= C[0].z); + x[3]=(B[0].w ^= C[0].w); + x[4]=(B[1].x ^= C[1].x); + x[5]=(B[1].y ^= C[1].y); + x[6]=(B[1].z ^= C[1].z); + x[7]=(B[1].w ^= C[1].w); + x[8]=(B[2].x ^= C[2].x); + x[9]=(B[2].y ^= C[2].y); + x[10]=(B[2].z ^= C[2].z); + x[11]=(B[2].w ^= C[2].w); + x[12]=(B[3].x ^= C[3].x); + x[13]=(B[3].y ^= C[3].y); + x[14]=(B[3].z ^= C[3].z); + x[15]=(B[3].w ^= C[3].w); + + /* Operate on columns. */ + QUARTER( x[0], x[4], x[ 8], x[12] ) + QUARTER( x[1], x[5], x[ 9], x[13] ) + QUARTER( x[2], x[6], x[10], x[14] ) + QUARTER( x[3], x[7], x[11], x[15] ) + + /* Operate on diagonals */ + QUARTER( x[0], x[5], x[10], x[15] ) + QUARTER( x[1], x[6], x[11], x[12] ) + QUARTER( x[2], x[7], x[ 8], x[13] ) + QUARTER( x[3], x[4], x[ 9], x[14] ) + + /* Operate on columns. */ + QUARTER( x[0], x[4], x[ 8], x[12] ) + QUARTER( x[1], x[5], x[ 9], x[13] ) + QUARTER( x[2], x[6], x[10], x[14] ) + QUARTER( x[3], x[7], x[11], x[15] ) + + /* Operate on diagonals */ + QUARTER( x[0], x[5], x[10], x[15] ) + QUARTER( x[1], x[6], x[11], x[12] ) + QUARTER( x[2], x[7], x[ 8], x[13] ) + QUARTER( x[3], x[4], x[ 9], x[14] ) + + /* Operate on columns. */ + QUARTER( x[0], x[4], x[ 8], x[12] ) + QUARTER( x[1], x[5], x[ 9], x[13] ) + QUARTER( x[2], x[6], x[10], x[14] ) + QUARTER( x[3], x[7], x[11], x[15] ) + + /* Operate on diagonals */ + QUARTER( x[0], x[5], x[10], x[15] ) + QUARTER( x[1], x[6], x[11], x[12] ) + QUARTER( x[2], x[7], x[ 8], x[13] ) + QUARTER( x[3], x[4], x[ 9], x[14] ) + + /* Operate on columns. */ + QUARTER( x[0], x[4], x[ 8], x[12] ) + QUARTER( x[1], x[5], x[ 9], x[13] ) + QUARTER( x[2], x[6], x[10], x[14] ) + QUARTER( x[3], x[7], x[11], x[15] ) + + /* Operate on diagonals */ + QUARTER( x[0], x[5], x[10], x[15] ) + QUARTER( x[1], x[6], x[11], x[12] ) + QUARTER( x[2], x[7], x[ 8], x[13] ) + QUARTER( x[3], x[4], x[ 9], x[14] ) + + B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7]; + B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15]; +} + +#else + +#define ADD4(d1,d2,d3,d4,s1,s2,s3,s4) \ + d1 += s1; d2 += s2; d3 += s3; d4 += s4; + +#define XOR4(d1,d2,d3,d4,s1,s2,s3,s4) \ + d1 ^= s1; d2 ^= s2; d3 ^= s3; d4 ^= s4; + +#define ROTL4(d1,d2,d3,d4,amt) \ + d1 = ROTL(d1, amt); d2 = ROTL(d2, amt); d3 = ROTL(d3, amt); d4 = ROTL(d4, amt); + +#define QROUND(a1,a2,a3,a4, b1,b2,b3,b4, c1,c2,c3,c4, amt) \ + ADD4 (a1,a2,a3,a4, c1,c2,c3,c4) \ + XOR4 (b1,b2,b3,b4, a1,a2,a3,a4) \ + ROTL4(b1,b2,b3,b4, amt) + +static __device__ void xor_chacha8(uint4 *B, uint4 *C) +{ + uint32_t x[16]; + x[0]=(B[0].x ^= C[0].x); + x[1]=(B[0].y ^= C[0].y); + x[2]=(B[0].z ^= C[0].z); + x[3]=(B[0].w ^= C[0].w); + x[4]=(B[1].x ^= C[1].x); + x[5]=(B[1].y ^= C[1].y); + x[6]=(B[1].z ^= C[1].z); + x[7]=(B[1].w ^= C[1].w); + x[8]=(B[2].x ^= C[2].x); + x[9]=(B[2].y ^= C[2].y); + x[10]=(B[2].z ^= C[2].z); + x[11]=(B[2].w ^= C[2].w); + x[12]=(B[3].x ^= C[3].x); + x[13]=(B[3].y ^= C[3].y); + x[14]=(B[3].z ^= C[3].z); + x[15]=(B[3].w ^= C[3].w); + + /* Operate on columns. */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); + + /* Operate on diagonals */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); + + /* Operate on columns. */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); + + /* Operate on diagonals */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); + + /* Operate on columns. */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); + + /* Operate on diagonals */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); + + /* Operate on columns. */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); + QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); + + /* Operate on diagonals */ + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); + QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); + QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); + + B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7]; + B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15]; +} + +#endif + + +#define ROTL7(a0,a1,a2,a3,a00,a10,a20,a30){\ +a0^=ROTL(a00, 7); a1^=ROTL(a10, 7); a2^=ROTL(a20, 7); a3^=ROTL(a30, 7);\ +};\ + +#define ROTL9(a0,a1,a2,a3,a00,a10,a20,a30){\ +a0^=ROTL(a00, 9); a1^=ROTL(a10, 9); a2^=ROTL(a20, 9); a3^=ROTL(a30, 9);\ +};\ + +#define ROTL13(a0,a1,a2,a3,a00,a10,a20,a30){\ +a0^=ROTL(a00, 13); a1^=ROTL(a10, 13); a2^=ROTL(a20, 13); a3^=ROTL(a30, 13);\ +};\ + +#define ROTL18(a0,a1,a2,a3,a00,a10,a20,a30){\ +a0^=ROTL(a00, 18); a1^=ROTL(a10, 18); a2^=ROTL(a20, 18); a3^=ROTL(a30, 18);\ +};\ + +static __device__ void xor_salsa8(uint4 *B, uint4 *C) +{ + uint32_t x[16]; + x[0]=(B[0].x ^= C[0].x); + x[1]=(B[0].y ^= C[0].y); + x[2]=(B[0].z ^= C[0].z); + x[3]=(B[0].w ^= C[0].w); + x[4]=(B[1].x ^= C[1].x); + x[5]=(B[1].y ^= C[1].y); + x[6]=(B[1].z ^= C[1].z); + x[7]=(B[1].w ^= C[1].w); + x[8]=(B[2].x ^= C[2].x); + x[9]=(B[2].y ^= C[2].y); + x[10]=(B[2].z ^= C[2].z); + x[11]=(B[2].w ^= C[2].w); + x[12]=(B[3].x ^= C[3].x); + x[13]=(B[3].y ^= C[3].y); + x[14]=(B[3].z ^= C[3].z); + x[15]=(B[3].w ^= C[3].w); + + /* Operate on columns. */ + ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); + ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); + ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); + ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); + + /* Operate on rows. */ + ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); + ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); + ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); + ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); + + /* Operate on columns. */ + ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); + ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); + ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); + ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); + + /* Operate on rows. */ + ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); + ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); + ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); + ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); + + /* Operate on columns. */ + ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); + ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); + ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); + ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); + + /* Operate on rows. */ + ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); + ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); + ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); + ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); + + /* Operate on columns. */ + ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); + ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); + ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); + ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); + + /* Operate on rows. */ + ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); + ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); + ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); + ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); + + B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7]; + B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15]; +} + + +template static __device__ void block_mixer(uint4 *B, uint4 *C) +{ + switch (ALGO) + { + case A_SCRYPT: xor_salsa8(B, C); break; + case A_SCRYPT_JANE: xor_chacha8(B, C); break; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Experimental Scrypt core kernel for Titan devices. +//! @param g_idata input data in global memory +//! @param g_odata output data in global memory +//////////////////////////////////////////////////////////////////////////////// +template __global__ void nv2_scrypt_core_kernelA(uint32_t *g_idata, int begin, int end) +{ + int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize; + g_idata += 32 * offset; + uint32_t * V = c_V[offset / warpSize]; + uint4 B[4], C[4]; + int i = begin; + + if(i == 0) { + __transposed_read_BC((uint4*)g_idata, B, C, 1, 0); + __transposed_write_BC(B, C, (uint4*)V, c_N); + ++i; + } else + __transposed_read_BC((uint4*)(V + (i-1)*32), B, C, c_N, 0); + + while(i < end) { + block_mixer(B, C); block_mixer(C, B); + __transposed_write_BC(B, C, (uint4*)(V + i*32), c_N); + ++i; + } +} + +template __global__ void nv2_scrypt_core_kernelA_LG(uint32_t *g_idata, int begin, int end, unsigned int LOOKUP_GAP) +{ + int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize; + g_idata += 32 * offset; + uint32_t * V = c_V[offset / warpSize]; + uint4 B[4], C[4]; + int i = begin; + + if(i == 0) { + __transposed_read_BC((uint4*)g_idata, B, C, 1, 0); + __transposed_write_BC(B, C, (uint4*)V, c_spacing); + ++i; + } else { + int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP; + __transposed_read_BC((uint4*)(V + pos*32), B, C, c_spacing, 0); + while(loop--) { block_mixer(B, C); block_mixer(C, B); } + } + + while(i < end) { + block_mixer(B, C); block_mixer(C, B); + if (i % LOOKUP_GAP == 0) + __transposed_write_BC(B, C, (uint4*)(V + (i/LOOKUP_GAP)*32), c_spacing); + ++i; + } +} + +template __global__ void nv2_scrypt_core_kernelB(uint32_t *g_odata, int begin, int end) +{ + int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize; + g_odata += 32 * offset; + uint32_t * V = c_V[offset / warpSize]; + uint4 B[4], C[4]; + + if(begin == 0) { + __transposed_read_BC((uint4*)V, B, C, c_N, c_N_1); + block_mixer(B, C); block_mixer(C, B); + } else + __transposed_read_BC((uint4*)g_odata, B, C, 1, 0); + + for (int i = begin; i < end; i++) { + int slot = C[0].x & c_N_1; + __transposed_xor_BC((uint4*)(V), B, C, c_N, slot); + block_mixer(B, C); block_mixer(C, B); + } + + __transposed_write_BC(B, C, (uint4*)(g_odata), 1); +} + +template __global__ void nv2_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned int LOOKUP_GAP) +{ + int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize; + g_odata += 32 * offset; + uint32_t * V = c_V[offset / warpSize]; + uint4 B[4], C[4]; + + if(begin == 0) { + int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP); + __transposed_read_BC((uint4*)V, B, C, c_spacing, pos); + while(loop--) { block_mixer(B, C); block_mixer(C, B); } + } else { + __transposed_read_BC((uint4*)g_odata, B, C, 1, 0); + } + + for (int i = begin; i < end; i++) { + int slot = C[0].x & c_N_1; + int pos = slot/LOOKUP_GAP, loop = slot-pos*LOOKUP_GAP; + uint4 b[4], c[4]; __transposed_read_BC((uint4*)(V), b, c, c_spacing, pos); + while(loop--) { block_mixer(b, c); block_mixer(c, b); } +#pragma unroll 4 + for(int n = 0; n < 4; n++) { B[n] ^= b[n]; C[n] ^= c[n]; } + block_mixer(B, C); block_mixer(C, B); + } + + __transposed_write_BC(B, C, (uint4*)(g_odata), 1); +} + + +// +// Maxcoin related Keccak implementation (Keccak256) +// + +// from salsa_kernel.cu +extern std::map context_blocks; +extern std::map context_wpb; +extern std::map context_kernel; +extern std::map context_streams[2]; +extern std::map context_hash[2]; + +__constant__ uint64_t ptarget64[4]; + +// ROL macro replaced with the inline assembly code below to work around a performance issue +//#define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64)))) +__inline__ __device__ uint2 ROL(const uint2 a, const int offset) { + uint2 result; + if(offset >= 32) { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } else { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + return result; +} +#define ROL_mult8(a, offset) ROL(a, offset) + +__inline__ __device__ uint64_t devectorize(uint2 v) { return __double_as_longlong(__hiloint2double(v.y, v.x)); } +__inline__ __device__ uint2 vectorize(uint64_t v) { return make_uint2(__double2loint(__longlong_as_double(v)), __double2hiint(__longlong_as_double(v))); } +__inline__ __device__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); } +__inline__ __device__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); } +__inline__ __device__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); } +__inline__ __device__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); } +__inline__ __device__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; } + +__constant__ uint64_t KeccakF_RoundConstants[24]; + +static uint64_t host_KeccakF_RoundConstants[24] = +{ + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + +__constant__ uint64_t pdata64[10]; + +static __device__ uint32_t cuda_swab32(uint32_t x) +{ + return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) + | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); +} + +// in this implementation the first and last iteration of the for() loop were explicitly +// unrolled and redundant operations were removed (e.g. operations on zero inputs, and +// computation of unnecessary outputs) +__global__ void titan_crypto_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate ) +{ + uint2 Aba, Abe, Abi, Abo, Abu; + uint2 Aga, Age, Agi, Ago, Agu; + uint2 Aka, Ake, Aki, Ako, Aku; + uint2 Ama, Ame, Ami, Amo, Amu; + uint2 Asa, Ase, Asi, Aso, Asu; + uint2 BCa, BCe, BCi, BCo, BCu; + uint2 Da, De, Di, Do, Du; + uint2 Eba, Ebe, Ebi, Ebo, Ebu; + uint2 Ega, Ege, Egi, Ego, Egu; + uint2 Eka, Eke, Eki, Eko, Eku; + uint2 Ema, Eme, Emi, Emo, Emu; + uint2 Esa, Ese, Esi, Eso, Esu; + + // embed unique nonce into source data stream in pdata[] + Agu = vectorize((pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32)); + + // prepareTheta + BCa = vectorize(pdata64[0]^pdata64[5]^0x0000000000000001ULL); + BCe = vectorize(pdata64[1]^pdata64[6]^0x8000000000000000ULL); + BCi = vectorize(pdata64[2]^pdata64[7]); + BCo = vectorize(pdata64[3]^pdata64[8]); + BCu = vectorize(pdata64[4])^Agu; + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba = vectorize(pdata64[0]) ^ Da; + BCa = Aba; + Age = vectorize(pdata64[6]) ^ De; + BCe = ROL(Age, 44); + Aki = Di; + BCi = ROL(Aki, 43); + Amo = Do; + BCo = ROL(Amo, 21); + Asu = Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= vectorize((uint64_t)KeccakF_RoundConstants[0]); + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo = vectorize(pdata64[3]) ^ Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka = vectorize(0x0000000000000001ULL) ^ Da; + BCi = ROL(Aka, 3); + Ame = vectorize(0x8000000000000000ULL) ^ De; + BCo = ROL(Ame, 45); + Asi = Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe = vectorize(pdata64[1]) ^ De; + BCa = ROL(Abe, 1); + Agi = vectorize(pdata64[7]) ^ Di; + BCe = ROL(Agi, 6); + Ako = Do; + BCi = ROL(Ako, 25); + Amu = Du; + BCo = ROL(Amu, 8); + Asa = Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu = vectorize(pdata64[4]) ^ Du; + BCa = ROL(Abu, 27); + Aga = vectorize(pdata64[5]) ^ Da; + BCe = ROL(Aga, 36); + Ake = De; + BCi = ROL(Ake, 10); + Ami = Di; + BCo = ROL(Ami, 15); + Aso = Do; + BCu = ROL(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi = vectorize(pdata64[2]) ^ Di; + BCa = ROL(Abi, 62); + Ago = vectorize(pdata64[8]) ^ Do; + BCe = ROL(Ago, 55); + Aku = Du; + BCi = ROL(Aku, 39); + Ama = Da; + BCo = ROL(Ama, 41); + Ase = De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + // prepareTheta + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= vectorize((uint64_t)KeccakF_RoundConstants[1]); + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + Abu = BCu ^((~BCa)& BCe ); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^((~BCe)& BCi ); + Age = BCe ^((~BCi)& BCo ); + Agi = BCi ^((~BCo)& BCu ); + Ago = BCo ^((~BCu)& BCa ); + Agu = BCu ^((~BCa)& BCe ); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^((~BCe)& BCi ); + Ake = BCe ^((~BCi)& BCo ); + Aki = BCi ^((~BCo)& BCu ); + Ako = BCo ^((~BCu)& BCa ); + Aku = BCu ^((~BCa)& BCe ); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL(Eso, 56); + Ama = BCa ^((~BCe)& BCi ); + Ame = BCe ^((~BCi)& BCo ); + Ami = BCi ^((~BCo)& BCu ); + Amo = BCo ^((~BCu)& BCa ); + Amu = BCu ^((~BCa)& BCe ); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^((~BCe)& BCi ); + Ase = BCe ^((~BCi)& BCo ); + Asi = BCi ^((~BCo)& BCu ); + Aso = BCo ^((~BCu)& BCa ); + Asu = BCu ^((~BCa)& BCe ); + +//#pragma unroll 10 + for( int laneCount = 2; laneCount < 22; laneCount += 2 ) + { + // prepareTheta + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= vectorize((uint64_t)KeccakF_RoundConstants[laneCount]); + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + // prepareTheta + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= vectorize((uint64_t)KeccakF_RoundConstants[laneCount+1]); + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + Abu = BCu ^((~BCa)& BCe ); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^((~BCe)& BCi ); + Age = BCe ^((~BCi)& BCo ); + Agi = BCi ^((~BCo)& BCu ); + Ago = BCo ^((~BCu)& BCa ); + Agu = BCu ^((~BCa)& BCe ); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^((~BCe)& BCi ); + Ake = BCe ^((~BCi)& BCo ); + Aki = BCi ^((~BCo)& BCu ); + Ako = BCo ^((~BCu)& BCa ); + Aku = BCu ^((~BCa)& BCe ); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL(Eso, 56); + Ama = BCa ^((~BCe)& BCi ); + Ame = BCe ^((~BCi)& BCo ); + Ami = BCi ^((~BCo)& BCu ); + Amo = BCo ^((~BCu)& BCa ); + Amu = BCu ^((~BCa)& BCe ); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^((~BCe)& BCi ); + Ase = BCe ^((~BCi)& BCo ); + Asi = BCi ^((~BCo)& BCu ); + Aso = BCo ^((~BCu)& BCa ); + Asu = BCu ^((~BCa)& BCe ); + } + + // prepareTheta + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= vectorize((uint64_t)KeccakF_RoundConstants[22]); + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + // prepareTheta + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= vectorize((uint64_t)KeccakF_RoundConstants[23]); + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + + if (validate) { + g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x); + g_out[3] = devectorize(Abo); + g_out[2] = devectorize(Abi); + g_out[1] = devectorize(Abe); + g_out[0] = devectorize(Aba); + } + + // the likelyhood of meeting the hashing target is so low, that we're not guarding this + // with atomic writes, locks or similar... + uint64_t *g_good64 = (uint64_t*)g_good; + if (devectorize(Abo) <= ptarget64[3]) { + if (devectorize(Abo) < g_good64[3]) { + g_good64[3] = devectorize(Abo); + g_good64[2] = devectorize(Abi); + g_good64[1] = devectorize(Abe); + g_good64[0] = devectorize(Aba); + g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); + } + } +} + +static std::map context_good[2]; + +bool NV2Kernel::prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) +{ + static bool init[MAX_DEVICES] = {false}; + if (!init[thr_id]) + { + checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice)); + + // allocate pinned host memory for good hashes + uint32_t *tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp; + + init[thr_id] = true; + } + checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + + return context_good[0][thr_id] && context_good[1][thr_id]; +} + +void NV2Kernel::do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) +{ + checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id])); + + titan_crypto_hash<<>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h); + + // copy hashes from device memory to host (ALL hashes, lots of data...) + if (do_d2h && hash != NULL) { + size_t mem_size = throughput * sizeof(uint32_t) * 8; + checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size, + cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); + } + else if (hash != NULL) { + // asynchronous copy of winning nonce (just 4 bytes...) + checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t), + cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); + } +} + + +// +// Blakecoin related Keccak implementation (Keccak256) +// + +typedef uint32_t sph_u32; +#define SPH_C32(x) ((sph_u32)(x)) +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) +#if __CUDA_ARCH__ < 350 + // Kepler (Compute 3.0) + #define SPH_ROTL32(a, b) ((a)<<(b))|((a)>>(32-(b))) +#else + // Kepler (Compute 3.5) + #define SPH_ROTL32(a, b) __funnelshift_l( a, a, b ); +#endif +#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) + +__constant__ uint32_t pdata[20]; + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static __device__ sph_u32 cuda_sph_bswap32(sph_u32 x) +{ + return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) + | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); +} + +/** + * Encode a 32-bit value into the provided buffer (big endian convention). + * + * @param dst the destination buffer + * @param val the 32-bit value to encode + */ +static __device__ void +cuda_sph_enc32be(void *dst, sph_u32 val) +{ + *(sph_u32 *)dst = cuda_sph_bswap32(val); +} + +#define Z00 0 +#define Z01 1 +#define Z02 2 +#define Z03 3 +#define Z04 4 +#define Z05 5 +#define Z06 6 +#define Z07 7 +#define Z08 8 +#define Z09 9 +#define Z0A A +#define Z0B B +#define Z0C C +#define Z0D D +#define Z0E E +#define Z0F F + +#define Z10 E +#define Z11 A +#define Z12 4 +#define Z13 8 +#define Z14 9 +#define Z15 F +#define Z16 D +#define Z17 6 +#define Z18 1 +#define Z19 C +#define Z1A 0 +#define Z1B 2 +#define Z1C B +#define Z1D 7 +#define Z1E 5 +#define Z1F 3 + +#define Z20 B +#define Z21 8 +#define Z22 C +#define Z23 0 +#define Z24 5 +#define Z25 2 +#define Z26 F +#define Z27 D +#define Z28 A +#define Z29 E +#define Z2A 3 +#define Z2B 6 +#define Z2C 7 +#define Z2D 1 +#define Z2E 9 +#define Z2F 4 + +#define Z30 7 +#define Z31 9 +#define Z32 3 +#define Z33 1 +#define Z34 D +#define Z35 C +#define Z36 B +#define Z37 E +#define Z38 2 +#define Z39 6 +#define Z3A 5 +#define Z3B A +#define Z3C 4 +#define Z3D 0 +#define Z3E F +#define Z3F 8 + +#define Z40 9 +#define Z41 0 +#define Z42 5 +#define Z43 7 +#define Z44 2 +#define Z45 4 +#define Z46 A +#define Z47 F +#define Z48 E +#define Z49 1 +#define Z4A B +#define Z4B C +#define Z4C 6 +#define Z4D 8 +#define Z4E 3 +#define Z4F D + +#define Z50 2 +#define Z51 C +#define Z52 6 +#define Z53 A +#define Z54 0 +#define Z55 B +#define Z56 8 +#define Z57 3 +#define Z58 4 +#define Z59 D +#define Z5A 7 +#define Z5B 5 +#define Z5C F +#define Z5D E +#define Z5E 1 +#define Z5F 9 + +#define Z60 C +#define Z61 5 +#define Z62 1 +#define Z63 F +#define Z64 E +#define Z65 D +#define Z66 4 +#define Z67 A +#define Z68 0 +#define Z69 7 +#define Z6A 6 +#define Z6B 3 +#define Z6C 9 +#define Z6D 2 +#define Z6E 8 +#define Z6F B + +#define Z70 D +#define Z71 B +#define Z72 7 +#define Z73 E +#define Z74 C +#define Z75 1 +#define Z76 3 +#define Z77 9 +#define Z78 5 +#define Z79 0 +#define Z7A F +#define Z7B 4 +#define Z7C 8 +#define Z7D 6 +#define Z7E 2 +#define Z7F A + +#define Z80 6 +#define Z81 F +#define Z82 E +#define Z83 9 +#define Z84 B +#define Z85 3 +#define Z86 0 +#define Z87 8 +#define Z88 C +#define Z89 2 +#define Z8A D +#define Z8B 7 +#define Z8C 1 +#define Z8D 4 +#define Z8E A +#define Z8F 5 + +#define Z90 A +#define Z91 2 +#define Z92 8 +#define Z93 4 +#define Z94 7 +#define Z95 6 +#define Z96 1 +#define Z97 5 +#define Z98 F +#define Z99 B +#define Z9A 9 +#define Z9B E +#define Z9C 3 +#define Z9D C +#define Z9E D +#define Z9F 0 + +#define Mx(r, i) Mx_(Z ## r ## i) +#define Mx_(n) Mx__(n) +#define Mx__(n) M ## n + +#define CSx(r, i) CSx_(Z ## r ## i) +#define CSx_(n) CSx__(n) +#define CSx__(n) CS ## n + +#define CS0 SPH_C32(0x243F6A88) +#define CS1 SPH_C32(0x85A308D3) +#define CS2 SPH_C32(0x13198A2E) +#define CS3 SPH_C32(0x03707344) +#define CS4 SPH_C32(0xA4093822) +#define CS5 SPH_C32(0x299F31D0) +#define CS6 SPH_C32(0x082EFA98) +#define CS7 SPH_C32(0xEC4E6C89) +#define CS8 SPH_C32(0x452821E6) +#define CS9 SPH_C32(0x38D01377) +#define CSA SPH_C32(0xBE5466CF) +#define CSB SPH_C32(0x34E90C6C) +#define CSC SPH_C32(0xC0AC29B7) +#define CSD SPH_C32(0xC97C50DD) +#define CSE SPH_C32(0x3F84D5B5) +#define CSF SPH_C32(0xB5470917) + +#define GS(m0, m1, c0, c1, a, b, c, d) do { \ + a = SPH_T32(a + b + (m0 ^ c1)); \ + d = SPH_ROTR32(d ^ a, 16); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 12); \ + a = SPH_T32(a + b + (m1 ^ c0)); \ + d = SPH_ROTR32(d ^ a, 8); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 7); \ + } while (0) + +#define ROUND_S(r) do { \ + GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ + GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ + GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ + GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ + GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ + GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ + GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ + GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ + } while (0) + +#define COMPRESS32 do { \ + sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \ + sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \ + sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CS0; \ + V9 = S1 ^ CS1; \ + VA = S2 ^ CS2; \ + VB = S3 ^ CS3; \ + VC = T0 ^ CS4; \ + VD = T0 ^ CS5; \ + VE = T1 ^ CS6; \ + VF = T1 ^ CS7; \ + M0 = input[0]; \ + M1 = input[1]; \ + M2 = input[2]; \ + M3 = input[3]; \ + M4 = input[4]; \ + M5 = input[5]; \ + M6 = input[6]; \ + M7 = input[7]; \ + M8 = input[8]; \ + M9 = input[9]; \ + MA = input[10]; \ + MB = input[11]; \ + MC = input[12]; \ + MD = input[13]; \ + ME = input[14]; \ + MF = input[15]; \ + ROUND_S(0); \ + ROUND_S(1); \ + ROUND_S(2); \ + ROUND_S(3); \ + ROUND_S(4); \ + ROUND_S(5); \ + ROUND_S(6); \ + ROUND_S(7); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + + +__global__ void titan_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate ) +{ + uint32_t input[16]; + uint64_t output[4]; + +#pragma unroll 16 + for (int i=0; i < 16; ++i) input[i] = pdata[i]; + + sph_u32 H0 = 0x6A09E667; + sph_u32 H1 = 0xBB67AE85; + sph_u32 H2 = 0x3C6EF372; + sph_u32 H3 = 0xA54FF53A; + sph_u32 H4 = 0x510E527F; + sph_u32 H5 = 0x9B05688C; + sph_u32 H6 = 0x1F83D9AB; + sph_u32 H7 = 0x5BE0CD19; + sph_u32 S0 = 0; + sph_u32 S1 = 0; + sph_u32 S2 = 0; + sph_u32 S3 = 0; + sph_u32 T0 = 0; + sph_u32 T1 = 0; + T0 = SPH_T32(T0 + 512); + COMPRESS32; + +#pragma unroll 3 + for (int i=0; i < 3; ++i) input[i] = pdata[16+i]; + input[3] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); + input[4] = 0x80000000; +#pragma unroll 8 + for (int i=5; i < 13; ++i) input[i] = 0; + input[13] = 0x00000001; + input[14] = T1; + input[15] = T0 + 128; + + T0 = SPH_T32(T0 + 128); + COMPRESS32; + + cuda_sph_enc32be((unsigned char*)output + 4*6, H6); + cuda_sph_enc32be((unsigned char*)output + 4*7, H7); + if (validate || output[3] <= ptarget64[3]) + { + // this data is only needed when we actually need to save the hashes + cuda_sph_enc32be((unsigned char*)output + 4*0, H0); + cuda_sph_enc32be((unsigned char*)output + 4*1, H1); + cuda_sph_enc32be((unsigned char*)output + 4*2, H2); + cuda_sph_enc32be((unsigned char*)output + 4*3, H3); + cuda_sph_enc32be((unsigned char*)output + 4*4, H4); + cuda_sph_enc32be((unsigned char*)output + 4*5, H5); + } + + if (validate) + { + g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x); +#pragma unroll 4 + for (int i=0; i < 4; ++i) g_out[i] = output[i]; + } + + if (output[3] <= ptarget64[3]) { + uint64_t *g_good64 = (uint64_t*)g_good; + if (output[3] < g_good64[3]) { + g_good64[3] = output[3]; + g_good64[2] = output[2]; + g_good64[1] = output[1]; + g_good64[0] = output[0]; + g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); + } + } +} + +bool NV2Kernel::prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) +{ + static bool init[MAX_DEVICES] = {false}; + if (!init[thr_id]) + { + // allocate pinned host memory for good hashes + uint32_t *tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp; + + init[thr_id] = true; + } + checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + + return context_good[0][thr_id] && context_good[1][thr_id]; +} + +void NV2Kernel::do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) +{ + checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id])); + + titan_blake256_hash<<>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h); + + // copy hashes from device memory to host (ALL hashes, lots of data...) + if (do_d2h && hash != NULL) { + size_t mem_size = throughput * sizeof(uint32_t) * 8; + checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size, + cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); + } + else if (hash != NULL) { + // asynchronous copy of winning nonce (just 4 bytes...) + checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t), + cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); + } +} diff --git a/scrypt/nv_kernel2.h b/scrypt/nv_kernel2.h new file mode 100644 index 0000000..a67c65f --- /dev/null +++ b/scrypt/nv_kernel2.h @@ -0,0 +1,36 @@ +#ifndef NV2_KERNEL_H +#define NV2_KERNEL_H + +#include "miner.h" +#include + +#include "salsa_kernel.h" + +class NV2Kernel : public KernelInterface +{ +public: + NV2Kernel(); + + virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V); + virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache); + + virtual char get_identifier() { return 'T'; }; + virtual int get_major_version() { return 3; }; + virtual int get_minor_version() { return 5; }; + + virtual int max_warps_per_block() { return 24; }; + virtual int get_texel_width() { return 4; }; + virtual bool no_textures() { return true; } + virtual bool support_lookup_gap() { return true; } + + virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; } + virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; } + + virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); + virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false); + + virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]); + virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false); +}; + +#endif // #ifndef NV2_KERNEL_H diff --git a/scrypt/salsa_kernel.cu b/scrypt/salsa_kernel.cu new file mode 100644 index 0000000..e82de02 --- /dev/null +++ b/scrypt/salsa_kernel.cu @@ -0,0 +1,939 @@ + +// +// Contains the autotuning logic and some utility functions. +// Note that all CUDA kernels have been moved to other .cu files +// +// NOTE: compile this .cu module for compute_20,sm_21 with --maxrregcount=64 +// + +#include +#include +#include +#include // usleep +#include // tolower +#include "cuda_helper.h" + +#include "salsa_kernel.h" + +#include "titan_kernel.h" +#include "fermi_kernel.h" +#include "test_kernel.h" +#include "nv_kernel.h" +#include "nv_kernel2.h" +#include "kepler_kernel.h" + +#include "miner.h" + +#if WIN32 +#ifdef _WIN64 +#define _64BIT 1 +#endif +#else +#if __x86_64__ +#define _64BIT 1 +#endif +#endif + +#if _64BIT +#define MAXMEM 0x300000000ULL // 12 GB (the largest Kepler) +#else +#define MAXMEM 0xFFFFFFFFULL // nearly 4 GB (32 bit limitations) +#endif + +// require CUDA 5.5 driver API +#define DMAJ 5 +#define DMIN 5 + +// define some error checking macros +#undef checkCudaErrors + +#if WIN32 +#define DELIMITER '/' +#else +#define DELIMITER '/' +#endif +#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ ) + +#define checkCudaErrors(x) \ +{ \ + cudaGetLastError(); \ + x; \ + cudaError_t err = cudaGetLastError(); \ + if (err != cudaSuccess) \ + applog(LOG_ERR, "GPU #%d: Err %d: %s (%s:%d)", device_map[thr_id], err, cudaGetErrorString(err), __FILENAME__, __LINE__); \ +} + +// some globals containing pointers to device memory (for chunked allocation) +// [MAX_DEVICES] indexes up to MAX_DEVICES threads (0...MAX_DEVICES-1) +int MAXWARPS[MAX_GPUS]; +uint32_t* h_V[MAX_GPUS][TOTAL_WARP_LIMIT*64]; // NOTE: the *64 prevents buffer overflow for --keccak +uint32_t h_V_extra[MAX_GPUS][TOTAL_WARP_LIMIT*64]; // with really large kernel launch configurations + +KernelInterface *Best_Kernel_Heuristics(cudaDeviceProp *props) +{ + KernelInterface *kernel = NULL; + uint32_t N = (1UL << opt_nfactor+1); // not sure + + if (IS_SCRYPT() || (IS_SCRYPT_JANE() && N <= 8192)) + { + // high register count kernels (scrypt, low N-factor scrypt-jane) + if (props->major > 3 || (props->major == 3 && props->minor >= 5)) + kernel = new NV2Kernel(); // we don't want this for Keccak though + else if (props->major == 3 && props->minor == 0) + kernel = new NVKernel(); + else if (props->major == 2 || props->major == 1) + kernel = new FermiKernel(); + } + else + { + // low register count kernels (high N-factor scrypt-jane) + if (props->major > 3 || (props->major == 3 && props->minor >= 5)) + kernel = new TitanKernel(); + else if (props->major == 3 && props->minor == 0) + kernel = new KeplerKernel(); + else if (props->major == 2 || props->major == 1) + kernel = new TestKernel(); + } + return kernel; +} + + +bool validate_config(char *config, int &b, int &w, KernelInterface **kernel = NULL, cudaDeviceProp *props = NULL) +{ + bool success = false; + char kernelid = ' '; + if (config != NULL) + { + if (config[0] == 'T' || config[0] == 'K' || config[0] == 'F' || config[0] == 'L' || + config[0] == 't' || config[0] == 'k' || config[0] == 'f' || + config[0] == 'Z' || config[0] == 'Y' || config[0] == 'X') { + kernelid = config[0]; + config++; + } + + if (config[0] >= '0' && config[0] <= '9') + if (sscanf(config, "%dx%d", &b, &w) == 2) + success = true; + + if (success && kernel != NULL) + { + switch (kernelid) + { + case 'T': case 'Z': *kernel = new NV2Kernel(); break; + case 't': *kernel = new TitanKernel(); break; + case 'K': case 'Y': *kernel = new NVKernel(); break; + case 'k': *kernel = new KeplerKernel(); break; + case 'F': case 'L': *kernel = new FermiKernel(); break; + case 'f': case 'X': *kernel = new TestKernel(); break; + case ' ': // choose based on device architecture + *kernel = Best_Kernel_Heuristics(props); + break; + } + } + } + return success; +} + +std::map context_blocks; +std::map context_wpb; +std::map context_concurrent; +std::map context_kernel; +std::map context_idata[2]; +std::map context_odata[2]; +std::map context_streams[2]; +std::map context_X[2]; +std::map context_H[2]; +std::map context_serialize[2]; + +// for SHA256 hashing on GPU +std::map context_tstate[2]; +std::map context_ostate[2]; +std::map context_hash[2]; + +int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurrent, int &wpb); + +void cuda_shutdown(int thr_id) +{ + cudaDeviceSynchronize(); + cudaDeviceReset(); + cudaThreadExit(); +} + +int cuda_throughput(int thr_id) +{ + int GRID_BLOCKS, WARPS_PER_BLOCK; + if (context_blocks.find(thr_id) == context_blocks.end()) + { +#if 0 + CUcontext ctx; + cuCtxCreate( &ctx, CU_CTX_SCHED_YIELD, device_map[thr_id] ); + cuCtxSetCurrent(ctx); +#else + checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleYield)); + checkCudaErrors(cudaSetDevice(device_map[thr_id])); + checkCudaErrors(cudaFree(0)); +#endif + + KernelInterface *kernel; + bool concurrent; + GRID_BLOCKS = find_optimal_blockcount(thr_id, kernel, concurrent, WARPS_PER_BLOCK); + + if(GRID_BLOCKS == 0) + return 0; + + unsigned int THREADS_PER_WU = kernel->threads_per_wu(); + unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * 32; + unsigned int state_size = WU_PER_LAUNCH * sizeof(uint32_t) * 8; + + // allocate device memory for scrypt_core inputs and outputs + uint32_t *tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_idata[0][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_idata[1][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_odata[0][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_odata[1][thr_id] = tmp; + + // allocate pinned host memory for scrypt hashes + checkCudaErrors(cudaHostAlloc((void **) &tmp, state_size, cudaHostAllocDefault)); context_H[0][thr_id] = tmp; + checkCudaErrors(cudaHostAlloc((void **) &tmp, state_size, cudaHostAllocDefault)); context_H[1][thr_id] = tmp; + + if (IS_SCRYPT()) + { + if (parallel < 2) + { + // allocate pinned host memory for scrypt_core input/output + checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[0][thr_id] = tmp; + checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[1][thr_id] = tmp; + } + else + { + // allocate tstate, ostate, scrypt hash device memory + checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_tstate[0][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_tstate[1][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_ostate[0][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_ostate[1][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[0][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[1][thr_id] = tmp; + } + } + else if (IS_SCRYPT_JANE()) + { + // allocate pinned host memory for scrypt_core input/output + checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[0][thr_id] = tmp; + checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[1][thr_id] = tmp; + + checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[0][thr_id] = tmp; + checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[1][thr_id] = tmp; + } + + // create two CUDA streams + cudaStream_t tmp2; + checkCudaErrors( cudaStreamCreate(&tmp2) ); context_streams[0][thr_id] = tmp2; + checkCudaErrors( cudaStreamCreate(&tmp2) ); context_streams[1][thr_id] = tmp2; + + // events used to serialize the kernel launches (we don't want any overlapping of kernels) + cudaEvent_t tmp4; + checkCudaErrors(cudaEventCreateWithFlags(&tmp4, cudaEventDisableTiming)); context_serialize[0][thr_id] = tmp4; + checkCudaErrors(cudaEventCreateWithFlags(&tmp4, cudaEventDisableTiming)); context_serialize[1][thr_id] = tmp4; + checkCudaErrors(cudaEventRecord(context_serialize[1][thr_id])); + + context_kernel[thr_id] = kernel; + context_concurrent[thr_id] = concurrent; + context_blocks[thr_id] = GRID_BLOCKS; + context_wpb[thr_id] = WARPS_PER_BLOCK; + } + + GRID_BLOCKS = context_blocks[thr_id]; + WARPS_PER_BLOCK = context_wpb[thr_id]; + unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); + return WU_PER_LAUNCH; +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class + { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class + { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class + { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class + { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class + { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class + { 0x30, 192 }, // Kepler Generation (SM 3.0) GK10x class - GK104 = 1536 cores / 8 SMs + { 0x35, 192 }, // Kepler Generation (SM 3.5) GK11x class + { 0x50, 128 }, // Maxwell Generation (SM 5.0) GTX750/750Ti + { 0x52, 128 }, // Maxwell Second Generation (SM 5.2) GTX980 = 2048 cores / 16 SMs - GTX970 1664 cores / 13 SMs + { -1, -1 }, + }; + + int index = 0; + while (nGpuArchCoresPerSM[index].SM != -1) + { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + index++; + } + + // If we don't find the values, we default use the previous one to run properly + applog(LOG_WARNING, "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, 128); + return 128; +} + +#ifdef WIN32 +#include +static int console_width() { + CONSOLE_SCREEN_BUFFER_INFO csbi; + GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi); + return csbi.srWindow.Right - csbi.srWindow.Left + 1; +} +#else +static inline int console_width() { + return 999; +} +#endif + +int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurrent, int &WARPS_PER_BLOCK) +{ + int cw = console_width(); + int optimal_blocks = 0; + + cudaDeviceProp props; + checkCudaErrors(cudaGetDeviceProperties(&props, device_map[thr_id])); + concurrent = (props.concurrentKernels > 0); + + device_name[thr_id] = strdup(props.name); + applog(LOG_INFO, "GPU #%d: %s with SM %d.%d", device_map[thr_id], props.name, props.major, props.minor); + + WARPS_PER_BLOCK = -1; + + // if not specified, use interactive mode for devices that have the watchdog timer enabled + if (device_interactive[thr_id] == -1) + device_interactive[thr_id] = props.kernelExecTimeoutEnabled; + + // turn off texture cache if not otherwise specified + if (device_texturecache[thr_id] == -1) + device_texturecache[thr_id] = 0; + + // if not otherwise specified or required, turn single memory allocations off as they reduce + // the amount of memory that we can allocate on Windows Vista, 7 and 8 (WDDM driver model issue) + if (device_singlememory[thr_id] == -1) device_singlememory[thr_id] = 0; + + // figure out which kernel implementation to use + if (!validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK, &kernel, &props)) { + kernel = NULL; + if (device_config[thr_id] != NULL) { + if (device_config[thr_id][0] == 'T' || device_config[thr_id][0] == 'Z') + kernel = new NV2Kernel(); + else if (device_config[thr_id][0] == 't') + kernel = new TitanKernel(); + else if (device_config[thr_id][0] == 'K' || device_config[thr_id][0] == 'Y') + kernel = new NVKernel(); + else if (device_config[thr_id][0] == 'k') + kernel = new KeplerKernel(); + else if (device_config[thr_id][0] == 'F' || device_config[thr_id][0] == 'L') + kernel = new FermiKernel(); + else if (device_config[thr_id][0] == 'f' || device_config[thr_id][0] == 'X') + kernel = new TestKernel(); + } + if (kernel == NULL) kernel = Best_Kernel_Heuristics(&props); + } + + if (kernel->get_major_version() > props.major || kernel->get_major_version() == props.major && kernel->get_minor_version() > props.minor) + { + applog(LOG_ERR, "GPU #%d: FATAL: the '%c' kernel requires %d.%d capability!", device_map[thr_id], kernel->get_identifier(), kernel->get_major_version(), kernel->get_minor_version()); + return 0; + } + + // set whatever cache configuration and shared memory bank mode the kernel prefers + checkCudaErrors(cudaDeviceSetCacheConfig(kernel->cache_config())); + checkCudaErrors(cudaDeviceSetSharedMemConfig(kernel->shared_mem_config())); + + // some kernels (e.g. Titan) do not support the texture cache + if (kernel->no_textures() && device_texturecache[thr_id]) { + applog(LOG_WARNING, "GPU #%d: the '%c' kernel ignores the texture cache argument", device_map[thr_id], kernel->get_identifier()); + device_texturecache[thr_id] = 0; + } + + // Texture caching only works with single memory allocation + if (device_texturecache[thr_id]) device_singlememory[thr_id] = 1; + + if (kernel->single_memory() && !device_singlememory[thr_id]) { + applog(LOG_WARNING, "GPU #%d: the '%c' kernel requires single memory allocation", device_map[thr_id], kernel->get_identifier()); + device_singlememory[thr_id] = 1; + } + + if (device_lookup_gap[thr_id] == 0) device_lookup_gap[thr_id] = 1; + if (!kernel->support_lookup_gap() && device_lookup_gap[thr_id] > 1) + { + applog(LOG_WARNING, "GPU #%d: the '%c' kernel does not support a lookup gap", device_map[thr_id], kernel->get_identifier()); + device_lookup_gap[thr_id] = 1; + } + + applog(LOG_INFO, "GPU #%d: interactive: %d, tex-cache: %d%s, single-alloc: %d", device_map[thr_id], + (device_interactive[thr_id] != 0) ? 1 : 0, + (device_texturecache[thr_id] != 0) ? device_texturecache[thr_id] : 0, (device_texturecache[thr_id] != 0) ? "D" : "", + (device_singlememory[thr_id] != 0) ? 1 : 0 ); + + // number of threads collaborating on one work unit (hash) + unsigned int THREADS_PER_WU = kernel->threads_per_wu(); + unsigned int LOOKUP_GAP = device_lookup_gap[thr_id]; + unsigned int BACKOFF = device_backoff[thr_id]; + unsigned int N = (1 << (opt_nfactor+1)); + double szPerWarp = (double)(SCRATCH * WU_PER_WARP * sizeof(uint32_t)); + //applog(LOG_INFO, "WU_PER_WARP=%u, THREADS_PER_WU=%u, LOOKUP_GAP=%u, BACKOFF=%u, SCRATCH=%u", WU_PER_WARP, THREADS_PER_WU, LOOKUP_GAP, BACKOFF, SCRATCH); + applog(LOG_INFO, "GPU #%d: %d hashes / %.1f MB per warp.", device_map[thr_id], WU_PER_WARP, szPerWarp / (1024.0 * 1024.0)); + + // compute highest MAXWARPS numbers for kernels allowing cudaBindTexture to succeed + int MW_1D_4 = 134217728 / (SCRATCH * WU_PER_WARP / 4); // for uint4_t textures + int MW_1D_2 = 134217728 / (SCRATCH * WU_PER_WARP / 2); // for uint2_t textures + int MW_1D = kernel->get_texel_width() == 2 ? MW_1D_2 : MW_1D_4; + + uint32_t *d_V = NULL; + if (device_singlememory[thr_id]) + { + // if no launch config was specified, we simply + // allocate the single largest memory chunk on the device that we can get + if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK)) { + MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK; + } + else { + // compute no. of warps to allocate the largest number producing a single memory block + // PROBLEM: one some devices, ALL allocations will fail if the first one failed. This sucks. + size_t MEM_LIMIT = (size_t)min((unsigned long long)MAXMEM, (unsigned long long)props.totalGlobalMem); + int warpmax = (int)min((unsigned long long)TOTAL_WARP_LIMIT, (unsigned long long)(MEM_LIMIT / szPerWarp)); + + // run a bisection algorithm for memory allocation (way more reliable than the previous approach) + int best = 0; + int warp = (warpmax+1)/2; + int interval = (warpmax+1)/2; + while (interval > 0) + { + cudaGetLastError(); // clear the error state + cudaMalloc((void **)&d_V, (size_t)(szPerWarp * warp)); + if (cudaGetLastError() == cudaSuccess) { + checkCudaErrors(cudaFree(d_V)); d_V = NULL; + if (warp > best) best = warp; + if (warp == warpmax) break; + interval = (interval+1)/2; + warp += interval; + if (warp > warpmax) warp = warpmax; + } + else + { + interval = interval/2; + warp -= interval; + if (warp < 1) warp = 1; + } + } + // back off a bit from the largest possible allocation size + MAXWARPS[thr_id] = ((100-BACKOFF)*best+50)/100; + } + + // now allocate a buffer for determined MAXWARPS setting + cudaGetLastError(); // clear the error state + cudaMalloc((void **)&d_V, (size_t)SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t)); + if (cudaGetLastError() == cudaSuccess) { + for (int i=0; i < MAXWARPS[thr_id]; ++i) + h_V[thr_id][i] = d_V + SCRATCH * WU_PER_WARP * i; + + if (device_texturecache[thr_id] == 1) + { + if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK)) + { + if ( optimal_blocks * WARPS_PER_BLOCK > MW_1D ) { + applog(LOG_ERR, "GPU #%d: '%s' exceeds limits for 1D cache. Using 2D cache instead.", device_map[thr_id], device_config[thr_id]); + device_texturecache[thr_id] = 2; + } + } + // bind linear memory to a 1D texture reference + if (kernel->get_texel_width() == 2) + kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * min(MAXWARPS[thr_id],MW_1D_2) * sizeof(uint32_t)); + else + kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * min(MAXWARPS[thr_id],MW_1D_4) * sizeof(uint32_t)); + } + else if (device_texturecache[thr_id] == 2) + { + // bind pitch linear memory to a 2D texture reference + if (kernel->get_texel_width() == 2) + kernel->bindtexture_2D(d_V, SCRATCH/2, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t)); + else + kernel->bindtexture_2D(d_V, SCRATCH/4, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t)); + } + } + else + { + applog(LOG_ERR, "GPU #%d: FATAL: Launch config '%s' requires too much memory!", device_map[thr_id], device_config[thr_id]); + return 0; + } + } + else + { + if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK)) + MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK; + else + MAXWARPS[thr_id] = TOTAL_WARP_LIMIT; + + // chunked memory allocation up to device limits + int warp; + for (warp = 0; warp < MAXWARPS[thr_id]; ++warp) { + // work around partition camping problems by adding a random start address offset to each allocation + h_V_extra[thr_id][warp] = (props.major == 1) ? (16 * (rand()%(16384/16))) : 0; + cudaGetLastError(); // clear the error state + cudaMalloc((void **) &h_V[thr_id][warp], (SCRATCH * WU_PER_WARP + h_V_extra[thr_id][warp])*sizeof(uint32_t)); + if (cudaGetLastError() == cudaSuccess) h_V[thr_id][warp] += h_V_extra[thr_id][warp]; + else { + h_V_extra[thr_id][warp] = 0; + + // back off by several warp allocations to have some breathing room + int remove = (BACKOFF*warp+50)/100; + for (int i=0; warp > 0 && i < remove; ++i) { + warp--; + checkCudaErrors(cudaFree(h_V[thr_id][warp]-h_V_extra[thr_id][warp])); + h_V[thr_id][warp] = NULL; h_V_extra[thr_id][warp] = 0; + } + + break; + } + } + MAXWARPS[thr_id] = warp; + } + if (IS_SCRYPT() || IS_SCRYPT_JANE()) { + kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]); + } + + if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK)) + { + if (optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id]) + { + applog(LOG_ERR, "GPU #%d: FATAL: Given launch config '%s' requires too much memory.", device_map[thr_id], device_config[thr_id]); + return 0; + } + + if (WARPS_PER_BLOCK > kernel->max_warps_per_block()) + { + applog(LOG_ERR, "GPU #%d: FATAL: Given launch config '%s' exceeds warp limit for '%c' kernel.", device_map[thr_id], device_config[thr_id], kernel->get_identifier()); + return 0; + } + } + else + { + if (device_config[thr_id] != NULL && strcasecmp("auto", device_config[thr_id])) + applog(LOG_WARNING, "GPU #%d: Given launch config '%s' does not validate.", device_map[thr_id], device_config[thr_id]); + + if (autotune) + { + applog(LOG_INFO, "GPU #%d: Performing auto-tuning, please wait 2 minutes...", device_map[thr_id]); + + // allocate device memory + uint32_t *d_idata = NULL, *d_odata = NULL; + if (IS_SCRYPT() || IS_SCRYPT_JANE()) { + unsigned int mem_size = MAXWARPS[thr_id] * WU_PER_WARP * sizeof(uint32_t) * 32; + checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size)); + checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size)); + + // pre-initialize some device memory + uint32_t *h_idata = (uint32_t*)malloc(mem_size); + for (unsigned int i=0; i < mem_size/sizeof(uint32_t); ++i) h_idata[i] = i*2654435761UL; // knuth's method + checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice)); + free(h_idata); + } +#if 0 + else if (opt_algo == ALGO_KECCAK) { + uint32_t pdata[20] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20}; + uint32_t ptarget[8] = {0,0,0,0,0,0,0,0}; + kernel->prepare_keccak256(thr_id, pdata, ptarget); + } else if (opt_algo == ALGO_BLAKE) { + uint32_t pdata[20] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20}; + uint32_t ptarget[8] = {0,0,0,0,0,0,0,0}; + kernel->prepare_blake256(thr_id, pdata, ptarget); + } +#endif + double best_hash_sec = 0.0; + int best_wpb = 0; + + // auto-tuning loop + { + // we want to have enough total warps for half the multiprocessors at least + // compute highest MAXWARPS number that we can support based on texture cache mode + int MINTW = props.multiProcessorCount / 2; + int MAXTW = (device_texturecache[thr_id] == 1) ? min(MAXWARPS[thr_id],MW_1D) : MAXWARPS[thr_id]; + + // we want to have blocks for half the multiprocessors at least + int MINB = props.multiProcessorCount / 2; + int MAXB = MAXTW; + + double tmin = 0.05; + + applog(LOG_INFO, "GPU #%d: maximum total warps (BxW): %d", (int) device_map[thr_id], MAXTW); + + for (int GRID_BLOCKS = MINB; !abort_flag && GRID_BLOCKS <= MAXB; ++GRID_BLOCKS) + { + double Hash[32+1] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; + for (WARPS_PER_BLOCK = 1; !abort_flag && WARPS_PER_BLOCK <= kernel->max_warps_per_block(); ++WARPS_PER_BLOCK) + { + double hash_sec = 0; + if (GRID_BLOCKS * WARPS_PER_BLOCK >= MINTW && + GRID_BLOCKS * WARPS_PER_BLOCK <= MAXTW) + { + // setup execution parameters + dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1); + dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1); + + struct timeval tv_start, tv_end; + double tdelta = 0; + + checkCudaErrors(cudaDeviceSynchronize()); + gettimeofday(&tv_start, NULL); + int repeat = 0; + do // average several measurements for better exactness + { + if (IS_SCRYPT() || IS_SCRYPT_JANE()) + kernel->run_kernel( + grid, threads, WARPS_PER_BLOCK, thr_id, NULL, + d_idata, d_odata, N, LOOKUP_GAP, device_interactive[thr_id], true, device_texturecache[thr_id] + ); + if(cudaDeviceSynchronize() != cudaSuccess) + break; + ++repeat; + gettimeofday(&tv_end, NULL); + // for a better result averaging, measure for at least 50ms (10ms for Keccak) + } while ((tdelta=(1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec))) < tmin); + if (cudaGetLastError() != cudaSuccess) continue; + + tdelta /= repeat; // BUGFIX: this averaging over multiple measurements was missing + + // for scrypt: in interactive mode only find launch configs where kernel launch times are short enough + // TODO: instead we could reduce the batchsize parameter to meet the launch time requirement. + if (IS_SCRYPT() && device_interactive[thr_id] && GRID_BLOCKS > 2*props.multiProcessorCount && tdelta > 1.0/30) + if (WARPS_PER_BLOCK == 1) goto skip; else goto skip2; + + hash_sec = (double)WU_PER_LAUNCH / tdelta; + Hash[WARPS_PER_BLOCK] = hash_sec; + if (hash_sec > best_hash_sec) { + optimal_blocks = GRID_BLOCKS; + best_hash_sec = hash_sec; + best_wpb = WARPS_PER_BLOCK; + } + } + } +skip2: ; + if (opt_debug) { + if (GRID_BLOCKS == MINB) { + char line[512] = " "; + for (int i=1; i<=kernel->max_warps_per_block(); ++i) { + char tmp[16]; sprintf(tmp, i < 10 ? " x%-2d" : " x%-2d ", i); + strcat(line, tmp); + if (cw == 80 && (i % 8 == 0 && i != kernel->max_warps_per_block())) + strcat(line, "\n "); + } + applog(LOG_DEBUG, line); + } + + char kMGT = ' '; bool flag; + for (int j=0; j < 4; ++j) { + flag=false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 1000, i++); + if (flag) for (int i=1; i<=kernel->max_warps_per_block(); Hash[i] /= 1000, i++); + else break; + if (kMGT == ' ') kMGT = 'k'; + else if (kMGT == 'k') kMGT = 'M'; + else if (kMGT == 'M') kMGT = 'G'; + else if (kMGT == 'G') kMGT = 'T'; + } + const char *format = "%5.4f%c"; + flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 1, i++); if (flag) format = "%5.3f%c"; + flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 10, i++); if (flag) format = "%5.2f%c"; + flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 100, i++); if (flag) format = "%5.1f%c"; + + char line[512]; sprintf(line, "%3d:", GRID_BLOCKS); + for (int i=1; i<=kernel->max_warps_per_block(); ++i) { + char tmp[16]; + if (Hash[i]>0) + sprintf(tmp, format, Hash[i], (imax_warps_per_block())?'|':' '); + else + sprintf(tmp, " %c", (imax_warps_per_block())?'|':' '); + strcat(line, tmp); + if (cw == 80 && (i % 8 == 0 && i != kernel->max_warps_per_block())) + strcat(line, "\n "); + } + int n = strlen(line)-1; line[n++] = '|'; line[n++] = ' '; line[n++] = kMGT; line[n++] = '\0'; + strcat(line, "H/s"); + applog(LOG_DEBUG, line); + } + } +skip: ; + } + + if (IS_SCRYPT() || IS_SCRYPT_JANE()) { + checkCudaErrors(cudaFree(d_odata)); + checkCudaErrors(cudaFree(d_idata)); + } + + WARPS_PER_BLOCK = best_wpb; + applog(LOG_INFO, "GPU #%d: %7.2f hash/s with configuration %c%dx%d", device_map[thr_id], best_hash_sec, kernel->get_identifier(), optimal_blocks, WARPS_PER_BLOCK); + } + else + { + // Heuristics to find a good kernel launch configuration + + // base the initial block estimate on the number of multiprocessors + int device_cores = props.multiProcessorCount * _ConvertSMVer2Cores(props.major, props.minor); + + // defaults, in case nothing else is chosen below + optimal_blocks = 4 * device_cores / WU_PER_WARP; + WARPS_PER_BLOCK = 2; + + // Based on compute capability, pick a known good block x warp configuration. + if (props.major >= 3) + { + if (props.major == 3 && props.minor == 5) // GK110 (Tesla K20X, K20, GeForce GTX TITAN) + { + // TODO: what to do with Titan and Tesla K20(X)? + // for now, do the same as for GTX 660Ti (2GB) + optimal_blocks = (int)(optimal_blocks * 0.8809524); + WARPS_PER_BLOCK = 2; + } + else // GK104, GK106, GK107 ... + { + if (MAXWARPS[thr_id] > (int)(optimal_blocks * 1.7261905) * 2) + { + // this results in 290x2 configuration on GTX 660Ti (3GB) + // but it requires 3GB memory on the card! + optimal_blocks = (int)(optimal_blocks * 1.7261905); + WARPS_PER_BLOCK = 2; + } + else + { + // this results in 148x2 configuration on GTX 660Ti (2GB) + optimal_blocks = (int)(optimal_blocks * 0.8809524); + WARPS_PER_BLOCK = 2; + } + } + } + // 1st generation Fermi (compute 2.0) GF100, GF110 + else if (props.major == 2 && props.minor == 0) + { + // this results in a 60x4 configuration on GTX 570 + optimal_blocks = 4 * device_cores / WU_PER_WARP; + WARPS_PER_BLOCK = 4; + } + // 2nd generation Fermi (compute 2.1) GF104,106,108,114,116 + else if (props.major == 2 && props.minor == 1) + { + // this results in a 56x2 configuration on GTX 460 + optimal_blocks = props.multiProcessorCount * 8; + WARPS_PER_BLOCK = 2; + } + + // in case we run out of memory with the automatically chosen configuration, + // first back off with WARPS_PER_BLOCK, then reduce optimal_blocks. + if (WARPS_PER_BLOCK==3 && optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id]) + WARPS_PER_BLOCK = 2; + while (optimal_blocks > 0 && optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id]) + optimal_blocks--; + } + } + + applog(LOG_INFO, "GPU #%d: using launch configuration %c%dx%d", device_map[thr_id], kernel->get_identifier(), optimal_blocks, WARPS_PER_BLOCK); + + if (device_singlememory[thr_id]) + { + if (MAXWARPS[thr_id] != optimal_blocks * WARPS_PER_BLOCK) + { + MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK; + if (device_texturecache[thr_id] == 1) + kernel->unbindtexture_1D(); + else if (device_texturecache[thr_id] == 2) + kernel->unbindtexture_2D(); + checkCudaErrors(cudaFree(d_V)); d_V = NULL; + + cudaGetLastError(); // clear the error state + cudaMalloc((void **)&d_V, (size_t)SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t)); + if (cudaGetLastError() == cudaSuccess) { + for (int i=0; i < MAXWARPS[thr_id]; ++i) + h_V[thr_id][i] = d_V + SCRATCH * WU_PER_WARP * i; + + if (device_texturecache[thr_id] == 1) + { + // bind linear memory to a 1D texture reference + if (kernel->get_texel_width() == 2) + kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t)); + else + kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t)); + } + else if (device_texturecache[thr_id] == 2) + { + // bind pitch linear memory to a 2D texture reference + if (kernel->get_texel_width() == 2) + kernel->bindtexture_2D(d_V, SCRATCH/2, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t)); + else + kernel->bindtexture_2D(d_V, SCRATCH/4, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t)); + } + + // update pointers to scratch buffer in constant memory after reallocation + if (IS_SCRYPT() || IS_SCRYPT_JANE()) { + kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]); + } + } + else + { + applog(LOG_ERR, "GPU #%d: Unable to allocate enough memory for launch config '%s'.", device_map[thr_id], device_config[thr_id]); + } + } + } + else + { + // back off unnecessary memory allocations to have some breathing room + while (MAXWARPS[thr_id] > 0 && MAXWARPS[thr_id] > optimal_blocks * WARPS_PER_BLOCK) { + (MAXWARPS[thr_id])--; + checkCudaErrors(cudaFree(h_V[thr_id][MAXWARPS[thr_id]]-h_V_extra[thr_id][MAXWARPS[thr_id]])); + h_V[thr_id][MAXWARPS[thr_id]] = NULL; h_V_extra[thr_id][MAXWARPS[thr_id]] = 0; + } + } + + return optimal_blocks; +} + +void cuda_scrypt_HtoD(int thr_id, uint32_t *X, int stream) +{ + unsigned int GRID_BLOCKS = context_blocks[thr_id]; + unsigned int WARPS_PER_BLOCK = context_wpb[thr_id]; + unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); + unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * 32; + + // copy host memory to device + cudaMemcpyAsync(context_idata[stream][thr_id], X, mem_size, cudaMemcpyHostToDevice, context_streams[stream][thr_id]); +} + +void cuda_scrypt_serialize(int thr_id, int stream) +{ + // if the device can concurrently execute multiple kernels, then we must + // wait for the serialization event recorded by the other stream + //if (context_concurrent[thr_id] || device_interactive[thr_id]) + cudaStreamWaitEvent(context_streams[stream][thr_id], context_serialize[(stream+1)&1][thr_id], 0); +} + +void cuda_scrypt_done(int thr_id, int stream) +{ + // record the serialization event in the current stream + cudaEventRecord(context_serialize[stream][thr_id], context_streams[stream][thr_id]); +} + +void cuda_scrypt_flush(int thr_id, int stream) +{ + // flush the work queue (required for WDDM drivers) + cudaStreamSynchronize(context_streams[stream][thr_id]); +} + +void cuda_scrypt_core(int thr_id, int stream, unsigned int N) +{ + unsigned int GRID_BLOCKS = context_blocks[thr_id]; + unsigned int WARPS_PER_BLOCK = context_wpb[thr_id]; + unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); + unsigned int LOOKUP_GAP = device_lookup_gap[thr_id]; + + // setup execution parameters + dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1); + dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1); + + context_kernel[thr_id]->run_kernel(grid, threads, WARPS_PER_BLOCK, thr_id, context_streams[stream][thr_id], context_idata[stream][thr_id], context_odata[stream][thr_id], N, LOOKUP_GAP, device_interactive[thr_id], opt_benchmark, device_texturecache[thr_id]); +} + +bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) +{ + return context_kernel[thr_id]->prepare_keccak256(thr_id, host_pdata, ptarget); +} + +void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) +{ + unsigned int GRID_BLOCKS = context_blocks[thr_id]; + unsigned int WARPS_PER_BLOCK = context_wpb[thr_id]; + unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); + + // setup execution parameters + dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1); + dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1); + + context_kernel[thr_id]->do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h); +} + +bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) +{ + return context_kernel[thr_id]->prepare_blake256(thr_id, host_pdata, ptarget); +} + +void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) +{ + unsigned int GRID_BLOCKS = context_blocks[thr_id]; + unsigned int WARPS_PER_BLOCK = context_wpb[thr_id]; + unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); + + // setup execution parameters + dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1); + dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1); + + context_kernel[thr_id]->do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h); +} + +void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA) +{ + unsigned int GRID_BLOCKS = context_blocks[thr_id]; + unsigned int WARPS_PER_BLOCK = context_wpb[thr_id]; + unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); + unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * (postSHA ? 8 : 32); + + // copy result from device to host (asynchronously) + checkCudaErrors(cudaMemcpyAsync(X, postSHA ? context_hash[stream][thr_id] : context_odata[stream][thr_id], mem_size, cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); +} + +bool cuda_scrypt_sync(int thr_id, int stream) +{ + cudaError_t err; + + if(device_interactive[thr_id] && !opt_benchmark) + { + // For devices that also do desktop rendering or compositing, we want to free up some time slots. + // That requires making a pause in work submission when there is no active task on the GPU, + // and Device Synchronize ensures that. + + // this call was replaced by the loop below to workaround the high CPU usage issue + //err = cudaDeviceSynchronize(); + + while((err = cudaStreamQuery(context_streams[0][thr_id])) == cudaErrorNotReady || + (err == cudaSuccess && (err = cudaStreamQuery(context_streams[1][thr_id])) == cudaErrorNotReady)) + usleep(1000); + + usleep(1000); + } + else + { + // this call was replaced by the loop below to workaround the high CPU usage issue + //err = cudaStreamSynchronize(context_streams[stream][thr_id]); + + while((err = cudaStreamQuery(context_streams[stream][thr_id])) == cudaErrorNotReady) + usleep(1000); + } + + if(err != cudaSuccess) + { + applog(LOG_ERR, "GPU #%d: CUDA error `%s` while executing the kernel.", device_map[thr_id], cudaGetErrorString(err)); + return false; + } + + return true; +} + +uint32_t* cuda_transferbuffer(int thr_id, int stream) +{ + return context_X[stream][thr_id]; +} + +uint32_t* cuda_hashbuffer(int thr_id, int stream) +{ + return context_H[stream][thr_id]; +} diff --git a/scrypt/salsa_kernel.h b/scrypt/salsa_kernel.h new file mode 100644 index 0000000..f25fb01 --- /dev/null +++ b/scrypt/salsa_kernel.h @@ -0,0 +1,135 @@ +#ifndef SALSA_KERNEL_H +#define SALSA_KERNEL_H + +#include +#include +#include +#include +#include + +#include "miner.h" + +#define MAX_DEVICES MAX_GPUS + +#define A_SCRYPT 0 +#define A_SCRYPT_JANE 1 + +// from ccminer.cpp +extern short device_map[MAX_GPUS]; +extern int device_interactive[MAX_GPUS]; +extern int device_batchsize[MAX_GPUS]; +extern int device_backoff[MAX_GPUS]; +extern int device_lookup_gap[MAX_GPUS]; +extern int device_texturecache[MAX_GPUS]; +extern int device_singlememory[MAX_GPUS]; +extern char *device_config[MAX_GPUS]; +extern char *device_name[MAX_GPUS]; +extern bool autotune; + +extern int opt_nfactor; +extern char *jane_params; +extern bool abort_flag; +extern bool autotune; +extern int parallel; + +extern void get_currentalgo(char* buf, int sz); + +typedef unsigned int uint32_t; // define this as 32 bit type derived from int + +static char algo[64] = { 0 }; +static __inline bool IS_SCRYPT() { if (algo[0] == '\0') get_currentalgo(algo, 64); return !strcmp(algo,"scrypt"); } +static __inline bool IS_SCRYPT_JANE() { if (algo[0] == '\0') get_currentalgo(algo, 64); return !strcmp(algo,"scrypt-jane"); } + +// CUDA externals +extern int cuda_num_devices(); +extern void cuda_shutdown(int thr_id); +extern int cuda_throughput(int thr_id); + +extern uint32_t *cuda_transferbuffer(int thr_id, int stream); +extern uint32_t *cuda_hashbuffer(int thr_id, int stream); + +extern void cuda_scrypt_HtoD(int thr_id, uint32_t *X, int stream); +extern void cuda_scrypt_serialize(int thr_id, int stream); +extern void cuda_scrypt_core(int thr_id, int stream, unsigned int N); +extern void cuda_scrypt_done(int thr_id, int stream); +extern void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA); +extern bool cuda_scrypt_sync(int thr_id, int stream); +extern void cuda_scrypt_flush(int thr_id, int stream); + +extern bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); +extern void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h); + +extern bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); +extern void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h); + +extern void computeGold(uint32_t *idata, uint32_t *reference, uchar *scratchpad); + +extern bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); +extern bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); + +#ifdef __NVCC__ +extern void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h); +extern void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h); +#endif + +// If we're in C++ mode, we're either compiling .cu files or scrypt.cpp + +#ifdef __NVCC__ + +/** + * An pure virtual interface for a CUDA kernel implementation. + * TODO: encapsulate the kernel launch parameters in some kind of wrapper. + */ +class KernelInterface +{ +public: + virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) = 0; + virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) = 0; + virtual bool bindtexture_1D(uint32_t *d_V, size_t size) { return true; } + virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) { return true; } + virtual bool unbindtexture_1D() { return true; } + virtual bool unbindtexture_2D() { return true; } + + virtual char get_identifier() = 0; + virtual int get_major_version() { return 1; } + virtual int get_minor_version() { return 0; } + virtual int max_warps_per_block() = 0; + virtual int get_texel_width() = 0; + virtual bool no_textures() { return false; }; + virtual bool single_memory() { return false; }; + virtual int threads_per_wu() { return 1; } + virtual bool support_lookup_gap() { return false; } + virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeDefault; } + virtual cudaFuncCache cache_config() { return cudaFuncCachePreferNone; } + + virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) { + return default_prepare_keccak256(thr_id, host_pdata, ptarget); + } + virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) { + default_do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h); + } + + virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) { + return default_prepare_blake256(thr_id, host_pdata, ptarget); + } + virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) { + default_do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h); + } +}; + +// Not performing error checking is actually bad, but... +#define checkCudaErrors(x) x +#define getLastCudaError(x) + +#endif // #ifdef __NVCC__ + +// Define work unit size +#define TOTAL_WARP_LIMIT 4096 +#define WU_PER_WARP (32 / THREADS_PER_WU) +#define WU_PER_BLOCK (WU_PER_WARP*WARPS_PER_BLOCK) +#define WU_PER_LAUNCH (GRID_BLOCKS*WU_PER_BLOCK) + +// make scratchpad size dependent on N and LOOKUP_GAP +#define SCRATCH (((N+LOOKUP_GAP-1)/LOOKUP_GAP)*32) + +#endif // #ifndef SALSA_KERNEL_H diff --git a/scrypt/scrypt-jane.h b/scrypt/scrypt-jane.h new file mode 100644 index 0000000..e8c270b --- /dev/null +++ b/scrypt/scrypt-jane.h @@ -0,0 +1,29 @@ +#ifndef SCRYPT_JANE_H +#define SCRYPT_JANE_H + +/* + Nfactor: Increases CPU & Memory Hardness + N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used + + rfactor: Increases Memory Hardness + r = (1 << rfactor): How large a chunk is + + pfactor: Increases CPU Hardness + p = (1 << pfactor): Number of times to mix the main chunk + + A block is the basic mixing unit (salsa/chacha block = 64 bytes) + A chunk is (2 * r) blocks + + ~Memory used = (N + 2) * ((2 * r) * block size) +*/ + +#include +#include +#include + +typedef void (*scrypt_fatal_errorfn)(const char *msg); +void scrypt_set_fatal_error(scrypt_fatal_errorfn fn); + +void scrypt_N_1_1(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, uint32_t N, unsigned char *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V); + +#endif /* SCRYPT_JANE_H */ diff --git a/scrypt/sha2.c b/scrypt/sha2.c new file mode 100644 index 0000000..0fb8979 --- /dev/null +++ b/scrypt/sha2.c @@ -0,0 +1,638 @@ +/* + * Copyright 2011 ArtForz + * Copyright 2011-2013 pooler + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" +#include "miner.h" + +#include +#include + +#ifdef WIN32 +#define __attribute__(x) +#endif + +#if defined(__arm__) && defined(__APCS_32__) +#define EXTERN_SHA256 +#endif + +static const uint32_t sha256_h[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 +}; + +static const uint32_t sha256_k[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +void sha256_init(uint32_t *state) +{ + memcpy(state, sha256_h, 32); +} + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + do { \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; \ + } while (0) + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + sha256_k[i]) + +#ifndef EXTERN_SHA256 + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +void sha256_transform(uint32_t *state, const uint32_t *block, int swap) +{ + uint32_t W[64]; + uint32_t S[8]; + uint32_t t0, t1; + int i; + + /* 1. Prepare message schedule W. */ + if (swap) { + for (i = 0; i < 16; i++) + W[i] = swab32(block[i]); + } else + memcpy(W, block, 64); + for (i = 16; i < 64; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; + } + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + RNDr(S, W, 0); + RNDr(S, W, 1); + RNDr(S, W, 2); + RNDr(S, W, 3); + RNDr(S, W, 4); + RNDr(S, W, 5); + RNDr(S, W, 6); + RNDr(S, W, 7); + RNDr(S, W, 8); + RNDr(S, W, 9); + RNDr(S, W, 10); + RNDr(S, W, 11); + RNDr(S, W, 12); + RNDr(S, W, 13); + RNDr(S, W, 14); + RNDr(S, W, 15); + RNDr(S, W, 16); + RNDr(S, W, 17); + RNDr(S, W, 18); + RNDr(S, W, 19); + RNDr(S, W, 20); + RNDr(S, W, 21); + RNDr(S, W, 22); + RNDr(S, W, 23); + RNDr(S, W, 24); + RNDr(S, W, 25); + RNDr(S, W, 26); + RNDr(S, W, 27); + RNDr(S, W, 28); + RNDr(S, W, 29); + RNDr(S, W, 30); + RNDr(S, W, 31); + RNDr(S, W, 32); + RNDr(S, W, 33); + RNDr(S, W, 34); + RNDr(S, W, 35); + RNDr(S, W, 36); + RNDr(S, W, 37); + RNDr(S, W, 38); + RNDr(S, W, 39); + RNDr(S, W, 40); + RNDr(S, W, 41); + RNDr(S, W, 42); + RNDr(S, W, 43); + RNDr(S, W, 44); + RNDr(S, W, 45); + RNDr(S, W, 46); + RNDr(S, W, 47); + RNDr(S, W, 48); + RNDr(S, W, 49); + RNDr(S, W, 50); + RNDr(S, W, 51); + RNDr(S, W, 52); + RNDr(S, W, 53); + RNDr(S, W, 54); + RNDr(S, W, 55); + RNDr(S, W, 56); + RNDr(S, W, 57); + RNDr(S, W, 58); + RNDr(S, W, 59); + RNDr(S, W, 60); + RNDr(S, W, 61); + RNDr(S, W, 62); + RNDr(S, W, 63); + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +#endif /* EXTERN_SHA256 */ + + +static const uint32_t sha256d_hash1[16] = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x80000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000100 +}; + +static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) +{ + uint32_t S[16]; + int i; + + sha256_init(S); + sha256_transform(S, data, 0); + sha256_transform(S, data + 16, 0); + memcpy(S + 8, sha256d_hash1 + 8, 32); + sha256_init(hash); + sha256_transform(hash, S, 0); + for (i = 0; i < 8; i++) + hash[i] = swab32(hash[i]); +} + +void sha256d(unsigned char *hash, const unsigned char *data, int len) +{ + uint32_t S[16], T[16]; + int i, r; + + sha256_init(S); + for (r = len; r > -9; r -= 64) { + if (r < 64) + memset(T, 0, 64); + memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); + if (r >= 0 && r < 64) + ((unsigned char *)T)[r] = 0x80; + for (i = 0; i < 16; i++) + T[i] = be32dec(T + i); + if (r < 56) + T[15] = 8 * len; + sha256_transform(S, T, 0); + } + memcpy(S + 8, sha256d_hash1 + 8, 32); + sha256_init(T); + sha256_transform(T, S, 0); + for (i = 0; i < 8; i++) + be32enc((uint32_t *)hash + i, T[i]); +} + +static inline void sha256d_preextend(uint32_t *W) +{ + W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; + W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1]; + W[18] = s1(W[16]) + W[11] + W[ 2]; + W[19] = s1(W[17]) + W[12] + s0(W[ 4]); + W[20] = W[13] + s0(W[ 5]) + W[ 4]; + W[21] = W[14] + s0(W[ 6]) + W[ 5]; + W[22] = W[15] + s0(W[ 7]) + W[ 6]; + W[23] = W[16] + s0(W[ 8]) + W[ 7]; + W[24] = W[17] + s0(W[ 9]) + W[ 8]; + W[25] = s0(W[10]) + W[ 9]; + W[26] = s0(W[11]) + W[10]; + W[27] = s0(W[12]) + W[11]; + W[28] = s0(W[13]) + W[12]; + W[29] = s0(W[14]) + W[13]; + W[30] = s0(W[15]) + W[14]; + W[31] = s0(W[16]) + W[15]; +} + +static inline void sha256d_prehash(uint32_t *S, const uint32_t *W) +{ + uint32_t t0, t1; + RNDr(S, W, 0); + RNDr(S, W, 1); + RNDr(S, W, 2); +} + +#ifdef EXTERN_SHA256 + +void sha256d_ms(uint32_t *hash, uint32_t *W, + const uint32_t *midstate, const uint32_t *prehash); + +#else + +static inline void sha256d_ms(uint32_t *hash, uint32_t *W, + const uint32_t *midstate, const uint32_t *prehash) +{ + uint32_t S[64]; + uint32_t t0, t1; + int i; + + S[18] = W[18]; + S[19] = W[19]; + S[20] = W[20]; + S[22] = W[22]; + S[23] = W[23]; + S[24] = W[24]; + S[30] = W[30]; + S[31] = W[31]; + + W[18] += s0(W[3]); + W[19] += W[3]; + W[20] += s1(W[18]); + W[21] = s1(W[19]); + W[22] += s1(W[20]); + W[23] += s1(W[21]); + W[24] += s1(W[22]); + W[25] = s1(W[23]) + W[18]; + W[26] = s1(W[24]) + W[19]; + W[27] = s1(W[25]) + W[20]; + W[28] = s1(W[26]) + W[21]; + W[29] = s1(W[27]) + W[22]; + W[30] += s1(W[28]) + W[23]; + W[31] += s1(W[29]) + W[24]; + for (i = 32; i < 64; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; + } + + memcpy(S, prehash, 32); + + RNDr(S, W, 3); + RNDr(S, W, 4); + RNDr(S, W, 5); + RNDr(S, W, 6); + RNDr(S, W, 7); + RNDr(S, W, 8); + RNDr(S, W, 9); + RNDr(S, W, 10); + RNDr(S, W, 11); + RNDr(S, W, 12); + RNDr(S, W, 13); + RNDr(S, W, 14); + RNDr(S, W, 15); + RNDr(S, W, 16); + RNDr(S, W, 17); + RNDr(S, W, 18); + RNDr(S, W, 19); + RNDr(S, W, 20); + RNDr(S, W, 21); + RNDr(S, W, 22); + RNDr(S, W, 23); + RNDr(S, W, 24); + RNDr(S, W, 25); + RNDr(S, W, 26); + RNDr(S, W, 27); + RNDr(S, W, 28); + RNDr(S, W, 29); + RNDr(S, W, 30); + RNDr(S, W, 31); + RNDr(S, W, 32); + RNDr(S, W, 33); + RNDr(S, W, 34); + RNDr(S, W, 35); + RNDr(S, W, 36); + RNDr(S, W, 37); + RNDr(S, W, 38); + RNDr(S, W, 39); + RNDr(S, W, 40); + RNDr(S, W, 41); + RNDr(S, W, 42); + RNDr(S, W, 43); + RNDr(S, W, 44); + RNDr(S, W, 45); + RNDr(S, W, 46); + RNDr(S, W, 47); + RNDr(S, W, 48); + RNDr(S, W, 49); + RNDr(S, W, 50); + RNDr(S, W, 51); + RNDr(S, W, 52); + RNDr(S, W, 53); + RNDr(S, W, 54); + RNDr(S, W, 55); + RNDr(S, W, 56); + RNDr(S, W, 57); + RNDr(S, W, 58); + RNDr(S, W, 59); + RNDr(S, W, 60); + RNDr(S, W, 61); + RNDr(S, W, 62); + RNDr(S, W, 63); + + for (i = 0; i < 8; i++) + S[i] += midstate[i]; + + W[18] = S[18]; + W[19] = S[19]; + W[20] = S[20]; + W[22] = S[22]; + W[23] = S[23]; + W[24] = S[24]; + W[30] = S[30]; + W[31] = S[31]; + + memcpy(S + 8, sha256d_hash1 + 8, 32); + S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0]; + S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1]; + S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2]; + S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3]; + S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4]; + S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5]; + S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6]; + S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7]; + S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8]; + S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9]; + S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10]; + S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11]; + S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12]; + S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13]; + S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14]; + S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15]; + for (i = 32; i < 60; i += 2) { + S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16]; + S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15]; + } + S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44]; + + sha256_init(hash); + + RNDr(hash, S, 0); + RNDr(hash, S, 1); + RNDr(hash, S, 2); + RNDr(hash, S, 3); + RNDr(hash, S, 4); + RNDr(hash, S, 5); + RNDr(hash, S, 6); + RNDr(hash, S, 7); + RNDr(hash, S, 8); + RNDr(hash, S, 9); + RNDr(hash, S, 10); + RNDr(hash, S, 11); + RNDr(hash, S, 12); + RNDr(hash, S, 13); + RNDr(hash, S, 14); + RNDr(hash, S, 15); + RNDr(hash, S, 16); + RNDr(hash, S, 17); + RNDr(hash, S, 18); + RNDr(hash, S, 19); + RNDr(hash, S, 20); + RNDr(hash, S, 21); + RNDr(hash, S, 22); + RNDr(hash, S, 23); + RNDr(hash, S, 24); + RNDr(hash, S, 25); + RNDr(hash, S, 26); + RNDr(hash, S, 27); + RNDr(hash, S, 28); + RNDr(hash, S, 29); + RNDr(hash, S, 30); + RNDr(hash, S, 31); + RNDr(hash, S, 32); + RNDr(hash, S, 33); + RNDr(hash, S, 34); + RNDr(hash, S, 35); + RNDr(hash, S, 36); + RNDr(hash, S, 37); + RNDr(hash, S, 38); + RNDr(hash, S, 39); + RNDr(hash, S, 40); + RNDr(hash, S, 41); + RNDr(hash, S, 42); + RNDr(hash, S, 43); + RNDr(hash, S, 44); + RNDr(hash, S, 45); + RNDr(hash, S, 46); + RNDr(hash, S, 47); + RNDr(hash, S, 48); + RNDr(hash, S, 49); + RNDr(hash, S, 50); + RNDr(hash, S, 51); + RNDr(hash, S, 52); + RNDr(hash, S, 53); + RNDr(hash, S, 54); + RNDr(hash, S, 55); + RNDr(hash, S, 56); + + hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5]) + + S[57] + sha256_k[57]; + hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4]) + + S[58] + sha256_k[58]; + hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3]) + + S[59] + sha256_k[59]; + hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2]) + + S[60] + sha256_k[60] + + sha256_h[7]; +} + +#endif /* EXTERN_SHA256 */ + +#ifdef HAVE_SHA256_4WAY + +void sha256d_ms_4way(uint32_t *hash, uint32_t *data, + const uint32_t *midstate, const uint32_t *prehash); + +static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, struct timeval *tv_start, struct timeval *tv_end, unsigned long *hashes_done) +{ + gettimeofday(tv_start, NULL); + + uint32_t data[4 * 64] __attribute__((aligned(128))); + uint32_t hash[4 * 8] __attribute__((aligned(32))); + uint32_t midstate[4 * 8] __attribute__((aligned(32))); + uint32_t prehash[4 * 8] __attribute__((aligned(32))); + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + int i, j; + + memcpy(data, pdata + 16, 64); + sha256d_preextend(data); + for (i = 31; i >= 0; i--) + for (j = 0; j < 4; j++) + data[i * 4 + j] = data[i]; + + sha256_init(midstate); + sha256_transform(midstate, pdata, 0); + memcpy(prehash, midstate, 32); + sha256d_prehash(prehash, pdata + 16); + for (i = 7; i >= 0; i--) { + for (j = 0; j < 4; j++) { + midstate[i * 4 + j] = midstate[i]; + prehash[i * 4 + j] = prehash[i]; + } + } + + do { + for (i = 0; i < 4; i++) + data[4 * 3 + i] = ++n; + + sha256d_ms_4way(hash, data, midstate, prehash); + + for (i = 0; i < 4; i++) { + if (swab32(hash[4 * 7 + i]) <= Htarg) { + pdata[19] = data[4 * 3 + i]; + sha256d_80_swap(hash, pdata); + if (fulltest(hash, ptarget)) { + *hashes_done = n - first_nonce + 1; + gettimeofday(&tv_end, NULL); + return 1; + } + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + gettimeofday(&tv_end, NULL); + return 0; +} + +#endif /* HAVE_SHA256_4WAY */ + +#ifdef HAVE_SHA256_8WAY + +void sha256d_ms_8way(uint32_t *hash, uint32_t *data, + const uint32_t *midstate, const uint32_t *prehash); + +static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[8 * 64] __attribute__((aligned(128))); + uint32_t hash[8 * 8] __attribute__((aligned(32))); + uint32_t midstate[8 * 8] __attribute__((aligned(32))); + uint32_t prehash[8 * 8] __attribute__((aligned(32))); + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + int i, j; + + memcpy(data, pdata + 16, 64); + sha256d_preextend(data); + for (i = 31; i >= 0; i--) + for (j = 0; j < 8; j++) + data[i * 8 + j] = data[i]; + + sha256_init(midstate); + sha256_transform(midstate, pdata, 0); + memcpy(prehash, midstate, 32); + sha256d_prehash(prehash, pdata + 16); + for (i = 7; i >= 0; i--) { + for (j = 0; j < 8; j++) { + midstate[i * 8 + j] = midstate[i]; + prehash[i * 8 + j] = prehash[i]; + } + } + + do { + for (i = 0; i < 8; i++) + data[8 * 3 + i] = ++n; + + sha256d_ms_8way(hash, data, midstate, prehash); + + for (i = 0; i < 8; i++) { + if (swab32(hash[8 * 7 + i]) <= Htarg) { + pdata[19] = data[8 * 3 + i]; + sha256d_80_swap(hash, pdata); + if (fulltest(hash, ptarget)) { + *hashes_done = n - first_nonce + 1; + return 1; + } + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} + +#endif /* HAVE_SHA256_8WAY */ + +int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, struct timeval *tv_start, struct timeval *tv_end, unsigned long *hashes_done) +{ + uint32_t data[64] __attribute__((aligned(128))); + uint32_t hash[8] __attribute__((aligned(32))); + uint32_t midstate[8] __attribute__((aligned(32))); + uint32_t prehash[8] __attribute__((aligned(32))); + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + +#ifdef HAVE_SHA256_8WAY + if (sha256_use_8way()) + return scanhash_sha256d_8way(thr_id, pdata, ptarget, + max_nonce, hashes_done); +#endif +#ifdef HAVE_SHA256_4WAY + if (sha256_use_4way()) + return scanhash_sha256d_4way(thr_id, pdata, ptarget, + max_nonce, hashes_done); +#endif + + memcpy(data, pdata + 16, 64); + sha256d_preextend(data); + + sha256_init(midstate); + sha256_transform(midstate, pdata, 0); + memcpy(prehash, midstate, 32); + sha256d_prehash(prehash, pdata + 16); + + do { + data[3] = ++n; + sha256d_ms(hash, data, midstate, prehash); + if (swab32(hash[7]) <= Htarg) { + pdata[19] = data[3]; + sha256d_80_swap(hash, pdata); + if (fulltest(hash, ptarget)) { + *hashes_done = n - first_nonce + 1; + return 1; + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} diff --git a/scrypt/sha256.cu b/scrypt/sha256.cu new file mode 100644 index 0000000..5b4c808 --- /dev/null +++ b/scrypt/sha256.cu @@ -0,0 +1,441 @@ +// +// =============== SHA256 part on nVidia GPU ====================== +// +// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64 +// + +#include + +#include "cuda_runtime.h" +#include "miner.h" + +#include "salsa_kernel.h" + +#include "sha256.h" + +// define some error checking macros +#undef checkCudaErrors + +#if WIN32 +#define DELIMITER '/' +#else +#define DELIMITER '/' +#endif +#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ ) + +#define checkCudaErrors(x) { \ + cudaGetLastError(); \ + x; \ + cudaError_t err = cudaGetLastError(); \ + if (err != cudaSuccess) \ + applog(LOG_ERR, "GPU #%d: cudaError %d (%s) calling '%s' (%s line %d)\n", (int) device_map[thr_id], err, cudaGetErrorString(err), #x, __FILENAME__, __LINE__); \ +} + +// from salsa_kernel.cu +extern std::map context_idata[2]; +extern std::map context_odata[2]; +extern std::map context_streams[2]; +extern std::map context_tstate[2]; +extern std::map context_ostate[2]; +extern std::map context_hash[2]; + +static const uint32_t host_sha256_h[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 +}; + +static const uint32_t host_sha256_k[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + do { \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; \ + } while (0) + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + sha256_k[i]) + +static const uint32_t host_keypad[12] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 +}; + +static const uint32_t host_innerpad[11] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 +}; + +static const uint32_t host_outerpad[8] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300 +}; + +static const uint32_t host_finalblk[16] = { + 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 +}; + +// +// CUDA code +// + +__constant__ uint32_t sha256_h[8]; +__constant__ uint32_t sha256_k[64]; +__constant__ uint32_t keypad[12]; +__constant__ uint32_t innerpad[11]; +__constant__ uint32_t outerpad[8]; +__constant__ uint32_t finalblk[16]; +__constant__ uint32_t pdata[20]; +__constant__ uint32_t midstate[8]; + +__device__ void mycpy12(uint32_t *d, const uint32_t *s) { +#pragma unroll 3 + for (int k=0; k < 3; k++) d[k] = s[k]; +} + +__device__ void mycpy16(uint32_t *d, const uint32_t *s) { +#pragma unroll 4 + for (int k=0; k < 4; k++) d[k] = s[k]; +} + +__device__ void mycpy32(uint32_t *d, const uint32_t *s) { +#pragma unroll 8 + for (int k=0; k < 8; k++) d[k] = s[k]; +} + +__device__ void mycpy44(uint32_t *d, const uint32_t *s) { +#pragma unroll 11 + for (int k=0; k < 11; k++) d[k] = s[k]; +} + +__device__ void mycpy48(uint32_t *d, const uint32_t *s) { +#pragma unroll 12 + for (int k=0; k < 12; k++) d[k] = s[k]; +} + +__device__ void mycpy64(uint32_t *d, const uint32_t *s) { +#pragma unroll 16 + for (int k=0; k < 16; k++) d[k] = s[k]; +} + +__device__ uint32_t cuda_swab32(uint32_t x) +{ + return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) + | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); +} + +__device__ void mycpy32_swab32(uint32_t *d, const uint32_t *s) { +#pragma unroll 8 + for (int k=0; k < 8; k++) d[k] = cuda_swab32(s[k]); +} + +__device__ void mycpy64_swab32(uint32_t *d, const uint32_t *s) { +#pragma unroll 16 + for (int k=0; k < 16; k++) d[k] = cuda_swab32(s[k]); +} + +__device__ void cuda_sha256_init(uint32_t *state) +{ + mycpy32(state, sha256_h); +} + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. Modified for lower register use. + */ +__device__ void cuda_sha256_transform(uint32_t *state, const uint32_t *block) +{ + uint32_t W[64]; // only 4 of these are accessed during each partial Mix + uint32_t S[8]; + uint32_t t0, t1; + int i; + + /* 1. Initialize working variables. */ + mycpy32(S, state); + + /* 2. Prepare message schedule W and Mix. */ + mycpy16(W, block); + RNDr(S, W, 0); RNDr(S, W, 1); RNDr(S, W, 2); RNDr(S, W, 3); + + mycpy16(W+4, block+4); + RNDr(S, W, 4); RNDr(S, W, 5); RNDr(S, W, 6); RNDr(S, W, 7); + + mycpy16(W+8, block+8); + RNDr(S, W, 8); RNDr(S, W, 9); RNDr(S, W, 10); RNDr(S, W, 11); + + mycpy16(W+12, block+12); + RNDr(S, W, 12); RNDr(S, W, 13); RNDr(S, W, 14); RNDr(S, W, 15); + +#pragma unroll 2 + for (i = 16; i < 20; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } + RNDr(S, W, 16); RNDr(S, W, 17); RNDr(S, W, 18); RNDr(S, W, 19); + +#pragma unroll 2 + for (i = 20; i < 24; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } + RNDr(S, W, 20); RNDr(S, W, 21); RNDr(S, W, 22); RNDr(S, W, 23); + +#pragma unroll 2 + for (i = 24; i < 28; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } + RNDr(S, W, 24); RNDr(S, W, 25); RNDr(S, W, 26); RNDr(S, W, 27); + +#pragma unroll 2 + for (i = 28; i < 32; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } + RNDr(S, W, 28); RNDr(S, W, 29); RNDr(S, W, 30); RNDr(S, W, 31); + +#pragma unroll 2 + for (i = 32; i < 36; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } + RNDr(S, W, 32); RNDr(S, W, 33); RNDr(S, W, 34); RNDr(S, W, 35); + +#pragma unroll 2 + for (i = 36; i < 40; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } + RNDr(S, W, 36); RNDr(S, W, 37); RNDr(S, W, 38); RNDr(S, W, 39); + +#pragma unroll 2 + for (i = 40; i < 44; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } + RNDr(S, W, 40); RNDr(S, W, 41); RNDr(S, W, 42); RNDr(S, W, 43); + +#pragma unroll 2 + for (i = 44; i < 48; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } + RNDr(S, W, 44); RNDr(S, W, 45); RNDr(S, W, 46); RNDr(S, W, 47); + +#pragma unroll 2 + for (i = 48; i < 52; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } + RNDr(S, W, 48); RNDr(S, W, 49); RNDr(S, W, 50); RNDr(S, W, 51); + +#pragma unroll 2 + for (i = 52; i < 56; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } + RNDr(S, W, 52); RNDr(S, W, 53); RNDr(S, W, 54); RNDr(S, W, 55); + +#pragma unroll 2 + for (i = 56; i < 60; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } + RNDr(S, W, 56); RNDr(S, W, 57); RNDr(S, W, 58); RNDr(S, W, 59); + +#pragma unroll 2 + for (i = 60; i < 64; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } + RNDr(S, W, 60); RNDr(S, W, 61); RNDr(S, W, 62); RNDr(S, W, 63); + + /* 3. Mix local working variables into global state */ +#pragma unroll 8 + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +// +// HMAC SHA256 functions, modified to work with pdata and nonce directly +// + +__device__ void cuda_HMAC_SHA256_80_init(uint32_t *tstate, uint32_t *ostate, uint32_t nonce) +{ + uint32_t ihash[8]; + uint32_t pad[16]; + int i; + + /* tstate is assumed to contain the midstate of key */ + mycpy12(pad, pdata + 16); + pad[3] = nonce; + mycpy48(pad + 4, keypad); + cuda_sha256_transform(tstate, pad); + mycpy32(ihash, tstate); + + cuda_sha256_init(ostate); +#pragma unroll 8 + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x5c5c5c5c; +#pragma unroll 8 + for (i=8; i < 16; i++) + pad[i] = 0x5c5c5c5c; + cuda_sha256_transform(ostate, pad); + + cuda_sha256_init(tstate); +#pragma unroll 8 + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x36363636; +#pragma unroll 8 + for (i=8; i < 16; i++) + pad[i] = 0x36363636; + cuda_sha256_transform(tstate, pad); +} + +__device__ void cuda_PBKDF2_SHA256_80_128(const uint32_t *tstate, + const uint32_t *ostate, uint32_t *output, uint32_t nonce) +{ + uint32_t istate[8], ostate2[8]; + uint32_t ibuf[16], obuf[16]; + + mycpy32(istate, tstate); + cuda_sha256_transform(istate, pdata); + + mycpy12(ibuf, pdata + 16); + ibuf[3] = nonce; + ibuf[4] = 1; + mycpy44(ibuf + 5, innerpad); + + mycpy32(obuf, istate); + mycpy32(obuf + 8, outerpad); + cuda_sha256_transform(obuf, ibuf); + + mycpy32(ostate2, ostate); + cuda_sha256_transform(ostate2, obuf); + mycpy32_swab32(output, ostate2); // TODO: coalescing would be desired + + mycpy32(obuf, istate); + ibuf[4] = 2; + cuda_sha256_transform(obuf, ibuf); + + mycpy32(ostate2, ostate); + cuda_sha256_transform(ostate2, obuf); + mycpy32_swab32(output+8, ostate2); // TODO: coalescing would be desired + + mycpy32(obuf, istate); + ibuf[4] = 3; + cuda_sha256_transform(obuf, ibuf); + + mycpy32(ostate2, ostate); + cuda_sha256_transform(ostate2, obuf); + mycpy32_swab32(output+16, ostate2); // TODO: coalescing would be desired + + mycpy32(obuf, istate); + ibuf[4] = 4; + cuda_sha256_transform(obuf, ibuf); + + mycpy32(ostate2, ostate); + cuda_sha256_transform(ostate2, obuf); + mycpy32_swab32(output+24, ostate2); // TODO: coalescing would be desired +} + +__global__ void cuda_pre_sha256(uint32_t g_inp[32], uint32_t g_tstate_ext[8], uint32_t g_ostate_ext[8], uint32_t nonce) +{ + nonce += (blockIdx.x * blockDim.x) + threadIdx.x; + g_inp += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x); + g_tstate_ext += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x); + g_ostate_ext += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x); + + uint32_t tstate[8], ostate[8]; + mycpy32(tstate, midstate); + + cuda_HMAC_SHA256_80_init(tstate, ostate, nonce); + + mycpy32(g_tstate_ext, tstate); // TODO: coalescing would be desired + mycpy32(g_ostate_ext, ostate); // TODO: coalescing would be desired + + cuda_PBKDF2_SHA256_80_128(tstate, ostate, g_inp, nonce); +} + +__global__ void cuda_post_sha256(uint32_t g_output[8], uint32_t g_tstate_ext[8], uint32_t g_ostate_ext[8], uint32_t g_salt_ext[32]) +{ + g_output += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x); + g_tstate_ext += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x); + g_ostate_ext += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x); + g_salt_ext += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x); + + uint32_t tstate[16]; + mycpy32(tstate, g_tstate_ext); // TODO: coalescing would be desired + + uint32_t halfsalt[16]; + mycpy64_swab32(halfsalt, g_salt_ext); // TODO: coalescing would be desired + cuda_sha256_transform(tstate, halfsalt); + mycpy64_swab32(halfsalt, g_salt_ext+16); // TODO: coalescing would be desired + cuda_sha256_transform(tstate, halfsalt); + cuda_sha256_transform(tstate, finalblk); + + uint32_t buf[16]; + mycpy32(buf, tstate); + mycpy32(buf + 8, outerpad); + + uint32_t ostate[16]; + mycpy32(ostate, g_ostate_ext); + + cuda_sha256_transform(ostate, buf); + mycpy32_swab32(g_output, ostate); // TODO: coalescing would be desired +} + +// +// callable host code to initialize constants and to call kernels +// + +void prepare_sha256(int thr_id, uint32_t host_pdata[20], uint32_t host_midstate[8]) +{ + static bool init[8] = {false, false, false, false, false, false, false, false}; + if (!init[thr_id]) + { + checkCudaErrors(cudaMemcpyToSymbol(sha256_h, host_sha256_h, sizeof(host_sha256_h), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(sha256_k, host_sha256_k, sizeof(host_sha256_k), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(keypad, host_keypad, sizeof(host_keypad), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(innerpad, host_innerpad, sizeof(host_innerpad), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(outerpad, host_outerpad, sizeof(host_outerpad), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(finalblk, host_finalblk, sizeof(host_finalblk), 0, cudaMemcpyHostToDevice)); + init[thr_id] = true; + } + checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(midstate, host_midstate, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); +} + +void pre_sha256(int thr_id, int stream, uint32_t nonce, int throughput) +{ + dim3 block(128); + dim3 grid((throughput+127)/128); + + cuda_pre_sha256<<>>(context_idata[stream][thr_id], context_tstate[stream][thr_id], context_ostate[stream][thr_id], nonce); +} + +void post_sha256(int thr_id, int stream, int throughput) +{ + dim3 block(128); + dim3 grid((throughput+127)/128); + + cuda_post_sha256<<>>(context_hash[stream][thr_id], context_tstate[stream][thr_id], context_ostate[stream][thr_id], context_odata[stream][thr_id]); +} diff --git a/scrypt/sha256.h b/scrypt/sha256.h new file mode 100644 index 0000000..cacb08f --- /dev/null +++ b/scrypt/sha256.h @@ -0,0 +1,10 @@ +#ifndef SHA256_H +#define SHA256_H + +#include + +extern "C" void prepare_sha256(int thr_id, uint32_t cpu_pdata[20], uint32_t cpu_midstate[8]); +extern "C" void pre_sha256(int thr_id, int stream, uint32_t nonce, int throughput); +extern "C" void post_sha256(int thr_id, int stream, int throughput); + +#endif // #ifndef SHA256_H diff --git a/scrypt/test_kernel.cu b/scrypt/test_kernel.cu new file mode 100644 index 0000000..f7552d3 --- /dev/null +++ b/scrypt/test_kernel.cu @@ -0,0 +1,781 @@ +/* Copyright (C) 2013 David G. Andersen. All rights reserved. + * with modifications by Christian Buchner + * + * Use of this code is covered under the Apache 2.0 license, which + * can be found in the file "LICENSE" + * + * The array notation for b[] and bx[] arrays was converted to uint4, + * in preparation for some experimental changes to memory access patterns. + * Also this kernel is going to be a testbed for adaptation to Fermi devices. + */ + +// TODO: experiment with different memory access patterns in write/read_keys_direct functions +// TODO: attempt V.Volkov style ILP (factor 4) + +#include + +#include "cuda_runtime.h" +#include "miner.h" + +#include "salsa_kernel.h" +#include "test_kernel.h" + +#define TEXWIDTH 32768 +#define THREADS_PER_WU 4 // four threads per hash + +typedef enum +{ + ANDERSEN, + SIMPLE +} MemoryAccess; + + +// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes) +__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT]; + +// iteration count N +__constant__ uint32_t c_N; +__constant__ uint32_t c_N_1; // N-1 +// scratch buffer size SCRATCH +__constant__ uint32_t c_SCRATCH; +__constant__ uint32_t c_SCRATCH_WU_PER_WARP; // (SCRATCH * WU_PER_WARP) +__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP) - 1 + +// using texture references for the "tex" variants of the B kernels +texture texRef1D_4_V; +texture texRef2D_4_V; + +template __device__ __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3); + +static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) { + left.x ^= right.x; + left.y ^= right.y; + left.z ^= right.z; + left.w ^= right.w; + return left; +} + +static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) { + left.x += right.x; + left.y += right.y; + left.z += right.z; + left.w += right.w; + return left; +} + + +/* write_keys writes the 8 keys being processed by a warp to the global + * scratchpad. To effectively use memory bandwidth, it performs the writes + * (and reads, for read_keys) 128 bytes at a time per memory location + * by __shfl'ing the 4 entries in bx to the threads in the next-up + * thread group. It then has eight threads together perform uint4 + * (128 bit) writes to the destination region. This seems to make + * quite effective use of memory bandwidth. An approach that spread + * uint32s across more threads was slower because of the increased + * computation it required. + * + * "start" is the loop iteration producing the write - the offset within + * the block's memory. + * + * Internally, this algorithm first __shfl's the 4 bx entries to + * the next up thread group, and then uses a conditional move to + * ensure that odd-numbered thread groups exchange the b/bx ordering + * so that the right parts are written together. + * + * Thanks to Babu for helping design the 128-bit-per-write version. + * + * _direct lets the caller specify the absolute start location instead of + * the relative start location, as an attempt to reduce some recomputation. + */ + +template __device__ __forceinline__ +void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start) +{ + uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; + + if (SCHEME == ANDERSEN) { + uint4 t=b, t2; + extern __shared__ unsigned char shared[]; + uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); + uint32_t *s = &tmp[threadIdx.x/32][threadIdx.x%32]; + uint32_t *st = &tmp[threadIdx.x/32][(threadIdx.x + 4)%32]; + *s = bx.x; t2.x = *st; + *s = bx.y; t2.y = *st; + *s = bx.z; t2.z = *st; + *s = bx.w; t2.w = *st; + *s = start; int t2_start = *st + 4; + bool c = (threadIdx.x & 0x4); + *((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t); + *((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2); + } else { + *((uint4 *)(&scratch[start ])) = b; + *((uint4 *)(&scratch[start+16])) = bx; + } +} + +template __device__ __forceinline__ +void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start) +{ + uint32_t *scratch; + + if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; + if (SCHEME == ANDERSEN) { + extern __shared__ unsigned char shared[]; + uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); + uint32_t *s = &tmp[threadIdx.x/32][threadIdx.x%32]; + *s = start; int t2_start = tmp[threadIdx.x/32][(threadIdx.x + 4)%32] + 4; + if (TEX_DIM > 0) { start /= 4; t2_start /= 4; } + bool c = (threadIdx.x & 0x4); + if (TEX_DIM == 0) { + b = *((uint4 *)(&scratch[c ? t2_start : start])); + bx = *((uint4 *)(&scratch[c ? start : t2_start])); + } else if (TEX_DIM == 1) { + b = tex1Dfetch(texRef1D_4_V, c ? t2_start : start); + bx = tex1Dfetch(texRef1D_4_V, c ? start : t2_start); + } else if (TEX_DIM == 2) { + b = tex2D(texRef2D_4_V, 0.5f + ((c ? t2_start : start)%TEXWIDTH), 0.5f + ((c ? t2_start : start)/TEXWIDTH)); + bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH)); + } + uint4 temp = b; b = (c ? bx : b); bx = (c ? temp : bx); + uint32_t *st = &tmp[threadIdx.x/32][(threadIdx.x + 28)%32]; + *s = bx.x; bx.x = *st; + *s = bx.y; bx.y = *st; + *s = bx.z; bx.z = *st; + *s = bx.w; bx.w = *st; + } else { + if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start])); + else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4); + else if (TEX_DIM == 2) b = tex2D(texRef2D_4_V, 0.5f + ((start/4)%TEXWIDTH), 0.5f + ((start/4)/TEXWIDTH)); + if (TEX_DIM == 0) bx = *((uint4 *)(&scratch[start+16])); + else if (TEX_DIM == 1) bx = tex1Dfetch(texRef1D_4_V, (start+16)/4); + else if (TEX_DIM == 2) bx = tex2D(texRef2D_4_V, 0.5f + (((start+16)/4)%TEXWIDTH), 0.5f + (((start+16)/4)/TEXWIDTH)); + } +} + + +__device__ __forceinline__ +void primary_order_shuffle(uint4 &b, uint4 &bx) +{ + /* Inner loop shuffle targets */ + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + extern __shared__ unsigned char shared[]; + uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); + unsigned int wrp = threadIdx.x/32, lane = threadIdx.x%32; + uint32_t *s = &tmp[wrp][lane]; + uint32_t *s1 = &tmp[wrp][x1]; + uint32_t *s2 = &tmp[wrp][x2]; + uint32_t *s3 = &tmp[wrp][x3]; + + *s = b.w; b.w = *s1; + *s = b.z; b.z = *s2; + *s = b.y; b.y = *s3; + uint32_t temp = b.y; b.y = b.w; b.w = temp; + + *s = bx.w; bx.w = *s1; + *s = bx.z; bx.z = *s2; + *s = bx.y; bx.y = *s3; + temp = bx.y; bx.y = bx.w; bx.w = temp; +} + +/* + * load_key loads a 32*32bit key from a contiguous region of memory in B. + * The input keys are in external order (i.e., 0, 1, 2, 3, ...). + * After loading, each thread has its four b and four bx keys stored + * in internal processing order. + */ + +__device__ __forceinline__ +void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx) +{ + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int key_offset = scrypt_block * 32; + uint32_t thread_in_block = threadIdx.x % 4; + + // Read in permuted order. Key loads are not our bottleneck right now. + b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4]; + b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4]; + b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4]; + b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4]; + bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16]; + bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16]; + bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16]; + bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16]; + + primary_order_shuffle(b, bx); + +} + +/* + * store_key performs the opposite transform as load_key, taking + * internally-ordered b and bx and storing them into a contiguous + * region of B in external order. + */ + +__device__ __forceinline__ +void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx) +{ + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int key_offset = scrypt_block * 32; + uint32_t thread_in_block = threadIdx.x % 4; + + primary_order_shuffle(b, bx); + + B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x; + B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y; + B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z; + B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w; + B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x; + B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y; + B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z; + B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w; +} + + +/* + * load_key loads a 32*32bit key from a contiguous region of memory in B. + * The input keys are in external order (i.e., 0, 1, 2, 3, ...). + * After loading, each thread has its four b and four bx keys stored + * in internal processing order. + */ + +__device__ __forceinline__ +void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx) +{ + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int key_offset = scrypt_block * 32; + uint32_t thread_in_block = threadIdx.x % 4; + + // Read in permuted order. Key loads are not our bottleneck right now. + b.x = B[key_offset + 4*0 + thread_in_block%4]; + b.y = B[key_offset + 4*1 + thread_in_block%4]; + b.z = B[key_offset + 4*2 + thread_in_block%4]; + b.w = B[key_offset + 4*3 + thread_in_block%4]; + bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16]; + bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16]; + bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16]; + bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16]; +} + +/* + * store_key performs the opposite transform as load_key, taking + * internally-ordered b and bx and storing them into a contiguous + * region of B in external order. + */ + +__device__ __forceinline__ +void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx) +{ + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int key_offset = scrypt_block * 32; + uint32_t thread_in_block = threadIdx.x % 4; + + B[key_offset + 4*0 + thread_in_block%4] = b.x; + B[key_offset + 4*1 + thread_in_block%4] = b.y; + B[key_offset + 4*2 + thread_in_block%4] = b.z; + B[key_offset + 4*3 + thread_in_block%4] = b.w; + B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x; + B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y; + B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z; + B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w; +} + + +template __device__ __forceinline__ +void load_key(const uint32_t *B, uint4 &b, uint4 &bx) +{ + switch(ALGO) { + case A_SCRYPT: load_key_salsa(B, b, bx); break; + case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break; + } +} + +template __device__ __forceinline__ +void store_key(uint32_t *B, uint4 &b, uint4 &bx) +{ + switch(ALGO) { + case A_SCRYPT: store_key_salsa(B, b, bx); break; + case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break; + } +} + + +/* + * salsa_xor_core (Salsa20/8 cypher) + * The original scrypt called: + * xor_salsa8(&X[0], &X[16]); <-- the "b" loop + * xor_salsa8(&X[16], &X[0]); <-- the "bx" loop + * This version is unrolled to handle both of these loops in a single + * call to avoid unnecessary data movement. + */ + +#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<>(32-amt))); } + +__device__ __forceinline__ +void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) +{ + extern __shared__ unsigned char shared[]; + uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); + unsigned int wrp = threadIdx.x/32, lane = threadIdx.x%32; + uint32_t *s = &tmp[wrp][lane]; + uint32_t *s1 = &tmp[wrp][x1]; + uint32_t *s2 = &tmp[wrp][x2]; + uint32_t *s3 = &tmp[wrp][x3]; + + uint4 x; + + b ^= bx; + x = b; + + // Enter in "primary order" (t0 has 0, 4, 8, 12) + // (t1 has 5, 9, 13, 1) + // (t2 has 10, 14, 2, 6) + // (t3 has 15, 3, 7, 11) + + #pragma unroll + for (int j = 0; j < 4; j++) { + + // Mixing phase of salsa + XOR_ROTATE_ADD(x.y, x.x, x.w, 7); + XOR_ROTATE_ADD(x.z, x.y, x.x, 9); + XOR_ROTATE_ADD(x.w, x.z, x.y, 13); + XOR_ROTATE_ADD(x.x, x.w, x.z, 18); + + /* Transpose rows and columns. */ + /* Unclear if this optimization is needed: These are ordered based + * upon the dependencies needed in the later xors. Compiler should be + * able to figure this out, but might as well give it a hand. */ + *s = x.y; x.y = *s3; + *s = x.w; x.w = *s1; + *s = x.z; x.z = *s2; + + /* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first, + * but the register targets are rewritten here to swap x[1] and x[3] so that + * they can be directly shuffled to and from our peer threads without + * reassignment. The reverse shuffle then puts them back in the right place. + */ + + XOR_ROTATE_ADD(x.w, x.x, x.y, 7); + XOR_ROTATE_ADD(x.z, x.w, x.x, 9); + XOR_ROTATE_ADD(x.y, x.z, x.w, 13); + XOR_ROTATE_ADD(x.x, x.y, x.z, 18); + + *s = x.w; x.w = *s3; + *s = x.y; x.y = *s1; + *s = x.z; x.z = *s2; + } + + b += x; + // The next two lines are the beginning of the BX-centric loop iteration + bx ^= b; + x = bx; + + // This is a copy of the same loop above, identical but stripped of comments. + // Duplicated so that we can complete a bx-based loop with fewer register moves. + #pragma unroll + for (int j = 0; j < 4; j++) { + XOR_ROTATE_ADD(x.y, x.x, x.w, 7); + XOR_ROTATE_ADD(x.z, x.y, x.x, 9); + XOR_ROTATE_ADD(x.w, x.z, x.y, 13); + XOR_ROTATE_ADD(x.x, x.w, x.z, 18); + + *s = x.y; x.y = *s3; + *s = x.w; x.w = *s1; + *s = x.z; x.z = *s2; + + XOR_ROTATE_ADD(x.w, x.x, x.y, 7); + XOR_ROTATE_ADD(x.z, x.w, x.x, 9); + XOR_ROTATE_ADD(x.y, x.z, x.w, 13); + XOR_ROTATE_ADD(x.x, x.y, x.z, 18); + + *s = x.w; x.w = *s3; + *s = x.y; x.y = *s1; + *s = x.z; x.z = *s2; + } + + // At the end of these iterations, the data is in primary order again. +#undef XOR_ROTATE_ADD + + bx += x; +} + + +/* + * chacha_xor_core (ChaCha20/8 cypher) + * This version is unrolled to handle both of these loops in a single + * call to avoid unnecessary data movement. + * + * load_key and store_key must not use primary order when + * using ChaCha20/8, but rather the basic transposed order + * (referred to as "column mode" below) + */ + +#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<>(32-amt))); } + +__device__ __forceinline__ +void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) +{ + extern __shared__ unsigned char shared[]; + uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); + unsigned int wrp = threadIdx.x/32, lane = threadIdx.x%32; + uint32_t *s = &tmp[wrp][lane]; + uint32_t *s1 = &tmp[wrp][x1]; + uint32_t *s2 = &tmp[wrp][x2]; + uint32_t *s3 = &tmp[wrp][x3]; + + uint4 x; + + b ^= bx; + x = b; + + // Enter in "column" mode (t0 has 0, 4, 8, 12) + // (t1 has 1, 5, 9, 13) + // (t2 has 2, 6, 10, 14) + // (t3 has 3, 7, 11, 15) + + #pragma unroll + for (int j = 0; j < 4; j++) { + + // Column Mixing phase of chacha + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) + + *s = x.y; x.y = *s1; + *s = x.z; x.z = *s2; + *s = x.w; x.w = *s3; + + // Diagonal Mixing phase of chacha + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) + + *s = x.y; x.y = *s3; + *s = x.z; x.z = *s2; + *s = x.w; x.w = *s1; + } + + b += x; + // The next two lines are the beginning of the BX-centric loop iteration + bx ^= b; + x = bx; + + #pragma unroll + for (int j = 0; j < 4; j++) { + + // Column Mixing phase of chacha + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) + + *s = x.y; x.y = *s1; + *s = x.z; x.z = *s2; + *s = x.w; x.w = *s3; + + // Diagonal Mixing phase of chacha + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) + + *s = x.y; x.y = *s3; + *s = x.z; x.z = *s2; + *s = x.w; x.w = *s1; + } + +#undef CHACHA_PRIMITIVE + + bx += x; +} + +template __device__ __forceinline__ +void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) +{ + switch(ALGO) { + case A_SCRYPT: salsa_xor_core(b, bx, x1, x2, x3); break; + case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break; + } +} + + +/* + * The hasher_gen_kernel operates on a group of 1024-bit input keys + * in B, stored as: + * B = { k1B k1Bx k2B k2Bx ... } + * and fills up the scratchpad with the iterative hashes derived from + * those keys: + * scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... } + * scratch is 1024 times larger than the input keys B. + * It is extremely important to stream writes effectively into scratch; + * less important to coalesce the reads from B. + * + * Key ordering note: Keys are input from B in "original" order: + * K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 } + * After inputting into kernel_gen, each component k and kx of the + * key is transmuted into a permuted internal order to make processing faster: + * K = k, kx with: + * k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11 + * and similarly for kx. + */ + +template __global__ +void test_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end) +{ + uint4 b, bx; + + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; + + int i=begin; + + if (i == 0) { + load_key(d_idata, b, bx); + write_keys_direct(b, bx, start); + ++i; + } else read_keys_direct(b, bx, start+32*(i-1)); + + while (i < end) { + block_mixer(b, bx, x1, x2, x3); + write_keys_direct(b, bx, start+32*i); + ++i; + } +} + +template __global__ +void test_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP) +{ + uint4 b, bx; + + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; + + int i=begin; + + if (i == 0) { + load_key(d_idata, b, bx); + write_keys_direct(b, bx, start); + ++i; + } else { + int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP; + read_keys_direct(b, bx, start+32*pos); + while(loop--) block_mixer(b, bx, x1, x2, x3); + } + + while (i < end) { + block_mixer(b, bx, x1, x2, x3); + if (i % LOOKUP_GAP == 0) + write_keys_direct(b, bx, start+32*(i/LOOKUP_GAP)); + ++i; + } +} + + +/* + * hasher_hash_kernel runs the second phase of scrypt after the scratch + * buffer is filled with the iterative hashes: It bounces through + * the scratch buffer in pseudorandom order, mixing the key as it goes. + */ + +template __global__ +void test_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end) +{ + extern __shared__ unsigned char shared[]; + uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); + + uint4 b, bx; + + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4); + if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP; + + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + if (begin == 0) { + read_keys_direct(b, bx, start+32*c_N_1); + block_mixer(b, bx, x1, x2, x3); + } else load_key(d_odata, b, bx); + + for (int i = begin; i < end; i++) { + tmp[threadIdx.x/32][threadIdx.x%32] = bx.x; + int j = (tmp[threadIdx.x/32][(threadIdx.x & 0x1c)] & (c_N_1)); + uint4 t, tx; read_keys_direct(t, tx, start+32*j); + b ^= t; bx ^= tx; + block_mixer(b, bx, x1, x2, x3); + } + + store_key(d_odata, b, bx); +} + +template __global__ +void test_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP) +{ + extern __shared__ unsigned char shared[]; + uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); + + uint4 b, bx; + + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4); + if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP; + + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + if (begin == 0) { + int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP); + read_keys_direct(b, bx, start+32*pos); + while(loop--) block_mixer(b, bx, x1, x2, x3); + } else load_key(d_odata, b, bx); + + for (int i = begin; i < end; i++) { + tmp[threadIdx.x/32][threadIdx.x%32] = bx.x; + int j = (tmp[threadIdx.x/32][(threadIdx.x & 0x1c)] & (c_N_1)); + int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; + uint4 t, tx; read_keys_direct(t, tx, start+32*pos); + while(loop--) block_mixer(t, tx, x1, x2, x3); + b ^= t; bx ^= tx; + block_mixer(b, bx, x1, x2, x3); + } + + store_key(d_odata, b, bx); +} + + +TestKernel::TestKernel() : KernelInterface() +{ +} + +bool TestKernel::bindtexture_1D(uint32_t *d_V, size_t size) +{ + cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc(); + texRef1D_4_V.normalized = 0; + texRef1D_4_V.filterMode = cudaFilterModePoint; + texRef1D_4_V.addressMode[0] = cudaAddressModeClamp; + checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size)); + return true; +} + +bool TestKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) +{ + cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc(); + texRef2D_4_V.normalized = 0; + texRef2D_4_V.filterMode = cudaFilterModePoint; + texRef2D_4_V.addressMode[0] = cudaAddressModeClamp; + texRef2D_4_V.addressMode[1] = cudaAddressModeClamp; + // maintain texture width of TEXWIDTH (max. limit is 65000) + while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; } + while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; } + checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch)); + return true; +} + +bool TestKernel::unbindtexture_1D() +{ + checkCudaErrors(cudaUnbindTexture(texRef1D_4_V)); + return true; +} + +bool TestKernel::unbindtexture_2D() +{ + checkCudaErrors(cudaUnbindTexture(texRef2D_4_V)); + return true; +} + +void TestKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) +{ + checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice)); +} + +bool TestKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) +{ + bool success = true; + + // compute required shared memory per block for __shfl() emulation + size_t shared = ((threads.x + 31) / 32) * (32+1) * sizeof(uint32_t); + + // make some constants available to kernel, update only initially and when changing + static int prev_N[MAX_DEVICES] = {0}; + if (N != prev_N[thr_id]) { + uint32_t h_N = N; + uint32_t h_N_1 = N-1; + uint32_t h_SCRATCH = SCRATCH; + uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP); + uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1; + + cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + + prev_N[thr_id] = N; + } + + // First phase: Sequential writes to scratchpad. + + int batch = device_batchsize[thr_id]; + + unsigned int pos = 0; + do { + if (LOOKUP_GAP == 1) { + if (IS_SCRYPT()) test_scrypt_core_kernelA <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) test_scrypt_core_kernelA <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N)); + } else { + if (IS_SCRYPT()) test_scrypt_core_kernelA_LG <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) test_scrypt_core_kernelA_LG <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + } + pos += batch; + } while (pos < N); + + // Second phase: Random read access from scratchpad. + pos = 0; + do { + if (LOOKUP_GAP == 1) { + if (texture_cache == 0) { + if (IS_SCRYPT()) test_scrypt_core_kernelB<<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N)); + } + else if (texture_cache == 1) { + if (IS_SCRYPT()) test_scrypt_core_kernelB<<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N)); + } + else if (texture_cache == 2) { + if (IS_SCRYPT()) test_scrypt_core_kernelB<<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N)); + } + } else { + if (texture_cache == 0) { + if (IS_SCRYPT()) test_scrypt_core_kernelB_LG<<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + } + else if (texture_cache == 1) { + if (IS_SCRYPT()) test_scrypt_core_kernelB_LG<<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + } + else if (texture_cache == 2) { + if (IS_SCRYPT()) test_scrypt_core_kernelB_LG<<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + } + } + + pos += batch; + } while (pos < N); + + return success; +} diff --git a/scrypt/test_kernel.h b/scrypt/test_kernel.h new file mode 100644 index 0000000..e084f72 --- /dev/null +++ b/scrypt/test_kernel.h @@ -0,0 +1,30 @@ +#ifndef TEST_KERNEL_H +#define TEST_KERNEL_H + +#include "salsa_kernel.h" + +class TestKernel : public KernelInterface +{ +public: + TestKernel(); + + virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V); + virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache); + virtual bool bindtexture_1D(uint32_t *d_V, size_t size); + virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch); + virtual bool unbindtexture_1D(); + virtual bool unbindtexture_2D(); + + virtual char get_identifier() { return 'f'; }; + virtual int get_major_version() { return 1; }; + virtual int get_minor_version() { return 0; }; + + virtual int max_warps_per_block() { return 32; }; + virtual int get_texel_width() { return 4; }; + virtual int threads_per_wu() { return 4; } + virtual bool support_lookup_gap() { return true; } + virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; } + virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; } +}; + +#endif // #ifndef TEST_KERNEL_H diff --git a/scrypt/titan_kernel.cu b/scrypt/titan_kernel.cu new file mode 100644 index 0000000..fe12ea6 --- /dev/null +++ b/scrypt/titan_kernel.cu @@ -0,0 +1,731 @@ +/* Copyright (C) 2013 David G. Andersen. All rights reserved. + * with modifications by Christian Buchner + * + * Use of this code is covered under the Apache 2.0 license, which + * can be found in the file "LICENSE" + */ + +// attempt V.Volkov style ILP (factor 4) + +#include + +#include "cuda_runtime.h" +#include "miner.h" + +#include "salsa_kernel.h" +#include "titan_kernel.h" + +#define THREADS_PER_WU 4 // four threads per hash + +typedef enum +{ + ANDERSEN, + SIMPLE +} MemoryAccess; + +#if __CUDA_ARCH__ < 350 + // Kepler (Compute 3.0) + #define __ldg(x) (*(x)) +#endif + +// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes) +__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT]; + +// iteration count N +__constant__ uint32_t c_N; +__constant__ uint32_t c_N_1; // N-1 +// scratch buffer size SCRATCH +__constant__ uint32_t c_SCRATCH; +__constant__ uint32_t c_SCRATCH_WU_PER_WARP; // (SCRATCH * WU_PER_WARP) +__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP)-1 + +template __device__ __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3); + +static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) { + left.x ^= right.x; + left.y ^= right.y; + left.z ^= right.z; + left.w ^= right.w; + return left; +} + +static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) { + left.x += right.x; + left.y += right.y; + left.z += right.z; + left.w += right.w; + return left; +} + +static __device__ uint4 __shfl(const uint4 bx, int target_thread) { + return make_uint4(__shfl((int)bx.x, target_thread), __shfl((int)bx.y, target_thread), __shfl((int)bx.z, target_thread), __shfl((int)bx.w, target_thread)); +} + +/* write_keys writes the 8 keys being processed by a warp to the global + * scratchpad. To effectively use memory bandwidth, it performs the writes + * (and reads, for read_keys) 128 bytes at a time per memory location + * by __shfl'ing the 4 entries in bx to the threads in the next-up + * thread group. It then has eight threads together perform uint4 + * (128 bit) writes to the destination region. This seems to make + * quite effective use of memory bandwidth. An approach that spread + * uint32s across more threads was slower because of the increased + * computation it required. + * + * "start" is the loop iteration producing the write - the offset within + * the block's memory. + * + * Internally, this algorithm first __shfl's the 4 bx entries to + * the next up thread group, and then uses a conditional move to + * ensure that odd-numbered thread groups exchange the b/bx ordering + * so that the right parts are written together. + * + * Thanks to Babu for helping design the 128-bit-per-write version. + * + * _direct lets the caller specify the absolute start location instead of + * the relative start location, as an attempt to reduce some recomputation. + */ + +template __device__ __forceinline__ +void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start) +{ + uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; + if (SCHEME == ANDERSEN) { + int target_thread = (threadIdx.x + 4)%32; + uint4 t=b, t2=__shfl(bx, target_thread); + int t2_start = __shfl((int)start, target_thread) + 4; + bool c = (threadIdx.x & 0x4); + *((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t); + *((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2); + } else { + *((uint4 *)(&scratch[start ])) = b; + *((uint4 *)(&scratch[start+16])) = bx; + } +} + +template __device__ __forceinline__ +void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start) +{ + uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; + if (SCHEME == ANDERSEN) { + int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4; + bool c = (threadIdx.x & 0x4); + b = __ldg((uint4 *)(&scratch[c ? t2_start : start])); + bx = __ldg((uint4 *)(&scratch[c ? start : t2_start])); + uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx); + bx = __shfl(bx, (threadIdx.x + 28)%32); + } else { + b = *((uint4 *)(&scratch[start])); + bx = *((uint4 *)(&scratch[start+16])); + } +} + +__device__ __forceinline__ +void primary_order_shuffle(uint32_t b[4], uint32_t bx[4]) { + /* Inner loop shuffle targets */ + int x1 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+3)&0x3); + + b[3] = __shfl((int)b[3], x1); + b[2] = __shfl((int)b[2], x2); + b[1] = __shfl((int)b[1], x3); + uint32_t tmp = b[1]; b[1] = b[3]; b[3] = tmp; + + bx[3] = __shfl((int)bx[3], x1); + bx[2] = __shfl((int)bx[2], x2); + bx[1] = __shfl((int)bx[1], x3); + tmp = bx[1]; bx[1] = bx[3]; bx[3] = tmp; +} + +__device__ __forceinline__ +void primary_order_shuffle(uint4 &b, uint4 &bx) { + /* Inner loop shuffle targets */ + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + b.w = __shfl((int)b.w, x1); + b.z = __shfl((int)b.z, x2); + b.y = __shfl((int)b.y, x3); + uint32_t tmp = b.y; b.y = b.w; b.w = tmp; + + bx.w = __shfl((int)bx.w, x1); + bx.z = __shfl((int)bx.z, x2); + bx.y = __shfl((int)bx.y, x3); + tmp = bx.y; bx.y = bx.w; bx.w = tmp; +} + +/* + * load_key loads a 32*32bit key from a contiguous region of memory in B. + * The input keys are in external order (i.e., 0, 1, 2, 3, ...). + * After loading, each thread has its four b and four bx keys stored + * in internal processing order. + */ + +__device__ __forceinline__ +void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx) +{ + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int key_offset = scrypt_block * 32; + uint32_t thread_in_block = threadIdx.x % 4; + + // Read in permuted order. Key loads are not our bottleneck right now. + b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4]; + b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4]; + b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4]; + b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4]; + bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16]; + bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16]; + bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16]; + bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16]; + + primary_order_shuffle(b, bx); +} + +/* + * store_key performs the opposite transform as load_key, taking + * internally-ordered b and bx and storing them into a contiguous + * region of B in external order. + */ + +__device__ __forceinline__ +void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx) +{ + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int key_offset = scrypt_block * 32; + uint32_t thread_in_block = threadIdx.x % 4; + + primary_order_shuffle(b, bx); + + B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x; + B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y; + B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z; + B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w; + B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x; + B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y; + B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z; + B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w; +} + + +/* + * load_key loads a 32*32bit key from a contiguous region of memory in B. + * The input keys are in external order (i.e., 0, 1, 2, 3, ...). + * After loading, each thread has its four b and four bx keys stored + * in internal processing order. + */ + +__device__ __forceinline__ +void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx) +{ + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int key_offset = scrypt_block * 32; + uint32_t thread_in_block = threadIdx.x % 4; + + // Read in permuted order. Key loads are not our bottleneck right now. + b.x = B[key_offset + 4*0 + thread_in_block%4]; + b.y = B[key_offset + 4*1 + thread_in_block%4]; + b.z = B[key_offset + 4*2 + thread_in_block%4]; + b.w = B[key_offset + 4*3 + thread_in_block%4]; + bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16]; + bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16]; + bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16]; + bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16]; +} + +/* + * store_key performs the opposite transform as load_key, taking + * internally-ordered b and bx and storing them into a contiguous + * region of B in external order. + */ + +__device__ __forceinline__ +void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx) +{ + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int key_offset = scrypt_block * 32; + uint32_t thread_in_block = threadIdx.x % 4; + + B[key_offset + 4*0 + thread_in_block%4] = b.x; + B[key_offset + 4*1 + thread_in_block%4] = b.y; + B[key_offset + 4*2 + thread_in_block%4] = b.z; + B[key_offset + 4*3 + thread_in_block%4] = b.w; + B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x; + B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y; + B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z; + B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w; +} + + +template __device__ __forceinline__ +void load_key(const uint32_t *B, uint4 &b, uint4 &bx) +{ + switch(ALGO) { + case A_SCRYPT: load_key_salsa(B, b, bx); break; + case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break; + } +} + +template __device__ __forceinline__ +void store_key(uint32_t *B, uint4 &b, uint4 &bx) +{ + switch(ALGO) { + case A_SCRYPT: store_key_salsa(B, b, bx); break; + case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break; + } +} + + +/* + * salsa_xor_core (Salsa20/8 cypher) + * The original scrypt called: + * xor_salsa8(&X[0], &X[16]); <-- the "b" loop + * xor_salsa8(&X[16], &X[0]); <-- the "bx" loop + * This version is unrolled to handle both of these loops in a single + * call to avoid unnecessary data movement. + */ + +#if __CUDA_ARCH__ < 350 + // Kepler (Compute 3.0) + #define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<>(32-amt))); } +#else + // Kepler (Compute 3.5) + #define ROTL(a, b) __funnelshift_l( a, a, b ); + #define XOR_ROTATE_ADD(dst, s1, s2, amt) dst ^= ROTL(s1+s2, amt); +#endif + + +__device__ __forceinline__ +void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) +{ + uint4 x; + + b ^= bx; + x = b; + + // Enter in "primary order" (t0 has 0, 4, 8, 12) + // (t1 has 5, 9, 13, 1) + // (t2 has 10, 14, 2, 6) + // (t3 has 15, 3, 7, 11) + + #pragma unroll + for (int j = 0; j < 4; j++) + { + // Mixing phase of salsa + XOR_ROTATE_ADD(x.y, x.x, x.w, 7); + XOR_ROTATE_ADD(x.z, x.y, x.x, 9); + XOR_ROTATE_ADD(x.w, x.z, x.y, 13); + XOR_ROTATE_ADD(x.x, x.w, x.z, 18); + + /* Transpose rows and columns. */ + /* Unclear if this optimization is needed: These are ordered based + * upon the dependencies needed in the later xors. Compiler should be + * able to figure this out, but might as well give it a hand. */ + x.y = __shfl((int)x.y, x3); + x.w = __shfl((int)x.w, x1); + x.z = __shfl((int)x.z, x2); + + /* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first, + * but the register targets are rewritten here to swap x[1] and x[3] so that + * they can be directly shuffled to and from our peer threads without + * reassignment. The reverse shuffle then puts them back in the right place. + */ + + XOR_ROTATE_ADD(x.w, x.x, x.y, 7); + XOR_ROTATE_ADD(x.z, x.w, x.x, 9); + XOR_ROTATE_ADD(x.y, x.z, x.w, 13); + XOR_ROTATE_ADD(x.x, x.y, x.z, 18); + + x.w = __shfl((int)x.w, x3); + x.y = __shfl((int)x.y, x1); + x.z = __shfl((int)x.z, x2); + } + + b += x; + // The next two lines are the beginning of the BX-centric loop iteration + bx ^= b; + x = bx; + + // This is a copy of the same loop above, identical but stripped of comments. + // Duplicated so that we can complete a bx-based loop with fewer register moves. + #pragma unroll 4 + for (int j = 0; j < 4; j++) + { + XOR_ROTATE_ADD(x.y, x.x, x.w, 7); + XOR_ROTATE_ADD(x.z, x.y, x.x, 9); + XOR_ROTATE_ADD(x.w, x.z, x.y, 13); + XOR_ROTATE_ADD(x.x, x.w, x.z, 18); + + x.y = __shfl((int)x.y, x3); + x.w = __shfl((int)x.w, x1); + x.z = __shfl((int)x.z, x2); + + XOR_ROTATE_ADD(x.w, x.x, x.y, 7); + XOR_ROTATE_ADD(x.z, x.w, x.x, 9); + XOR_ROTATE_ADD(x.y, x.z, x.w, 13); + XOR_ROTATE_ADD(x.x, x.y, x.z, 18); + + x.w = __shfl((int)x.w, x3); + x.y = __shfl((int)x.y, x1); + x.z = __shfl((int)x.z, x2); + } + + // At the end of these iterations, the data is in primary order again. +#undef XOR_ROTATE_ADD + + bx += x; +} + + +/* + * chacha_xor_core (ChaCha20/8 cypher) + * This version is unrolled to handle both of these loops in a single + * call to avoid unnecessary data movement. + * + * load_key and store_key must not use primary order when + * using ChaCha20/8, but rather the basic transposed order + * (referred to as "column mode" below) + */ + +#if __CUDA_ARCH__ < 320 + // Kepler (Compute 3.0) + #define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<>(32-amt))); } +#else + // Kepler (Compute 3.5) + #define ROTL(a, b) __funnelshift_l( a, a, b ); + #define CHACHA_PRIMITIVE(pt, rt, ps, amt) { pt += ps; rt = ROTL(rt ^ pt,amt); } +#endif + +__device__ __forceinline__ +void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) +{ + uint4 x; + + b ^= bx; + x = b; + + // Enter in "column" mode (t0 has 0, 4, 8, 12) + // (t1 has 1, 5, 9, 13) + // (t2 has 2, 6, 10, 14) + // (t3 has 3, 7, 11, 15) + +#pragma unroll 4 + for (int j = 0; j < 4; j++) { + + // Column Mixing phase of chacha + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) + + x.y = __shfl((int)x.y, x1); + x.z = __shfl((int)x.z, x2); + x.w = __shfl((int)x.w, x3); + + // Diagonal Mixing phase of chacha + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) + + x.y = __shfl((int)x.y, x3); + x.z = __shfl((int)x.z, x2); + x.w = __shfl((int)x.w, x1); + } + + b += x; + // The next two lines are the beginning of the BX-centric loop iteration + bx ^= b; + x = bx; + + #pragma unroll + for (int j = 0; j < 4; j++) + { + + // Column Mixing phase of chacha + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) + + x.y = __shfl((int)x.y, x1); + x.z = __shfl((int)x.z, x2); + x.w = __shfl((int)x.w, x3); + + // Diagonal Mixing phase of chacha + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) + CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) + CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) + + x.y = __shfl((int)x.y, x3); + x.z = __shfl((int)x.z, x2); + x.w = __shfl((int)x.w, x1); + } + +#undef CHACHA_PRIMITIVE + + bx += x; +} + + +template __device__ __forceinline__ +void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) +{ + switch(ALGO) { + case A_SCRYPT: salsa_xor_core(b, bx, x1, x2, x3); break; + case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break; + } +} + + +/* + * The hasher_gen_kernel operates on a group of 1024-bit input keys + * in B, stored as: + * B = { k1B k1Bx k2B k2Bx ... } + * and fills up the scratchpad with the iterative hashes derived from + * those keys: + * scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... } + * scratch is 1024 times larger than the input keys B. + * It is extremely important to stream writes effectively into scratch; + * less important to coalesce the reads from B. + * + * Key ordering note: Keys are input from B in "original" order: + * K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 } + * After inputting into kernel_gen, each component k and kx of the + * key is transmuted into a permuted internal order to make processing faster: + * K = k, kx with: + * k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11 + * and similarly for kx. + */ + +template __global__ +void titan_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end) +{ + uint4 b, bx; + + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; + + int i=begin; + + if (i == 0) { + load_key(d_idata, b, bx); + write_keys_direct(b, bx, start); + ++i; + } else read_keys_direct(b, bx, start+32*(i-1)); + + while (i < end) { + block_mixer(b, bx, x1, x2, x3); + write_keys_direct(b, bx, start+32*i); + ++i; + } +} + +template __global__ +void titan_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP) +{ + uint4 b, bx; + + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; + + int i=begin; + + if (i == 0) { + load_key(d_idata, b, bx); + write_keys_direct(b, bx, start); + ++i; + } else { + int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP; + read_keys_direct(b, bx, start+32*pos); + while(loop--) block_mixer(b, bx, x1, x2, x3); + } + + while (i < end) { + block_mixer(b, bx, x1, x2, x3); + if (i % LOOKUP_GAP == 0) + write_keys_direct(b, bx, start+32*(i/LOOKUP_GAP)); + ++i; + } +} + + +/* + * hasher_hash_kernel runs the second phase of scrypt after the scratch + * buffer is filled with the iterative hashes: It bounces through + * the scratch buffer in pseudorandom order, mixing the key as it goes. + */ + +template __global__ +void titan_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end) +{ + uint4 b, bx; + + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int start = ((scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; + + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + if (begin == 0) { + read_keys_direct(b, bx, start+32*c_N_1); + block_mixer(b, bx, x1, x2, x3); + } else load_key(d_odata, b, bx); + + for (int i = begin; i < end; i++) { + int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + uint4 t, tx; read_keys_direct(t, tx, start+32*j); + b ^= t; bx ^= tx; + block_mixer(b, bx, x1, x2, x3); + } + + store_key(d_odata, b, bx); +} + +template __global__ +void titan_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP) +{ + uint4 b, bx; + + int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; + int start = ((scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; + + int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); + int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); + int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); + + if (begin == 0) { + int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP); + read_keys_direct(b, bx, start+32*pos); + while(loop--) + block_mixer(b, bx, x1, x2, x3); + } + else + load_key(d_odata, b, bx); + + if (SCHEME == SIMPLE) + { + // better divergent thread handling submitted by nVidia engineers, but + // supposedly this does not run with the ANDERSEN memory access scheme + int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + int pos = j/LOOKUP_GAP; + int loop = -1; + uint4 t, tx; + + int i = begin; + while(i < end) + { + if (loop == -1) { + j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + pos = j/LOOKUP_GAP; + loop = j-pos*LOOKUP_GAP; + read_keys_direct(t, tx, start+32*pos); + } + if (loop == 0) { + b ^= t; bx ^= tx; + t=b;tx=bx; + } + + block_mixer(t, tx, x1, x2, x3); + if (loop == 0) { + b=t;bx=tx; + i++; + } + loop--; + } + } + else + { + // this is my original implementation, now used with the ANDERSEN + // memory access scheme only. + for (int i = begin; i < end; i++) { + int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; + uint4 t, tx; read_keys_direct(t, tx, start+32*pos); + while (loop--) + block_mixer(t, tx, x1, x2, x3); + b ^= t; bx ^= tx; + block_mixer(b, bx, x1, x2, x3); + } + } + + store_key(d_odata, b, bx); +} + + +TitanKernel::TitanKernel() : KernelInterface() +{ +} + +void TitanKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) +{ + checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice)); +} + +bool TitanKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, + uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) +{ + bool success = true; + + // make some constants available to kernel, update only initially and when changing + static int prev_N[MAX_DEVICES] = {0}; + if (N != prev_N[thr_id]) { + uint32_t h_N = N; + uint32_t h_N_1 = N-1; + uint32_t h_SCRATCH = SCRATCH; + uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP); + uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1; + + cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + + prev_N[thr_id] = N; + } + + // First phase: Sequential writes to scratchpad. + + int batch = device_batchsize[thr_id]; + + unsigned int pos = 0; + do { + if (LOOKUP_GAP == 1) { + if (IS_SCRYPT()) titan_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + } else { + if (IS_SCRYPT()) titan_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + } + pos += batch; + + } while (pos < N); + + // Second phase: Random read access from scratchpad. + + pos = 0; + do { + if (LOOKUP_GAP == 1) { + if (IS_SCRYPT()) titan_scrypt_core_kernelB <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelB <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + } else { + if (IS_SCRYPT()) titan_scrypt_core_kernelB_LG <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelB_LG <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + } + pos += batch; + + } while (pos < N); + + return success; +} diff --git a/scrypt/titan_kernel.h b/scrypt/titan_kernel.h new file mode 100644 index 0000000..720b9a3 --- /dev/null +++ b/scrypt/titan_kernel.h @@ -0,0 +1,26 @@ +#ifndef TITAN_KERNEL_H +#define TITAN_KERNEL_H + +#include "salsa_kernel.h" + +class TitanKernel : public KernelInterface +{ +public: + TitanKernel(); + + virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V); + virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache); + + virtual char get_identifier() { return 't'; } + virtual int get_major_version() { return 3; } + virtual int get_minor_version() { return 5; } + + virtual int max_warps_per_block() { return 32; } + virtual int get_texel_width() { return 4; } + virtual bool no_textures() { return true; } + virtual int threads_per_wu() { return 4; } + virtual bool support_lookup_gap() { return true; } + virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; } +}; + +#endif // #ifndef TITAN_KERNEL_H diff --git a/util.cpp b/util.cpp index 366cc3e..d349774 100644 --- a/util.cpp +++ b/util.cpp @@ -1788,6 +1788,9 @@ void print_hash_tests(void) qubithash(&hash[0], &buf[0]); printpfx("qubit", hash); + scrypthash(&hash[0], &buf[0]); + printpfx("scrypt", hash); + skeincoinhash(&hash[0], &buf[0]); printpfx("skein", hash);