mirror of https://github.com/GOSTSec/ccminer
Tanguy Pruvot
9 years ago
38 changed files with 12675 additions and 773 deletions
@ -0,0 +1,626 @@
@@ -0,0 +1,626 @@
|
||||
/*
|
||||
scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane
|
||||
|
||||
Public Domain or MIT License, whichever is easier |
||||
*/ |
||||
|
||||
#include "miner.h" |
||||
|
||||
#include "scrypt/scrypt-jane.h" |
||||
#include "scrypt/code/scrypt-jane-portable.h" |
||||
#include "scrypt/code/scrypt-jane-romix.h" |
||||
#include "scrypt/keccak.h" |
||||
|
||||
#include "scrypt/salsa_kernel.h" |
||||
|
||||
#define scrypt_maxN 30 /* (1 << (30 + 1)) = ~2 billion */ |
||||
#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */ |
||||
#define scrypt_maxr scrypt_r_32kb /* 32kb */ |
||||
#define scrypt_maxp 25 /* (1 << 25) = ~33 million */ |
||||
|
||||
// ---------------------------- BEGIN keccak functions ------------------------------------
|
||||
|
||||
#define SCRYPT_HASH "Keccak-512" |
||||
#define SCRYPT_HASH_DIGEST_SIZE 64 |
||||
#define SCRYPT_KECCAK_F 1600 |
||||
#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 1024 */ |
||||
#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 576 */ |
||||
#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8) |
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; |
||||
|
||||
typedef struct scrypt_hash_state_t { |
||||
uint64_t state[SCRYPT_KECCAK_F / 64]; |
||||
uint32_t leftover; |
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; |
||||
} scrypt_hash_state; |
||||
|
||||
static const uint64_t keccak_round_constants[24] = { |
||||
0x0000000000000001ull, 0x0000000000008082ull, |
||||
0x800000000000808aull, 0x8000000080008000ull, |
||||
0x000000000000808bull, 0x0000000080000001ull, |
||||
0x8000000080008081ull, 0x8000000000008009ull, |
||||
0x000000000000008aull, 0x0000000000000088ull, |
||||
0x0000000080008009ull, 0x000000008000000aull, |
||||
0x000000008000808bull, 0x800000000000008bull, |
||||
0x8000000000008089ull, 0x8000000000008003ull, |
||||
0x8000000000008002ull, 0x8000000000000080ull, |
||||
0x000000000000800aull, 0x800000008000000aull, |
||||
0x8000000080008081ull, 0x8000000000008080ull, |
||||
0x0000000080000001ull, 0x8000000080008008ull |
||||
}; |
||||
|
||||
static void |
||||
keccak_block(scrypt_hash_state *S, const uint8_t *in) { |
||||
size_t i; |
||||
uint64_t *s = S->state, t[5], u[5], v, w; |
||||
|
||||
/* absorb input */ |
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8) |
||||
s[i] ^= U8TO64_LE(in); |
||||
|
||||
for (i = 0; i < 24; i++) { |
||||
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ |
||||
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; |
||||
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; |
||||
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; |
||||
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; |
||||
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; |
||||
|
||||
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ |
||||
u[0] = t[4] ^ ROTL64(t[1], 1); |
||||
u[1] = t[0] ^ ROTL64(t[2], 1); |
||||
u[2] = t[1] ^ ROTL64(t[3], 1); |
||||
u[3] = t[2] ^ ROTL64(t[4], 1); |
||||
u[4] = t[3] ^ ROTL64(t[0], 1); |
||||
|
||||
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ |
||||
s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; |
||||
s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; |
||||
s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; |
||||
s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; |
||||
s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; |
||||
|
||||
/* rho pi: b[..] = rotl(a[..], ..) */ |
||||
v = s[ 1]; |
||||
s[ 1] = ROTL64(s[ 6], 44); |
||||
s[ 6] = ROTL64(s[ 9], 20); |
||||
s[ 9] = ROTL64(s[22], 61); |
||||
s[22] = ROTL64(s[14], 39); |
||||
s[14] = ROTL64(s[20], 18); |
||||
s[20] = ROTL64(s[ 2], 62); |
||||
s[ 2] = ROTL64(s[12], 43); |
||||
s[12] = ROTL64(s[13], 25); |
||||
s[13] = ROTL64(s[19], 8); |
||||
s[19] = ROTL64(s[23], 56); |
||||
s[23] = ROTL64(s[15], 41); |
||||
s[15] = ROTL64(s[ 4], 27); |
||||
s[ 4] = ROTL64(s[24], 14); |
||||
s[24] = ROTL64(s[21], 2); |
||||
s[21] = ROTL64(s[ 8], 55); |
||||
s[ 8] = ROTL64(s[16], 45); |
||||
s[16] = ROTL64(s[ 5], 36); |
||||
s[ 5] = ROTL64(s[ 3], 28); |
||||
s[ 3] = ROTL64(s[18], 21); |
||||
s[18] = ROTL64(s[17], 15); |
||||
s[17] = ROTL64(s[11], 10); |
||||
s[11] = ROTL64(s[ 7], 6); |
||||
s[ 7] = ROTL64(s[10], 3); |
||||
s[10] = ROTL64( v, 1); |
||||
|
||||
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ |
||||
v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; |
||||
v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; |
||||
v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; |
||||
v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; |
||||
v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; |
||||
|
||||
/* iota: a[0,0] ^= round constant */ |
||||
s[0] ^= keccak_round_constants[i]; |
||||
} |
||||
} |
||||
|
||||
static void |
||||
scrypt_hash_init(scrypt_hash_state *S) { |
||||
memset(S, 0, sizeof(*S)); |
||||
} |
||||
|
||||
static void |
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { |
||||
size_t want; |
||||
|
||||
/* handle the previous data */ |
||||
if (S->leftover) { |
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); |
||||
want = (want < inlen) ? want : inlen; |
||||
memcpy(S->buffer + S->leftover, in, want); |
||||
S->leftover += (uint32_t)want; |
||||
if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) |
||||
return; |
||||
in += want; |
||||
inlen -= want; |
||||
keccak_block(S, S->buffer); |
||||
} |
||||
|
||||
/* handle the current data */ |
||||
while (inlen >= SCRYPT_HASH_BLOCK_SIZE) { |
||||
keccak_block(S, in); |
||||
in += SCRYPT_HASH_BLOCK_SIZE; |
||||
inlen -= SCRYPT_HASH_BLOCK_SIZE; |
||||
} |
||||
|
||||
/* handle leftover data */ |
||||
S->leftover = (uint32_t)inlen; |
||||
if (S->leftover) |
||||
memcpy(S->buffer, in, S->leftover); |
||||
} |
||||
|
||||
static void |
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { |
||||
size_t i; |
||||
|
||||
S->buffer[S->leftover] = 0x01; |
||||
memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1)); |
||||
S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80; |
||||
keccak_block(S, S->buffer); |
||||
|
||||
for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) { |
||||
U64TO8_LE(&hash[i], S->state[i / 8]); |
||||
} |
||||
} |
||||
|
||||
// ---------------------------- END keccak functions ------------------------------------
|
||||
|
||||
// ---------------------------- BEGIN PBKDF2 functions ------------------------------------
|
||||
|
||||
typedef struct scrypt_hmac_state_t { |
||||
scrypt_hash_state inner, outer; |
||||
} scrypt_hmac_state; |
||||
|
||||
|
||||
static void |
||||
scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) { |
||||
scrypt_hash_state st; |
||||
scrypt_hash_init(&st); |
||||
scrypt_hash_update(&st, m, mlen); |
||||
scrypt_hash_finish(&st, hash); |
||||
} |
||||
|
||||
/* hmac */ |
||||
static void |
||||
scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) { |
||||
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; |
||||
size_t i; |
||||
|
||||
scrypt_hash_init(&st->inner); |
||||
scrypt_hash_init(&st->outer); |
||||
|
||||
if (keylen <= SCRYPT_HASH_BLOCK_SIZE) { |
||||
/* use the key directly if it's <= blocksize bytes */ |
||||
memcpy(pad, key, keylen); |
||||
} else { |
||||
/* if it's > blocksize bytes, hash it */ |
||||
scrypt_hash(pad, key, keylen); |
||||
} |
||||
|
||||
/* inner = (key ^ 0x36) */ |
||||
/* h(inner || ...) */ |
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) |
||||
pad[i] ^= 0x36; |
||||
scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); |
||||
|
||||
/* outer = (key ^ 0x5c) */ |
||||
/* h(outer || ...) */ |
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) |
||||
pad[i] ^= (0x5c ^ 0x36); |
||||
scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); |
||||
} |
||||
|
||||
static void |
||||
scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) { |
||||
/* h(inner || m...) */ |
||||
scrypt_hash_update(&st->inner, m, mlen); |
||||
} |
||||
|
||||
static void |
||||
scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) { |
||||
/* h(inner || m) */ |
||||
scrypt_hash_digest innerhash; |
||||
scrypt_hash_finish(&st->inner, innerhash); |
||||
|
||||
/* h(outer || h(inner || m)) */ |
||||
scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash)); |
||||
scrypt_hash_finish(&st->outer, mac); |
||||
} |
||||
|
||||
/*
|
||||
* Special version where N = 1 |
||||
* - mikaelh |
||||
*/ |
||||
static void |
||||
scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) { |
||||
scrypt_hmac_state hmac_pw, hmac_pw_salt, work; |
||||
scrypt_hash_digest ti, u; |
||||
uint8_t be[4]; |
||||
uint32_t i, /*j,*/ blocks; |
||||
// uint64_t c;
|
||||
|
||||
/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ |
||||
|
||||
/* hmac(password, ...) */ |
||||
scrypt_hmac_init(&hmac_pw, password, password_len); |
||||
|
||||
/* hmac(password, salt...) */ |
||||
hmac_pw_salt = hmac_pw; |
||||
scrypt_hmac_update(&hmac_pw_salt, salt, salt_len); |
||||
|
||||
blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; |
||||
for (i = 1; i <= blocks; i++) { |
||||
/* U1 = hmac(password, salt || be(i)) */ |
||||
U32TO8_BE(be, i); |
||||
work = hmac_pw_salt; |
||||
scrypt_hmac_update(&work, be, 4); |
||||
scrypt_hmac_finish(&work, ti); |
||||
memcpy(u, ti, sizeof(u)); |
||||
|
||||
memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes); |
||||
out += SCRYPT_HASH_DIGEST_SIZE; |
||||
bytes -= SCRYPT_HASH_DIGEST_SIZE; |
||||
} |
||||
} |
||||
|
||||
// ---------------------------- END PBKDF2 functions ------------------------------------
|
||||
|
||||
static void |
||||
scrypt_fatal_error_default(const char *msg) { |
||||
fprintf(stderr, "%s\n", msg); |
||||
exit(1); |
||||
} |
||||
|
||||
static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default; |
||||
|
||||
void |
||||
scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) { |
||||
scrypt_fatal_error = fn; |
||||
} |
||||
|
||||
typedef struct scrypt_aligned_alloc_t { |
||||
uint8_t *mem, *ptr; |
||||
} scrypt_aligned_alloc; |
||||
|
||||
#if defined(SCRYPT_TEST_SPEED) |
||||
static uint8_t *mem_base = (uint8_t *)0; |
||||
static size_t mem_bump = 0; |
||||
|
||||
/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */ |
||||
static scrypt_aligned_alloc |
||||
scrypt_alloc(uint64_t size) { |
||||
scrypt_aligned_alloc aa; |
||||
if (!mem_base) { |
||||
mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1)); |
||||
if (!mem_base) |
||||
scrypt_fatal_error("scrypt: out of memory"); |
||||
mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); |
||||
} |
||||
aa.mem = mem_base + mem_bump; |
||||
aa.ptr = aa.mem; |
||||
mem_bump += (size_t)size; |
||||
return aa; |
||||
} |
||||
|
||||
static void |
||||
scrypt_free(scrypt_aligned_alloc *aa) { |
||||
mem_bump = 0; |
||||
} |
||||
#else |
||||
static scrypt_aligned_alloc |
||||
scrypt_alloc(uint64_t size) { |
||||
static const size_t max_alloc = (size_t)-1; |
||||
scrypt_aligned_alloc aa; |
||||
size += (SCRYPT_BLOCK_BYTES - 1); |
||||
if (size > max_alloc) |
||||
scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory"); |
||||
aa.mem = (uint8_t *)malloc((size_t)size); |
||||
aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); |
||||
if (!aa.mem) |
||||
scrypt_fatal_error("scrypt: out of memory"); |
||||
return aa; |
||||
} |
||||
|
||||
static void |
||||
scrypt_free(scrypt_aligned_alloc *aa) { |
||||
free(aa->mem); |
||||
} |
||||
#endif |
||||
|
||||
|
||||
// yacoin: increasing Nfactor gradually
|
||||
unsigned char GetNfactor(unsigned int nTimestamp) { |
||||
int l = 0; |
||||
|
||||
unsigned int Nfactor = 0; |
||||
|
||||
// Yacoin defaults
|
||||
unsigned int Ntimestamp = 1367991200; |
||||
unsigned int minN = 4; |
||||
unsigned int maxN = 30; |
||||
|
||||
if (strlen(jane_params) > 0) { |
||||
if (!strcmp(jane_params, "YAC") || !strcasecmp(jane_params, "Yacoin")) {} // No-Op
|
||||
//
|
||||
// NO WARRANTY FOR CORRECTNESS. Look for the int64 nChainStartTime constant
|
||||
// in the src/main.cpp file of the official wallet clients as well as the
|
||||
// const unsigned char minNfactor and const unsigned char maxNfactor
|
||||
//
|
||||
else if (!strcmp(jane_params, "YBC") || !strcasecmp(jane_params, "YBCoin")) { |
||||
// YBCoin: 1372386273, minN: 4, maxN: 30
|
||||
Ntimestamp = 1372386273; minN= 4; maxN= 30; |
||||
} else if (!strcmp(jane_params, "ZZC") || !strcasecmp(jane_params, "ZZCoin")) { |
||||
// ZcCoin: 1375817223, minN: 12, maxN: 30
|
||||
Ntimestamp = 1375817223; minN= 12; maxN= 30; |
||||
} else if (!strcmp(jane_params, "FEC") || !strcasecmp(jane_params, "FreeCoin")) { |
||||
// FreeCoin: 1375801200, minN: 6, maxN: 32
|
||||
Ntimestamp = 1375801200; minN= 6; maxN= 32; |
||||
} else if (!strcmp(jane_params, "ONC") || !strcasecmp(jane_params, "OneCoin")) { |
||||
// OneCoin: 1371119462, minN: 6, maxN: 30
|
||||
Ntimestamp = 1371119462; minN= 6; maxN= 30; |
||||
} else if (!strcmp(jane_params, "QQC") || !strcasecmp(jane_params, "QQCoin")) { |
||||
// QQCoin: 1387769316, minN: 4, maxN: 30
|
||||
Ntimestamp = 1387769316; minN= 4; maxN= 30; |
||||
} else if (!strcmp(jane_params, "GPL") || !strcasecmp(jane_params, "GoldPressedLatinum")) { |
||||
// GoldPressedLatinum:1377557832, minN: 4, maxN: 30
|
||||
Ntimestamp = 1377557832; minN= 4; maxN= 30; |
||||
} else if (!strcmp(jane_params, "MRC") || !strcasecmp(jane_params, "MicroCoin")) { |
||||
// MicroCoin:1389028879, minN: 4, maxN: 30
|
||||
Ntimestamp = 1389028879; minN= 4; maxN= 30; |
||||
} else if (!strcmp(jane_params, "APC") || !strcasecmp(jane_params, "AppleCoin")) { |
||||
// AppleCoin:1384720832, minN: 4, maxN: 30
|
||||
Ntimestamp = 1384720832; minN= 4; maxN= 30; |
||||
} else if (!strcmp(jane_params, "CPR") || !strcasecmp(jane_params, "Copperbars")) { |
||||
// Copperbars:1376184687, minN: 4, maxN: 30
|
||||
Ntimestamp = 1376184687; minN= 4; maxN= 30; |
||||
} else if (!strcmp(jane_params, "CACH") || !strcasecmp(jane_params, "CacheCoin")) { |
||||
// CacheCoin:1388949883, minN: 4, maxN: 30
|
||||
Ntimestamp = 1388949883; minN= 4; maxN= 30; |
||||
} else if (!strcmp(jane_params, "UTC") || !strcasecmp(jane_params, "UltraCoin")) { |
||||
// MicroCoin:1388361600, minN: 4, maxN: 30
|
||||
Ntimestamp = 1388361600; minN= 4; maxN= 30; |
||||
} else if (!strcmp(jane_params, "VEL") || !strcasecmp(jane_params, "VelocityCoin")) { |
||||
// VelocityCoin:1387769316, minN: 4, maxN: 30
|
||||
Ntimestamp = 1387769316; minN= 4; maxN= 30; |
||||
} else if (!strcmp(jane_params, "ITC") || !strcasecmp(jane_params, "InternetCoin")) { |
||||
// InternetCoin:1388385602, minN: 4, maxN: 30
|
||||
Ntimestamp = 1388385602; minN= 4; maxN= 30; |
||||
} else if (!strcmp(jane_params, "RAD") || !strcasecmp(jane_params, "RadioactiveCoin")) { |
||||
// InternetCoin:1389196388, minN: 4, maxN: 30
|
||||
Ntimestamp = 1389196388; minN= 4; maxN= 30; |
||||
} else { |
||||
if (sscanf(jane_params, "%u,%u,%u", &Ntimestamp, &minN, &maxN) != 3) |
||||
if (sscanf(jane_params, "%u", &Nfactor) == 1) return Nfactor; // skip bounding against minN, maxN
|
||||
else applog(LOG_INFO, "Unable to parse scrypt-jane parameters: '%s'. Defaulting to Yacoin.", jane_params); |
||||
} |
||||
} |
||||
// determination based on the constants determined above
|
||||
if (nTimestamp <= Ntimestamp) |
||||
return minN; |
||||
|
||||
unsigned long int s = nTimestamp - Ntimestamp; |
||||
while ((s >> 1) > 3) { |
||||
l += 1; |
||||
s >>= 1; |
||||
} |
||||
|
||||
s &= 3; |
||||
|
||||
int n = (l * 170 + s * 25 - 2320) / 100; |
||||
|
||||
if (n < 0) n = 0; |
||||
|
||||
if (n > 255) |
||||
printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n); |
||||
|
||||
Nfactor = n; |
||||
if (Nfactor<minN) return minN; |
||||
if (Nfactor>maxN) return maxN; |
||||
return Nfactor; |
||||
} |
||||
|
||||
#define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \ |
||||
| (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) |
||||
|
||||
static int s_Nfactor = 0; |
||||
|
||||
int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, unsigned char *scratchbuf, |
||||
uint32_t max_nonce, unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end) |
||||
{ |
||||
const uint32_t Htarg = ptarget[7]; |
||||
|
||||
if (s_Nfactor == 0 && strlen(jane_params) > 0) |
||||
applog(LOG_INFO, "Given scrypt-jane parameters: %s", jane_params); |
||||
|
||||
int Nfactor = GetNfactor(bswap_32x4(pdata[17])); |
||||
if (Nfactor > scrypt_maxN) { |
||||
scrypt_fatal_error("scrypt: N out of range"); |
||||
} |
||||
|
||||
if (Nfactor != s_Nfactor) |
||||
{ |
||||
// all of this isn't very thread-safe...
|
||||
opt_nfactor = (1 << (Nfactor + 1)); |
||||
|
||||
applog(LOG_INFO, "Nfactor is %d (N=%d)!", Nfactor, opt_nfactor); |
||||
|
||||
if (s_Nfactor != 0) { |
||||
// handle N-factor increase at runtime
|
||||
// by adjusting the lookup_gap by factor 2
|
||||
if (s_Nfactor == Nfactor-1) |
||||
for (int i=0; i < 8; ++i) |
||||
device_lookup_gap[i] *= 2; |
||||
} |
||||
s_Nfactor = Nfactor; |
||||
} |
||||
|
||||
int throughput = cuda_throughput(thr_id); |
||||
|
||||
if(throughput == 0) |
||||
return -1; |
||||
|
||||
gettimeofday(tv_start, NULL); |
||||
|
||||
uint32_t *data[2] = { new uint32_t[20*throughput], new uint32_t[20*throughput] }; |
||||
uint32_t* hash[2] = { cuda_hashbuffer(thr_id,0), cuda_hashbuffer(thr_id,1) }; |
||||
|
||||
uint32_t n = pdata[19]; |
||||
|
||||
/* byte swap pdata into data[0]/[1] arrays */ |
||||
for (int k=0; k<2; ++k) { |
||||
for(int z=0;z<20;z++) data[k][z] = bswap_32x4(pdata[z]); |
||||
for(int i=1;i<throughput;++i) memcpy(&data[k][20*i], &data[k][0], 20*sizeof(uint32_t)); |
||||
} |
||||
if (parallel == 2) prepare_keccak512(thr_id, pdata); |
||||
|
||||
scrypt_aligned_alloc Xbuf[2] = { scrypt_alloc(128 * throughput), scrypt_alloc(128 * throughput) }; |
||||
scrypt_aligned_alloc Vbuf = scrypt_alloc((uint64_t)opt_nfactor * 128); |
||||
scrypt_aligned_alloc Ybuf = scrypt_alloc(128); |
||||
|
||||
uint32_t nonce[2]; |
||||
uint32_t* cuda_X[2] = { cuda_transferbuffer(thr_id,0), cuda_transferbuffer(thr_id,1) }; |
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) |
||||
scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); |
||||
#endif |
||||
|
||||
int cur = 0, nxt = 1; |
||||
int iteration = 0; |
||||
|
||||
do { |
||||
nonce[nxt] = n; |
||||
|
||||
if (parallel < 2) |
||||
{ |
||||
for(int i=0;i<throughput;++i) { |
||||
uint32_t tmp_nonce = n++; |
||||
data[nxt][20*i + 19] = bswap_32x4(tmp_nonce); |
||||
} |
||||
|
||||
for(int i=0;i<throughput;++i) |
||||
scrypt_pbkdf2_1((unsigned char *)&data[nxt][20*i], 80, (unsigned char *)&data[nxt][20*i], 80, Xbuf[nxt].ptr + 128 * i, 128); |
||||
|
||||
memcpy(cuda_X[nxt], Xbuf[nxt].ptr, 128 * throughput); |
||||
cuda_scrypt_serialize(thr_id, nxt); |
||||
cuda_scrypt_HtoD(thr_id, cuda_X[nxt], nxt); |
||||
cuda_scrypt_core(thr_id, nxt, opt_nfactor); |
||||
cuda_scrypt_done(thr_id, nxt); |
||||
|
||||
cuda_scrypt_DtoH(thr_id, cuda_X[nxt], nxt, false); |
||||
|
||||
cuda_scrypt_flush(thr_id, nxt); |
||||
|
||||
if(!cuda_scrypt_sync(thr_id, cur)) |
||||
{ |
||||
return -1; |
||||
} |
||||
|
||||
memcpy(Xbuf[cur].ptr, cuda_X[cur], 128 * throughput); |
||||
for(int i=0;i<throughput;++i) |
||||
scrypt_pbkdf2_1((unsigned char *)&data[cur][20*i], 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)(&hash[cur][8*i]), 32); |
||||
|
||||
#define VERIFY_ALL 0 |
||||
#if VERIFY_ALL |
||||
{ |
||||
/* 2: X = ROMix(X) */ |
||||
for(int i=0;i<throughput;++i) |
||||
scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)Ybuf.ptr, (scrypt_mix_word_t *)Vbuf.ptr, N); |
||||
|
||||
unsigned int err = 0; |
||||
for(int i=0;i<throughput;++i) { |
||||
unsigned char *ref = (Xbuf[cur].ptr + 128 * i); |
||||
unsigned char *dat = (unsigned char*)(cuda_X[cur] + 32 * i); |
||||
if (memcmp(ref, dat, 128) != 0) |
||||
{ |
||||
err++; |
||||
#if 0 |
||||
uint32_t *ref32 = (uint32_t*) ref; |
||||
uint32_t *dat32 = (uint32_t*) dat; |
||||
for (int j=0; j<32; ++j) { |
||||
if (ref32[j] != dat32[j]) |
||||
fprintf(stderr, "ref32[i=%d][j=%d] = $%08x / $%08x\n", i, j, ref32[j], dat32[j]); |
||||
} |
||||
#endif |
||||
} |
||||
} |
||||
if (err > 0) fprintf(stderr, "%d out of %d hashes differ.\n", err, throughput); |
||||
} |
||||
#endif |
||||
} else { |
||||
n += throughput; |
||||
|
||||
cuda_scrypt_serialize(thr_id, nxt); |
||||
pre_keccak512(thr_id, nxt, nonce[nxt], throughput); |
||||
cuda_scrypt_core(thr_id, nxt, opt_nfactor); |
||||
|
||||
cuda_scrypt_flush(thr_id, nxt); |
||||
|
||||
post_keccak512(thr_id, nxt, nonce[nxt], throughput); |
||||
cuda_scrypt_done(thr_id, nxt); |
||||
|
||||
cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true); |
||||
|
||||
if(!cuda_scrypt_sync(thr_id, cur)) |
||||
{ |
||||
return -1; |
||||
} |
||||
} |
||||
|
||||
if(iteration > 0) |
||||
{ |
||||
for(int i=0;i<throughput;++i) { |
||||
volatile unsigned char *hashc = (unsigned char *)(&hash[cur][8*i]); |
||||
|
||||
if (hash[cur][8*i+7] <= Htarg && fulltest(&hash[cur][8*i], ptarget)) |
||||
{ |
||||
uint32_t _ALIGN(64) thash[8], tdata[20]; |
||||
uint32_t tmp_nonce = nonce[cur] + i; |
||||
|
||||
for(int z=0;z<20;z++) |
||||
tdata[z] = bswap_32x4(pdata[z]); |
||||
tdata[19] = bswap_32x4(tmp_nonce); |
||||
|
||||
scrypt_pbkdf2_1((unsigned char *)tdata, 80, (unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128); |
||||
scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)(Ybuf.ptr), (scrypt_mix_word_t *)(Vbuf.ptr), opt_nfactor); |
||||
scrypt_pbkdf2_1((unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)thash, 32); |
||||
|
||||
if (memcmp(thash, &hash[cur][8*i], 32) == 0) |
||||
{ |
||||
//applog(LOG_INFO, "GPU #%d: %s result validates on CPU.", device_map[thr_id], device_name[thr_id]);
|
||||
|
||||
*hashes_done = n - pdata[19]; |
||||
pdata[19] = tmp_nonce; |
||||
scrypt_free(&Vbuf); |
||||
scrypt_free(&Ybuf); |
||||
scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]); |
||||
delete[] data[0]; delete[] data[1]; |
||||
gettimeofday(tv_end, NULL); |
||||
return 1; |
||||
} else { |
||||
applog(LOG_INFO, "GPU #%d: %s result does not validate on CPU (i=%d, s=%d)!", device_map[thr_id], device_name[thr_id], i, cur); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
cur = (cur+1)&1; |
||||
nxt = (nxt+1)&1; |
||||
++iteration; |
||||
} while (n <= max_nonce && !work_restart[thr_id].restart); |
||||
|
||||
scrypt_free(&Vbuf); |
||||
scrypt_free(&Ybuf); |
||||
scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]); |
||||
delete[] data[0]; delete[] data[1]; |
||||
|
||||
*hashes_done = n - pdata[19]; |
||||
pdata[19] = n; |
||||
gettimeofday(tv_end, NULL); |
||||
return 0; |
||||
} |
@ -1,756 +0,0 @@
@@ -1,756 +0,0 @@
|
||||
/*
|
||||
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler |
||||
* All rights reserved. |
||||
* |
||||
* Redistribution and use in source and binary forms, with or without |
||||
* modification, are permitted provided that the following conditions |
||||
* are met: |
||||
* 1. Redistributions of source code must retain the above copyright |
||||
* notice, this list of conditions and the following disclaimer. |
||||
* 2. Redistributions in binary form must reproduce the above copyright |
||||
* notice, this list of conditions and the following disclaimer in the |
||||
* documentation and/or other materials provided with the distribution. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
||||
* SUCH DAMAGE. |
||||
* |
||||
* This file was originally written by Colin Percival as part of the Tarsnap |
||||
* online backup system. |
||||
*/ |
||||
|
||||
#include "cpuminer-config.h" |
||||
#include "miner.h" |
||||
|
||||
#include <stdlib.h> |
||||
#include <string.h> |
||||
#include <inttypes.h> |
||||
|
||||
static const uint32_t keypad[12] = { |
||||
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 |
||||
}; |
||||
static const uint32_t innerpad[11] = { |
||||
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 |
||||
}; |
||||
static const uint32_t outerpad[8] = { |
||||
0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300 |
||||
}; |
||||
static const uint32_t finalblk[16] = { |
||||
0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 |
||||
}; |
||||
|
||||
static inline void HMAC_SHA256_80_init(const uint32_t *key, |
||||
uint32_t *tstate, uint32_t *ostate) |
||||
{ |
||||
uint32_t ihash[8]; |
||||
uint32_t pad[16]; |
||||
int i; |
||||
|
||||
/* tstate is assumed to contain the midstate of key */ |
||||
memcpy(pad, key + 16, 16); |
||||
memcpy(pad + 4, keypad, 48); |
||||
sha256_transform(tstate, pad, 0); |
||||
memcpy(ihash, tstate, 32); |
||||
|
||||
sha256_init(ostate); |
||||
for (i = 0; i < 8; i++) |
||||
pad[i] = ihash[i] ^ 0x5c5c5c5c; |
||||
for (; i < 16; i++) |
||||
pad[i] = 0x5c5c5c5c; |
||||
sha256_transform(ostate, pad, 0); |
||||
|
||||
sha256_init(tstate); |
||||
for (i = 0; i < 8; i++) |
||||
pad[i] = ihash[i] ^ 0x36363636; |
||||
for (; i < 16; i++) |
||||
pad[i] = 0x36363636; |
||||
sha256_transform(tstate, pad, 0); |
||||
} |
||||
|
||||
static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate, |
||||
const uint32_t *ostate, const uint32_t *salt, uint32_t *output) |
||||
{ |
||||
uint32_t istate[8], ostate2[8]; |
||||
uint32_t ibuf[16], obuf[16]; |
||||
int i, j; |
||||
|
||||
memcpy(istate, tstate, 32); |
||||
sha256_transform(istate, salt, 0); |
||||
|
||||
memcpy(ibuf, salt + 16, 16); |
||||
memcpy(ibuf + 5, innerpad, 44); |
||||
memcpy(obuf + 8, outerpad, 32); |
||||
|
||||
for (i = 0; i < 4; i++) { |
||||
memcpy(obuf, istate, 32); |
||||
ibuf[4] = i + 1; |
||||
sha256_transform(obuf, ibuf, 0); |
||||
|
||||
memcpy(ostate2, ostate, 32); |
||||
sha256_transform(ostate2, obuf, 0); |
||||
for (j = 0; j < 8; j++) |
||||
output[8 * i + j] = swab32(ostate2[j]); |
||||
} |
||||
} |
||||
|
||||
static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, |
||||
const uint32_t *salt, uint32_t *output) |
||||
{ |
||||
uint32_t buf[16]; |
||||
int i; |
||||
|
||||
sha256_transform(tstate, salt, 1); |
||||
sha256_transform(tstate, salt + 16, 1); |
||||
sha256_transform(tstate, finalblk, 0); |
||||
memcpy(buf, tstate, 32); |
||||
memcpy(buf + 8, outerpad, 32); |
||||
|
||||
sha256_transform(ostate, buf, 0); |
||||
for (i = 0; i < 8; i++) |
||||
output[i] = swab32(ostate[i]); |
||||
} |
||||
|
||||
|
||||
#if HAVE_SHA256_4WAY |
||||
|
||||
static const uint32_t keypad_4way[4 * 12] = { |
||||
0x80000000, 0x80000000, 0x80000000, 0x80000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000280, 0x00000280, 0x00000280, 0x00000280 |
||||
}; |
||||
static const uint32_t innerpad_4way[4 * 11] = { |
||||
0x80000000, 0x80000000, 0x80000000, 0x80000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0 |
||||
}; |
||||
static const uint32_t outerpad_4way[4 * 8] = { |
||||
0x80000000, 0x80000000, 0x80000000, 0x80000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000300, 0x00000300, 0x00000300, 0x00000300 |
||||
}; |
||||
static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = { |
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001, |
||||
0x80000000, 0x80000000, 0x80000000, 0x80000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000620, 0x00000620, 0x00000620, 0x00000620 |
||||
}; |
||||
|
||||
static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, |
||||
uint32_t *tstate, uint32_t *ostate) |
||||
{ |
||||
uint32_t ihash[4 * 8] __attribute__((aligned(16))); |
||||
uint32_t pad[4 * 16] __attribute__((aligned(16))); |
||||
int i; |
||||
|
||||
/* tstate is assumed to contain the midstate of key */ |
||||
memcpy(pad, key + 4 * 16, 4 * 16); |
||||
memcpy(pad + 4 * 4, keypad_4way, 4 * 48); |
||||
sha256_transform_4way(tstate, pad, 0); |
||||
memcpy(ihash, tstate, 4 * 32); |
||||
|
||||
sha256_init_4way(ostate); |
||||
for (i = 0; i < 4 * 8; i++) |
||||
pad[i] = ihash[i] ^ 0x5c5c5c5c; |
||||
for (; i < 4 * 16; i++) |
||||
pad[i] = 0x5c5c5c5c; |
||||
sha256_transform_4way(ostate, pad, 0); |
||||
|
||||
sha256_init_4way(tstate); |
||||
for (i = 0; i < 4 * 8; i++) |
||||
pad[i] = ihash[i] ^ 0x36363636; |
||||
for (; i < 4 * 16; i++) |
||||
pad[i] = 0x36363636; |
||||
sha256_transform_4way(tstate, pad, 0); |
||||
} |
||||
|
||||
static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, |
||||
const uint32_t *ostate, const uint32_t *salt, uint32_t *output) |
||||
{ |
||||
uint32_t istate[4 * 8] __attribute__((aligned(16))); |
||||
uint32_t ostate2[4 * 8] __attribute__((aligned(16))); |
||||
uint32_t ibuf[4 * 16] __attribute__((aligned(16))); |
||||
uint32_t obuf[4 * 16] __attribute__((aligned(16))); |
||||
int i, j; |
||||
|
||||
memcpy(istate, tstate, 4 * 32); |
||||
sha256_transform_4way(istate, salt, 0); |
||||
|
||||
memcpy(ibuf, salt + 4 * 16, 4 * 16); |
||||
memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); |
||||
memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32); |
||||
|
||||
for (i = 0; i < 4; i++) { |
||||
memcpy(obuf, istate, 4 * 32); |
||||
ibuf[4 * 4 + 0] = i + 1; |
||||
ibuf[4 * 4 + 1] = i + 1; |
||||
ibuf[4 * 4 + 2] = i + 1; |
||||
ibuf[4 * 4 + 3] = i + 1; |
||||
sha256_transform_4way(obuf, ibuf, 0); |
||||
|
||||
memcpy(ostate2, ostate, 4 * 32); |
||||
sha256_transform_4way(ostate2, obuf, 0); |
||||
for (j = 0; j < 4 * 8; j++) |
||||
output[4 * 8 * i + j] = swab32(ostate2[j]); |
||||
} |
||||
} |
||||
|
||||
static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, |
||||
uint32_t *ostate, const uint32_t *salt, uint32_t *output) |
||||
{ |
||||
uint32_t buf[4 * 16] __attribute__((aligned(16))); |
||||
int i; |
||||
|
||||
sha256_transform_4way(tstate, salt, 1); |
||||
sha256_transform_4way(tstate, salt + 4 * 16, 1); |
||||
sha256_transform_4way(tstate, finalblk_4way, 0); |
||||
memcpy(buf, tstate, 4 * 32); |
||||
memcpy(buf + 4 * 8, outerpad_4way, 4 * 32); |
||||
|
||||
sha256_transform_4way(ostate, buf, 0); |
||||
for (i = 0; i < 4 * 8; i++) |
||||
output[i] = swab32(ostate[i]); |
||||
} |
||||
|
||||
#endif /* HAVE_SHA256_4WAY */ |
||||
|
||||
|
||||
#if HAVE_SHA256_8WAY |
||||
|
||||
static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = { |
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, |
||||
0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620 |
||||
}; |
||||
|
||||
static inline void HMAC_SHA256_80_init_8way(const uint32_t *key, |
||||
uint32_t *tstate, uint32_t *ostate) |
||||
{ |
||||
uint32_t ihash[8 * 8] __attribute__((aligned(32))); |
||||
uint32_t pad[8 * 16] __attribute__((aligned(32))); |
||||
int i; |
||||
|
||||
/* tstate is assumed to contain the midstate of key */ |
||||
memcpy(pad, key + 8 * 16, 8 * 16); |
||||
for (i = 0; i < 8; i++) |
||||
pad[8 * 4 + i] = 0x80000000; |
||||
memset(pad + 8 * 5, 0x00, 8 * 40); |
||||
for (i = 0; i < 8; i++) |
||||
pad[8 * 15 + i] = 0x00000280; |
||||
sha256_transform_8way(tstate, pad, 0); |
||||
memcpy(ihash, tstate, 8 * 32); |
||||
|
||||
sha256_init_8way(ostate); |
||||
for (i = 0; i < 8 * 8; i++) |
||||
pad[i] = ihash[i] ^ 0x5c5c5c5c; |
||||
for (; i < 8 * 16; i++) |
||||
pad[i] = 0x5c5c5c5c; |
||||
sha256_transform_8way(ostate, pad, 0); |
||||
|
||||
sha256_init_8way(tstate); |
||||
for (i = 0; i < 8 * 8; i++) |
||||
pad[i] = ihash[i] ^ 0x36363636; |
||||
for (; i < 8 * 16; i++) |
||||
pad[i] = 0x36363636; |
||||
sha256_transform_8way(tstate, pad, 0); |
||||
} |
||||
|
||||
static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, |
||||
const uint32_t *ostate, const uint32_t *salt, uint32_t *output) |
||||
{ |
||||
uint32_t istate[8 * 8] __attribute__((aligned(32))); |
||||
uint32_t ostate2[8 * 8] __attribute__((aligned(32))); |
||||
uint32_t ibuf[8 * 16] __attribute__((aligned(32))); |
||||
uint32_t obuf[8 * 16] __attribute__((aligned(32))); |
||||
int i, j; |
||||
|
||||
memcpy(istate, tstate, 8 * 32); |
||||
sha256_transform_8way(istate, salt, 0); |
||||
|
||||
memcpy(ibuf, salt + 8 * 16, 8 * 16); |
||||
for (i = 0; i < 8; i++) |
||||
ibuf[8 * 5 + i] = 0x80000000; |
||||
memset(ibuf + 8 * 6, 0x00, 8 * 36); |
||||
for (i = 0; i < 8; i++) |
||||
ibuf[8 * 15 + i] = 0x000004a0; |
||||
|
||||
for (i = 0; i < 8; i++) |
||||
obuf[8 * 8 + i] = 0x80000000; |
||||
memset(obuf + 8 * 9, 0x00, 8 * 24); |
||||
for (i = 0; i < 8; i++) |
||||
obuf[8 * 15 + i] = 0x00000300; |
||||
|
||||
for (i = 0; i < 4; i++) { |
||||
memcpy(obuf, istate, 8 * 32); |
||||
ibuf[8 * 4 + 0] = i + 1; |
||||
ibuf[8 * 4 + 1] = i + 1; |
||||
ibuf[8 * 4 + 2] = i + 1; |
||||
ibuf[8 * 4 + 3] = i + 1; |
||||
ibuf[8 * 4 + 4] = i + 1; |
||||
ibuf[8 * 4 + 5] = i + 1; |
||||
ibuf[8 * 4 + 6] = i + 1; |
||||
ibuf[8 * 4 + 7] = i + 1; |
||||
sha256_transform_8way(obuf, ibuf, 0); |
||||
|
||||
memcpy(ostate2, ostate, 8 * 32); |
||||
sha256_transform_8way(ostate2, obuf, 0); |
||||
for (j = 0; j < 8 * 8; j++) |
||||
output[8 * 8 * i + j] = swab32(ostate2[j]); |
||||
} |
||||
} |
||||
|
||||
static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate, |
||||
uint32_t *ostate, const uint32_t *salt, uint32_t *output) |
||||
{ |
||||
uint32_t buf[8 * 16] __attribute__((aligned(32))); |
||||
int i; |
||||
|
||||
sha256_transform_8way(tstate, salt, 1); |
||||
sha256_transform_8way(tstate, salt + 8 * 16, 1); |
||||
sha256_transform_8way(tstate, finalblk_8way, 0); |
||||
|
||||
memcpy(buf, tstate, 8 * 32); |
||||
for (i = 0; i < 8; i++) |
||||
buf[8 * 8 + i] = 0x80000000; |
||||
memset(buf + 8 * 9, 0x00, 8 * 24); |
||||
for (i = 0; i < 8; i++) |
||||
buf[8 * 15 + i] = 0x00000300; |
||||
sha256_transform_8way(ostate, buf, 0); |
||||
|
||||
for (i = 0; i < 8 * 8; i++) |
||||
output[i] = swab32(ostate[i]); |
||||
} |
||||
|
||||
#endif /* HAVE_SHA256_8WAY */ |
||||
|
||||
|
||||
#if defined(__x86_64__) |
||||
|
||||
#define SCRYPT_MAX_WAYS 1 |
||||
#define HAVE_SCRYPT_3WAY 0 |
||||
#define scrypt_best_throughput() 1 |
||||
static void scrypt_core(uint32_t *X, uint32_t *V); |
||||
void scrypt_core_3way(uint32_t *X, uint32_t *V); |
||||
#if defined(USE_AVX2) |
||||
#undef SCRYPT_MAX_WAYS |
||||
#define SCRYPT_MAX_WAYS 21 |
||||
#define HAVE_SCRYPT_6WAY 0 |
||||
void scrypt_core_6way(uint32_t *X, uint32_t *V); |
||||
#endif |
||||
|
||||
#elif defined(__i386__) |
||||
|
||||
#define SCRYPT_MAX_WAYS 1 |
||||
#define scrypt_best_throughput() 1 |
||||
static void scrypt_core(uint32_t *X, uint32_t *V); |
||||
|
||||
#elif defined(__arm__) && defined(__APCS_32__) |
||||
|
||||
static void scrypt_core(uint32_t *X, uint32_t *V); |
||||
#if defined(__ARM_NEON__) |
||||
#undef HAVE_SHA256_4WAY |
||||
#define SCRYPT_MAX_WAYS 1 |
||||
#define HAVE_SCRYPT_3WAY 0 |
||||
#define scrypt_best_throughput() 1 |
||||
void scrypt_core_3way(uint32_t *X, uint32_t *V); |
||||
#endif |
||||
|
||||
#endif |
||||
|
||||
static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) |
||||
{ |
||||
uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; |
||||
int i; |
||||
|
||||
x00 = (B[ 0] ^= Bx[ 0]); |
||||
x01 = (B[ 1] ^= Bx[ 1]); |
||||
x02 = (B[ 2] ^= Bx[ 2]); |
||||
x03 = (B[ 3] ^= Bx[ 3]); |
||||
x04 = (B[ 4] ^= Bx[ 4]); |
||||
x05 = (B[ 5] ^= Bx[ 5]); |
||||
x06 = (B[ 6] ^= Bx[ 6]); |
||||
x07 = (B[ 7] ^= Bx[ 7]); |
||||
x08 = (B[ 8] ^= Bx[ 8]); |
||||
x09 = (B[ 9] ^= Bx[ 9]); |
||||
x10 = (B[10] ^= Bx[10]); |
||||
x11 = (B[11] ^= Bx[11]); |
||||
x12 = (B[12] ^= Bx[12]); |
||||
x13 = (B[13] ^= Bx[13]); |
||||
x14 = (B[14] ^= Bx[14]); |
||||
x15 = (B[15] ^= Bx[15]); |
||||
for (i = 0; i < 8; i += 2) { |
||||
#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) |
||||
/* Operate on columns. */ |
||||
x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); |
||||
x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7); |
||||
|
||||
x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); |
||||
x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9); |
||||
|
||||
x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); |
||||
x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13); |
||||
|
||||
x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); |
||||
x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18); |
||||
|
||||
/* Operate on rows. */ |
||||
x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); |
||||
x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7); |
||||
|
||||
x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); |
||||
x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9); |
||||
|
||||
x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); |
||||
x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13); |
||||
|
||||
x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); |
||||
x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18); |
||||
#undef R |
||||
} |
||||
B[ 0] += x00; |
||||
B[ 1] += x01; |
||||
B[ 2] += x02; |
||||
B[ 3] += x03; |
||||
B[ 4] += x04; |
||||
B[ 5] += x05; |
||||
B[ 6] += x06; |
||||
B[ 7] += x07; |
||||
B[ 8] += x08; |
||||
B[ 9] += x09; |
||||
B[10] += x10; |
||||
B[11] += x11; |
||||
B[12] += x12; |
||||
B[13] += x13; |
||||
B[14] += x14; |
||||
B[15] += x15; |
||||
} |
||||
|
||||
static inline void scrypt_core(uint32_t *X, uint32_t *V) |
||||
{ |
||||
uint32_t i, j, k; |
||||
|
||||
for (i = 0; i < 1024; i++) { |
||||
memcpy(&V[i * 32], X, 128); |
||||
xor_salsa8(&X[0], &X[16]); |
||||
xor_salsa8(&X[16], &X[0]); |
||||
} |
||||
for (i = 0; i < 1024; i++) { |
||||
j = 32 * (X[16] & 1023); |
||||
for (k = 0; k < 32; k++) |
||||
X[k] ^= V[j + k]; |
||||
xor_salsa8(&X[0], &X[16]); |
||||
xor_salsa8(&X[16], &X[0]); |
||||
} |
||||
} |
||||
|
||||
#ifndef SCRYPT_MAX_WAYS |
||||
#define SCRYPT_MAX_WAYS 1 |
||||
#define scrypt_best_throughput() 1 |
||||
#endif |
||||
|
||||
#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63) |
||||
|
||||
unsigned char *scrypt_buffer_alloc() |
||||
{ |
||||
return (unsigned char *)malloc(SCRYPT_BUFFER_SIZE); |
||||
} |
||||
|
||||
static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, |
||||
uint32_t *midstate, unsigned char *scratchpad) |
||||
{ |
||||
uint32_t tstate[8], ostate[8]; |
||||
uint32_t X[32]; |
||||
uint32_t *V; |
||||
|
||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); |
||||
|
||||
memcpy(tstate, midstate, 32); |
||||
HMAC_SHA256_80_init(input, tstate, ostate); |
||||
PBKDF2_SHA256_80_128(tstate, ostate, input, X); |
||||
|
||||
scrypt_core(X, V); |
||||
|
||||
PBKDF2_SHA256_128_32(tstate, ostate, X, output); |
||||
} |
||||
|
||||
#if HAVE_SHA256_4WAY |
||||
static void scrypt_1024_1_1_256_4way(const uint32_t *input, |
||||
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) |
||||
{ |
||||
uint32_t tstate[4 * 8] __attribute__((aligned(128))); |
||||
uint32_t ostate[4 * 8] __attribute__((aligned(128))); |
||||
uint32_t W[4 * 32] __attribute__((aligned(128))); |
||||
uint32_t X[4 * 32] __attribute__((aligned(128))); |
||||
uint32_t *V; |
||||
int i, k; |
||||
|
||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); |
||||
|
||||
for (i = 0; i < 20; i++) |
||||
for (k = 0; k < 4; k++) |
||||
W[4 * i + k] = input[k * 20 + i]; |
||||
for (i = 0; i < 8; i++) |
||||
for (k = 0; k < 4; k++) |
||||
tstate[4 * i + k] = midstate[i]; |
||||
HMAC_SHA256_80_init_4way(W, tstate, ostate); |
||||
PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); |
||||
for (i = 0; i < 32; i++) |
||||
for (k = 0; k < 4; k++) |
||||
X[k * 32 + i] = W[4 * i + k]; |
||||
scrypt_core(X + 0 * 32, V); |
||||
scrypt_core(X + 1 * 32, V); |
||||
scrypt_core(X + 2 * 32, V); |
||||
scrypt_core(X + 3 * 32, V); |
||||
for (i = 0; i < 32; i++) |
||||
for (k = 0; k < 4; k++) |
||||
W[4 * i + k] = X[k * 32 + i]; |
||||
PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); |
||||
for (i = 0; i < 8; i++) |
||||
for (k = 0; k < 4; k++) |
||||
output[k * 8 + i] = W[4 * i + k]; |
||||
} |
||||
#endif /* HAVE_SHA256_4WAY */ |
||||
|
||||
#if HAVE_SCRYPT_3WAY |
||||
|
||||
static void scrypt_1024_1_1_256_3way(const uint32_t *input, |
||||
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) |
||||
{ |
||||
uint32_t tstate[3 * 8], ostate[3 * 8]; |
||||
uint32_t X[3 * 32] __attribute__((aligned(64))); |
||||
uint32_t *V; |
||||
|
||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); |
||||
|
||||
memcpy(tstate + 0, midstate, 32); |
||||
memcpy(tstate + 8, midstate, 32); |
||||
memcpy(tstate + 16, midstate, 32); |
||||
HMAC_SHA256_80_init(input + 0, tstate + 0, ostate + 0); |
||||
HMAC_SHA256_80_init(input + 20, tstate + 8, ostate + 8); |
||||
HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16); |
||||
PBKDF2_SHA256_80_128(tstate + 0, ostate + 0, input + 0, X + 0); |
||||
PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32); |
||||
PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64); |
||||
|
||||
scrypt_core_3way(X, V); |
||||
|
||||
PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0); |
||||
PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8); |
||||
PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16); |
||||
} |
||||
|
||||
#if HAVE_SHA256_4WAY |
||||
static void scrypt_1024_1_1_256_12way(const uint32_t *input, |
||||
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) |
||||
{ |
||||
uint32_t tstate[12 * 8] __attribute__((aligned(128))); |
||||
uint32_t ostate[12 * 8] __attribute__((aligned(128))); |
||||
uint32_t W[12 * 32] __attribute__((aligned(128))); |
||||
uint32_t X[12 * 32] __attribute__((aligned(128))); |
||||
uint32_t *V; |
||||
int i, j, k; |
||||
|
||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); |
||||
|
||||
for (j = 0; j < 3; j++) |
||||
for (i = 0; i < 20; i++) |
||||
for (k = 0; k < 4; k++) |
||||
W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i]; |
||||
for (j = 0; j < 3; j++) |
||||
for (i = 0; i < 8; i++) |
||||
for (k = 0; k < 4; k++) |
||||
tstate[32 * j + 4 * i + k] = midstate[i]; |
||||
HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0); |
||||
HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32); |
||||
HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64); |
||||
PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0); |
||||
PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128); |
||||
PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256); |
||||
for (j = 0; j < 3; j++) |
||||
for (i = 0; i < 32; i++) |
||||
for (k = 0; k < 4; k++) |
||||
X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k]; |
||||
scrypt_core_3way(X + 0 * 96, V); |
||||
scrypt_core_3way(X + 1 * 96, V); |
||||
scrypt_core_3way(X + 2 * 96, V); |
||||
scrypt_core_3way(X + 3 * 96, V); |
||||
for (j = 0; j < 3; j++) |
||||
for (i = 0; i < 32; i++) |
||||
for (k = 0; k < 4; k++) |
||||
W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i]; |
||||
PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0); |
||||
PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128); |
||||
PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256); |
||||
for (j = 0; j < 3; j++) |
||||
for (i = 0; i < 8; i++) |
||||
for (k = 0; k < 4; k++) |
||||
output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k]; |
||||
} |
||||
#endif /* HAVE_SHA256_4WAY */ |
||||
|
||||
#endif /* HAVE_SCRYPT_3WAY */ |
||||
|
||||
#if HAVE_SCRYPT_6WAY |
||||
static void scrypt_1024_1_1_256_24way(const uint32_t *input, |
||||
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) |
||||
{ |
||||
uint32_t tstate[24 * 8] __attribute__((aligned(128))); |
||||
uint32_t ostate[24 * 8] __attribute__((aligned(128))); |
||||
uint32_t W[24 * 32] __attribute__((aligned(128))); |
||||
uint32_t X[24 * 32] __attribute__((aligned(128))); |
||||
uint32_t *V; |
||||
int i, j, k; |
||||
|
||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); |
||||
|
||||
for (j = 0; j < 3; j++) |
||||
for (i = 0; i < 20; i++) |
||||
for (k = 0; k < 8; k++) |
||||
W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i]; |
||||
for (j = 0; j < 3; j++) |
||||
for (i = 0; i < 8; i++) |
||||
for (k = 0; k < 8; k++) |
||||
tstate[8 * 8 * j + 8 * i + k] = midstate[i]; |
||||
HMAC_SHA256_80_init_8way(W + 0, tstate + 0, ostate + 0); |
||||
HMAC_SHA256_80_init_8way(W + 256, tstate + 64, ostate + 64); |
||||
HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128); |
||||
PBKDF2_SHA256_80_128_8way(tstate + 0, ostate + 0, W + 0, W + 0); |
||||
PBKDF2_SHA256_80_128_8way(tstate + 64, ostate + 64, W + 256, W + 256); |
||||
PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512); |
||||
for (j = 0; j < 3; j++) |
||||
for (i = 0; i < 32; i++) |
||||
for (k = 0; k < 8; k++) |
||||
X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k]; |
||||
scrypt_core_6way(X + 0 * 32, V); |
||||
scrypt_core_6way(X + 6 * 32, V); |
||||
scrypt_core_6way(X + 12 * 32, V); |
||||
scrypt_core_6way(X + 18 * 32, V); |
||||
for (j = 0; j < 3; j++) |
||||
for (i = 0; i < 32; i++) |
||||
for (k = 0; k < 8; k++) |
||||
W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i]; |
||||
PBKDF2_SHA256_128_32_8way(tstate + 0, ostate + 0, W + 0, W + 0); |
||||
PBKDF2_SHA256_128_32_8way(tstate + 64, ostate + 64, W + 256, W + 256); |
||||
PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512); |
||||
for (j = 0; j < 3; j++) |
||||
for (i = 0; i < 8; i++) |
||||
for (k = 0; k < 8; k++) |
||||
output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k]; |
||||
} |
||||
#endif /* HAVE_SCRYPT_6WAY */ |
||||
|
||||
int scanhash_scrypt(int thr_id, uint32_t *pdata, |
||||
unsigned char *scratchbuf, const uint32_t *ptarget, |
||||
uint32_t max_nonce, unsigned long *hashes_done) |
||||
{ |
||||
uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; |
||||
uint32_t midstate[8]; |
||||
uint32_t n = pdata[19] - 1; |
||||
const uint32_t Htarg = ptarget[7]; |
||||
uint32_t throughput = scrypt_best_throughput(); |
||||
uint32_t i; |
||||
|
||||
#if HAVE_SHA256_4WAY |
||||
if (sha256_use_4way()) |
||||
throughput *= 4; |
||||
#endif |
||||
|
||||
for (i = 0; i < throughput; i++) |
||||
memcpy(data + i * 20, pdata, 80); |
||||
|
||||
sha256_init(midstate); |
||||
sha256_transform(midstate, data, 0); |
||||
|
||||
do { |
||||
for (i = 0; i < throughput; i++) |
||||
data[i * 20 + 19] = ++n; |
||||
|
||||
#if HAVE_SHA256_4WAY |
||||
if (throughput == 4) |
||||
scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf); |
||||
else |
||||
#endif |
||||
#if HAVE_SCRYPT_3WAY && HAVE_SHA256_4WAY |
||||
if (throughput == 12) |
||||
scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf); |
||||
else |
||||
#endif |
||||
#if HAVE_SCRYPT_6WAY |
||||
if (throughput == 24) |
||||
scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf); |
||||
else |
||||
#endif |
||||
#if HAVE_SCRYPT_3WAY |
||||
if (throughput == 3) |
||||
scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf); |
||||
else |
||||
#endif |
||||
scrypt_1024_1_1_256(data, hash, midstate, scratchbuf); |
||||
|
||||
for (i = 0; i < throughput; i++) { |
||||
if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) { |
||||
*hashes_done = n - pdata[19] + 1; |
||||
pdata[19] = data[i * 20 + 19]; |
||||
return 1; |
||||
} |
||||
} |
||||
} while (n < max_nonce && !work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = n - pdata[19] + 1; |
||||
pdata[19] = n; |
||||
return 0; |
||||
} |
@ -0,0 +1,454 @@
@@ -0,0 +1,454 @@
|
||||
// |
||||
// =============== BLAKE part on nVidia GPU ====================== |
||||
// |
||||
// This is the generic "default" implementation when no architecture |
||||
// specific implementation is available in the kernel. |
||||
// |
||||
// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64 |
||||
// |
||||
// TODO: CUDA porting work remains to be done. |
||||
// |
||||
|
||||
#include <map> |
||||
#include <stdint.h> |
||||
|
||||
#include "cuda_runtime.h" |
||||
#include "salsa_kernel.h" |
||||
#include "miner.h" |
||||
|
||||
typedef uint32_t sph_u32; |
||||
#define SPH_C32(x) ((sph_u32)(x)) |
||||
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) |
||||
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) |
||||
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) |
||||
|
||||
__constant__ uint64_t ptarget64[4]; |
||||
__constant__ uint32_t pdata[20]; |
||||
|
||||
// define some error checking macros |
||||
#undef checkCudaErrors |
||||
|
||||
#if WIN32 |
||||
#define DELIMITER '/' |
||||
#else |
||||
#define DELIMITER '/' |
||||
#endif |
||||
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ ) |
||||
|
||||
#define checkCudaErrors(x) \ |
||||
{ \ |
||||
cudaGetLastError(); \ |
||||
x; \ |
||||
cudaError_t err = cudaGetLastError(); \ |
||||
if (err != cudaSuccess) \ |
||||
applog(LOG_ERR, "GPU #%d: cudaError %d (%s) calling '%s' (%s line %d)\n", device_map[thr_id], err, cudaGetErrorString(err), #x, __FILENAME__, __LINE__); \ |
||||
} |
||||
|
||||
// from salsa_kernel.cu |
||||
extern std::map<int, uint32_t *> context_idata[2]; |
||||
extern std::map<int, uint32_t *> context_odata[2]; |
||||
extern std::map<int, cudaStream_t> context_streams[2]; |
||||
extern std::map<int, uint32_t *> context_hash[2]; |
||||
|
||||
#ifdef _MSC_VER |
||||
#pragma warning (disable: 4146) |
||||
#endif |
||||
|
||||
static __device__ sph_u32 cuda_sph_bswap32(sph_u32 x) |
||||
{ |
||||
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) |
||||
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); |
||||
} |
||||
|
||||
/** |
||||
* Encode a 32-bit value into the provided buffer (big endian convention). |
||||
* |
||||
* @param dst the destination buffer |
||||
* @param val the 32-bit value to encode |
||||
*/ |
||||
static __device__ void |
||||
cuda_sph_enc32be(void *dst, sph_u32 val) |
||||
{ |
||||
*(sph_u32 *)dst = cuda_sph_bswap32(val); |
||||
} |
||||
|
||||
#define Z00 0 |
||||
#define Z01 1 |
||||
#define Z02 2 |
||||
#define Z03 3 |
||||
#define Z04 4 |
||||
#define Z05 5 |
||||
#define Z06 6 |
||||
#define Z07 7 |
||||
#define Z08 8 |
||||
#define Z09 9 |
||||
#define Z0A A |
||||
#define Z0B B |
||||
#define Z0C C |
||||
#define Z0D D |
||||
#define Z0E E |
||||
#define Z0F F |
||||
|
||||
#define Z10 E |
||||
#define Z11 A |
||||
#define Z12 4 |
||||
#define Z13 8 |
||||
#define Z14 9 |
||||
#define Z15 F |
||||
#define Z16 D |
||||
#define Z17 6 |
||||
#define Z18 1 |
||||
#define Z19 C |
||||
#define Z1A 0 |
||||
#define Z1B 2 |
||||
#define Z1C B |
||||
#define Z1D 7 |
||||
#define Z1E 5 |
||||
#define Z1F 3 |
||||
|
||||
#define Z20 B |
||||
#define Z21 8 |
||||
#define Z22 C |
||||
#define Z23 0 |
||||
#define Z24 5 |
||||
#define Z25 2 |
||||
#define Z26 F |
||||
#define Z27 D |
||||
#define Z28 A |
||||
#define Z29 E |
||||
#define Z2A 3 |
||||
#define Z2B 6 |
||||
#define Z2C 7 |
||||
#define Z2D 1 |
||||
#define Z2E 9 |
||||
#define Z2F 4 |
||||
|
||||
#define Z30 7 |
||||
#define Z31 9 |
||||
#define Z32 3 |
||||
#define Z33 1 |
||||
#define Z34 D |
||||
#define Z35 C |
||||
#define Z36 B |
||||
#define Z37 E |
||||
#define Z38 2 |
||||
#define Z39 6 |
||||
#define Z3A 5 |
||||
#define Z3B A |
||||
#define Z3C 4 |
||||
#define Z3D 0 |
||||
#define Z3E F |
||||
#define Z3F 8 |
||||
|
||||
#define Z40 9 |
||||
#define Z41 0 |
||||
#define Z42 5 |
||||
#define Z43 7 |
||||
#define Z44 2 |
||||
#define Z45 4 |
||||
#define Z46 A |
||||
#define Z47 F |
||||
#define Z48 E |
||||
#define Z49 1 |
||||
#define Z4A B |
||||
#define Z4B C |
||||
#define Z4C 6 |
||||
#define Z4D 8 |
||||
#define Z4E 3 |
||||
#define Z4F D |
||||
|
||||
#define Z50 2 |
||||
#define Z51 C |
||||
#define Z52 6 |
||||
#define Z53 A |
||||
#define Z54 0 |
||||
#define Z55 B |
||||
#define Z56 8 |
||||
#define Z57 3 |
||||
#define Z58 4 |
||||
#define Z59 D |
||||
#define Z5A 7 |
||||
#define Z5B 5 |
||||
#define Z5C F |
||||
#define Z5D E |
||||
#define Z5E 1 |
||||
#define Z5F 9 |
||||
|
||||
#define Z60 C |
||||
#define Z61 5 |
||||
#define Z62 1 |
||||
#define Z63 F |
||||
#define Z64 E |
||||
#define Z65 D |
||||
#define Z66 4 |
||||
#define Z67 A |
||||
#define Z68 0 |
||||
#define Z69 7 |
||||
#define Z6A 6 |
||||
#define Z6B 3 |
||||
#define Z6C 9 |
||||
#define Z6D 2 |
||||
#define Z6E 8 |
||||
#define Z6F B |
||||
|
||||
#define Z70 D |
||||
#define Z71 B |
||||
#define Z72 7 |
||||
#define Z73 E |
||||
#define Z74 C |
||||
#define Z75 1 |
||||
#define Z76 3 |
||||
#define Z77 9 |
||||
#define Z78 5 |
||||
#define Z79 0 |
||||
#define Z7A F |
||||
#define Z7B 4 |
||||
#define Z7C 8 |
||||
#define Z7D 6 |
||||
#define Z7E 2 |
||||
#define Z7F A |
||||
|
||||
#define Z80 6 |
||||
#define Z81 F |
||||
#define Z82 E |
||||
#define Z83 9 |
||||
#define Z84 B |
||||
#define Z85 3 |
||||
#define Z86 0 |
||||
#define Z87 8 |
||||
#define Z88 C |
||||
#define Z89 2 |
||||
#define Z8A D |
||||
#define Z8B 7 |
||||
#define Z8C 1 |
||||
#define Z8D 4 |
||||
#define Z8E A |
||||
#define Z8F 5 |
||||
|
||||
#define Z90 A |
||||
#define Z91 2 |
||||
#define Z92 8 |
||||
#define Z93 4 |
||||
#define Z94 7 |
||||
#define Z95 6 |
||||
#define Z96 1 |
||||
#define Z97 5 |
||||
#define Z98 F |
||||
#define Z99 B |
||||
#define Z9A 9 |
||||
#define Z9B E |
||||
#define Z9C 3 |
||||
#define Z9D C |
||||
#define Z9E D |
||||
#define Z9F 0 |
||||
|
||||
#define Mx(r, i) Mx_(Z ## r ## i) |
||||
#define Mx_(n) Mx__(n) |
||||
#define Mx__(n) M ## n |
||||
|
||||
#define CSx(r, i) CSx_(Z ## r ## i) |
||||
#define CSx_(n) CSx__(n) |
||||
#define CSx__(n) CS ## n |
||||
|
||||
#define CS0 SPH_C32(0x243F6A88) |
||||
#define CS1 SPH_C32(0x85A308D3) |
||||
#define CS2 SPH_C32(0x13198A2E) |
||||
#define CS3 SPH_C32(0x03707344) |
||||
#define CS4 SPH_C32(0xA4093822) |
||||
#define CS5 SPH_C32(0x299F31D0) |
||||
#define CS6 SPH_C32(0x082EFA98) |
||||
#define CS7 SPH_C32(0xEC4E6C89) |
||||
#define CS8 SPH_C32(0x452821E6) |
||||
#define CS9 SPH_C32(0x38D01377) |
||||
#define CSA SPH_C32(0xBE5466CF) |
||||
#define CSB SPH_C32(0x34E90C6C) |
||||
#define CSC SPH_C32(0xC0AC29B7) |
||||
#define CSD SPH_C32(0xC97C50DD) |
||||
#define CSE SPH_C32(0x3F84D5B5) |
||||
#define CSF SPH_C32(0xB5470917) |
||||
|
||||
#define GS(m0, m1, c0, c1, a, b, c, d) do { \ |
||||
a = SPH_T32(a + b + (m0 ^ c1)); \ |
||||
d = SPH_ROTR32(d ^ a, 16); \ |
||||
c = SPH_T32(c + d); \ |
||||
b = SPH_ROTR32(b ^ c, 12); \ |
||||
a = SPH_T32(a + b + (m1 ^ c0)); \ |
||||
d = SPH_ROTR32(d ^ a, 8); \ |
||||
c = SPH_T32(c + d); \ |
||||
b = SPH_ROTR32(b ^ c, 7); \ |
||||
} while (0) |
||||
|
||||
#define ROUND_S(r) do { \ |
||||
GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ |
||||
GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ |
||||
GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ |
||||
GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ |
||||
GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ |
||||
GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ |
||||
GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ |
||||
GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ |
||||
} while (0) |
||||
|
||||
#define COMPRESS32 do { \ |
||||
sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \ |
||||
sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \ |
||||
sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ |
||||
sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ |
||||
V0 = H0; \ |
||||
V1 = H1; \ |
||||
V2 = H2; \ |
||||
V3 = H3; \ |
||||
V4 = H4; \ |
||||
V5 = H5; \ |
||||
V6 = H6; \ |
||||
V7 = H7; \ |
||||
V8 = S0 ^ CS0; \ |
||||
V9 = S1 ^ CS1; \ |
||||
VA = S2 ^ CS2; \ |
||||
VB = S3 ^ CS3; \ |
||||
VC = T0 ^ CS4; \ |
||||
VD = T0 ^ CS5; \ |
||||
VE = T1 ^ CS6; \ |
||||
VF = T1 ^ CS7; \ |
||||
M0 = input[0]; \ |
||||
M1 = input[1]; \ |
||||
M2 = input[2]; \ |
||||
M3 = input[3]; \ |
||||
M4 = input[4]; \ |
||||
M5 = input[5]; \ |
||||
M6 = input[6]; \ |
||||
M7 = input[7]; \ |
||||
M8 = input[8]; \ |
||||
M9 = input[9]; \ |
||||
MA = input[10]; \ |
||||
MB = input[11]; \ |
||||
MC = input[12]; \ |
||||
MD = input[13]; \ |
||||
ME = input[14]; \ |
||||
MF = input[15]; \ |
||||
ROUND_S(0); \ |
||||
ROUND_S(1); \ |
||||
ROUND_S(2); \ |
||||
ROUND_S(3); \ |
||||
ROUND_S(4); \ |
||||
ROUND_S(5); \ |
||||
ROUND_S(6); \ |
||||
ROUND_S(7); \ |
||||
H0 ^= S0 ^ V0 ^ V8; \ |
||||
H1 ^= S1 ^ V1 ^ V9; \ |
||||
H2 ^= S2 ^ V2 ^ VA; \ |
||||
H3 ^= S3 ^ V3 ^ VB; \ |
||||
H4 ^= S0 ^ V4 ^ VC; \ |
||||
H5 ^= S1 ^ V5 ^ VD; \ |
||||
H6 ^= S2 ^ V6 ^ VE; \ |
||||
H7 ^= S3 ^ V7 ^ VF; \ |
||||
} while (0) |
||||
|
||||
__global__ void cuda_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate ) |
||||
{ |
||||
uint32_t input[16]; |
||||
uint64_t output[4]; |
||||
|
||||
#pragma unroll 16 |
||||
for (int i=0; i < 16; ++i) input[i] = pdata[i]; |
||||
|
||||
sph_u32 H0 = 0x6A09E667; |
||||
sph_u32 H1 = 0xBB67AE85; |
||||
sph_u32 H2 = 0x3C6EF372; |
||||
sph_u32 H3 = 0xA54FF53A; |
||||
sph_u32 H4 = 0x510E527F; |
||||
sph_u32 H5 = 0x9B05688C; |
||||
sph_u32 H6 = 0x1F83D9AB; |
||||
sph_u32 H7 = 0x5BE0CD19; |
||||
sph_u32 S0 = 0; |
||||
sph_u32 S1 = 0; |
||||
sph_u32 S2 = 0; |
||||
sph_u32 S3 = 0; |
||||
sph_u32 T0 = 0; |
||||
sph_u32 T1 = 0; |
||||
T0 = SPH_T32(T0 + 512); |
||||
COMPRESS32; |
||||
|
||||
#pragma unroll 3 |
||||
for (int i=0; i < 3; ++i) input[i] = pdata[16+i]; |
||||
input[3] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
input[4] = 0x80000000; |
||||
#pragma unroll 8 |
||||
for (int i=5; i < 13; ++i) input[i] = 0; |
||||
input[13] = 0x00000001; |
||||
input[14] = T1; |
||||
input[15] = T0 + 128; |
||||
|
||||
T0 = SPH_T32(T0 + 128); |
||||
COMPRESS32; |
||||
|
||||
cuda_sph_enc32be((unsigned char*)output + 4*6, H6); |
||||
cuda_sph_enc32be((unsigned char*)output + 4*7, H7); |
||||
if (validate || output[3] <= ptarget64[3]) |
||||
{ |
||||
// this data is only needed when we actually need to save the hashes |
||||
cuda_sph_enc32be((unsigned char*)output + 4*0, H0); |
||||
cuda_sph_enc32be((unsigned char*)output + 4*1, H1); |
||||
cuda_sph_enc32be((unsigned char*)output + 4*2, H2); |
||||
cuda_sph_enc32be((unsigned char*)output + 4*3, H3); |
||||
cuda_sph_enc32be((unsigned char*)output + 4*4, H4); |
||||
cuda_sph_enc32be((unsigned char*)output + 4*5, H5); |
||||
} |
||||
|
||||
if (validate) |
||||
{ |
||||
g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
#pragma unroll 4 |
||||
for (int i=0; i < 4; ++i) g_out[i] = output[i]; |
||||
} |
||||
|
||||
if (output[3] <= ptarget64[3]) { |
||||
uint64_t *g_good64 = (uint64_t*)g_good; |
||||
if (output[3] < g_good64[3]) { |
||||
g_good64[3] = output[3]; |
||||
g_good64[2] = output[2]; |
||||
g_good64[1] = output[1]; |
||||
g_good64[0] = output[0]; |
||||
g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
} |
||||
} |
||||
} |
||||
|
||||
static bool init[MAX_GPUS] = { 0 }; |
||||
static std::map<int, uint32_t *> context_good[2]; |
||||
|
||||
bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) |
||||
{ |
||||
if (!init[thr_id]) |
||||
{ |
||||
// allocate pinned host memory for good hashes |
||||
uint32_t *tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp; |
||||
|
||||
init[thr_id] = true; |
||||
} |
||||
checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 80, 0, cudaMemcpyHostToDevice)); |
||||
checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 32, 0, cudaMemcpyHostToDevice)); |
||||
|
||||
return context_good[0][thr_id] && context_good[1][thr_id]; |
||||
} |
||||
|
||||
void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) |
||||
{ |
||||
checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id])); |
||||
|
||||
cuda_blake256_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h); |
||||
|
||||
// copy hashes from device memory to host (ALL hashes, lots of data...) |
||||
if (do_d2h && hash != NULL) { |
||||
size_t mem_size = throughput * sizeof(uint32_t) * 8; |
||||
checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size, |
||||
cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); |
||||
} |
||||
else if (hash != NULL) { |
||||
// asynchronous copy of winning nonce (just 4 bytes...) |
||||
checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t), |
||||
cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); |
||||
} |
||||
} |
@ -0,0 +1,28 @@
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
pick the best algo at runtime or compile time? |
||||
---------------------------------------------- |
||||
SCRYPT_CHOOSE_COMPILETIME (gcc only!) |
||||
SCRYPT_CHOOSE_RUNTIME |
||||
*/ |
||||
#define SCRYPT_CHOOSE_RUNTIME |
||||
|
||||
|
||||
/*
|
||||
hash function to use |
||||
------------------------------- |
||||
SCRYPT_BLAKE256 |
||||
SCRYPT_BLAKE512 |
||||
SCRYPT_SHA256 |
||||
SCRYPT_SHA512 |
||||
SCRYPT_SKEIN512 |
||||
*/ |
||||
//#define SCRYPT_SHA256
|
||||
|
||||
|
||||
/*
|
||||
block mixer to use |
||||
----------------------------- |
||||
SCRYPT_CHACHA |
||||
SCRYPT_SALSA |
||||
*/ |
||||
//#define SCRYPT_SALSA
|
@ -0,0 +1,58 @@
@@ -0,0 +1,58 @@
|
||||
#define SCRYPT_MIX_BASE "ChaCha20/8" |
||||
|
||||
typedef uint32_t scrypt_mix_word_t; |
||||
|
||||
#define SCRYPT_WORDTO8_LE U32TO8_LE |
||||
#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP |
||||
|
||||
#define SCRYPT_BLOCK_BYTES 64 |
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) |
||||
|
||||
/* must have these here in case block bytes is ever != 64 */ |
||||
#include "scrypt-jane-romix-basic.h" |
||||
|
||||
#include "scrypt-jane-mix_chacha.h" |
||||
|
||||
/* cpu agnostic */ |
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic |
||||
#define SCRYPT_MIX_FN chacha_core_basic |
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian |
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian |
||||
#include "scrypt-jane-romix-template.h" |
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) |
||||
static scrypt_ROMixfn |
||||
scrypt_getROMix() { |
||||
size_t cpuflags = detect_cpu(); |
||||
|
||||
return scrypt_ROMix_basic; |
||||
} |
||||
#endif |
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED) |
||||
static size_t |
||||
available_implementations() { |
||||
size_t cpuflags = detect_cpu(); |
||||
size_t flags = 0; |
||||
|
||||
return flags; |
||||
} |
||||
#endif |
||||
|
||||
static int |
||||
scrypt_test_mix() { |
||||
static const uint8_t expected[16] = { |
||||
0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a, |
||||
}; |
||||
|
||||
int ret = 1; |
||||
size_t cpuflags = detect_cpu(); |
||||
|
||||
#if defined(SCRYPT_CHACHA_BASIC) |
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); |
||||
#endif |
||||
|
||||
return ret; |
||||
} |
||||
|
@ -0,0 +1,69 @@
@@ -0,0 +1,69 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED) |
||||
|
||||
#undef SCRYPT_MIX |
||||
#define SCRYPT_MIX "ChaCha20/8 Ref" |
||||
|
||||
#undef SCRYPT_CHACHA_INCLUDED |
||||
#define SCRYPT_CHACHA_INCLUDED |
||||
#define SCRYPT_CHACHA_BASIC |
||||
|
||||
static void |
||||
chacha_core_basic(uint32_t state[16]) { |
||||
size_t rounds = 8; |
||||
uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; |
||||
|
||||
x0 = state[0]; |
||||
x1 = state[1]; |
||||
x2 = state[2]; |
||||
x3 = state[3]; |
||||
x4 = state[4]; |
||||
x5 = state[5]; |
||||
x6 = state[6]; |
||||
x7 = state[7]; |
||||
x8 = state[8]; |
||||
x9 = state[9]; |
||||
x10 = state[10]; |
||||
x11 = state[11]; |
||||
x12 = state[12]; |
||||
x13 = state[13]; |
||||
x14 = state[14]; |
||||
x15 = state[15]; |
||||
|
||||
#define quarter(a,b,c,d) \ |
||||
a += b; t = d^a; d = ROTL32(t,16); \ |
||||
c += d; t = b^c; b = ROTL32(t,12); \ |
||||
a += b; t = d^a; d = ROTL32(t, 8); \ |
||||
c += d; t = b^c; b = ROTL32(t, 7); |
||||
|
||||
for (; rounds; rounds -= 2) { |
||||
quarter( x0, x4, x8,x12) |
||||
quarter( x1, x5, x9,x13) |
||||
quarter( x2, x6,x10,x14) |
||||
quarter( x3, x7,x11,x15) |
||||
quarter( x0, x5,x10,x15) |
||||
quarter( x1, x6,x11,x12) |
||||
quarter( x2, x7, x8,x13) |
||||
quarter( x3, x4, x9,x14) |
||||
} |
||||
|
||||
state[0] += x0; |
||||
state[1] += x1; |
||||
state[2] += x2; |
||||
state[3] += x3; |
||||
state[4] += x4; |
||||
state[5] += x5; |
||||
state[6] += x6; |
||||
state[7] += x7; |
||||
state[8] += x8; |
||||
state[9] += x9; |
||||
state[10] += x10; |
||||
state[11] += x11; |
||||
state[12] += x12; |
||||
state[13] += x13; |
||||
state[14] += x14; |
||||
state[15] += x15; |
||||
|
||||
#undef quarter |
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,32 @@
@@ -0,0 +1,32 @@
|
||||
|
||||
typedef enum cpu_flags_x86_t { }cpu_flags_x86; |
||||
|
||||
typedef enum cpu_vendors_x86_t { |
||||
cpu_nobody, |
||||
cpu_intel, |
||||
cpu_amd |
||||
} cpu_vendors_x86; |
||||
|
||||
typedef struct x86_regs_t { |
||||
uint32_t eax, ebx, ecx, edx; |
||||
} x86_regs; |
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED) |
||||
size_t cpu_detect_mask = (size_t)-1; |
||||
#endif |
||||
|
||||
static size_t |
||||
detect_cpu(void) { |
||||
size_t cpu_flags = 0; |
||||
return cpu_flags; |
||||
} |
||||
|
||||
#if defined(SCRYPT_TEST_SPEED) |
||||
static const char * |
||||
get_top_cpuflag_desc(size_t flag) { |
||||
return "Basic"; |
||||
} |
||||
#endif |
||||
|
||||
#define asm_calling_convention |
@ -0,0 +1,284 @@
@@ -0,0 +1,284 @@
|
||||
/* determine os */ |
||||
#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__) |
||||
#include <windows.h> |
||||
#include <wincrypt.h> |
||||
#define OS_WINDOWS |
||||
#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__) |
||||
#include <sys/mman.h> |
||||
#include <sys/time.h> |
||||
#include <fcntl.h> |
||||
|
||||
#define OS_SOLARIS |
||||
#else |
||||
#include <sys/mman.h> |
||||
#include <sys/time.h> |
||||
#include <sys/param.h> /* need this to define BSD */ |
||||
#include <unistd.h> |
||||
#include <fcntl.h> |
||||
|
||||
#define OS_NIX |
||||
#if defined(__linux__) |
||||
#include <endian.h> |
||||
#define OS_LINUX |
||||
#elif defined(BSD) |
||||
#define OS_BSD |
||||
|
||||
#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__)) |
||||
#define OS_OSX |
||||
#elif defined(macintosh) || defined(Macintosh) |
||||
#define OS_MAC |
||||
#elif defined(__OpenBSD__) |
||||
#define OS_OPENBSD |
||||
#endif |
||||
#endif |
||||
#endif |
||||
|
||||
|
||||
/* determine compiler */ |
||||
#if defined(_MSC_VER) |
||||
#define COMPILER_MSVC _MSC_VER |
||||
#if ((COMPILER_MSVC > 1200) || defined(_mm_free)) |
||||
#define COMPILER_MSVC6PP_AND_LATER |
||||
#endif |
||||
#if (COMPILER_MSVC >= 1500) |
||||
#define COMPILER_HAS_TMMINTRIN |
||||
#endif |
||||
|
||||
#pragma warning(disable : 4127) /* conditional expression is constant */ |
||||
#pragma warning(disable : 4100) /* unreferenced formal parameter */ |
||||
|
||||
#ifndef _CRT_SECURE_NO_WARNINGS |
||||
#define _CRT_SECURE_NO_WARNINGS |
||||
#endif |
||||
|
||||
#include <float.h> |
||||
#include <stdlib.h> /* _rotl */ |
||||
#include <intrin.h> |
||||
|
||||
typedef unsigned char uint8_t; |
||||
typedef unsigned short uint16_t; |
||||
typedef unsigned int uint32_t; |
||||
typedef signed int int32_t; |
||||
typedef unsigned __int64 uint64_t; |
||||
typedef signed __int64 int64_t; |
||||
|
||||
#define ROTL32(a,b) _rotl(a,b) |
||||
#define ROTR32(a,b) _rotr(a,b) |
||||
#define ROTL64(a,b) _rotl64(a,b) |
||||
#define ROTR64(a,b) _rotr64(a,b) |
||||
#undef NOINLINE |
||||
#define NOINLINE __declspec(noinline) |
||||
#undef INLINE |
||||
#define INLINE __forceinline |
||||
#undef FASTCALL |
||||
#define FASTCALL __fastcall |
||||
#undef CDECL |
||||
#define CDECL __cdecl |
||||
#undef STDCALL |
||||
#define STDCALL __stdcall |
||||
#undef NAKED |
||||
#define NAKED __declspec(naked) |
||||
#define MM16 __declspec(align(16)) |
||||
#endif |
||||
#if defined(__ICC) |
||||
#define COMPILER_INTEL |
||||
#endif |
||||
#if defined(__GNUC__) |
||||
#if (__GNUC__ >= 3) |
||||
#define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__ |
||||
#else |
||||
#define COMPILER_GCC_PATCHLEVEL 0 |
||||
#endif |
||||
#define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL) |
||||
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) |
||||
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) |
||||
#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) |
||||
#define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b))) |
||||
#undef NOINLINE |
||||
#if (COMPILER_GCC >= 30000) |
||||
#define NOINLINE __attribute__((noinline)) |
||||
#else |
||||
#define NOINLINE |
||||
#endif |
||||
#undef INLINE |
||||
#if (COMPILER_GCC >= 30000) |
||||
#define INLINE __attribute__((always_inline)) |
||||
#else |
||||
#define INLINE inline |
||||
#endif |
||||
#undef FASTCALL |
||||
#if (COMPILER_GCC >= 30400) |
||||
#define FASTCALL __attribute__((fastcall)) |
||||
#else |
||||
#define FASTCALL |
||||
#endif |
||||
#undef CDECL |
||||
#define CDECL __attribute__((cdecl)) |
||||
#undef STDCALL |
||||
#define STDCALL __attribute__((stdcall)) |
||||
#define MM16 __attribute__((aligned(16))) |
||||
#include <stdint.h> |
||||
#endif |
||||
#if defined(__MINGW32__) || defined(__MINGW64__) |
||||
#define COMPILER_MINGW |
||||
#endif |
||||
#if defined(__PATHCC__) |
||||
#define COMPILER_PATHCC |
||||
#endif |
||||
|
||||
#define OPTIONAL_INLINE |
||||
#if defined(OPTIONAL_INLINE) |
||||
#undef OPTIONAL_INLINE |
||||
#define OPTIONAL_INLINE INLINE |
||||
#else |
||||
#define OPTIONAL_INLINE |
||||
#endif |
||||
|
||||
#define CRYPTO_FN NOINLINE STDCALL |
||||
|
||||
/* determine cpu */ |
||||
#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64) |
||||
#define CPU_X86_64 |
||||
#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500)) |
||||
#define CPU_X86 500 |
||||
#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400)) |
||||
#define CPU_X86 400 |
||||
#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__) |
||||
#define CPU_X86 300 |
||||
#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64) |
||||
#define CPU_IA64 |
||||
#endif |
||||
|
||||
#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9) |
||||
#define CPU_SPARC |
||||
#if defined(__sparcv9) |
||||
#define CPU_SPARC64 |
||||
#endif |
||||
#endif |
||||
|
||||
#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64)) |
||||
#define CPU_64BITS |
||||
#undef FASTCALL |
||||
#define FASTCALL |
||||
#undef CDECL |
||||
#define CDECL |
||||
#undef STDCALL |
||||
#define STDCALL |
||||
#endif |
||||
|
||||
#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC) |
||||
#define CPU_PPC |
||||
#if defined(_ARCH_PWR7) |
||||
#define CPU_POWER7 |
||||
#elif defined(__64BIT__) |
||||
#define CPU_PPC64 |
||||
#else |
||||
#define CPU_PPC32 |
||||
#endif |
||||
#endif |
||||
|
||||
#if defined(__hppa__) || defined(__hppa) |
||||
#define CPU_HPPA |
||||
#endif |
||||
|
||||
#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) |
||||
#define CPU_ALPHA |
||||
#endif |
||||
|
||||
/* endian */ |
||||
|
||||
#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \ |
||||
(defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \ |
||||
(defined(CPU_X86) || defined(CPU_X86_64)) || \ |
||||
(defined(vax) || defined(MIPSEL) || defined(_MIPSEL))) |
||||
#define CPU_LE |
||||
#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \ |
||||
(defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \ |
||||
(defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB)) |
||||
#define CPU_BE |
||||
#else |
||||
/* unknown endian! */ |
||||
#endif |
||||
|
||||
|
||||
#define U8TO32_BE(p) \ |
||||
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ |
||||
((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) |
||||
|
||||
#define U8TO32_LE(p) \ |
||||
(((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \ |
||||
((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) |
||||
|
||||
#define U32TO8_BE(p, v) \ |
||||
(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ |
||||
(p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); |
||||
|
||||
#define U32TO8_LE(p, v) \ |
||||
(p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \ |
||||
(p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24); |
||||
|
||||
#define U8TO64_BE(p) \ |
||||
(((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4)) |
||||
|
||||
#define U8TO64_LE(p) \ |
||||
(((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32)) |
||||
|
||||
#define U64TO8_BE(p, v) \ |
||||
U32TO8_BE((p), (uint32_t)((v) >> 32)); \ |
||||
U32TO8_BE((p) + 4, (uint32_t)((v) )); |
||||
|
||||
#define U64TO8_LE(p, v) \ |
||||
U32TO8_LE((p), (uint32_t)((v) )); \ |
||||
U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); |
||||
|
||||
#define U32_SWAP(v) { \ |
||||
(v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \ |
||||
(v) = ((v) << 16) | ((v) >> 16); \ |
||||
} |
||||
|
||||
#define U64_SWAP(v) { \ |
||||
(v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \ |
||||
(v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \ |
||||
(v) = ((v) << 32) | ((v) >> 32); \ |
||||
} |
||||
|
||||
static int |
||||
scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) { |
||||
uint32_t differentbits = 0; |
||||
while (len--) |
||||
differentbits |= (*x++ ^ *y++); |
||||
return (1 & ((differentbits - 1) >> 8)); |
||||
} |
||||
|
||||
void |
||||
scrypt_ensure_zero(void *p, size_t len) { |
||||
#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC)) |
||||
__stosb((unsigned char *)p, 0, len); |
||||
#elif (defined(CPU_X86) && defined(COMPILER_GCC)) |
||||
__asm__ __volatile__( |
||||
"pushl %%edi;\n" |
||||
"pushl %%ecx;\n" |
||||
"rep stosb;\n" |
||||
"popl %%ecx;\n" |
||||
"popl %%edi;\n" |
||||
:: "a"(0), "D"(p), "c"(len) : "cc", "memory" |
||||
); |
||||
#elif (defined(CPU_X86_64) && defined(COMPILER_GCC)) |
||||
__asm__ __volatile__( |
||||
"pushq %%rdi;\n" |
||||
"pushq %%rcx;\n" |
||||
"rep stosb;\n" |
||||
"popq %%rcx;\n" |
||||
"popq %%rdi;\n" |
||||
:: "a"(0), "D"(p), "c"(len) : "cc", "memory" |
||||
); |
||||
#else |
||||
volatile uint8_t *b = (volatile uint8_t *)p; |
||||
size_t i; |
||||
for (i = 0; i < len; i++) |
||||
b[i] = 0; |
||||
#endif |
||||
} |
||||
|
||||
#include "scrypt-jane-portable-x86.h" |
||||
|
@ -0,0 +1,67 @@
@@ -0,0 +1,67 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) |
||||
/* function type returned by scrypt_getROMix, used with cpu detection */ |
||||
typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r); |
||||
#endif |
||||
|
||||
/* romix pre/post nop function */ |
||||
static void asm_calling_convention |
||||
scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) { |
||||
} |
||||
|
||||
/* romix pre/post endian conversion function */ |
||||
static void asm_calling_convention |
||||
scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { |
||||
#if !defined(CPU_LE) |
||||
static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}}; |
||||
size_t i; |
||||
if (endian_test.w == 0x100) { |
||||
nblocks *= SCRYPT_BLOCK_WORDS; |
||||
for (i = 0; i < nblocks; i++) { |
||||
SCRYPT_WORD_ENDIAN_SWAP(blocks[i]); |
||||
} |
||||
} |
||||
#endif |
||||
} |
||||
|
||||
/* chunkmix test function */ |
||||
typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r); |
||||
typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks); |
||||
|
||||
static int |
||||
scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) { |
||||
/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */ |
||||
const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS; |
||||
scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v; |
||||
uint8_t final[16]; |
||||
size_t i; |
||||
|
||||
for (i = 0; i < words; i++) { |
||||
v = (scrypt_mix_word_t)i; |
||||
v = (v << 8) | v; |
||||
v = (v << 16) | v; |
||||
chunk[0][i] = v; |
||||
} |
||||
|
||||
prefn(chunk[0], blocks); |
||||
mixfn(chunk[1], chunk[0], NULL, r); |
||||
postfn(chunk[1], blocks); |
||||
|
||||
/* grab the last 16 bytes of the final block */ |
||||
for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) { |
||||
SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]); |
||||
} |
||||
|
||||
return scrypt_verify(expected, final, 16); |
||||
} |
||||
|
||||
/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */ |
||||
static scrypt_mix_word_t * |
||||
scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) { |
||||
return base + (i * len); |
||||
} |
||||
|
||||
/* returns a pointer to block i */ |
||||
static scrypt_mix_word_t * |
||||
scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) { |
||||
return base + (i * SCRYPT_BLOCK_WORDS); |
||||
} |
@ -0,0 +1,179 @@
@@ -0,0 +1,179 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) |
||||
|
||||
#if defined(SCRYPT_CHOOSE_COMPILETIME) |
||||
#undef SCRYPT_ROMIX_FN |
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix |
||||
#endif |
||||
|
||||
#undef SCRYPT_HAVE_ROMIX |
||||
#define SCRYPT_HAVE_ROMIX |
||||
|
||||
#if !defined(SCRYPT_CHUNKMIX_FN) |
||||
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic |
||||
|
||||
/*
|
||||
Bout = ChunkMix(Bin) |
||||
|
||||
2*r: number of blocks in the chunk |
||||
*/ |
||||
static void asm_calling_convention |
||||
SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) { |
||||
scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block; |
||||
uint32_t i, j, blocksPerChunk = r * 2, half = 0; |
||||
|
||||
/* 1: X = B_{2r - 1} */ |
||||
block = scrypt_block(Bin, blocksPerChunk - 1); |
||||
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) |
||||
X[i] = block[i]; |
||||
|
||||
if (Bxor) { |
||||
block = scrypt_block(Bxor, blocksPerChunk - 1); |
||||
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) |
||||
X[i] ^= block[i]; |
||||
} |
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */ |
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) { |
||||
/* 3: X = H(X ^ B_i) */ |
||||
block = scrypt_block(Bin, i); |
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) |
||||
X[j] ^= block[j]; |
||||
|
||||
if (Bxor) { |
||||
block = scrypt_block(Bxor, i); |
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) |
||||
X[j] ^= block[j]; |
||||
} |
||||
SCRYPT_MIX_FN(X); |
||||
|
||||
/* 4: Y_i = X */ |
||||
/* 6: B'[0..r-1] = Y_even */ |
||||
/* 6: B'[r..2r-1] = Y_odd */ |
||||
block = scrypt_block(Bout, (i / 2) + half); |
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) |
||||
block[j] = X[j]; |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
/*
|
||||
X = ROMix(X) |
||||
|
||||
X: chunk to mix |
||||
Y: scratch chunk |
||||
N: number of rounds |
||||
V[N]: array of chunks to randomly index in to |
||||
2*r: number of blocks in a chunk |
||||
*/ |
||||
|
||||
static void NOINLINE FASTCALL |
||||
SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) { |
||||
uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; |
||||
scrypt_mix_word_t *block = V; |
||||
|
||||
SCRYPT_ROMIX_TANGLE_FN(X, r * 2); |
||||
|
||||
/* 1: X = B */ |
||||
/* implicit */ |
||||
|
||||
/* 2: for i = 0 to N - 1 do */ |
||||
memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); |
||||
for (i = 0; i < N - 1; i++, block += chunkWords) { |
||||
/* 3: V_i = X */ |
||||
/* 4: X = H(X) */ |
||||
SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r); |
||||
} |
||||
SCRYPT_CHUNKMIX_FN(X, block, NULL, r); |
||||
|
||||
/* 6: for i = 0 to N - 1 do */ |
||||
for (i = 0; i < N; i += 2) { |
||||
/* 7: j = Integerify(X) % N */ |
||||
j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); |
||||
|
||||
/* 8: X = H(Y ^ V_j) */ |
||||
SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r); |
||||
|
||||
/* 7: j = Integerify(Y) % N */ |
||||
j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); |
||||
|
||||
/* 8: X = H(Y ^ V_j) */ |
||||
SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r); |
||||
} |
||||
|
||||
/* 10: B' = X */ |
||||
/* implicit */ |
||||
|
||||
SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2); |
||||
} |
||||
|
||||
/*
|
||||
* Special version with hard-coded r = 1 |
||||
* - mikaelh |
||||
*/ |
||||
static void NOINLINE FASTCALL |
||||
scrypt_ROMix_1(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N) { |
||||
const uint32_t r = 1; |
||||
uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; |
||||
scrypt_mix_word_t *block = V; |
||||
|
||||
SCRYPT_ROMIX_TANGLE_FN(X, r * 2); |
||||
|
||||
/* 1: X = B */ |
||||
/* implicit */ |
||||
|
||||
/* 2: for i = 0 to N - 1 do */ |
||||
memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); |
||||
for (i = 0; i < N - 1; i++, block += chunkWords) { |
||||
/* 3: V_i = X */ |
||||
/* 4: X = H(X) */ |
||||
#ifdef SCRYPT_CHUNKMIX_1_FN |
||||
SCRYPT_CHUNKMIX_1_FN(block + chunkWords, block); |
||||
#else |
||||
SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r); |
||||
#endif |
||||
} |
||||
#ifdef SCRYPT_CHUNKMIX_1_FN |
||||
SCRYPT_CHUNKMIX_1_FN(X, block); |
||||
#else |
||||
SCRYPT_CHUNKMIX_FN(X, block, NULL, r); |
||||
#endif |
||||
|
||||
/* 6: for i = 0 to N - 1 do */ |
||||
for (i = 0; i < N; i += 2) { |
||||
/* 7: j = Integerify(X) % N */ |
||||
j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); |
||||
|
||||
/* 8: X = H(Y ^ V_j) */ |
||||
#ifdef SCRYPT_CHUNKMIX_1_XOR_FN |
||||
SCRYPT_CHUNKMIX_1_XOR_FN(Y, X, scrypt_item(V, j, chunkWords)); |
||||
#else |
||||
SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r); |
||||
#endif |
||||
|
||||
/* 7: j = Integerify(Y) % N */ |
||||
j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); |
||||
|
||||
/* 8: X = H(Y ^ V_j) */ |
||||
#ifdef SCRYPT_CHUNKMIX_1_XOR_FN |
||||
SCRYPT_CHUNKMIX_1_XOR_FN(X, Y, scrypt_item(V, j, chunkWords)); |
||||
#else |
||||
SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r); |
||||
#endif |
||||
} |
||||
|
||||
/* 10: B' = X */ |
||||
/* implicit */ |
||||
|
||||
SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2); |
||||
} |
||||
|
||||
#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */ |
||||
|
||||
|
||||
#undef SCRYPT_CHUNKMIX_FN |
||||
#undef SCRYPT_ROMIX_FN |
||||
#undef SCRYPT_MIX_FN |
||||
#undef SCRYPT_ROMIX_TANGLE_FN |
||||
#undef SCRYPT_ROMIX_UNTANGLE_FN |
||||
|
@ -0,0 +1 @@
@@ -0,0 +1 @@
|
||||
#include "scrypt-jane-chacha.h" |
@ -0,0 +1,907 @@
@@ -0,0 +1,907 @@
|
||||
// |
||||
// Kernel that runs best on Fermi devices |
||||
// |
||||
// - shared memory use reduced by nearly factor 2 over legacy kernel |
||||
// by transferring only half work units (16 x uint32_t) at once. |
||||
// - uses ulong2/uint4 based memory transfers (each thread moves 16 bytes), |
||||
// allowing for shorter unrolled loops. This relies on Fermi's better |
||||
// memory controllers to get high memory troughput. |
||||
// |
||||
// NOTE: compile this .cu module for compute_20,sm_20 with --maxrregcount=63 |
||||
// |
||||
// TODO: batch-size support for this kernel |
||||
// |
||||
|
||||
#include <map> |
||||
|
||||
#include "cuda_runtime.h" |
||||
#include "miner.h" |
||||
|
||||
#include "salsa_kernel.h" |
||||
#include "fermi_kernel.h" |
||||
|
||||
#define THREADS_PER_WU 1 // single thread per hash |
||||
|
||||
#define TEXWIDTH 32768 |
||||
|
||||
// forward references |
||||
template <int ALGO> __global__ void fermi_scrypt_core_kernelA(uint32_t *g_idata, unsigned int N); |
||||
template <int ALGO> __global__ void fermi_scrypt_core_kernelB(uint32_t *g_odata, unsigned int N); |
||||
template <int ALGO, int TEX_DIM> __global__ void fermi_scrypt_core_kernelB_tex(uint32_t *g_odata, unsigned int N); |
||||
template <int ALGO> __global__ void fermi_scrypt_core_kernelA_LG(uint32_t *g_idata, unsigned int N, unsigned int LOOKUP_GAP); |
||||
template <int ALGO> __global__ void fermi_scrypt_core_kernelB_LG(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP); |
||||
template <int ALGO, int TEX_DIM> __global__ void fermi_scrypt_core_kernelB_LG_tex(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP); |
||||
|
||||
// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes) |
||||
__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT]; |
||||
|
||||
// using texture references for the "tex" variants of the B kernels |
||||
texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V; |
||||
texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V; |
||||
|
||||
FermiKernel::FermiKernel() : KernelInterface() |
||||
{ |
||||
} |
||||
|
||||
bool FermiKernel::bindtexture_1D(uint32_t *d_V, size_t size) |
||||
{ |
||||
cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>(); |
||||
texRef1D_4_V.normalized = 0; |
||||
texRef1D_4_V.filterMode = cudaFilterModePoint; |
||||
texRef1D_4_V.addressMode[0] = cudaAddressModeClamp; |
||||
checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size)); |
||||
return true; |
||||
} |
||||
|
||||
bool FermiKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) |
||||
{ |
||||
cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>(); |
||||
texRef2D_4_V.normalized = 0; |
||||
texRef2D_4_V.filterMode = cudaFilterModePoint; |
||||
texRef2D_4_V.addressMode[0] = cudaAddressModeClamp; |
||||
texRef2D_4_V.addressMode[1] = cudaAddressModeClamp; |
||||
// maintain texture width of TEXWIDTH (max. limit is 65000) |
||||
while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; } |
||||
while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; } |
||||
// fprintf(stderr, "total size: %u, %u bytes\n", pitch * height, width * sizeof(uint32_t) * 4 * height); |
||||
// fprintf(stderr, "binding width width=%d, height=%d, pitch=%d\n", width, height,pitch); |
||||
checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch)); |
||||
return true; |
||||
} |
||||
|
||||
bool FermiKernel::unbindtexture_1D() |
||||
{ |
||||
checkCudaErrors(cudaUnbindTexture(texRef1D_4_V)); |
||||
return true; |
||||
} |
||||
|
||||
bool FermiKernel::unbindtexture_2D() |
||||
{ |
||||
checkCudaErrors(cudaUnbindTexture(texRef2D_4_V)); |
||||
return true; |
||||
} |
||||
|
||||
void FermiKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) |
||||
{ |
||||
checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice)); |
||||
} |
||||
|
||||
bool FermiKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) |
||||
{ |
||||
bool success = true; |
||||
|
||||
int shared = WARPS_PER_BLOCK * WU_PER_WARP * (16+4) * sizeof(uint32_t); |
||||
|
||||
// First phase: Sequential writes to scratchpad. |
||||
|
||||
if (LOOKUP_GAP == 1) { |
||||
if (IS_SCRYPT()) fermi_scrypt_core_kernelA<A_SCRYPT><<< grid, threads, shared, stream >>>(d_idata, N); |
||||
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelA<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_idata, N); |
||||
} else { |
||||
if (IS_SCRYPT()) fermi_scrypt_core_kernelA_LG<A_SCRYPT><<< grid, threads, shared, stream >>>(d_idata, N, LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelA_LG<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_idata, N, LOOKUP_GAP); |
||||
} |
||||
|
||||
// Second phase: Random read access from scratchpad. |
||||
|
||||
if (LOOKUP_GAP == 1) { |
||||
if (texture_cache) { |
||||
if (texture_cache == 1) { |
||||
if (IS_SCRYPT()) fermi_scrypt_core_kernelB_tex<A_SCRYPT,1><<< grid, threads, shared, stream >>>(d_odata, N); |
||||
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_tex<A_SCRYPT_JANE,1><<< grid, threads, shared, stream >>>(d_odata, N); |
||||
} else if (texture_cache == 2) { |
||||
if (IS_SCRYPT()) fermi_scrypt_core_kernelB_tex<A_SCRYPT,2><<< grid, threads, shared, stream >>>(d_odata, N); |
||||
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_tex<A_SCRYPT_JANE,2><<< grid, threads, shared, stream >>>(d_odata, N); |
||||
} |
||||
else success = false; |
||||
} else { |
||||
if (IS_SCRYPT()) fermi_scrypt_core_kernelB<A_SCRYPT><<< grid, threads, shared, stream >>>(d_odata, N); |
||||
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_odata, N); |
||||
} |
||||
} else { |
||||
if (texture_cache) { |
||||
if (texture_cache == 1) { |
||||
if (IS_SCRYPT()) fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT,1><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT_JANE,1><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP); |
||||
} else if (texture_cache == 2) { |
||||
if (IS_SCRYPT()) fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT,2><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT_JANE,2><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP); |
||||
} |
||||
else success = false; |
||||
} else { |
||||
if (IS_SCRYPT()) fermi_scrypt_core_kernelB_LG<A_SCRYPT><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_LG<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP); |
||||
} |
||||
} |
||||
|
||||
return success; |
||||
} |
||||
|
||||
#if 0 |
||||
|
||||
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) |
||||
|
||||
#define QUARTER(a,b,c,d) \ |
||||
a += b; d ^= a; d = ROTL(d,16); \ |
||||
c += d; b ^= c; b = ROTL(b,12); \ |
||||
a += b; d ^= a; d = ROTL(d,8); \ |
||||
c += d; b ^= c; b = ROTL(b,7); |
||||
|
||||
static __device__ void xor_chacha8(uint4 *B, uint4 *C) |
||||
{ |
||||
uint32_t x[16]; |
||||
x[0]=(B[0].x ^= C[0].x); |
||||
x[1]=(B[0].y ^= C[0].y); |
||||
x[2]=(B[0].z ^= C[0].z); |
||||
x[3]=(B[0].w ^= C[0].w); |
||||
x[4]=(B[1].x ^= C[1].x); |
||||
x[5]=(B[1].y ^= C[1].y); |
||||
x[6]=(B[1].z ^= C[1].z); |
||||
x[7]=(B[1].w ^= C[1].w); |
||||
x[8]=(B[2].x ^= C[2].x); |
||||
x[9]=(B[2].y ^= C[2].y); |
||||
x[10]=(B[2].z ^= C[2].z); |
||||
x[11]=(B[2].w ^= C[2].w); |
||||
x[12]=(B[3].x ^= C[3].x); |
||||
x[13]=(B[3].y ^= C[3].y); |
||||
x[14]=(B[3].z ^= C[3].z); |
||||
x[15]=(B[3].w ^= C[3].w); |
||||
|
||||
/* Operate on columns. */ |
||||
QUARTER( x[0], x[4], x[ 8], x[12] ) |
||||
QUARTER( x[1], x[5], x[ 9], x[13] ) |
||||
QUARTER( x[2], x[6], x[10], x[14] ) |
||||
QUARTER( x[3], x[7], x[11], x[15] ) |
||||
|
||||
/* Operate on diagonals */ |
||||
QUARTER( x[0], x[5], x[10], x[15] ) |
||||
QUARTER( x[1], x[6], x[11], x[12] ) |
||||
QUARTER( x[2], x[7], x[ 8], x[13] ) |
||||
QUARTER( x[3], x[4], x[ 9], x[14] ) |
||||
|
||||
/* Operate on columns. */ |
||||
QUARTER( x[0], x[4], x[ 8], x[12] ) |
||||
QUARTER( x[1], x[5], x[ 9], x[13] ) |
||||
QUARTER( x[2], x[6], x[10], x[14] ) |
||||
QUARTER( x[3], x[7], x[11], x[15] ) |
||||
|
||||
/* Operate on diagonals */ |
||||
QUARTER( x[0], x[5], x[10], x[15] ) |
||||
QUARTER( x[1], x[6], x[11], x[12] ) |
||||
QUARTER( x[2], x[7], x[ 8], x[13] ) |
||||
QUARTER( x[3], x[4], x[ 9], x[14] ) |
||||
|
||||
/* Operate on columns. */ |
||||
QUARTER( x[0], x[4], x[ 8], x[12] ) |
||||
QUARTER( x[1], x[5], x[ 9], x[13] ) |
||||
QUARTER( x[2], x[6], x[10], x[14] ) |
||||
QUARTER( x[3], x[7], x[11], x[15] ) |
||||
|
||||
/* Operate on diagonals */ |
||||
QUARTER( x[0], x[5], x[10], x[15] ) |
||||
QUARTER( x[1], x[6], x[11], x[12] ) |
||||
QUARTER( x[2], x[7], x[ 8], x[13] ) |
||||
QUARTER( x[3], x[4], x[ 9], x[14] ) |
||||
|
||||
/* Operate on columns. */ |
||||
QUARTER( x[0], x[4], x[ 8], x[12] ) |
||||
QUARTER( x[1], x[5], x[ 9], x[13] ) |
||||
QUARTER( x[2], x[6], x[10], x[14] ) |
||||
QUARTER( x[3], x[7], x[11], x[15] ) |
||||
|
||||
/* Operate on diagonals */ |
||||
QUARTER( x[0], x[5], x[10], x[15] ) |
||||
QUARTER( x[1], x[6], x[11], x[12] ) |
||||
QUARTER( x[2], x[7], x[ 8], x[13] ) |
||||
QUARTER( x[3], x[4], x[ 9], x[14] ) |
||||
|
||||
B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7]; |
||||
B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15]; |
||||
} |
||||
|
||||
#else |
||||
|
||||
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) |
||||
|
||||
#define ADD4(d1,d2,d3,d4,s1,s2,s3,s4) \ |
||||
d1 += s1; d2 += s2; d3 += s3; d4 += s4; |
||||
|
||||
#define XOR4(d1,d2,d3,d4,s1,s2,s3,s4) \ |
||||
d1 ^= s1; d2 ^= s2; d3 ^= s3; d4 ^= s4; |
||||
|
||||
#define ROTL4(d1,d2,d3,d4,amt) \ |
||||
d1 = ROTL(d1, amt); d2 = ROTL(d2, amt); d3 = ROTL(d3, amt); d4 = ROTL(d4, amt); |
||||
|
||||
#define QROUND(a1,a2,a3,a4, b1,b2,b3,b4, c1,c2,c3,c4, amt) \ |
||||
ADD4 (a1,a2,a3,a4, c1,c2,c3,c4) \ |
||||
XOR4 (b1,b2,b3,b4, a1,a2,a3,a4) \ |
||||
ROTL4(b1,b2,b3,b4, amt) |
||||
|
||||
static __device__ void xor_chacha8(uint4 *B, uint4 *C) |
||||
{ |
||||
uint32_t x[16]; |
||||
x[0]=(B[0].x ^= C[0].x); |
||||
x[1]=(B[0].y ^= C[0].y); |
||||
x[2]=(B[0].z ^= C[0].z); |
||||
x[3]=(B[0].w ^= C[0].w); |
||||
x[4]=(B[1].x ^= C[1].x); |
||||
x[5]=(B[1].y ^= C[1].y); |
||||
x[6]=(B[1].z ^= C[1].z); |
||||
x[7]=(B[1].w ^= C[1].w); |
||||
x[8]=(B[2].x ^= C[2].x); |
||||
x[9]=(B[2].y ^= C[2].y); |
||||
x[10]=(B[2].z ^= C[2].z); |
||||
x[11]=(B[2].w ^= C[2].w); |
||||
x[12]=(B[3].x ^= C[3].x); |
||||
x[13]=(B[3].y ^= C[3].y); |
||||
x[14]=(B[3].z ^= C[3].z); |
||||
x[15]=(B[3].w ^= C[3].w); |
||||
|
||||
/* Operate on columns. */ |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); |
||||
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); |
||||
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); |
||||
|
||||
/* Operate on diagonals */ |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); |
||||
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); |
||||
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); |
||||
|
||||
/* Operate on columns. */ |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); |
||||
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); |
||||
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); |
||||
|
||||
/* Operate on diagonals */ |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); |
||||
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); |
||||
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); |
||||
|
||||
/* Operate on columns. */ |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); |
||||
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); |
||||
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); |
||||
|
||||
/* Operate on diagonals */ |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); |
||||
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); |
||||
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); |
||||
|
||||
/* Operate on columns. */ |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16); |
||||
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12); |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 8); |
||||
QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 7); |
||||
|
||||
/* Operate on diagonals */ |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16); |
||||
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12); |
||||
QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 8); |
||||
QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 7); |
||||
|
||||
B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7]; |
||||
B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15]; |
||||
} |
||||
|
||||
#endif |
||||
|
||||
#define ROTL7(a0,a1,a2,a3,a00,a10,a20,a30){\ |
||||
a0^=(((a00)<<7) | ((a00)>>25) );\ |
||||
a1^=(((a10)<<7) | ((a10)>>25) );\ |
||||
a2^=(((a20)<<7) | ((a20)>>25) );\ |
||||
a3^=(((a30)<<7) | ((a30)>>25) );\ |
||||
};\ |
||||
|
||||
#define ROTL9(a0,a1,a2,a3,a00,a10,a20,a30){\ |
||||
a0^=(((a00)<<9) | ((a00)>>23) );\ |
||||
a1^=(((a10)<<9) | ((a10)>>23) );\ |
||||
a2^=(((a20)<<9) | ((a20)>>23) );\ |
||||
a3^=(((a30)<<9) | ((a30)>>23) );\ |
||||
};\ |
||||
|
||||
#define ROTL13(a0,a1,a2,a3,a00,a10,a20,a30){\ |
||||
a0^=(((a00)<<13) | ((a00)>>19) );\ |
||||
a1^=(((a10)<<13) | ((a10)>>19) );\ |
||||
a2^=(((a20)<<13) | ((a20)>>19) );\ |
||||
a3^=(((a30)<<13) | ((a30)>>19) );\ |
||||
};\ |
||||
|
||||
#define ROTL18(a0,a1,a2,a3,a00,a10,a20,a30){\ |
||||
a0^=(((a00)<<18) | ((a00)>>14) );\ |
||||
a1^=(((a10)<<18) | ((a10)>>14) );\ |
||||
a2^=(((a20)<<18) | ((a20)>>14) );\ |
||||
a3^=(((a30)<<18) | ((a30)>>14) );\ |
||||
};\ |
||||
|
||||
static __device__ void xor_salsa8(uint4 *B, uint4 *C) |
||||
{ |
||||
uint32_t x[16]; |
||||
x[0]=(B[0].x ^= C[0].x); |
||||
x[1]=(B[0].y ^= C[0].y); |
||||
x[2]=(B[0].z ^= C[0].z); |
||||
x[3]=(B[0].w ^= C[0].w); |
||||
x[4]=(B[1].x ^= C[1].x); |
||||
x[5]=(B[1].y ^= C[1].y); |
||||
x[6]=(B[1].z ^= C[1].z); |
||||
x[7]=(B[1].w ^= C[1].w); |
||||
x[8]=(B[2].x ^= C[2].x); |
||||
x[9]=(B[2].y ^= C[2].y); |
||||
x[10]=(B[2].z ^= C[2].z); |
||||
x[11]=(B[2].w ^= C[2].w); |
||||
x[12]=(B[3].x ^= C[3].x); |
||||
x[13]=(B[3].y ^= C[3].y); |
||||
x[14]=(B[3].z ^= C[3].z); |
||||
x[15]=(B[3].w ^= C[3].w); |
||||
|
||||
/* Operate on columns. */ |
||||
ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); |
||||
ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); |
||||
ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); |
||||
ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); |
||||
|
||||
/* Operate on rows. */ |
||||
ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); |
||||
ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); |
||||
ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); |
||||
ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); |
||||
|
||||
/* Operate on columns. */ |
||||
ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); |
||||
ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); |
||||
ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); |
||||
ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); |
||||
|
||||
/* Operate on rows. */ |
||||
ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); |
||||
ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); |
||||
ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); |
||||
ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); |
||||
|
||||
/* Operate on columns. */ |
||||
ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); |
||||
ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); |
||||
ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); |
||||
ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); |
||||
|
||||
/* Operate on rows. */ |
||||
ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); |
||||
ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); |
||||
ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); |
||||
ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); |
||||
|
||||
/* Operate on columns. */ |
||||
ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]); |
||||
ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]); |
||||
ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]); |
||||
ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]); |
||||
|
||||
/* Operate on rows. */ |
||||
ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]); |
||||
ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]); |
||||
ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]); |
||||
ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]); |
||||
|
||||
B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2]; B[0].w += x[3]; B[1].x += x[4]; B[1].y += x[5]; B[1].z += x[6]; B[1].w += x[7]; |
||||
B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15]; |
||||
} |
||||
|
||||
static __device__ __forceinline__ uint4& operator^=(uint4& left, const uint4& right) |
||||
{ |
||||
left.x ^= right.x; |
||||
left.y ^= right.y; |
||||
left.z ^= right.z; |
||||
left.w ^= right.w; |
||||
return left; |
||||
} |
||||
|
||||
//////////////////////////////////////////////////////////////////////////////// |
||||
//! Scrypt core kernel for Fermi class devices. |
||||
//! @param g_idata input data in global memory |
||||
//! @param g_odata output data in global memory |
||||
//////////////////////////////////////////////////////////////////////////////// |
||||
template <int ALGO> __global__ |
||||
void fermi_scrypt_core_kernelA(uint32_t *g_idata, unsigned int N) |
||||
{ |
||||
extern __shared__ unsigned char x[]; |
||||
uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x; |
||||
|
||||
int warpIdx = threadIdx.x / warpSize; |
||||
int warpThread = threadIdx.x % warpSize; |
||||
const unsigned int LOOKUP_GAP = 1; |
||||
|
||||
// variables supporting the large memory transaction magic |
||||
unsigned int Y = warpThread/4; |
||||
unsigned int Z = 4*(warpThread%4); |
||||
|
||||
// add block specific offsets |
||||
int WARPS_PER_BLOCK = blockDim.x / 32; |
||||
int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP; |
||||
g_idata += 32 * offset; |
||||
uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z; |
||||
|
||||
// registers to store an entire work unit |
||||
uint4 B[4], C[4]; |
||||
|
||||
uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z]; |
||||
uint32_t *XX = X[warpIdx][warpThread]; |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&V[SCRATCH*wu])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+Z])); |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&V[SCRATCH*wu+16])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+16+Z])); |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
for (int i = 1; i < N; i++) { |
||||
|
||||
switch(ALGO) { |
||||
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; |
||||
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; |
||||
} |
||||
|
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&V[SCRATCH*wu + i*32])) = *((ulonglong2*)XB[wu]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&V[SCRATCH*wu + i*32 + 16])) = *((ulonglong2*)XB[wu]); |
||||
} |
||||
} |
||||
|
||||
template <int ALGO> __global__ |
||||
void fermi_scrypt_core_kernelB(uint32_t *g_odata, unsigned int N) |
||||
{ |
||||
extern __shared__ unsigned char x[]; |
||||
uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x; |
||||
|
||||
int warpIdx = threadIdx.x / warpSize; |
||||
int warpThread = threadIdx.x % warpSize; |
||||
const unsigned int LOOKUP_GAP = 1; |
||||
|
||||
// variables supporting the large memory transaction magic |
||||
unsigned int Y = warpThread/4; |
||||
unsigned int Z = 4*(warpThread%4); |
||||
|
||||
// add block specific offsets |
||||
int WARPS_PER_BLOCK = blockDim.x / 32; |
||||
int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP; |
||||
g_odata += 32 * offset; |
||||
uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z; |
||||
|
||||
// registers to store an entire work unit |
||||
uint4 B[4], C[4]; |
||||
|
||||
uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z]; |
||||
uint32_t *XX = X[warpIdx][warpThread]; |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + (N-1)*32])); |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + (N-1)*32 + 16])); |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
switch(ALGO) { |
||||
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; |
||||
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; |
||||
} |
||||
|
||||
for (int i = 0; i < N; i++) { |
||||
|
||||
XX[16] = 32 * (C[0].x & (N-1)); |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z]])); |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) B[idx] ^= *((uint4*)&XX[4*idx]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z] + 16])); |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) C[idx] ^= *((uint4*)&XX[4*idx]); |
||||
|
||||
switch(ALGO) { |
||||
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; |
||||
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; |
||||
} |
||||
} |
||||
|
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]); |
||||
|
||||
} |
||||
|
||||
template <int ALGO, int TEX_DIM> __global__ void |
||||
fermi_scrypt_core_kernelB_tex(uint32_t *g_odata, unsigned int N) |
||||
{ |
||||
extern __shared__ unsigned char x[]; |
||||
uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x; |
||||
|
||||
int warpIdx = threadIdx.x / warpSize; |
||||
int warpThread = threadIdx.x % warpSize; |
||||
const unsigned int LOOKUP_GAP = 1; |
||||
|
||||
// variables supporting the large memory transaction magic |
||||
unsigned int Y = warpThread/4; |
||||
unsigned int Z = 4*(warpThread%4); |
||||
|
||||
// add block specific offsets |
||||
int WARPS_PER_BLOCK = blockDim.x / 32; |
||||
int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP; |
||||
g_odata += 32 * offset; |
||||
|
||||
// registers to store an entire work unit |
||||
uint4 B[4], C[4]; |
||||
|
||||
uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z]; |
||||
uint32_t *XX = X[warpIdx][warpThread]; |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + (N-1)*32 + Z)/4; |
||||
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ? |
||||
tex1Dfetch(texRef1D_4_V, loc) : |
||||
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + (N-1)*32 + 16+Z)/4; |
||||
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ? |
||||
tex1Dfetch(texRef1D_4_V, loc) : |
||||
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
switch(ALGO) { |
||||
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; |
||||
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; |
||||
} |
||||
|
||||
for (int i = 0; i < N; i++) { |
||||
|
||||
XX[16] = 32 * (C[0].x & (N-1)); |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + Z)/4; |
||||
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ? |
||||
tex1Dfetch(texRef1D_4_V, loc) : |
||||
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) B[idx] ^= *((uint4*)&XX[4*idx]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + 16+Z)/4; |
||||
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ? |
||||
tex1Dfetch(texRef1D_4_V, loc) : |
||||
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) C[idx] ^= *((uint4*)&XX[4*idx]); |
||||
|
||||
switch(ALGO) { |
||||
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; |
||||
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; |
||||
} |
||||
} |
||||
|
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]); |
||||
} |
||||
|
||||
// |
||||
// Lookup-Gap variations of the above functions |
||||
// |
||||
|
||||
template <int ALGO> __global__ void |
||||
fermi_scrypt_core_kernelA_LG(uint32_t *g_idata, unsigned int N, unsigned int LOOKUP_GAP) |
||||
{ |
||||
extern __shared__ unsigned char x[]; |
||||
uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x; |
||||
|
||||
int warpIdx = threadIdx.x / warpSize; |
||||
int warpThread = threadIdx.x % warpSize; |
||||
|
||||
// variables supporting the large memory transaction magic |
||||
unsigned int Y = warpThread/4; |
||||
unsigned int Z = 4*(warpThread%4); |
||||
|
||||
// add block specific offsets |
||||
int WARPS_PER_BLOCK = blockDim.x / 32; |
||||
int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP; |
||||
g_idata += 32 * offset; |
||||
uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z; |
||||
|
||||
// registers to store an entire work unit |
||||
uint4 B[4], C[4]; |
||||
|
||||
uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z]; |
||||
uint32_t *XX = X[warpIdx][warpThread]; |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&V[SCRATCH*wu])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+Z])); |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&V[SCRATCH*wu+16])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+16+Z])); |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
for (int i = 1; i < N; i++) { |
||||
|
||||
switch(ALGO) { |
||||
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; |
||||
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; |
||||
} |
||||
|
||||
if (i % LOOKUP_GAP == 0) { |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&V[SCRATCH*wu + (i/LOOKUP_GAP)*32])) = *((ulonglong2*)XB[wu]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&V[SCRATCH*wu + (i/LOOKUP_GAP)*32 + 16])) = *((ulonglong2*)XB[wu]); |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <int ALGO> __global__ void |
||||
fermi_scrypt_core_kernelB_LG(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP) |
||||
{ |
||||
extern __shared__ unsigned char x[]; |
||||
uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x; |
||||
|
||||
int warpIdx = threadIdx.x / warpSize; |
||||
int warpThread = threadIdx.x % warpSize; |
||||
|
||||
// variables supporting the large memory transaction magic |
||||
unsigned int Y = warpThread/4; |
||||
unsigned int Z = 4*(warpThread%4); |
||||
|
||||
// add block specific offsets |
||||
int WARPS_PER_BLOCK = blockDim.x / 32; |
||||
int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP; |
||||
g_odata += 32 * offset; |
||||
uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z; |
||||
|
||||
// registers to store an entire work unit |
||||
uint4 B[4], C[4]; |
||||
|
||||
uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z]; |
||||
uint32_t *XX = X[warpIdx][warpThread]; |
||||
|
||||
uint32_t pos = (N-1)/LOOKUP_GAP; uint32_t loop = 1 + (N-1)-pos*LOOKUP_GAP; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + pos*32])); |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + pos*32 + 16])); |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
while (loop--) |
||||
switch(ALGO) { |
||||
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; |
||||
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; |
||||
} |
||||
|
||||
for (int i = 0; i < N; i++) { |
||||
|
||||
uint32_t j = C[0].x & (N-1); |
||||
uint32_t pos = j / LOOKUP_GAP; uint32_t loop = j - pos*LOOKUP_GAP; |
||||
XX[16] = 32 * pos; |
||||
|
||||
uint4 b[4], c[4]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z]])); |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) b[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z] + 16])); |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) c[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
while (loop--) |
||||
switch(ALGO) { |
||||
case A_SCRYPT: xor_salsa8(b, c); xor_salsa8(c, b); break; |
||||
case A_SCRYPT_JANE: xor_chacha8(b, c); xor_chacha8(c, b); break; |
||||
} |
||||
|
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) B[idx] ^= b[idx]; |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) C[idx] ^= c[idx]; |
||||
|
||||
switch(ALGO) { |
||||
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; |
||||
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; |
||||
} |
||||
} |
||||
|
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]); |
||||
|
||||
} |
||||
|
||||
template <int ALGO, int TEX_DIM> __global__ void |
||||
fermi_scrypt_core_kernelB_LG_tex(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP) |
||||
{ |
||||
extern __shared__ unsigned char x[]; |
||||
uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x; |
||||
|
||||
int warpIdx = threadIdx.x / warpSize; |
||||
int warpThread = threadIdx.x % warpSize; |
||||
|
||||
// variables supporting the large memory transaction magic |
||||
unsigned int Y = warpThread/4; |
||||
unsigned int Z = 4*(warpThread%4); |
||||
|
||||
// add block specific offsets |
||||
int WARPS_PER_BLOCK = blockDim.x / 32; |
||||
int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP; |
||||
g_odata += 32 * offset; |
||||
|
||||
// registers to store an entire work unit |
||||
uint4 B[4], C[4]; |
||||
|
||||
uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z]; |
||||
uint32_t *XX = X[warpIdx][warpThread]; |
||||
|
||||
uint32_t pos = (N-1)/LOOKUP_GAP; uint32_t loop = 1 + (N-1)-pos*LOOKUP_GAP; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + pos*32 + Z)/4; |
||||
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ? |
||||
tex1Dfetch(texRef1D_4_V, loc) : |
||||
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + pos*32 + 16+Z)/4; |
||||
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ? |
||||
tex1Dfetch(texRef1D_4_V, loc) : |
||||
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
while (loop--) |
||||
switch(ALGO) { |
||||
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; |
||||
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; |
||||
} |
||||
|
||||
for (int i = 0; i < N; i++) { |
||||
|
||||
uint32_t j = C[0].x & (N-1); |
||||
uint32_t pos = j / LOOKUP_GAP; uint32_t loop = j - pos*LOOKUP_GAP; |
||||
XX[16] = 32 * pos; |
||||
|
||||
uint4 b[4], c[4]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + Z)/4; |
||||
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ? |
||||
tex1Dfetch(texRef1D_4_V, loc) : |
||||
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) b[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + 16+Z)/4; |
||||
*((uint4*)XB[wu]) = ((TEX_DIM == 1) ? |
||||
tex1Dfetch(texRef1D_4_V, loc) : |
||||
tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); } |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) c[idx] = *((uint4*)&XX[4*idx]); |
||||
|
||||
while (loop--) |
||||
switch(ALGO) { |
||||
case A_SCRYPT: xor_salsa8(b, c); xor_salsa8(c, b); break; |
||||
case A_SCRYPT_JANE: xor_chacha8(b, c); xor_chacha8(c, b); break; |
||||
} |
||||
|
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) B[idx] ^= b[idx]; |
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) C[idx] ^= c[idx]; |
||||
|
||||
switch(ALGO) { |
||||
case A_SCRYPT: xor_salsa8(B, C); xor_salsa8(C, B); break; |
||||
case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break; |
||||
} |
||||
} |
||||
|
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]); |
||||
|
||||
#pragma unroll 4 |
||||
for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx]; |
||||
#pragma unroll 4 |
||||
for (int wu=0; wu < 32; wu+=8) |
||||
*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]); |
||||
} |
@ -0,0 +1,28 @@
@@ -0,0 +1,28 @@
|
||||
#ifndef FERMI_KERNEL_H |
||||
#define FERMI_KERNEL_H |
||||
|
||||
#include "salsa_kernel.h" |
||||
|
||||
class FermiKernel : public KernelInterface |
||||
{ |
||||
public: |
||||
FermiKernel(); |
||||
|
||||
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V); |
||||
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache); |
||||
virtual bool bindtexture_1D(uint32_t *d_V, size_t size); |
||||
virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch); |
||||
virtual bool unbindtexture_1D(); |
||||
virtual bool unbindtexture_2D(); |
||||
|
||||
virtual char get_identifier() { return 'F'; }; |
||||
virtual int get_major_version() { return 1; } |
||||
virtual int get_minor_version() { return 0; } |
||||
virtual int max_warps_per_block() { return 16; }; |
||||
virtual int get_texel_width() { return 4; }; |
||||
virtual bool support_lookup_gap() { return true; } |
||||
virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; } |
||||
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferShared; } |
||||
}; |
||||
|
||||
#endif // #ifndef FERMI_KERNEL_H
|
@ -0,0 +1,837 @@
@@ -0,0 +1,837 @@
|
||||
// |
||||
// =============== KECCAK part on nVidia GPU ====================== |
||||
// |
||||
// The keccak512 (SHA-3) is used in the PBKDF2 for scrypt-jane coins |
||||
// in place of the SHA2 based PBKDF2 used in scrypt coins. |
||||
// |
||||
// The keccak256 is used exclusively in Maxcoin and clones. This module |
||||
// holds the generic "default" implementation when no architecture |
||||
// specific implementation is available in the kernel. |
||||
// |
||||
// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64 |
||||
// |
||||
|
||||
#include <map> |
||||
#include <stdint.h> |
||||
|
||||
#include "salsa_kernel.h" |
||||
#include "cuda_runtime.h" |
||||
#include "miner.h" |
||||
|
||||
#include "keccak.h" |
||||
|
||||
// define some error checking macros |
||||
#undef checkCudaErrors |
||||
|
||||
#if WIN32 |
||||
#define DELIMITER '/' |
||||
#else |
||||
#define DELIMITER '/' |
||||
#endif |
||||
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ ) |
||||
|
||||
#define checkCudaErrors(x) \ |
||||
{ \ |
||||
cudaGetLastError(); \ |
||||
x; \ |
||||
cudaError_t err = cudaGetLastError(); \ |
||||
if (err != cudaSuccess) \ |
||||
applog(LOG_ERR, "GPU #%d: cudaError %d (%s) calling '%s' (%s line %d)\n", device_map[thr_id], err, cudaGetErrorString(err), #x, __FILENAME__, __LINE__); \ |
||||
} |
||||
|
||||
// from salsa_kernel.cu |
||||
extern std::map<int, uint32_t *> context_idata[2]; |
||||
extern std::map<int, uint32_t *> context_odata[2]; |
||||
extern std::map<int, cudaStream_t> context_streams[2]; |
||||
extern std::map<int, uint32_t *> context_hash[2]; |
||||
|
||||
#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) |
||||
|
||||
// CB |
||||
#define U32TO64_LE(p) \ |
||||
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) |
||||
|
||||
#define U64TO32_LE(p, v) \ |
||||
*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); |
||||
|
||||
static __device__ void mycpy64(uint32_t *d, const uint32_t *s) { |
||||
#pragma unroll 16 |
||||
for (int k=0; k < 16; ++k) d[k] = s[k]; |
||||
} |
||||
|
||||
static __device__ void mycpy56(uint32_t *d, const uint32_t *s) { |
||||
#pragma unroll 14 |
||||
for (int k=0; k < 14; ++k) d[k] = s[k]; |
||||
} |
||||
|
||||
static __device__ void mycpy32(uint32_t *d, const uint32_t *s) { |
||||
#pragma unroll 8 |
||||
for (int k=0; k < 8; ++k) d[k] = s[k]; |
||||
} |
||||
|
||||
static __device__ void mycpy8(uint32_t *d, const uint32_t *s) { |
||||
#pragma unroll 2 |
||||
for (int k=0; k < 2; ++k) d[k] = s[k]; |
||||
} |
||||
|
||||
static __device__ void mycpy4(uint32_t *d, const uint32_t *s) { |
||||
*d = *s; |
||||
} |
||||
|
||||
// ---------------------------- BEGIN keccak functions ------------------------------------ |
||||
|
||||
#define KECCAK_HASH "Keccak-512" |
||||
|
||||
typedef struct keccak_hash_state_t { |
||||
uint64_t state[25]; // 25*2 |
||||
uint32_t buffer[72/4]; // 72 |
||||
} keccak_hash_state; |
||||
|
||||
__device__ void statecopy0(keccak_hash_state *d, keccak_hash_state *s) |
||||
{ |
||||
#pragma unroll 25 |
||||
for (int i=0; i < 25; ++i) |
||||
d->state[i] = s->state[i]; |
||||
} |
||||
|
||||
__device__ void statecopy8(keccak_hash_state *d, keccak_hash_state *s) |
||||
{ |
||||
#pragma unroll 25 |
||||
for (int i=0; i < 25; ++i) |
||||
d->state[i] = s->state[i]; |
||||
#pragma unroll 2 |
||||
for (int i=0; i < 2; ++i) |
||||
d->buffer[i] = s->buffer[i]; |
||||
} |
||||
|
||||
static const uint64_t host_keccak_round_constants[24] = { |
||||
0x0000000000000001ull, 0x0000000000008082ull, |
||||
0x800000000000808aull, 0x8000000080008000ull, |
||||
0x000000000000808bull, 0x0000000080000001ull, |
||||
0x8000000080008081ull, 0x8000000000008009ull, |
||||
0x000000000000008aull, 0x0000000000000088ull, |
||||
0x0000000080008009ull, 0x000000008000000aull, |
||||
0x000000008000808bull, 0x800000000000008bull, |
||||
0x8000000000008089ull, 0x8000000000008003ull, |
||||
0x8000000000008002ull, 0x8000000000000080ull, |
||||
0x000000000000800aull, 0x800000008000000aull, |
||||
0x8000000080008081ull, 0x8000000000008080ull, |
||||
0x0000000080000001ull, 0x8000000080008008ull |
||||
}; |
||||
|
||||
__constant__ uint64_t c_keccak_round_constants[24]; |
||||
__constant__ uint32_t pdata[20]; |
||||
|
||||
__device__ |
||||
void keccak_block(keccak_hash_state *S, const uint32_t *in) { |
||||
size_t i; |
||||
uint64_t *s = S->state, t[5], u[5], v, w; |
||||
|
||||
/* absorb input */ |
||||
#pragma unroll 9 |
||||
for (i = 0; i < 72 / 8; i++, in += 2) |
||||
s[i] ^= U32TO64_LE(in); |
||||
|
||||
for (i = 0; i < 24; i++) { |
||||
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ |
||||
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; |
||||
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; |
||||
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; |
||||
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; |
||||
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; |
||||
|
||||
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ |
||||
u[0] = t[4] ^ ROTL64(t[1], 1); |
||||
u[1] = t[0] ^ ROTL64(t[2], 1); |
||||
u[2] = t[1] ^ ROTL64(t[3], 1); |
||||
u[3] = t[2] ^ ROTL64(t[4], 1); |
||||
u[4] = t[3] ^ ROTL64(t[0], 1); |
||||
|
||||
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ |
||||
s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; |
||||
s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; |
||||
s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; |
||||
s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; |
||||
s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; |
||||
|
||||
/* rho pi: b[..] = rotl(a[..], ..) */ |
||||
v = s[ 1]; |
||||
s[ 1] = ROTL64(s[ 6], 44); |
||||
s[ 6] = ROTL64(s[ 9], 20); |
||||
s[ 9] = ROTL64(s[22], 61); |
||||
s[22] = ROTL64(s[14], 39); |
||||
s[14] = ROTL64(s[20], 18); |
||||
s[20] = ROTL64(s[ 2], 62); |
||||
s[ 2] = ROTL64(s[12], 43); |
||||
s[12] = ROTL64(s[13], 25); |
||||
s[13] = ROTL64(s[19], 8); |
||||
s[19] = ROTL64(s[23], 56); |
||||
s[23] = ROTL64(s[15], 41); |
||||
s[15] = ROTL64(s[ 4], 27); |
||||
s[ 4] = ROTL64(s[24], 14); |
||||
s[24] = ROTL64(s[21], 2); |
||||
s[21] = ROTL64(s[ 8], 55); |
||||
s[ 8] = ROTL64(s[16], 45); |
||||
s[16] = ROTL64(s[ 5], 36); |
||||
s[ 5] = ROTL64(s[ 3], 28); |
||||
s[ 3] = ROTL64(s[18], 21); |
||||
s[18] = ROTL64(s[17], 15); |
||||
s[17] = ROTL64(s[11], 10); |
||||
s[11] = ROTL64(s[ 7], 6); |
||||
s[ 7] = ROTL64(s[10], 3); |
||||
s[10] = ROTL64( v, 1); |
||||
|
||||
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ |
||||
v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; |
||||
v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; |
||||
v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; |
||||
v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; |
||||
v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; |
||||
|
||||
/* iota: a[0,0] ^= round constant */ |
||||
s[0] ^= c_keccak_round_constants[i]; |
||||
} |
||||
} |
||||
|
||||
__device__ |
||||
void keccak_hash_init(keccak_hash_state *S) { |
||||
#pragma unroll 25 |
||||
for (int i=0; i<25; ++i) |
||||
S->state[i] = 0ULL; |
||||
} |
||||
|
||||
// assuming there is no leftover data and exactly 72 bytes are incoming |
||||
// we can directly call into the block hashing function |
||||
__device__ void keccak_hash_update72(keccak_hash_state *S, const uint32_t *in) { |
||||
keccak_block(S, in); |
||||
} |
||||
|
||||
__device__ void keccak_hash_update8(keccak_hash_state *S, const uint32_t *in) { |
||||
mycpy8(S->buffer, in); |
||||
} |
||||
|
||||
__device__ void keccak_hash_update4_8(keccak_hash_state *S, const uint32_t *in) { |
||||
mycpy4(S->buffer+8/4, in); |
||||
} |
||||
|
||||
__device__ void keccak_hash_update4_56(keccak_hash_state *S, const uint32_t *in) { |
||||
mycpy4(S->buffer+56/4, in); |
||||
} |
||||
|
||||
__device__ void keccak_hash_update56(keccak_hash_state *S, const uint32_t *in) { |
||||
mycpy56(S->buffer, in); |
||||
} |
||||
|
||||
__device__ void keccak_hash_update64(keccak_hash_state *S, const uint32_t *in) { |
||||
mycpy64(S->buffer, in); |
||||
} |
||||
|
||||
__device__ void keccak_hash_finish8(keccak_hash_state *S, uint32_t *hash) { |
||||
S->buffer[8/4] = 0x01; |
||||
#pragma unroll 15 |
||||
for (int i=8/4+1; i < 72/4; ++i) S->buffer[i] = 0; |
||||
S->buffer[72/4 - 1] |= 0x80000000; |
||||
keccak_block(S, (const uint32_t*)S->buffer); |
||||
#pragma unroll 8 |
||||
for (size_t i = 0; i < 64; i += 8) { |
||||
U64TO32_LE((&hash[i/4]), S->state[i / 8]); |
||||
} |
||||
} |
||||
|
||||
__device__ void keccak_hash_finish12(keccak_hash_state *S, uint32_t *hash) { |
||||
S->buffer[12/4] = 0x01; |
||||
#pragma unroll 14 |
||||
for (int i=12/4+1; i < 72/4; ++i) S->buffer[i] = 0; |
||||
S->buffer[72/4 - 1] |= 0x80000000; |
||||
keccak_block(S, (const uint32_t*)S->buffer); |
||||
#pragma unroll 8 |
||||
for (size_t i = 0; i < 64; i += 8) { |
||||
U64TO32_LE((&hash[i/4]), S->state[i / 8]); |
||||
} |
||||
} |
||||
|
||||
__device__ void keccak_hash_finish60(keccak_hash_state *S, uint32_t *hash) { |
||||
S->buffer[60/4] = 0x01; |
||||
#pragma unroll 2 |
||||
for (int i=60/4+1; i < 72/4; ++i) S->buffer[i] = 0; |
||||
S->buffer[72/4 - 1] |= 0x80000000; |
||||
keccak_block(S, (const uint32_t*)S->buffer); |
||||
#pragma unroll 8 |
||||
for (size_t i = 0; i < 64; i += 8) { |
||||
U64TO32_LE((&hash[i/4]), S->state[i / 8]); |
||||
} |
||||
} |
||||
|
||||
__device__ void keccak_hash_finish64(keccak_hash_state *S, uint32_t *hash) { |
||||
S->buffer[64/4] = 0x01; |
||||
#pragma unroll 1 |
||||
for (int i=64/4+1; i < 72/4; ++i) S->buffer[i] = 0; |
||||
S->buffer[72/4 - 1] |= 0x80000000; |
||||
keccak_block(S, (const uint32_t*)S->buffer); |
||||
#pragma unroll 8 |
||||
for (size_t i = 0; i < 64; i += 8) { |
||||
U64TO32_LE((&hash[i/4]), S->state[i / 8]); |
||||
} |
||||
} |
||||
|
||||
// ---------------------------- END keccak functions ------------------------------------ |
||||
|
||||
// ---------------------------- BEGIN PBKDF2 functions ------------------------------------ |
||||
|
||||
typedef struct pbkdf2_hmac_state_t { |
||||
keccak_hash_state inner, outer; |
||||
} pbkdf2_hmac_state; |
||||
|
||||
|
||||
__device__ void pbkdf2_hash(uint32_t *hash, const uint32_t *m) { |
||||
keccak_hash_state st; |
||||
keccak_hash_init(&st); |
||||
keccak_hash_update72(&st, m); |
||||
keccak_hash_update8(&st, m+72/4); |
||||
keccak_hash_finish8(&st, hash); |
||||
} |
||||
|
||||
/* hmac */ |
||||
__device__ void pbkdf2_hmac_init80(pbkdf2_hmac_state *st, const uint32_t *key) { |
||||
uint32_t pad[72/4]; |
||||
size_t i; |
||||
|
||||
keccak_hash_init(&st->inner); |
||||
keccak_hash_init(&st->outer); |
||||
|
||||
#pragma unroll 18 |
||||
for (i = 0; i < 72/4; i++) |
||||
pad[i] = 0; |
||||
|
||||
/* key > blocksize bytes, hash it */ |
||||
pbkdf2_hash(pad, key); |
||||
|
||||
/* inner = (key ^ 0x36) */ |
||||
/* h(inner || ...) */ |
||||
#pragma unroll 18 |
||||
for (i = 0; i < 72/4; i++) |
||||
pad[i] ^= 0x36363636; |
||||
keccak_hash_update72(&st->inner, pad); |
||||
|
||||
/* outer = (key ^ 0x5c) */ |
||||
/* h(outer || ...) */ |
||||
#pragma unroll 18 |
||||
for (i = 0; i < 72/4; i++) |
||||
pad[i] ^= 0x6a6a6a6a; |
||||
keccak_hash_update72(&st->outer, pad); |
||||
} |
||||
|
||||
// assuming there is no leftover data and exactly 72 bytes are incoming |
||||
// we can directly call into the block hashing function |
||||
__device__ void pbkdf2_hmac_update72(pbkdf2_hmac_state *st, const uint32_t *m) { |
||||
/* h(inner || m...) */ |
||||
keccak_hash_update72(&st->inner, m); |
||||
} |
||||
|
||||
__device__ void pbkdf2_hmac_update8(pbkdf2_hmac_state *st, const uint32_t *m) { |
||||
/* h(inner || m...) */ |
||||
keccak_hash_update8(&st->inner, m); |
||||
} |
||||
|
||||
__device__ void pbkdf2_hmac_update4_8(pbkdf2_hmac_state *st, const uint32_t *m) { |
||||
/* h(inner || m...) */ |
||||
keccak_hash_update4_8(&st->inner, m); |
||||
} |
||||
|
||||
__device__ void pbkdf2_hmac_update4_56(pbkdf2_hmac_state *st, const uint32_t *m) { |
||||
/* h(inner || m...) */ |
||||
keccak_hash_update4_56(&st->inner, m); |
||||
} |
||||
|
||||
__device__ void pbkdf2_hmac_update56(pbkdf2_hmac_state *st, const uint32_t *m) { |
||||
/* h(inner || m...) */ |
||||
keccak_hash_update56(&st->inner, m); |
||||
} |
||||
|
||||
__device__ void pbkdf2_hmac_finish12(pbkdf2_hmac_state *st, uint32_t *mac) { |
||||
/* h(inner || m) */ |
||||
uint32_t innerhash[16]; |
||||
keccak_hash_finish12(&st->inner, innerhash); |
||||
|
||||
/* h(outer || h(inner || m)) */ |
||||
keccak_hash_update64(&st->outer, innerhash); |
||||
keccak_hash_finish64(&st->outer, mac); |
||||
} |
||||
|
||||
__device__ void pbkdf2_hmac_finish60(pbkdf2_hmac_state *st, uint32_t *mac) { |
||||
/* h(inner || m) */ |
||||
uint32_t innerhash[16]; |
||||
keccak_hash_finish60(&st->inner, innerhash); |
||||
|
||||
/* h(outer || h(inner || m)) */ |
||||
keccak_hash_update64(&st->outer, innerhash); |
||||
keccak_hash_finish64(&st->outer, mac); |
||||
} |
||||
|
||||
__device__ void pbkdf2_statecopy8(pbkdf2_hmac_state *d, pbkdf2_hmac_state *s) { |
||||
statecopy8(&d->inner, &s->inner); |
||||
statecopy0(&d->outer, &s->outer); |
||||
} |
||||
|
||||
// ---------------------------- END PBKDF2 functions ------------------------------------ |
||||
|
||||
static __device__ uint32_t cuda_swab32(uint32_t x) { |
||||
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) |
||||
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); |
||||
} |
||||
|
||||
__global__ __launch_bounds__(128) |
||||
void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce) |
||||
{ |
||||
nonce += (blockIdx.x * blockDim.x) + threadIdx.x; |
||||
g_idata += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
|
||||
uint32_t data[20]; |
||||
|
||||
#pragma unroll |
||||
for (int i=0; i <19; ++i) |
||||
data[i] = cuda_swab32(pdata[i]); |
||||
data[19] = cuda_swab32(nonce); |
||||
|
||||
// scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)data, 80, (uint8_t*)g_idata, 128); |
||||
|
||||
pbkdf2_hmac_state hmac_pw, work; |
||||
uint32_t ti[16]; |
||||
uint32_t be; |
||||
|
||||
/* hmac(password, ...) */ |
||||
pbkdf2_hmac_init80(&hmac_pw, data); |
||||
|
||||
/* hmac(password, salt...) */ |
||||
pbkdf2_hmac_update72(&hmac_pw, data); |
||||
pbkdf2_hmac_update8(&hmac_pw, data+72/4); |
||||
|
||||
/* U1 = hmac(password, salt || be(i)) */ |
||||
be = cuda_swab32(1); |
||||
pbkdf2_statecopy8(&work, &hmac_pw); |
||||
pbkdf2_hmac_update4_8(&work, &be); |
||||
pbkdf2_hmac_finish12(&work, ti); |
||||
mycpy64(g_idata, ti); |
||||
|
||||
be = cuda_swab32(2); |
||||
pbkdf2_statecopy8(&work, &hmac_pw); |
||||
pbkdf2_hmac_update4_8(&work, &be); |
||||
pbkdf2_hmac_finish12(&work, ti); |
||||
mycpy64(g_idata+16, ti); |
||||
} |
||||
|
||||
|
||||
__global__ __launch_bounds__(128) |
||||
void cuda_post_keccak512(uint32_t *g_odata, uint32_t *g_hash, uint32_t nonce) |
||||
{ |
||||
nonce += (blockIdx.x * blockDim.x) + threadIdx.x; |
||||
g_odata += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
g_hash += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
|
||||
uint32_t data[20]; |
||||
|
||||
#pragma unroll 19 |
||||
for (int i=0; i <19; ++i) |
||||
data[i] = cuda_swab32(pdata[i]); |
||||
data[19] = cuda_swab32(nonce); |
||||
|
||||
// scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)g_odata, 128, (uint8_t*)g_hash, 32); |
||||
|
||||
pbkdf2_hmac_state hmac_pw; |
||||
uint32_t ti[16]; |
||||
uint32_t be; |
||||
|
||||
/* hmac(password, ...) */ |
||||
pbkdf2_hmac_init80(&hmac_pw, data); |
||||
|
||||
/* hmac(password, salt...) */ |
||||
pbkdf2_hmac_update72(&hmac_pw, g_odata); |
||||
pbkdf2_hmac_update56(&hmac_pw, g_odata+72/4); |
||||
|
||||
/* U1 = hmac(password, salt || be(i)) */ |
||||
be = cuda_swab32(1); |
||||
pbkdf2_hmac_update4_56(&hmac_pw, &be); |
||||
pbkdf2_hmac_finish60(&hmac_pw, ti); |
||||
mycpy32(g_hash, ti); |
||||
} |
||||
|
||||
// |
||||
// callable host code to initialize constants and to call kernels |
||||
// |
||||
|
||||
static bool init[MAX_GPUS] = { 0 }; |
||||
|
||||
extern "C" void prepare_keccak512(int thr_id, const uint32_t host_pdata[20]) |
||||
{ |
||||
if (!init[thr_id]) |
||||
{ |
||||
checkCudaErrors(cudaMemcpyToSymbol(c_keccak_round_constants, host_keccak_round_constants, sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice)); |
||||
init[thr_id] = true; |
||||
} |
||||
checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); |
||||
} |
||||
|
||||
extern "C" void pre_keccak512(int thr_id, int stream, uint32_t nonce, int throughput) |
||||
{ |
||||
dim3 block(128); |
||||
dim3 grid((throughput+127)/128); |
||||
|
||||
cuda_pre_keccak512<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_idata[stream][thr_id], nonce); |
||||
} |
||||
|
||||
extern "C" void post_keccak512(int thr_id, int stream, uint32_t nonce, int throughput) |
||||
{ |
||||
dim3 block(128); |
||||
dim3 grid((throughput+127)/128); |
||||
|
||||
cuda_post_keccak512<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_odata[stream][thr_id], context_hash[stream][thr_id], nonce); |
||||
} |
||||
|
||||
|
||||
// |
||||
// Maxcoin related Keccak implementation (Keccak256) |
||||
// |
||||
|
||||
#include <stdint.h> |
||||
|
||||
#include <map> |
||||
extern std::map<int, int> context_blocks; |
||||
extern std::map<int, int> context_wpb; |
||||
extern std::map<int, KernelInterface *> context_kernel; |
||||
|
||||
__constant__ uint64_t ptarget64[4]; |
||||
|
||||
#define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64)))) |
||||
#define ROL_mult8(a, offset) ROL(a, offset) |
||||
|
||||
__constant__ uint64_t KeccakF_RoundConstants[24]; |
||||
|
||||
static uint64_t host_KeccakF_RoundConstants[24] = { |
||||
(uint64_t)0x0000000000000001ULL, |
||||
(uint64_t)0x0000000000008082ULL, |
||||
(uint64_t)0x800000000000808aULL, |
||||
(uint64_t)0x8000000080008000ULL, |
||||
(uint64_t)0x000000000000808bULL, |
||||
(uint64_t)0x0000000080000001ULL, |
||||
(uint64_t)0x8000000080008081ULL, |
||||
(uint64_t)0x8000000000008009ULL, |
||||
(uint64_t)0x000000000000008aULL, |
||||
(uint64_t)0x0000000000000088ULL, |
||||
(uint64_t)0x0000000080008009ULL, |
||||
(uint64_t)0x000000008000000aULL, |
||||
(uint64_t)0x000000008000808bULL, |
||||
(uint64_t)0x800000000000008bULL, |
||||
(uint64_t)0x8000000000008089ULL, |
||||
(uint64_t)0x8000000000008003ULL, |
||||
(uint64_t)0x8000000000008002ULL, |
||||
(uint64_t)0x8000000000000080ULL, |
||||
(uint64_t)0x000000000000800aULL, |
||||
(uint64_t)0x800000008000000aULL, |
||||
(uint64_t)0x8000000080008081ULL, |
||||
(uint64_t)0x8000000000008080ULL, |
||||
(uint64_t)0x0000000080000001ULL, |
||||
(uint64_t)0x8000000080008008ULL |
||||
}; |
||||
|
||||
__constant__ uint64_t pdata64[10]; |
||||
|
||||
__global__ |
||||
void crypto_hash(uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate) |
||||
{ |
||||
uint64_t Aba, Abe, Abi, Abo, Abu; |
||||
uint64_t Aga, Age, Agi, Ago, Agu; |
||||
uint64_t Aka, Ake, Aki, Ako, Aku; |
||||
uint64_t Ama, Ame, Ami, Amo, Amu; |
||||
uint64_t Asa, Ase, Asi, Aso, Asu; |
||||
uint64_t BCa, BCe, BCi, BCo, BCu; |
||||
uint64_t Da, De, Di, Do, Du; |
||||
uint64_t Eba, Ebe, Ebi, Ebo, Ebu; |
||||
uint64_t Ega, Ege, Egi, Ego, Egu; |
||||
uint64_t Eka, Eke, Eki, Eko, Eku; |
||||
uint64_t Ema, Eme, Emi, Emo, Emu; |
||||
uint64_t Esa, Ese, Esi, Eso, Esu; |
||||
|
||||
//copyFromState(A, state) |
||||
Aba = pdata64[0]; |
||||
Abe = pdata64[1]; |
||||
Abi = pdata64[2]; |
||||
Abo = pdata64[3]; |
||||
Abu = pdata64[4]; |
||||
Aga = pdata64[5]; |
||||
Age = pdata64[6]; |
||||
Agi = pdata64[7]; |
||||
Ago = pdata64[8]; |
||||
Agu = (pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32); |
||||
Aka = 0x0000000000000001ULL; |
||||
Ake = 0; |
||||
Aki = 0; |
||||
Ako = 0; |
||||
Aku = 0; |
||||
Ama = 0; |
||||
Ame = 0x8000000000000000ULL; |
||||
Ami = 0; |
||||
Amo = 0; |
||||
Amu = 0; |
||||
Asa = 0; |
||||
Ase = 0; |
||||
Asi = 0; |
||||
Aso = 0; |
||||
Asu = 0; |
||||
|
||||
#pragma unroll 12 |
||||
for( int laneCount = 0; laneCount < 24; laneCount += 2 ) |
||||
{ |
||||
// prepareTheta |
||||
BCa = Aba^Aga^Aka^Ama^Asa; |
||||
BCe = Abe^Age^Ake^Ame^Ase; |
||||
BCi = Abi^Agi^Aki^Ami^Asi; |
||||
BCo = Abo^Ago^Ako^Amo^Aso; |
||||
BCu = Abu^Agu^Aku^Amu^Asu; |
||||
|
||||
//thetaRhoPiChiIotaPrepareTheta(round , A, E) |
||||
Da = BCu^ROL(BCe, 1); |
||||
De = BCa^ROL(BCi, 1); |
||||
Di = BCe^ROL(BCo, 1); |
||||
Do = BCi^ROL(BCu, 1); |
||||
Du = BCo^ROL(BCa, 1); |
||||
|
||||
Aba ^= Da; |
||||
BCa = Aba; |
||||
Age ^= De; |
||||
BCe = ROL(Age, 44); |
||||
Aki ^= Di; |
||||
BCi = ROL(Aki, 43); |
||||
Amo ^= Do; |
||||
BCo = ROL(Amo, 21); |
||||
Asu ^= Du; |
||||
BCu = ROL(Asu, 14); |
||||
Eba = BCa ^((~BCe)& BCi ); |
||||
Eba ^= (uint64_t)KeccakF_RoundConstants[laneCount]; |
||||
Ebe = BCe ^((~BCi)& BCo ); |
||||
Ebi = BCi ^((~BCo)& BCu ); |
||||
Ebo = BCo ^((~BCu)& BCa ); |
||||
Ebu = BCu ^((~BCa)& BCe ); |
||||
|
||||
Abo ^= Do; |
||||
BCa = ROL(Abo, 28); |
||||
Agu ^= Du; |
||||
BCe = ROL(Agu, 20); |
||||
Aka ^= Da; |
||||
BCi = ROL(Aka, 3); |
||||
Ame ^= De; |
||||
BCo = ROL(Ame, 45); |
||||
Asi ^= Di; |
||||
BCu = ROL(Asi, 61); |
||||
Ega = BCa ^((~BCe)& BCi ); |
||||
Ege = BCe ^((~BCi)& BCo ); |
||||
Egi = BCi ^((~BCo)& BCu ); |
||||
Ego = BCo ^((~BCu)& BCa ); |
||||
Egu = BCu ^((~BCa)& BCe ); |
||||
|
||||
Abe ^= De; |
||||
BCa = ROL(Abe, 1); |
||||
Agi ^= Di; |
||||
BCe = ROL(Agi, 6); |
||||
Ako ^= Do; |
||||
BCi = ROL(Ako, 25); |
||||
Amu ^= Du; |
||||
BCo = ROL_mult8(Amu, 8); |
||||
Asa ^= Da; |
||||
BCu = ROL(Asa, 18); |
||||
Eka = BCa ^((~BCe)& BCi ); |
||||
Eke = BCe ^((~BCi)& BCo ); |
||||
Eki = BCi ^((~BCo)& BCu ); |
||||
Eko = BCo ^((~BCu)& BCa ); |
||||
Eku = BCu ^((~BCa)& BCe ); |
||||
|
||||
Abu ^= Du; |
||||
BCa = ROL(Abu, 27); |
||||
Aga ^= Da; |
||||
BCe = ROL(Aga, 36); |
||||
Ake ^= De; |
||||
BCi = ROL(Ake, 10); |
||||
Ami ^= Di; |
||||
BCo = ROL(Ami, 15); |
||||
Aso ^= Do; |
||||
BCu = ROL_mult8(Aso, 56); |
||||
Ema = BCa ^((~BCe)& BCi ); |
||||
Eme = BCe ^((~BCi)& BCo ); |
||||
Emi = BCi ^((~BCo)& BCu ); |
||||
Emo = BCo ^((~BCu)& BCa ); |
||||
Emu = BCu ^((~BCa)& BCe ); |
||||
|
||||
Abi ^= Di; |
||||
BCa = ROL(Abi, 62); |
||||
Ago ^= Do; |
||||
BCe = ROL(Ago, 55); |
||||
Aku ^= Du; |
||||
BCi = ROL(Aku, 39); |
||||
Ama ^= Da; |
||||
BCo = ROL(Ama, 41); |
||||
Ase ^= De; |
||||
BCu = ROL(Ase, 2); |
||||
Esa = BCa ^((~BCe)& BCi ); |
||||
Ese = BCe ^((~BCi)& BCo ); |
||||
Esi = BCi ^((~BCo)& BCu ); |
||||
Eso = BCo ^((~BCu)& BCa ); |
||||
Esu = BCu ^((~BCa)& BCe ); |
||||
|
||||
// prepareTheta |
||||
BCa = Eba^Ega^Eka^Ema^Esa; |
||||
BCe = Ebe^Ege^Eke^Eme^Ese; |
||||
BCi = Ebi^Egi^Eki^Emi^Esi; |
||||
BCo = Ebo^Ego^Eko^Emo^Eso; |
||||
BCu = Ebu^Egu^Eku^Emu^Esu; |
||||
|
||||
//thetaRhoPiChiIotaPrepareTheta(round+1, E, A) |
||||
Da = BCu^ROL(BCe, 1); |
||||
De = BCa^ROL(BCi, 1); |
||||
Di = BCe^ROL(BCo, 1); |
||||
Do = BCi^ROL(BCu, 1); |
||||
Du = BCo^ROL(BCa, 1); |
||||
|
||||
Eba ^= Da; |
||||
BCa = Eba; |
||||
Ege ^= De; |
||||
BCe = ROL(Ege, 44); |
||||
Eki ^= Di; |
||||
BCi = ROL(Eki, 43); |
||||
Emo ^= Do; |
||||
BCo = ROL(Emo, 21); |
||||
Esu ^= Du; |
||||
BCu = ROL(Esu, 14); |
||||
Aba = BCa ^((~BCe)& BCi ); |
||||
Aba ^= (uint64_t)KeccakF_RoundConstants[laneCount+1]; |
||||
Abe = BCe ^((~BCi)& BCo ); |
||||
Abi = BCi ^((~BCo)& BCu ); |
||||
Abo = BCo ^((~BCu)& BCa ); |
||||
Abu = BCu ^((~BCa)& BCe ); |
||||
|
||||
Ebo ^= Do; |
||||
BCa = ROL(Ebo, 28); |
||||
Egu ^= Du; |
||||
BCe = ROL(Egu, 20); |
||||
Eka ^= Da; |
||||
BCi = ROL(Eka, 3); |
||||
Eme ^= De; |
||||
BCo = ROL(Eme, 45); |
||||
Esi ^= Di; |
||||
BCu = ROL(Esi, 61); |
||||
Aga = BCa ^((~BCe)& BCi ); |
||||
Age = BCe ^((~BCi)& BCo ); |
||||
Agi = BCi ^((~BCo)& BCu ); |
||||
Ago = BCo ^((~BCu)& BCa ); |
||||
Agu = BCu ^((~BCa)& BCe ); |
||||
|
||||
Ebe ^= De; |
||||
BCa = ROL(Ebe, 1); |
||||
Egi ^= Di; |
||||
BCe = ROL(Egi, 6); |
||||
Eko ^= Do; |
||||
BCi = ROL(Eko, 25); |
||||
Emu ^= Du; |
||||
BCo = ROL_mult8(Emu, 8); |
||||
Esa ^= Da; |
||||
BCu = ROL(Esa, 18); |
||||
Aka = BCa ^((~BCe)& BCi ); |
||||
Ake = BCe ^((~BCi)& BCo ); |
||||
Aki = BCi ^((~BCo)& BCu ); |
||||
Ako = BCo ^((~BCu)& BCa ); |
||||
Aku = BCu ^((~BCa)& BCe ); |
||||
|
||||
Ebu ^= Du; |
||||
BCa = ROL(Ebu, 27); |
||||
Ega ^= Da; |
||||
BCe = ROL(Ega, 36); |
||||
Eke ^= De; |
||||
BCi = ROL(Eke, 10); |
||||
Emi ^= Di; |
||||
BCo = ROL(Emi, 15); |
||||
Eso ^= Do; |
||||
BCu = ROL_mult8(Eso, 56); |
||||
Ama = BCa ^((~BCe)& BCi ); |
||||
Ame = BCe ^((~BCi)& BCo ); |
||||
Ami = BCi ^((~BCo)& BCu ); |
||||
Amo = BCo ^((~BCu)& BCa ); |
||||
Amu = BCu ^((~BCa)& BCe ); |
||||
|
||||
Ebi ^= Di; |
||||
BCa = ROL(Ebi, 62); |
||||
Ego ^= Do; |
||||
BCe = ROL(Ego, 55); |
||||
Eku ^= Du; |
||||
BCi = ROL(Eku, 39); |
||||
Ema ^= Da; |
||||
BCo = ROL(Ema, 41); |
||||
Ese ^= De; |
||||
BCu = ROL(Ese, 2); |
||||
Asa = BCa ^((~BCe)& BCi ); |
||||
Ase = BCe ^((~BCi)& BCo ); |
||||
Asi = BCi ^((~BCo)& BCu ); |
||||
Aso = BCo ^((~BCu)& BCa ); |
||||
Asu = BCu ^((~BCa)& BCe ); |
||||
} |
||||
|
||||
if (validate) { |
||||
g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
g_out[3] = Abo; |
||||
g_out[2] = Abi; |
||||
g_out[1] = Abe; |
||||
g_out[0] = Aba; |
||||
} |
||||
|
||||
// the likelyhood of meeting the hashing target is so low, that we're not guarding this |
||||
// with atomic writes, locks or similar... |
||||
uint64_t *g_good64 = (uint64_t*)g_good; |
||||
if (Abo <= ptarget64[3]) { |
||||
if (Abo < g_good64[3]) { |
||||
g_good64[3] = Abo; |
||||
g_good64[2] = Abi; |
||||
g_good64[1] = Abe; |
||||
g_good64[0] = Aba; |
||||
g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
} |
||||
} |
||||
} |
||||
|
||||
static std::map<int, uint32_t *> context_good[2]; |
||||
|
||||
// ... keccak??? |
||||
bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) |
||||
{ |
||||
static bool init[MAX_DEVICES] = {false}; |
||||
if (!init[thr_id]) |
||||
{ |
||||
checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice)); |
||||
|
||||
// allocate pinned host memory for good hashes |
||||
uint32_t *tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp; |
||||
|
||||
init[thr_id] = true; |
||||
} |
||||
checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); |
||||
checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); |
||||
|
||||
return context_good[0][thr_id] && context_good[1][thr_id]; |
||||
} |
||||
|
||||
void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) |
||||
{ |
||||
checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id])); |
||||
|
||||
crypto_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h); |
||||
|
||||
// copy hashes from device memory to host (ALL hashes, lots of data...) |
||||
if (do_d2h && hash != NULL) { |
||||
size_t mem_size = throughput * sizeof(uint32_t) * 8; |
||||
checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size, |
||||
cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); |
||||
} |
||||
else if (hash != NULL) { |
||||
// asynchronous copy of winning nonce (just 4 bytes...) |
||||
checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t), |
||||
cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); |
||||
} |
||||
} |
@ -0,0 +1,8 @@
@@ -0,0 +1,8 @@
|
||||
#ifndef KECCAK_H |
||||
#define KEKKAC_H |
||||
|
||||
extern "C" void prepare_keccak512(int thr_id, const uint32_t host_pdata[20]); |
||||
extern "C" void pre_keccak512(int thr_id, int stream, uint32_t nonce, int throughput); |
||||
extern "C" void post_keccak512(int thr_id, int stream, uint32_t nonce, int throughput); |
||||
|
||||
#endif // #ifndef KEKKAC_H
|
@ -0,0 +1,781 @@
@@ -0,0 +1,781 @@
|
||||
/* Copyright (C) 2013 David G. Andersen. All rights reserved. |
||||
* with modifications by Christian Buchner |
||||
* |
||||
* Use of this code is covered under the Apache 2.0 license, which |
||||
* can be found in the file "LICENSE" |
||||
*/ |
||||
|
||||
// TODO: attempt V.Volkov style ILP (factor 4) |
||||
|
||||
#include <map> |
||||
|
||||
#include "cuda_runtime.h" |
||||
#include "miner.h" |
||||
|
||||
#include "salsa_kernel.h" |
||||
#include "kepler_kernel.h" |
||||
|
||||
#define TEXWIDTH 32768 |
||||
#define THREADS_PER_WU 4 // four threads per hash |
||||
|
||||
typedef enum |
||||
{ |
||||
ANDERSEN, |
||||
SIMPLE |
||||
} MemoryAccess; |
||||
|
||||
// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes) |
||||
__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT]; |
||||
|
||||
// iteration count N |
||||
__constant__ uint32_t c_N; |
||||
__constant__ uint32_t c_N_1; // N-1 |
||||
// scratch buffer size SCRATCH |
||||
__constant__ uint32_t c_SCRATCH; |
||||
__constant__ uint32_t c_SCRATCH_WU_PER_WARP; // (SCRATCH * WU_PER_WARP) |
||||
__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP) - 1 |
||||
|
||||
// using texture references for the "tex" variants of the B kernels |
||||
texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V; |
||||
texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V; |
||||
|
||||
template <int ALGO> __device__ __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3); |
||||
|
||||
static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) { |
||||
left.x ^= right.x; |
||||
left.y ^= right.y; |
||||
left.z ^= right.z; |
||||
left.w ^= right.w; |
||||
return left; |
||||
} |
||||
|
||||
static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) { |
||||
left.x += right.x; |
||||
left.y += right.y; |
||||
left.z += right.z; |
||||
left.w += right.w; |
||||
return left; |
||||
} |
||||
|
||||
static __device__ uint4 __shfl(const uint4 bx, int target_thread) { |
||||
return make_uint4( |
||||
__shfl((int)bx.x, target_thread), |
||||
__shfl((int)bx.y, target_thread), |
||||
__shfl((int)bx.z, target_thread), |
||||
__shfl((int)bx.w, target_thread) |
||||
); |
||||
} |
||||
|
||||
/* write_keys writes the 8 keys being processed by a warp to the global |
||||
* scratchpad. To effectively use memory bandwidth, it performs the writes |
||||
* (and reads, for read_keys) 128 bytes at a time per memory location |
||||
* by __shfl'ing the 4 entries in bx to the threads in the next-up |
||||
* thread group. It then has eight threads together perform uint4 |
||||
* (128 bit) writes to the destination region. This seems to make |
||||
* quite effective use of memory bandwidth. An approach that spread |
||||
* uint32s across more threads was slower because of the increased |
||||
* computation it required. |
||||
* |
||||
* "start" is the loop iteration producing the write - the offset within |
||||
* the block's memory. |
||||
* |
||||
* Internally, this algorithm first __shfl's the 4 bx entries to |
||||
* the next up thread group, and then uses a conditional move to |
||||
* ensure that odd-numbered thread groups exchange the b/bx ordering |
||||
* so that the right parts are written together. |
||||
* |
||||
* Thanks to Babu for helping design the 128-bit-per-write version. |
||||
* |
||||
* _direct lets the caller specify the absolute start location instead of |
||||
* the relative start location, as an attempt to reduce some recomputation. |
||||
*/ |
||||
|
||||
template <MemoryAccess SCHEME> __device__ __forceinline__ |
||||
void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start) |
||||
{ |
||||
uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; |
||||
|
||||
if (SCHEME == ANDERSEN) { |
||||
int target_thread = (threadIdx.x + 4)%32; |
||||
uint4 t=b, t2=__shfl(bx, target_thread); |
||||
int t2_start = __shfl((int)start, target_thread) + 4; |
||||
bool c = (threadIdx.x & 0x4); |
||||
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t); |
||||
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2); |
||||
} else if (SCHEME == SIMPLE) { |
||||
*((uint4 *)(&scratch[start ])) = b; |
||||
*((uint4 *)(&scratch[start+16])) = bx; |
||||
} |
||||
} |
||||
|
||||
template <MemoryAccess SCHEME, int TEX_DIM> __device__ __forceinline__ |
||||
void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start) |
||||
{ |
||||
uint32_t *scratch; |
||||
|
||||
if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; |
||||
if (SCHEME == ANDERSEN) { |
||||
int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4; |
||||
if (TEX_DIM > 0) { start /= 4; t2_start /= 4; } |
||||
bool c = (threadIdx.x & 0x4); |
||||
if (TEX_DIM == 0) { |
||||
b = *((uint4 *)(&scratch[c ? t2_start : start])); |
||||
bx = *((uint4 *)(&scratch[c ? start : t2_start])); |
||||
} else if (TEX_DIM == 1) { |
||||
b = tex1Dfetch(texRef1D_4_V, c ? t2_start : start); |
||||
bx = tex1Dfetch(texRef1D_4_V, c ? start : t2_start); |
||||
} else if (TEX_DIM == 2) { |
||||
b = tex2D(texRef2D_4_V, 0.5f + ((c ? t2_start : start)%TEXWIDTH), 0.5f + ((c ? t2_start : start)/TEXWIDTH)); |
||||
bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH)); |
||||
} |
||||
uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx); |
||||
bx = __shfl(bx, (threadIdx.x + 28)%32); |
||||
} else { |
||||
if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start])); |
||||
else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4); |
||||
else if (TEX_DIM == 2) b = tex2D(texRef2D_4_V, 0.5f + ((start/4)%TEXWIDTH), 0.5f + ((start/4)/TEXWIDTH)); |
||||
if (TEX_DIM == 0) bx = *((uint4 *)(&scratch[start+16])); |
||||
else if (TEX_DIM == 1) bx = tex1Dfetch(texRef1D_4_V, (start+16)/4); |
||||
else if (TEX_DIM == 2) bx = tex2D(texRef2D_4_V, 0.5f + (((start+16)/4)%TEXWIDTH), 0.5f + (((start+16)/4)/TEXWIDTH)); |
||||
} |
||||
} |
||||
|
||||
|
||||
__device__ __forceinline__ |
||||
void primary_order_shuffle(uint4 &b, uint4 &bx) |
||||
{ |
||||
/* Inner loop shuffle targets */ |
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
b.w = __shfl((int)b.w, x1); |
||||
b.z = __shfl((int)b.z, x2); |
||||
b.y = __shfl((int)b.y, x3); |
||||
uint32_t tmp = b.y; b.y = b.w; b.w = tmp; |
||||
|
||||
bx.w = __shfl((int)bx.w, x1); |
||||
bx.z = __shfl((int)bx.z, x2); |
||||
bx.y = __shfl((int)bx.y, x3); |
||||
tmp = bx.y; bx.y = bx.w; bx.w = tmp; |
||||
} |
||||
|
||||
/* |
||||
* load_key loads a 32*32bit key from a contiguous region of memory in B. |
||||
* The input keys are in external order (i.e., 0, 1, 2, 3, ...). |
||||
* After loading, each thread has its four b and four bx keys stored |
||||
* in internal processing order. |
||||
*/ |
||||
|
||||
__device__ __forceinline__ |
||||
void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int key_offset = scrypt_block * 32; |
||||
uint32_t thread_in_block = threadIdx.x % 4; |
||||
|
||||
// Read in permuted order. Key loads are not our bottleneck right now. |
||||
b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4]; |
||||
b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4]; |
||||
b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4]; |
||||
b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4]; |
||||
bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16]; |
||||
bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16]; |
||||
bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16]; |
||||
bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16]; |
||||
|
||||
primary_order_shuffle(b, bx); |
||||
} |
||||
|
||||
/* |
||||
* store_key performs the opposite transform as load_key, taking |
||||
* internally-ordered b and bx and storing them into a contiguous |
||||
* region of B in external order. |
||||
*/ |
||||
|
||||
__device__ __forceinline__ |
||||
void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int key_offset = scrypt_block * 32; |
||||
uint32_t thread_in_block = threadIdx.x % 4; |
||||
|
||||
primary_order_shuffle(b, bx); |
||||
|
||||
B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w; |
||||
} |
||||
|
||||
|
||||
/* |
||||
* load_key loads a 32*32bit key from a contiguous region of memory in B. |
||||
* The input keys are in external order (i.e., 0, 1, 2, 3, ...). |
||||
* After loading, each thread has its four b and four bx keys stored |
||||
* in internal processing order. |
||||
*/ |
||||
|
||||
__device__ __forceinline__ |
||||
void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int key_offset = scrypt_block * 32; |
||||
uint32_t thread_in_block = threadIdx.x % 4; |
||||
|
||||
// Read in permuted order. Key loads are not our bottleneck right now. |
||||
b.x = B[key_offset + 4*0 + thread_in_block%4]; |
||||
b.y = B[key_offset + 4*1 + thread_in_block%4]; |
||||
b.z = B[key_offset + 4*2 + thread_in_block%4]; |
||||
b.w = B[key_offset + 4*3 + thread_in_block%4]; |
||||
bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16]; |
||||
bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16]; |
||||
bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16]; |
||||
bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16]; |
||||
} |
||||
|
||||
/* |
||||
* store_key performs the opposite transform as load_key, taking |
||||
* internally-ordered b and bx and storing them into a contiguous |
||||
* region of B in external order. |
||||
*/ |
||||
|
||||
__device__ __forceinline__ |
||||
void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx) |
||||
{ |
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int key_offset = scrypt_block * 32; |
||||
uint32_t thread_in_block = threadIdx.x % 4; |
||||
|
||||
B[key_offset + 4*0 + thread_in_block%4] = b.x; |
||||
B[key_offset + 4*1 + thread_in_block%4] = b.y; |
||||
B[key_offset + 4*2 + thread_in_block%4] = b.z; |
||||
B[key_offset + 4*3 + thread_in_block%4] = b.w; |
||||
B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x; |
||||
B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y; |
||||
B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z; |
||||
B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w; |
||||
} |
||||
|
||||
|
||||
template <int ALGO> __device__ __forceinline__ |
||||
void load_key(const uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
switch(ALGO) { |
||||
case A_SCRYPT: load_key_salsa(B, b, bx); break; |
||||
case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break; |
||||
} |
||||
} |
||||
|
||||
template <int ALGO> __device__ __forceinline__ |
||||
void store_key(uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
switch(ALGO) { |
||||
case A_SCRYPT: store_key_salsa(B, b, bx); break; |
||||
case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* |
||||
* salsa_xor_core (Salsa20/8 cypher) |
||||
* The original scrypt called: |
||||
* xor_salsa8(&X[0], &X[16]); <-- the "b" loop |
||||
* xor_salsa8(&X[16], &X[0]); <-- the "bx" loop |
||||
* This version is unrolled to handle both of these loops in a single |
||||
* call to avoid unnecessary data movement. |
||||
*/ |
||||
|
||||
#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<<amt)|(tmp>>(32-amt))); } |
||||
|
||||
__device__ __forceinline__ |
||||
void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) |
||||
{ |
||||
uint4 x; |
||||
|
||||
b ^= bx; |
||||
x = b; |
||||
|
||||
// Enter in "primary order" (t0 has 0, 4, 8, 12) |
||||
// (t1 has 5, 9, 13, 1) |
||||
// (t2 has 10, 14, 2, 6) |
||||
// (t3 has 15, 3, 7, 11) |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 4; j++) { |
||||
|
||||
// Mixing phase of salsa |
||||
XOR_ROTATE_ADD(x.y, x.x, x.w, 7); |
||||
XOR_ROTATE_ADD(x.z, x.y, x.x, 9); |
||||
XOR_ROTATE_ADD(x.w, x.z, x.y, 13); |
||||
XOR_ROTATE_ADD(x.x, x.w, x.z, 18); |
||||
|
||||
/* Transpose rows and columns. */ |
||||
/* Unclear if this optimization is needed: These are ordered based |
||||
* upon the dependencies needed in the later xors. Compiler should be |
||||
* able to figure this out, but might as well give it a hand. */ |
||||
x.y = __shfl((int)x.y, x3); |
||||
x.w = __shfl((int)x.w, x1); |
||||
x.z = __shfl((int)x.z, x2); |
||||
|
||||
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first, |
||||
* but the register targets are rewritten here to swap x[1] and x[3] so that |
||||
* they can be directly shuffled to and from our peer threads without |
||||
* reassignment. The reverse shuffle then puts them back in the right place. |
||||
*/ |
||||
|
||||
XOR_ROTATE_ADD(x.w, x.x, x.y, 7); |
||||
XOR_ROTATE_ADD(x.z, x.w, x.x, 9); |
||||
XOR_ROTATE_ADD(x.y, x.z, x.w, 13); |
||||
XOR_ROTATE_ADD(x.x, x.y, x.z, 18); |
||||
|
||||
x.w = __shfl((int)x.w, x3); |
||||
x.y = __shfl((int)x.y, x1); |
||||
x.z = __shfl((int)x.z, x2); |
||||
} |
||||
|
||||
b += x; |
||||
// The next two lines are the beginning of the BX-centric loop iteration |
||||
bx ^= b; |
||||
x = bx; |
||||
|
||||
// This is a copy of the same loop above, identical but stripped of comments. |
||||
// Duplicated so that we can complete a bx-based loop with fewer register moves. |
||||
#pragma unroll |
||||
for (int j = 0; j < 4; j++) { |
||||
XOR_ROTATE_ADD(x.y, x.x, x.w, 7); |
||||
XOR_ROTATE_ADD(x.z, x.y, x.x, 9); |
||||
XOR_ROTATE_ADD(x.w, x.z, x.y, 13); |
||||
XOR_ROTATE_ADD(x.x, x.w, x.z, 18); |
||||
|
||||
x.y = __shfl((int)x.y, x3); |
||||
x.w = __shfl((int)x.w, x1); |
||||
x.z = __shfl((int)x.z, x2); |
||||
|
||||
XOR_ROTATE_ADD(x.w, x.x, x.y, 7); |
||||
XOR_ROTATE_ADD(x.z, x.w, x.x, 9); |
||||
XOR_ROTATE_ADD(x.y, x.z, x.w, 13); |
||||
XOR_ROTATE_ADD(x.x, x.y, x.z, 18); |
||||
|
||||
x.w = __shfl((int)x.w, x3); |
||||
x.y = __shfl((int)x.y, x1); |
||||
x.z = __shfl((int)x.z, x2); |
||||
} |
||||
|
||||
// At the end of these iterations, the data is in primary order again. |
||||
#undef XOR_ROTATE_ADD |
||||
|
||||
bx += x; |
||||
} |
||||
|
||||
|
||||
/* |
||||
* chacha_xor_core (ChaCha20/8 cypher) |
||||
* This version is unrolled to handle both of these loops in a single |
||||
* call to avoid unnecessary data movement. |
||||
* |
||||
* load_key and store_key must not use primary order when |
||||
* using ChaCha20/8, but rather the basic transposed order |
||||
* (referred to as "column mode" below) |
||||
*/ |
||||
|
||||
#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<<amt)|(tmp>>(32-amt))); } |
||||
|
||||
__device__ __forceinline__ |
||||
void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) |
||||
{ |
||||
uint4 x; |
||||
|
||||
b ^= bx; |
||||
x = b; |
||||
|
||||
// Enter in "column" mode (t0 has 0, 4, 8, 12) |
||||
// (t1 has 1, 5, 9, 13) |
||||
// (t2 has 2, 6, 10, 14) |
||||
// (t3 has 3, 7, 11, 15) |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 4; j++) { |
||||
|
||||
// Column Mixing phase of chacha |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
||||
|
||||
x.y = __shfl((int)x.y, x1); |
||||
x.z = __shfl((int)x.z, x2); |
||||
x.w = __shfl((int)x.w, x3); |
||||
|
||||
// Diagonal Mixing phase of chacha |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
||||
|
||||
x.y = __shfl((int)x.y, x3); |
||||
x.z = __shfl((int)x.z, x2); |
||||
x.w = __shfl((int)x.w, x1); |
||||
} |
||||
|
||||
b += x; |
||||
// The next two lines are the beginning of the BX-centric loop iteration |
||||
bx ^= b; |
||||
x = bx; |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 4; j++) { |
||||
|
||||
// Column Mixing phase of chacha |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
||||
|
||||
x.y = __shfl((int)x.y, x1); |
||||
x.z = __shfl((int)x.z, x2); |
||||
x.w = __shfl((int)x.w, x3); |
||||
|
||||
// Diagonal Mixing phase of chacha |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
||||
|
||||
x.y = __shfl((int)x.y, x3); |
||||
x.z = __shfl((int)x.z, x2); |
||||
x.w = __shfl((int)x.w, x1); |
||||
} |
||||
|
||||
#undef CHACHA_PRIMITIVE |
||||
|
||||
bx += x; |
||||
} |
||||
|
||||
|
||||
template <int ALGO> __device__ __forceinline__ |
||||
void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) |
||||
{ |
||||
switch(ALGO) { |
||||
case A_SCRYPT: salsa_xor_core(b, bx, x1, x2, x3); break; |
||||
case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* |
||||
* The hasher_gen_kernel operates on a group of 1024-bit input keys |
||||
* in B, stored as: |
||||
* B = { k1B k1Bx k2B k2Bx ... } |
||||
* and fills up the scratchpad with the iterative hashes derived from |
||||
* those keys: |
||||
* scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... } |
||||
* scratch is 1024 times larger than the input keys B. |
||||
* It is extremely important to stream writes effectively into scratch; |
||||
* less important to coalesce the reads from B. |
||||
* |
||||
* Key ordering note: Keys are input from B in "original" order: |
||||
* K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 } |
||||
* After inputting into kernel_gen, each component k and kx of the |
||||
* key is transmuted into a permuted internal order to make processing faster: |
||||
* K = k, kx with: |
||||
* k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11 |
||||
* and similarly for kx. |
||||
*/ |
||||
|
||||
template <int ALGO, MemoryAccess SCHEME> __global__ |
||||
void kepler_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end) |
||||
{ |
||||
uint4 b, bx; |
||||
|
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; |
||||
|
||||
int i=begin; |
||||
|
||||
if (i == 0) { |
||||
load_key<ALGO>(d_idata, b, bx); |
||||
write_keys_direct<SCHEME>(b, bx, start); |
||||
++i; |
||||
} else read_keys_direct<SCHEME,0>(b, bx, start+32*(i-1)); |
||||
|
||||
while (i < end) { |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
write_keys_direct<SCHEME>(b, bx, start+32*i); |
||||
++i; |
||||
} |
||||
} |
||||
|
||||
template <int ALGO, MemoryAccess SCHEME> __global__ |
||||
void kepler_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP) |
||||
{ |
||||
uint4 b, bx; |
||||
|
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; |
||||
|
||||
int i=begin; |
||||
|
||||
if (i == 0) { |
||||
load_key<ALGO>(d_idata, b, bx); |
||||
write_keys_direct<SCHEME>(b, bx, start); |
||||
++i; |
||||
} else { |
||||
int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP; |
||||
read_keys_direct<SCHEME,0>(b, bx, start+32*pos); |
||||
while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} |
||||
|
||||
while (i < end) { |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
if (i % LOOKUP_GAP == 0) |
||||
write_keys_direct<SCHEME>(b, bx, start+32*(i/LOOKUP_GAP)); |
||||
++i; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* |
||||
* hasher_hash_kernel runs the second phase of scrypt after the scratch |
||||
* buffer is filled with the iterative hashes: It bounces through |
||||
* the scratch buffer in pseudorandom order, mixing the key as it goes. |
||||
*/ |
||||
|
||||
template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__ |
||||
void kepler_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end) |
||||
{ |
||||
uint4 b, bx; |
||||
|
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4); |
||||
if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP; |
||||
|
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
if (begin == 0) { |
||||
read_keys_direct<SCHEME, TEX_DIM>(b, bx, start+32*c_N_1); |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} else load_key<ALGO>(d_odata, b, bx); |
||||
|
||||
for (int i = begin; i < end; i++) { |
||||
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
||||
uint4 t, tx; read_keys_direct<SCHEME, TEX_DIM>(t, tx, start+32*j); |
||||
b ^= t; bx ^= tx; |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} |
||||
|
||||
store_key<ALGO>(d_odata, b, bx); |
||||
} |
||||
|
||||
template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__ |
||||
void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP) |
||||
{ |
||||
uint4 b, bx; |
||||
|
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4); |
||||
if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP; |
||||
|
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
if (begin == 0) { |
||||
int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP); |
||||
read_keys_direct<SCHEME,TEX_DIM>(b, bx, start+32*pos); |
||||
while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} else load_key<ALGO>(d_odata, b, bx); |
||||
|
||||
if (SCHEME == SIMPLE) |
||||
{ |
||||
// better divergent thread handling submitted by nVidia engineers, but |
||||
// supposedly this does not run with the ANDERSEN memory access scheme |
||||
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
||||
int pos = j/LOOKUP_GAP; |
||||
int loop = -1; |
||||
uint4 t, tx; |
||||
|
||||
int i = begin; |
||||
while(i < end) { |
||||
if (loop==-1) { |
||||
j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
||||
pos = j/LOOKUP_GAP; |
||||
loop = j-pos*LOOKUP_GAP; |
||||
read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos); |
||||
} |
||||
if (loop==0) { |
||||
b ^= t; bx ^= tx; |
||||
t=b;tx=bx; |
||||
} |
||||
block_mixer<ALGO>(t, tx, x1, x2, x3); |
||||
if (loop==0) { |
||||
b=t;bx=tx; |
||||
i++; |
||||
} |
||||
loop--; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
// this is my original implementation, now used with the ANDERSEN |
||||
// memory access scheme only. |
||||
for (int i = begin; i < end; i++) { |
||||
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
||||
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; |
||||
uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos); |
||||
while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3); |
||||
b ^= t; bx ^= tx; |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} |
||||
} |
||||
|
||||
//for (int i = begin; i < end; i++) { |
||||
// int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
||||
// int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; |
||||
// uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos); |
||||
// while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3); |
||||
// b ^= t; bx ^= tx; |
||||
// block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
//} |
||||
|
||||
store_key<ALGO>(d_odata, b, bx); |
||||
} |
||||
|
||||
KeplerKernel::KeplerKernel() : KernelInterface() |
||||
{ |
||||
} |
||||
|
||||
bool KeplerKernel::bindtexture_1D(uint32_t *d_V, size_t size) |
||||
{ |
||||
cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>(); |
||||
texRef1D_4_V.normalized = 0; |
||||
texRef1D_4_V.filterMode = cudaFilterModePoint; |
||||
texRef1D_4_V.addressMode[0] = cudaAddressModeClamp; |
||||
checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size)); |
||||
return true; |
||||
} |
||||
|
||||
bool KeplerKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) |
||||
{ |
||||
cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>(); |
||||
texRef2D_4_V.normalized = 0; |
||||
texRef2D_4_V.filterMode = cudaFilterModePoint; |
||||
texRef2D_4_V.addressMode[0] = cudaAddressModeClamp; |
||||
texRef2D_4_V.addressMode[1] = cudaAddressModeClamp; |
||||
// maintain texture width of TEXWIDTH (max. limit is 65000) |
||||
while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; } |
||||
while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; } |
||||
checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch)); |
||||
return true; |
||||
} |
||||
|
||||
bool KeplerKernel::unbindtexture_1D() |
||||
{ |
||||
checkCudaErrors(cudaUnbindTexture(texRef1D_4_V)); |
||||
return true; |
||||
} |
||||
|
||||
bool KeplerKernel::unbindtexture_2D() |
||||
{ |
||||
checkCudaErrors(cudaUnbindTexture(texRef2D_4_V)); |
||||
return true; |
||||
} |
||||
|
||||
void KeplerKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) |
||||
{ |
||||
checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice)); |
||||
} |
||||
|
||||
bool KeplerKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, |
||||
uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) |
||||
{ |
||||
bool success = true; |
||||
|
||||
// make some constants available to kernel, update only initially and when changing |
||||
static int prev_N[MAX_DEVICES] = {0}; |
||||
if (N != prev_N[thr_id]) { |
||||
uint32_t h_N = N; |
||||
uint32_t h_N_1 = N-1; |
||||
uint32_t h_SCRATCH = SCRATCH; |
||||
uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP); |
||||
uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1; |
||||
|
||||
cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
|
||||
prev_N[thr_id] = N; |
||||
} |
||||
|
||||
// First phase: Sequential writes to scratchpad. |
||||
|
||||
int batch = device_batchsize[thr_id]; |
||||
//int num_sleeps = 2* ((N + (batch-1)) / batch); |
||||
//int sleeptime = 100; |
||||
|
||||
unsigned int pos = 0; |
||||
do |
||||
{ |
||||
if (LOOKUP_GAP == 1) { |
||||
if (IS_SCRYPT()) kepler_scrypt_core_kernelA<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); |
||||
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); |
||||
} else { |
||||
if (IS_SCRYPT()) kepler_scrypt_core_kernelA_LG<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
} |
||||
pos += batch; |
||||
} while (pos < N); |
||||
|
||||
// Second phase: Random read access from scratchpad. |
||||
|
||||
pos = 0; |
||||
do |
||||
{ |
||||
if (LOOKUP_GAP == 1) { |
||||
|
||||
if (texture_cache == 0) { |
||||
if (IS_SCRYPT()) kepler_scrypt_core_kernelB<A_SCRYPT ,ANDERSEN, 0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<A_SCRYPT_JANE,SIMPLE, 0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
} else if (texture_cache == 1) { |
||||
if (IS_SCRYPT()) kepler_scrypt_core_kernelB<A_SCRYPT ,ANDERSEN,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<A_SCRYPT_JANE,SIMPLE, 1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
} else if (texture_cache == 2) { |
||||
if (IS_SCRYPT()) kepler_scrypt_core_kernelB<A_SCRYPT ,ANDERSEN,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<A_SCRYPT_JANE,SIMPLE, 2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
} |
||||
|
||||
} else { |
||||
|
||||
if (texture_cache == 0) { |
||||
if (IS_SCRYPT()) kepler_scrypt_core_kernelB_LG<A_SCRYPT ,ANDERSEN,0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB_LG<A_SCRYPT_JANE,SIMPLE, 0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
} else if (texture_cache == 1) { |
||||
if (IS_SCRYPT()) kepler_scrypt_core_kernelB_LG<A_SCRYPT ,ANDERSEN,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB_LG<A_SCRYPT_JANE,SIMPLE, 1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
} else if (texture_cache == 2) { |
||||
if (IS_SCRYPT()) kepler_scrypt_core_kernelB_LG<A_SCRYPT ,ANDERSEN,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB_LG<A_SCRYPT_JANE,SIMPLE, 2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
} |
||||
} |
||||
|
||||
pos += batch; |
||||
} while (pos < N); |
||||
|
||||
return success; |
||||
} |
@ -0,0 +1,29 @@
@@ -0,0 +1,29 @@
|
||||
#ifndef KEPLER_KERNEL_H |
||||
#define KEPLER_KERNEL_H |
||||
|
||||
#include "salsa_kernel.h" |
||||
|
||||
class KeplerKernel : public KernelInterface |
||||
{ |
||||
public: |
||||
KeplerKernel(); |
||||
|
||||
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V); |
||||
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache); |
||||
virtual bool bindtexture_1D(uint32_t *d_V, size_t size); |
||||
virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch); |
||||
virtual bool unbindtexture_1D(); |
||||
virtual bool unbindtexture_2D(); |
||||
|
||||
virtual char get_identifier() { return 'k'; }; |
||||
virtual int get_major_version() { return 3; }; |
||||
virtual int get_minor_version() { return 0; }; |
||||
|
||||
virtual int max_warps_per_block() { return 32; }; |
||||
virtual int get_texel_width() { return 4; }; |
||||
virtual int threads_per_wu() { return 4; } |
||||
virtual bool support_lookup_gap() { return true; } |
||||
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; } |
||||
}; |
||||
|
||||
#endif // #ifndef KEPLER_KERNEL_H
|
@ -0,0 +1,36 @@
@@ -0,0 +1,36 @@
|
||||
#ifndef NV_KERNEL_H |
||||
#define NV_KERNEL_H |
||||
|
||||
#include "salsa_kernel.h" |
||||
|
||||
class NVKernel : public KernelInterface |
||||
{ |
||||
public: |
||||
NVKernel(); |
||||
|
||||
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V); |
||||
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache); |
||||
|
||||
virtual bool bindtexture_1D(uint32_t *d_V, size_t size); |
||||
virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch); |
||||
virtual bool unbindtexture_1D(); |
||||
virtual bool unbindtexture_2D(); |
||||
|
||||
virtual char get_identifier() { return 'K'; }; |
||||
virtual int get_major_version() { return 3; }; |
||||
virtual int get_minor_version() { return 0; }; |
||||
|
||||
virtual int max_warps_per_block() { return 32; }; |
||||
virtual int get_texel_width() { return 4; }; |
||||
virtual bool support_lookup_gap() { return true; } |
||||
virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; } |
||||
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; } |
||||
|
||||
virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); |
||||
virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false); |
||||
|
||||
virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]); |
||||
virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false); |
||||
}; |
||||
|
||||
#endif // #ifndef NV_KERNEL_H
|
@ -0,0 +1,36 @@
@@ -0,0 +1,36 @@
|
||||
#ifndef NV2_KERNEL_H |
||||
#define NV2_KERNEL_H |
||||
|
||||
#include "miner.h" |
||||
#include <cuda_runtime.h> |
||||
|
||||
#include "salsa_kernel.h" |
||||
|
||||
class NV2Kernel : public KernelInterface |
||||
{ |
||||
public: |
||||
NV2Kernel(); |
||||
|
||||
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V); |
||||
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache); |
||||
|
||||
virtual char get_identifier() { return 'T'; }; |
||||
virtual int get_major_version() { return 3; }; |
||||
virtual int get_minor_version() { return 5; }; |
||||
|
||||
virtual int max_warps_per_block() { return 24; }; |
||||
virtual int get_texel_width() { return 4; }; |
||||
virtual bool no_textures() { return true; } |
||||
virtual bool support_lookup_gap() { return true; } |
||||
|
||||
virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; } |
||||
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; } |
||||
|
||||
virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); |
||||
virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false); |
||||
|
||||
virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]); |
||||
virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false); |
||||
}; |
||||
|
||||
#endif // #ifndef NV2_KERNEL_H
|
@ -0,0 +1,939 @@
@@ -0,0 +1,939 @@
|
||||
|
||||
// |
||||
// Contains the autotuning logic and some utility functions. |
||||
// Note that all CUDA kernels have been moved to other .cu files |
||||
// |
||||
// NOTE: compile this .cu module for compute_20,sm_21 with --maxrregcount=64 |
||||
// |
||||
|
||||
#include <stdio.h> |
||||
#include <map> |
||||
#include <algorithm> |
||||
#include <unistd.h> // usleep |
||||
#include <ctype.h> // tolower |
||||
#include "cuda_helper.h" |
||||
|
||||
#include "salsa_kernel.h" |
||||
|
||||
#include "titan_kernel.h" |
||||
#include "fermi_kernel.h" |
||||
#include "test_kernel.h" |
||||
#include "nv_kernel.h" |
||||
#include "nv_kernel2.h" |
||||
#include "kepler_kernel.h" |
||||
|
||||
#include "miner.h" |
||||
|
||||
#if WIN32 |
||||
#ifdef _WIN64 |
||||
#define _64BIT 1 |
||||
#endif |
||||
#else |
||||
#if __x86_64__ |
||||
#define _64BIT 1 |
||||
#endif |
||||
#endif |
||||
|
||||
#if _64BIT |
||||
#define MAXMEM 0x300000000ULL // 12 GB (the largest Kepler) |
||||
#else |
||||
#define MAXMEM 0xFFFFFFFFULL // nearly 4 GB (32 bit limitations) |
||||
#endif |
||||
|
||||
// require CUDA 5.5 driver API |
||||
#define DMAJ 5 |
||||
#define DMIN 5 |
||||
|
||||
// define some error checking macros |
||||
#undef checkCudaErrors |
||||
|
||||
#if WIN32 |
||||
#define DELIMITER '/' |
||||
#else |
||||
#define DELIMITER '/' |
||||
#endif |
||||
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ ) |
||||
|
||||
#define checkCudaErrors(x) \ |
||||
{ \ |
||||
cudaGetLastError(); \ |
||||
x; \ |
||||
cudaError_t err = cudaGetLastError(); \ |
||||
if (err != cudaSuccess) \ |
||||
applog(LOG_ERR, "GPU #%d: Err %d: %s (%s:%d)", device_map[thr_id], err, cudaGetErrorString(err), __FILENAME__, __LINE__); \ |
||||
} |
||||
|
||||
// some globals containing pointers to device memory (for chunked allocation) |
||||
// [MAX_DEVICES] indexes up to MAX_DEVICES threads (0...MAX_DEVICES-1) |
||||
int MAXWARPS[MAX_GPUS]; |
||||
uint32_t* h_V[MAX_GPUS][TOTAL_WARP_LIMIT*64]; // NOTE: the *64 prevents buffer overflow for --keccak |
||||
uint32_t h_V_extra[MAX_GPUS][TOTAL_WARP_LIMIT*64]; // with really large kernel launch configurations |
||||
|
||||
KernelInterface *Best_Kernel_Heuristics(cudaDeviceProp *props) |
||||
{ |
||||
KernelInterface *kernel = NULL; |
||||
uint32_t N = (1UL << opt_nfactor+1); // not sure |
||||
|
||||
if (IS_SCRYPT() || (IS_SCRYPT_JANE() && N <= 8192)) |
||||
{ |
||||
// high register count kernels (scrypt, low N-factor scrypt-jane) |
||||
if (props->major > 3 || (props->major == 3 && props->minor >= 5)) |
||||
kernel = new NV2Kernel(); // we don't want this for Keccak though |
||||
else if (props->major == 3 && props->minor == 0) |
||||
kernel = new NVKernel(); |
||||
else if (props->major == 2 || props->major == 1) |
||||
kernel = new FermiKernel(); |
||||
} |
||||
else |
||||
{ |
||||
// low register count kernels (high N-factor scrypt-jane) |
||||
if (props->major > 3 || (props->major == 3 && props->minor >= 5)) |
||||
kernel = new TitanKernel(); |
||||
else if (props->major == 3 && props->minor == 0) |
||||
kernel = new KeplerKernel(); |
||||
else if (props->major == 2 || props->major == 1) |
||||
kernel = new TestKernel(); |
||||
} |
||||
return kernel; |
||||
} |
||||
|
||||
|
||||
bool validate_config(char *config, int &b, int &w, KernelInterface **kernel = NULL, cudaDeviceProp *props = NULL) |
||||
{ |
||||
bool success = false; |
||||
char kernelid = ' '; |
||||
if (config != NULL) |
||||
{ |
||||
if (config[0] == 'T' || config[0] == 'K' || config[0] == 'F' || config[0] == 'L' || |
||||
config[0] == 't' || config[0] == 'k' || config[0] == 'f' || |
||||
config[0] == 'Z' || config[0] == 'Y' || config[0] == 'X') { |
||||
kernelid = config[0]; |
||||
config++; |
||||
} |
||||
|
||||
if (config[0] >= '0' && config[0] <= '9') |
||||
if (sscanf(config, "%dx%d", &b, &w) == 2) |
||||
success = true; |
||||
|
||||
if (success && kernel != NULL) |
||||
{ |
||||
switch (kernelid) |
||||
{ |
||||
case 'T': case 'Z': *kernel = new NV2Kernel(); break; |
||||
case 't': *kernel = new TitanKernel(); break; |
||||
case 'K': case 'Y': *kernel = new NVKernel(); break; |
||||
case 'k': *kernel = new KeplerKernel(); break; |
||||
case 'F': case 'L': *kernel = new FermiKernel(); break; |
||||
case 'f': case 'X': *kernel = new TestKernel(); break; |
||||
case ' ': // choose based on device architecture |
||||
*kernel = Best_Kernel_Heuristics(props); |
||||
break; |
||||
} |
||||
} |
||||
} |
||||
return success; |
||||
} |
||||
|
||||
std::map<int, int> context_blocks; |
||||
std::map<int, int> context_wpb; |
||||
std::map<int, bool> context_concurrent; |
||||
std::map<int, KernelInterface *> context_kernel; |
||||
std::map<int, uint32_t *> context_idata[2]; |
||||
std::map<int, uint32_t *> context_odata[2]; |
||||
std::map<int, cudaStream_t> context_streams[2]; |
||||
std::map<int, uint32_t *> context_X[2]; |
||||
std::map<int, uint32_t *> context_H[2]; |
||||
std::map<int, cudaEvent_t> context_serialize[2]; |
||||
|
||||
// for SHA256 hashing on GPU |
||||
std::map<int, uint32_t *> context_tstate[2]; |
||||
std::map<int, uint32_t *> context_ostate[2]; |
||||
std::map<int, uint32_t *> context_hash[2]; |
||||
|
||||
int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurrent, int &wpb); |
||||
|
||||
void cuda_shutdown(int thr_id) |
||||
{ |
||||
cudaDeviceSynchronize(); |
||||
cudaDeviceReset(); |
||||
cudaThreadExit(); |
||||
} |
||||
|
||||
int cuda_throughput(int thr_id) |
||||
{ |
||||
int GRID_BLOCKS, WARPS_PER_BLOCK; |
||||
if (context_blocks.find(thr_id) == context_blocks.end()) |
||||
{ |
||||
#if 0 |
||||
CUcontext ctx; |
||||
cuCtxCreate( &ctx, CU_CTX_SCHED_YIELD, device_map[thr_id] ); |
||||
cuCtxSetCurrent(ctx); |
||||
#else |
||||
checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleYield)); |
||||
checkCudaErrors(cudaSetDevice(device_map[thr_id])); |
||||
checkCudaErrors(cudaFree(0)); |
||||
#endif |
||||
|
||||
KernelInterface *kernel; |
||||
bool concurrent; |
||||
GRID_BLOCKS = find_optimal_blockcount(thr_id, kernel, concurrent, WARPS_PER_BLOCK); |
||||
|
||||
if(GRID_BLOCKS == 0) |
||||
return 0; |
||||
|
||||
unsigned int THREADS_PER_WU = kernel->threads_per_wu(); |
||||
unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * 32; |
||||
unsigned int state_size = WU_PER_LAUNCH * sizeof(uint32_t) * 8; |
||||
|
||||
// allocate device memory for scrypt_core inputs and outputs |
||||
uint32_t *tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_idata[0][thr_id] = tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_idata[1][thr_id] = tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_odata[0][thr_id] = tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_odata[1][thr_id] = tmp; |
||||
|
||||
// allocate pinned host memory for scrypt hashes |
||||
checkCudaErrors(cudaHostAlloc((void **) &tmp, state_size, cudaHostAllocDefault)); context_H[0][thr_id] = tmp; |
||||
checkCudaErrors(cudaHostAlloc((void **) &tmp, state_size, cudaHostAllocDefault)); context_H[1][thr_id] = tmp; |
||||
|
||||
if (IS_SCRYPT()) |
||||
{ |
||||
if (parallel < 2) |
||||
{ |
||||
// allocate pinned host memory for scrypt_core input/output |
||||
checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[0][thr_id] = tmp; |
||||
checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[1][thr_id] = tmp; |
||||
} |
||||
else |
||||
{ |
||||
// allocate tstate, ostate, scrypt hash device memory |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_tstate[0][thr_id] = tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_tstate[1][thr_id] = tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_ostate[0][thr_id] = tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_ostate[1][thr_id] = tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[0][thr_id] = tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[1][thr_id] = tmp; |
||||
} |
||||
} |
||||
else if (IS_SCRYPT_JANE()) |
||||
{ |
||||
// allocate pinned host memory for scrypt_core input/output |
||||
checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[0][thr_id] = tmp; |
||||
checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[1][thr_id] = tmp; |
||||
|
||||
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[0][thr_id] = tmp; |
||||
checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[1][thr_id] = tmp; |
||||
} |
||||
|
||||
// create two CUDA streams |
||||
cudaStream_t tmp2; |
||||
checkCudaErrors( cudaStreamCreate(&tmp2) ); context_streams[0][thr_id] = tmp2; |
||||
checkCudaErrors( cudaStreamCreate(&tmp2) ); context_streams[1][thr_id] = tmp2; |
||||
|
||||
// events used to serialize the kernel launches (we don't want any overlapping of kernels) |
||||
cudaEvent_t tmp4; |
||||
checkCudaErrors(cudaEventCreateWithFlags(&tmp4, cudaEventDisableTiming)); context_serialize[0][thr_id] = tmp4; |
||||
checkCudaErrors(cudaEventCreateWithFlags(&tmp4, cudaEventDisableTiming)); context_serialize[1][thr_id] = tmp4; |
||||
checkCudaErrors(cudaEventRecord(context_serialize[1][thr_id])); |
||||
|
||||
context_kernel[thr_id] = kernel; |
||||
context_concurrent[thr_id] = concurrent; |
||||
context_blocks[thr_id] = GRID_BLOCKS; |
||||
context_wpb[thr_id] = WARPS_PER_BLOCK; |
||||
} |
||||
|
||||
GRID_BLOCKS = context_blocks[thr_id]; |
||||
WARPS_PER_BLOCK = context_wpb[thr_id]; |
||||
unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); |
||||
return WU_PER_LAUNCH; |
||||
} |
||||
|
||||
// Beginning of GPU Architecture definitions |
||||
inline int _ConvertSMVer2Cores(int major, int minor) |
||||
{ |
||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM |
||||
typedef struct { |
||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version |
||||
int Cores; |
||||
} sSMtoCores; |
||||
|
||||
sSMtoCores nGpuArchCoresPerSM[] = { |
||||
{ 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class |
||||
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class |
||||
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class |
||||
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class |
||||
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class |
||||
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class |
||||
{ 0x30, 192 }, // Kepler Generation (SM 3.0) GK10x class - GK104 = 1536 cores / 8 SMs |
||||
{ 0x35, 192 }, // Kepler Generation (SM 3.5) GK11x class |
||||
{ 0x50, 128 }, // Maxwell Generation (SM 5.0) GTX750/750Ti |
||||
{ 0x52, 128 }, // Maxwell Second Generation (SM 5.2) GTX980 = 2048 cores / 16 SMs - GTX970 1664 cores / 13 SMs |
||||
{ -1, -1 }, |
||||
}; |
||||
|
||||
int index = 0; |
||||
while (nGpuArchCoresPerSM[index].SM != -1) |
||||
{ |
||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { |
||||
return nGpuArchCoresPerSM[index].Cores; |
||||
} |
||||
index++; |
||||
} |
||||
|
||||
// If we don't find the values, we default use the previous one to run properly |
||||
applog(LOG_WARNING, "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, 128); |
||||
return 128; |
||||
} |
||||
|
||||
#ifdef WIN32 |
||||
#include <windows.h> |
||||
static int console_width() { |
||||
CONSOLE_SCREEN_BUFFER_INFO csbi; |
||||
GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi); |
||||
return csbi.srWindow.Right - csbi.srWindow.Left + 1; |
||||
} |
||||
#else |
||||
static inline int console_width() { |
||||
return 999; |
||||
} |
||||
#endif |
||||
|
||||
int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurrent, int &WARPS_PER_BLOCK) |
||||
{ |
||||
int cw = console_width(); |
||||
int optimal_blocks = 0; |
||||
|
||||
cudaDeviceProp props; |
||||
checkCudaErrors(cudaGetDeviceProperties(&props, device_map[thr_id])); |
||||
concurrent = (props.concurrentKernels > 0); |
||||
|
||||
device_name[thr_id] = strdup(props.name); |
||||
applog(LOG_INFO, "GPU #%d: %s with SM %d.%d", device_map[thr_id], props.name, props.major, props.minor); |
||||
|
||||
WARPS_PER_BLOCK = -1; |
||||
|
||||
// if not specified, use interactive mode for devices that have the watchdog timer enabled |
||||
if (device_interactive[thr_id] == -1) |
||||
device_interactive[thr_id] = props.kernelExecTimeoutEnabled; |
||||
|
||||
// turn off texture cache if not otherwise specified |
||||
if (device_texturecache[thr_id] == -1) |
||||
device_texturecache[thr_id] = 0; |
||||
|
||||
// if not otherwise specified or required, turn single memory allocations off as they reduce |
||||
// the amount of memory that we can allocate on Windows Vista, 7 and 8 (WDDM driver model issue) |
||||
if (device_singlememory[thr_id] == -1) device_singlememory[thr_id] = 0; |
||||
|
||||
// figure out which kernel implementation to use |
||||
if (!validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK, &kernel, &props)) { |
||||
kernel = NULL; |
||||
if (device_config[thr_id] != NULL) { |
||||
if (device_config[thr_id][0] == 'T' || device_config[thr_id][0] == 'Z') |
||||
kernel = new NV2Kernel(); |
||||
else if (device_config[thr_id][0] == 't') |
||||
kernel = new TitanKernel(); |
||||
else if (device_config[thr_id][0] == 'K' || device_config[thr_id][0] == 'Y') |
||||
kernel = new NVKernel(); |
||||
else if (device_config[thr_id][0] == 'k') |
||||
kernel = new KeplerKernel(); |
||||
else if (device_config[thr_id][0] == 'F' || device_config[thr_id][0] == 'L') |
||||
kernel = new FermiKernel(); |
||||
else if (device_config[thr_id][0] == 'f' || device_config[thr_id][0] == 'X') |
||||
kernel = new TestKernel(); |
||||
} |
||||
if (kernel == NULL) kernel = Best_Kernel_Heuristics(&props); |
||||
} |
||||
|
||||
if (kernel->get_major_version() > props.major || kernel->get_major_version() == props.major && kernel->get_minor_version() > props.minor) |
||||
{ |
||||
applog(LOG_ERR, "GPU #%d: FATAL: the '%c' kernel requires %d.%d capability!", device_map[thr_id], kernel->get_identifier(), kernel->get_major_version(), kernel->get_minor_version()); |
||||
return 0; |
||||
} |
||||
|
||||
// set whatever cache configuration and shared memory bank mode the kernel prefers |
||||
checkCudaErrors(cudaDeviceSetCacheConfig(kernel->cache_config())); |
||||
checkCudaErrors(cudaDeviceSetSharedMemConfig(kernel->shared_mem_config())); |
||||
|
||||
// some kernels (e.g. Titan) do not support the texture cache |
||||
if (kernel->no_textures() && device_texturecache[thr_id]) { |
||||
applog(LOG_WARNING, "GPU #%d: the '%c' kernel ignores the texture cache argument", device_map[thr_id], kernel->get_identifier()); |
||||
device_texturecache[thr_id] = 0; |
||||
} |
||||
|
||||
// Texture caching only works with single memory allocation |
||||
if (device_texturecache[thr_id]) device_singlememory[thr_id] = 1; |
||||
|
||||
if (kernel->single_memory() && !device_singlememory[thr_id]) { |
||||
applog(LOG_WARNING, "GPU #%d: the '%c' kernel requires single memory allocation", device_map[thr_id], kernel->get_identifier()); |
||||
device_singlememory[thr_id] = 1; |
||||
} |
||||
|
||||
if (device_lookup_gap[thr_id] == 0) device_lookup_gap[thr_id] = 1; |
||||
if (!kernel->support_lookup_gap() && device_lookup_gap[thr_id] > 1) |
||||
{ |
||||
applog(LOG_WARNING, "GPU #%d: the '%c' kernel does not support a lookup gap", device_map[thr_id], kernel->get_identifier()); |
||||
device_lookup_gap[thr_id] = 1; |
||||
} |
||||
|
||||
applog(LOG_INFO, "GPU #%d: interactive: %d, tex-cache: %d%s, single-alloc: %d", device_map[thr_id], |
||||
(device_interactive[thr_id] != 0) ? 1 : 0, |
||||
(device_texturecache[thr_id] != 0) ? device_texturecache[thr_id] : 0, (device_texturecache[thr_id] != 0) ? "D" : "", |
||||
(device_singlememory[thr_id] != 0) ? 1 : 0 ); |
||||
|
||||
// number of threads collaborating on one work unit (hash) |
||||
unsigned int THREADS_PER_WU = kernel->threads_per_wu(); |
||||
unsigned int LOOKUP_GAP = device_lookup_gap[thr_id]; |
||||
unsigned int BACKOFF = device_backoff[thr_id]; |
||||
unsigned int N = (1 << (opt_nfactor+1)); |
||||
double szPerWarp = (double)(SCRATCH * WU_PER_WARP * sizeof(uint32_t)); |
||||
//applog(LOG_INFO, "WU_PER_WARP=%u, THREADS_PER_WU=%u, LOOKUP_GAP=%u, BACKOFF=%u, SCRATCH=%u", WU_PER_WARP, THREADS_PER_WU, LOOKUP_GAP, BACKOFF, SCRATCH); |
||||
applog(LOG_INFO, "GPU #%d: %d hashes / %.1f MB per warp.", device_map[thr_id], WU_PER_WARP, szPerWarp / (1024.0 * 1024.0)); |
||||
|
||||
// compute highest MAXWARPS numbers for kernels allowing cudaBindTexture to succeed |
||||
int MW_1D_4 = 134217728 / (SCRATCH * WU_PER_WARP / 4); // for uint4_t textures |
||||
int MW_1D_2 = 134217728 / (SCRATCH * WU_PER_WARP / 2); // for uint2_t textures |
||||
int MW_1D = kernel->get_texel_width() == 2 ? MW_1D_2 : MW_1D_4; |
||||
|
||||
uint32_t *d_V = NULL; |
||||
if (device_singlememory[thr_id]) |
||||
{ |
||||
// if no launch config was specified, we simply |
||||
// allocate the single largest memory chunk on the device that we can get |
||||
if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK)) { |
||||
MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK; |
||||
} |
||||
else { |
||||
// compute no. of warps to allocate the largest number producing a single memory block |
||||
// PROBLEM: one some devices, ALL allocations will fail if the first one failed. This sucks. |
||||
size_t MEM_LIMIT = (size_t)min((unsigned long long)MAXMEM, (unsigned long long)props.totalGlobalMem); |
||||
int warpmax = (int)min((unsigned long long)TOTAL_WARP_LIMIT, (unsigned long long)(MEM_LIMIT / szPerWarp)); |
||||
|
||||
// run a bisection algorithm for memory allocation (way more reliable than the previous approach) |
||||
int best = 0; |
||||
int warp = (warpmax+1)/2; |
||||
int interval = (warpmax+1)/2; |
||||
while (interval > 0) |
||||
{ |
||||
cudaGetLastError(); // clear the error state |
||||
cudaMalloc((void **)&d_V, (size_t)(szPerWarp * warp)); |
||||
if (cudaGetLastError() == cudaSuccess) { |
||||
checkCudaErrors(cudaFree(d_V)); d_V = NULL; |
||||
if (warp > best) best = warp; |
||||
if (warp == warpmax) break; |
||||
interval = (interval+1)/2; |
||||
warp += interval; |
||||
if (warp > warpmax) warp = warpmax; |
||||
} |
||||
else |
||||
{ |
||||
interval = interval/2; |
||||
warp -= interval; |
||||
if (warp < 1) warp = 1; |
||||
} |
||||
} |
||||
// back off a bit from the largest possible allocation size |
||||
MAXWARPS[thr_id] = ((100-BACKOFF)*best+50)/100; |
||||
} |
||||
|
||||
// now allocate a buffer for determined MAXWARPS setting |
||||
cudaGetLastError(); // clear the error state |
||||
cudaMalloc((void **)&d_V, (size_t)SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t)); |
||||
if (cudaGetLastError() == cudaSuccess) { |
||||
for (int i=0; i < MAXWARPS[thr_id]; ++i) |
||||
h_V[thr_id][i] = d_V + SCRATCH * WU_PER_WARP * i; |
||||
|
||||
if (device_texturecache[thr_id] == 1) |
||||
{ |
||||
if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK)) |
||||
{ |
||||
if ( optimal_blocks * WARPS_PER_BLOCK > MW_1D ) { |
||||
applog(LOG_ERR, "GPU #%d: '%s' exceeds limits for 1D cache. Using 2D cache instead.", device_map[thr_id], device_config[thr_id]); |
||||
device_texturecache[thr_id] = 2; |
||||
} |
||||
} |
||||
// bind linear memory to a 1D texture reference |
||||
if (kernel->get_texel_width() == 2) |
||||
kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * min(MAXWARPS[thr_id],MW_1D_2) * sizeof(uint32_t)); |
||||
else |
||||
kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * min(MAXWARPS[thr_id],MW_1D_4) * sizeof(uint32_t)); |
||||
} |
||||
else if (device_texturecache[thr_id] == 2) |
||||
{ |
||||
// bind pitch linear memory to a 2D texture reference |
||||
if (kernel->get_texel_width() == 2) |
||||
kernel->bindtexture_2D(d_V, SCRATCH/2, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t)); |
||||
else |
||||
kernel->bindtexture_2D(d_V, SCRATCH/4, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t)); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
applog(LOG_ERR, "GPU #%d: FATAL: Launch config '%s' requires too much memory!", device_map[thr_id], device_config[thr_id]); |
||||
return 0; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK)) |
||||
MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK; |
||||
else |
||||
MAXWARPS[thr_id] = TOTAL_WARP_LIMIT; |
||||
|
||||
// chunked memory allocation up to device limits |
||||
int warp; |
||||
for (warp = 0; warp < MAXWARPS[thr_id]; ++warp) { |
||||
// work around partition camping problems by adding a random start address offset to each allocation |
||||
h_V_extra[thr_id][warp] = (props.major == 1) ? (16 * (rand()%(16384/16))) : 0; |
||||
cudaGetLastError(); // clear the error state |
||||
cudaMalloc((void **) &h_V[thr_id][warp], (SCRATCH * WU_PER_WARP + h_V_extra[thr_id][warp])*sizeof(uint32_t)); |
||||
if (cudaGetLastError() == cudaSuccess) h_V[thr_id][warp] += h_V_extra[thr_id][warp]; |
||||
else { |
||||
h_V_extra[thr_id][warp] = 0; |
||||
|
||||
// back off by several warp allocations to have some breathing room |
||||
int remove = (BACKOFF*warp+50)/100; |
||||
for (int i=0; warp > 0 && i < remove; ++i) { |
||||
warp--; |
||||
checkCudaErrors(cudaFree(h_V[thr_id][warp]-h_V_extra[thr_id][warp])); |
||||
h_V[thr_id][warp] = NULL; h_V_extra[thr_id][warp] = 0; |
||||
} |
||||
|
||||
break; |
||||
} |
||||
} |
||||
MAXWARPS[thr_id] = warp; |
||||
} |
||||
if (IS_SCRYPT() || IS_SCRYPT_JANE()) { |
||||
kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]); |
||||
} |
||||
|
||||
if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK)) |
||||
{ |
||||
if (optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id]) |
||||
{ |
||||
applog(LOG_ERR, "GPU #%d: FATAL: Given launch config '%s' requires too much memory.", device_map[thr_id], device_config[thr_id]); |
||||
return 0; |
||||
} |
||||
|
||||
if (WARPS_PER_BLOCK > kernel->max_warps_per_block()) |
||||
{ |
||||
applog(LOG_ERR, "GPU #%d: FATAL: Given launch config '%s' exceeds warp limit for '%c' kernel.", device_map[thr_id], device_config[thr_id], kernel->get_identifier()); |
||||
return 0; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
if (device_config[thr_id] != NULL && strcasecmp("auto", device_config[thr_id])) |
||||
applog(LOG_WARNING, "GPU #%d: Given launch config '%s' does not validate.", device_map[thr_id], device_config[thr_id]); |
||||
|
||||
if (autotune) |
||||
{ |
||||
applog(LOG_INFO, "GPU #%d: Performing auto-tuning, please wait 2 minutes...", device_map[thr_id]); |
||||
|
||||
// allocate device memory |
||||
uint32_t *d_idata = NULL, *d_odata = NULL; |
||||
if (IS_SCRYPT() || IS_SCRYPT_JANE()) { |
||||
unsigned int mem_size = MAXWARPS[thr_id] * WU_PER_WARP * sizeof(uint32_t) * 32; |
||||
checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size)); |
||||
checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size)); |
||||
|
||||
// pre-initialize some device memory |
||||
uint32_t *h_idata = (uint32_t*)malloc(mem_size); |
||||
for (unsigned int i=0; i < mem_size/sizeof(uint32_t); ++i) h_idata[i] = i*2654435761UL; // knuth's method |
||||
checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice)); |
||||
free(h_idata); |
||||
} |
||||
#if 0 |
||||
else if (opt_algo == ALGO_KECCAK) { |
||||
uint32_t pdata[20] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20}; |
||||
uint32_t ptarget[8] = {0,0,0,0,0,0,0,0}; |
||||
kernel->prepare_keccak256(thr_id, pdata, ptarget); |
||||
} else if (opt_algo == ALGO_BLAKE) { |
||||
uint32_t pdata[20] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20}; |
||||
uint32_t ptarget[8] = {0,0,0,0,0,0,0,0}; |
||||
kernel->prepare_blake256(thr_id, pdata, ptarget); |
||||
} |
||||
#endif |
||||
double best_hash_sec = 0.0; |
||||
int best_wpb = 0; |
||||
|
||||
// auto-tuning loop |
||||
{ |
||||
// we want to have enough total warps for half the multiprocessors at least |
||||
// compute highest MAXWARPS number that we can support based on texture cache mode |
||||
int MINTW = props.multiProcessorCount / 2; |
||||
int MAXTW = (device_texturecache[thr_id] == 1) ? min(MAXWARPS[thr_id],MW_1D) : MAXWARPS[thr_id]; |
||||
|
||||
// we want to have blocks for half the multiprocessors at least |
||||
int MINB = props.multiProcessorCount / 2; |
||||
int MAXB = MAXTW; |
||||
|
||||
double tmin = 0.05; |
||||
|
||||
applog(LOG_INFO, "GPU #%d: maximum total warps (BxW): %d", (int) device_map[thr_id], MAXTW); |
||||
|
||||
for (int GRID_BLOCKS = MINB; !abort_flag && GRID_BLOCKS <= MAXB; ++GRID_BLOCKS) |
||||
{ |
||||
double Hash[32+1] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; |
||||
for (WARPS_PER_BLOCK = 1; !abort_flag && WARPS_PER_BLOCK <= kernel->max_warps_per_block(); ++WARPS_PER_BLOCK) |
||||
{ |
||||
double hash_sec = 0; |
||||
if (GRID_BLOCKS * WARPS_PER_BLOCK >= MINTW && |
||||
GRID_BLOCKS * WARPS_PER_BLOCK <= MAXTW) |
||||
{ |
||||
// setup execution parameters |
||||
dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1); |
||||
dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1); |
||||
|
||||
struct timeval tv_start, tv_end; |
||||
double tdelta = 0; |
||||
|
||||
checkCudaErrors(cudaDeviceSynchronize()); |
||||
gettimeofday(&tv_start, NULL); |
||||
int repeat = 0; |
||||
do // average several measurements for better exactness |
||||
{ |
||||
if (IS_SCRYPT() || IS_SCRYPT_JANE()) |
||||
kernel->run_kernel( |
||||
grid, threads, WARPS_PER_BLOCK, thr_id, NULL, |
||||
d_idata, d_odata, N, LOOKUP_GAP, device_interactive[thr_id], true, device_texturecache[thr_id] |
||||
); |
||||
if(cudaDeviceSynchronize() != cudaSuccess) |
||||
break; |
||||
++repeat; |
||||
gettimeofday(&tv_end, NULL); |
||||
// for a better result averaging, measure for at least 50ms (10ms for Keccak) |
||||
} while ((tdelta=(1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec))) < tmin); |
||||
if (cudaGetLastError() != cudaSuccess) continue; |
||||
|
||||
tdelta /= repeat; // BUGFIX: this averaging over multiple measurements was missing |
||||
|
||||
// for scrypt: in interactive mode only find launch configs where kernel launch times are short enough |
||||
// TODO: instead we could reduce the batchsize parameter to meet the launch time requirement. |
||||
if (IS_SCRYPT() && device_interactive[thr_id] && GRID_BLOCKS > 2*props.multiProcessorCount && tdelta > 1.0/30) |
||||
if (WARPS_PER_BLOCK == 1) goto skip; else goto skip2; |
||||
|
||||
hash_sec = (double)WU_PER_LAUNCH / tdelta; |
||||
Hash[WARPS_PER_BLOCK] = hash_sec; |
||||
if (hash_sec > best_hash_sec) { |
||||
optimal_blocks = GRID_BLOCKS; |
||||
best_hash_sec = hash_sec; |
||||
best_wpb = WARPS_PER_BLOCK; |
||||
} |
||||
} |
||||
} |
||||
skip2: ; |
||||
if (opt_debug) { |
||||
if (GRID_BLOCKS == MINB) { |
||||
char line[512] = " "; |
||||
for (int i=1; i<=kernel->max_warps_per_block(); ++i) { |
||||
char tmp[16]; sprintf(tmp, i < 10 ? " x%-2d" : " x%-2d ", i); |
||||
strcat(line, tmp); |
||||
if (cw == 80 && (i % 8 == 0 && i != kernel->max_warps_per_block())) |
||||
strcat(line, "\n "); |
||||
} |
||||
applog(LOG_DEBUG, line); |
||||
} |
||||
|
||||
char kMGT = ' '; bool flag; |
||||
for (int j=0; j < 4; ++j) { |
||||
flag=false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 1000, i++); |
||||
if (flag) for (int i=1; i<=kernel->max_warps_per_block(); Hash[i] /= 1000, i++); |
||||
else break; |
||||
if (kMGT == ' ') kMGT = 'k'; |
||||
else if (kMGT == 'k') kMGT = 'M'; |
||||
else if (kMGT == 'M') kMGT = 'G'; |
||||
else if (kMGT == 'G') kMGT = 'T'; |
||||
} |
||||
const char *format = "%5.4f%c"; |
||||
flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 1, i++); if (flag) format = "%5.3f%c"; |
||||
flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 10, i++); if (flag) format = "%5.2f%c"; |
||||
flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 100, i++); if (flag) format = "%5.1f%c"; |
||||
|
||||
char line[512]; sprintf(line, "%3d:", GRID_BLOCKS); |
||||
for (int i=1; i<=kernel->max_warps_per_block(); ++i) { |
||||
char tmp[16]; |
||||
if (Hash[i]>0) |
||||
sprintf(tmp, format, Hash[i], (i<kernel->max_warps_per_block())?'|':' '); |
||||
else |
||||
sprintf(tmp, " %c", (i<kernel->max_warps_per_block())?'|':' '); |
||||
strcat(line, tmp); |
||||
if (cw == 80 && (i % 8 == 0 && i != kernel->max_warps_per_block())) |
||||
strcat(line, "\n "); |
||||
} |
||||
int n = strlen(line)-1; line[n++] = '|'; line[n++] = ' '; line[n++] = kMGT; line[n++] = '\0'; |
||||
strcat(line, "H/s"); |
||||
applog(LOG_DEBUG, line); |
||||
} |
||||
} |
||||
skip: ; |
||||
} |
||||
|
||||
if (IS_SCRYPT() || IS_SCRYPT_JANE()) { |
||||
checkCudaErrors(cudaFree(d_odata)); |
||||
checkCudaErrors(cudaFree(d_idata)); |
||||
} |
||||
|
||||
WARPS_PER_BLOCK = best_wpb; |
||||
applog(LOG_INFO, "GPU #%d: %7.2f hash/s with configuration %c%dx%d", device_map[thr_id], best_hash_sec, kernel->get_identifier(), optimal_blocks, WARPS_PER_BLOCK); |
||||
} |
||||
else |
||||
{ |
||||
// Heuristics to find a good kernel launch configuration |
||||
|
||||
// base the initial block estimate on the number of multiprocessors |
||||
int device_cores = props.multiProcessorCount * _ConvertSMVer2Cores(props.major, props.minor); |
||||
|
||||
// defaults, in case nothing else is chosen below |
||||
optimal_blocks = 4 * device_cores / WU_PER_WARP; |
||||
WARPS_PER_BLOCK = 2; |
||||
|
||||
// Based on compute capability, pick a known good block x warp configuration. |
||||
if (props.major >= 3) |
||||
{ |
||||
if (props.major == 3 && props.minor == 5) // GK110 (Tesla K20X, K20, GeForce GTX TITAN) |
||||
{ |
||||
// TODO: what to do with Titan and Tesla K20(X)? |
||||
// for now, do the same as for GTX 660Ti (2GB) |
||||
optimal_blocks = (int)(optimal_blocks * 0.8809524); |
||||
WARPS_PER_BLOCK = 2; |
||||
} |
||||
else // GK104, GK106, GK107 ... |
||||
{ |
||||
if (MAXWARPS[thr_id] > (int)(optimal_blocks * 1.7261905) * 2) |
||||
{ |
||||
// this results in 290x2 configuration on GTX 660Ti (3GB) |
||||
// but it requires 3GB memory on the card! |
||||
optimal_blocks = (int)(optimal_blocks * 1.7261905); |
||||
WARPS_PER_BLOCK = 2; |
||||
} |
||||
else |
||||
{ |
||||
// this results in 148x2 configuration on GTX 660Ti (2GB) |
||||
optimal_blocks = (int)(optimal_blocks * 0.8809524); |
||||
WARPS_PER_BLOCK = 2; |
||||
} |
||||
} |
||||
} |
||||
// 1st generation Fermi (compute 2.0) GF100, GF110 |
||||
else if (props.major == 2 && props.minor == 0) |
||||
{ |
||||
// this results in a 60x4 configuration on GTX 570 |
||||
optimal_blocks = 4 * device_cores / WU_PER_WARP; |
||||
WARPS_PER_BLOCK = 4; |
||||
} |
||||
// 2nd generation Fermi (compute 2.1) GF104,106,108,114,116 |
||||
else if (props.major == 2 && props.minor == 1) |
||||
{ |
||||
// this results in a 56x2 configuration on GTX 460 |
||||
optimal_blocks = props.multiProcessorCount * 8; |
||||
WARPS_PER_BLOCK = 2; |
||||
} |
||||
|
||||
// in case we run out of memory with the automatically chosen configuration, |
||||
// first back off with WARPS_PER_BLOCK, then reduce optimal_blocks. |
||||
if (WARPS_PER_BLOCK==3 && optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id]) |
||||
WARPS_PER_BLOCK = 2; |
||||
while (optimal_blocks > 0 && optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id]) |
||||
optimal_blocks--; |
||||
} |
||||
} |
||||
|
||||
applog(LOG_INFO, "GPU #%d: using launch configuration %c%dx%d", device_map[thr_id], kernel->get_identifier(), optimal_blocks, WARPS_PER_BLOCK); |
||||
|
||||
if (device_singlememory[thr_id]) |
||||
{ |
||||
if (MAXWARPS[thr_id] != optimal_blocks * WARPS_PER_BLOCK) |
||||
{ |
||||
MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK; |
||||
if (device_texturecache[thr_id] == 1) |
||||
kernel->unbindtexture_1D(); |
||||
else if (device_texturecache[thr_id] == 2) |
||||
kernel->unbindtexture_2D(); |
||||
checkCudaErrors(cudaFree(d_V)); d_V = NULL; |
||||
|
||||
cudaGetLastError(); // clear the error state |
||||
cudaMalloc((void **)&d_V, (size_t)SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t)); |
||||
if (cudaGetLastError() == cudaSuccess) { |
||||
for (int i=0; i < MAXWARPS[thr_id]; ++i) |
||||
h_V[thr_id][i] = d_V + SCRATCH * WU_PER_WARP * i; |
||||
|
||||
if (device_texturecache[thr_id] == 1) |
||||
{ |
||||
// bind linear memory to a 1D texture reference |
||||
if (kernel->get_texel_width() == 2) |
||||
kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t)); |
||||
else |
||||
kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t)); |
||||
} |
||||
else if (device_texturecache[thr_id] == 2) |
||||
{ |
||||
// bind pitch linear memory to a 2D texture reference |
||||
if (kernel->get_texel_width() == 2) |
||||
kernel->bindtexture_2D(d_V, SCRATCH/2, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t)); |
||||
else |
||||
kernel->bindtexture_2D(d_V, SCRATCH/4, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t)); |
||||
} |
||||
|
||||
// update pointers to scratch buffer in constant memory after reallocation |
||||
if (IS_SCRYPT() || IS_SCRYPT_JANE()) { |
||||
kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
applog(LOG_ERR, "GPU #%d: Unable to allocate enough memory for launch config '%s'.", device_map[thr_id], device_config[thr_id]); |
||||
} |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
// back off unnecessary memory allocations to have some breathing room |
||||
while (MAXWARPS[thr_id] > 0 && MAXWARPS[thr_id] > optimal_blocks * WARPS_PER_BLOCK) { |
||||
(MAXWARPS[thr_id])--; |
||||
checkCudaErrors(cudaFree(h_V[thr_id][MAXWARPS[thr_id]]-h_V_extra[thr_id][MAXWARPS[thr_id]])); |
||||
h_V[thr_id][MAXWARPS[thr_id]] = NULL; h_V_extra[thr_id][MAXWARPS[thr_id]] = 0; |
||||
} |
||||
} |
||||
|
||||
return optimal_blocks; |
||||
} |
||||
|
||||
void cuda_scrypt_HtoD(int thr_id, uint32_t *X, int stream) |
||||
{ |
||||
unsigned int GRID_BLOCKS = context_blocks[thr_id]; |
||||
unsigned int WARPS_PER_BLOCK = context_wpb[thr_id]; |
||||
unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); |
||||
unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * 32; |
||||
|
||||
// copy host memory to device |
||||
cudaMemcpyAsync(context_idata[stream][thr_id], X, mem_size, cudaMemcpyHostToDevice, context_streams[stream][thr_id]); |
||||
} |
||||
|
||||
void cuda_scrypt_serialize(int thr_id, int stream) |
||||
{ |
||||
// if the device can concurrently execute multiple kernels, then we must |
||||
// wait for the serialization event recorded by the other stream |
||||
//if (context_concurrent[thr_id] || device_interactive[thr_id]) |
||||
cudaStreamWaitEvent(context_streams[stream][thr_id], context_serialize[(stream+1)&1][thr_id], 0); |
||||
} |
||||
|
||||
void cuda_scrypt_done(int thr_id, int stream) |
||||
{ |
||||
// record the serialization event in the current stream |
||||
cudaEventRecord(context_serialize[stream][thr_id], context_streams[stream][thr_id]); |
||||
} |
||||
|
||||
void cuda_scrypt_flush(int thr_id, int stream) |
||||
{ |
||||
// flush the work queue (required for WDDM drivers) |
||||
cudaStreamSynchronize(context_streams[stream][thr_id]); |
||||
} |
||||
|
||||
void cuda_scrypt_core(int thr_id, int stream, unsigned int N) |
||||
{ |
||||
unsigned int GRID_BLOCKS = context_blocks[thr_id]; |
||||
unsigned int WARPS_PER_BLOCK = context_wpb[thr_id]; |
||||
unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); |
||||
unsigned int LOOKUP_GAP = device_lookup_gap[thr_id]; |
||||
|
||||
// setup execution parameters |
||||
dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1); |
||||
dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1); |
||||
|
||||
context_kernel[thr_id]->run_kernel(grid, threads, WARPS_PER_BLOCK, thr_id, context_streams[stream][thr_id], context_idata[stream][thr_id], context_odata[stream][thr_id], N, LOOKUP_GAP, device_interactive[thr_id], opt_benchmark, device_texturecache[thr_id]); |
||||
} |
||||
|
||||
bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) |
||||
{ |
||||
return context_kernel[thr_id]->prepare_keccak256(thr_id, host_pdata, ptarget); |
||||
} |
||||
|
||||
void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) |
||||
{ |
||||
unsigned int GRID_BLOCKS = context_blocks[thr_id]; |
||||
unsigned int WARPS_PER_BLOCK = context_wpb[thr_id]; |
||||
unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); |
||||
|
||||
// setup execution parameters |
||||
dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1); |
||||
dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1); |
||||
|
||||
context_kernel[thr_id]->do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h); |
||||
} |
||||
|
||||
bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) |
||||
{ |
||||
return context_kernel[thr_id]->prepare_blake256(thr_id, host_pdata, ptarget); |
||||
} |
||||
|
||||
void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h) |
||||
{ |
||||
unsigned int GRID_BLOCKS = context_blocks[thr_id]; |
||||
unsigned int WARPS_PER_BLOCK = context_wpb[thr_id]; |
||||
unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); |
||||
|
||||
// setup execution parameters |
||||
dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1); |
||||
dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1); |
||||
|
||||
context_kernel[thr_id]->do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h); |
||||
} |
||||
|
||||
void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA) |
||||
{ |
||||
unsigned int GRID_BLOCKS = context_blocks[thr_id]; |
||||
unsigned int WARPS_PER_BLOCK = context_wpb[thr_id]; |
||||
unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu(); |
||||
unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * (postSHA ? 8 : 32); |
||||
|
||||
// copy result from device to host (asynchronously) |
||||
checkCudaErrors(cudaMemcpyAsync(X, postSHA ? context_hash[stream][thr_id] : context_odata[stream][thr_id], mem_size, cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); |
||||
} |
||||
|
||||
bool cuda_scrypt_sync(int thr_id, int stream) |
||||
{ |
||||
cudaError_t err; |
||||
|
||||
if(device_interactive[thr_id] && !opt_benchmark) |
||||
{ |
||||
// For devices that also do desktop rendering or compositing, we want to free up some time slots. |
||||
// That requires making a pause in work submission when there is no active task on the GPU, |
||||
// and Device Synchronize ensures that. |
||||
|
||||
// this call was replaced by the loop below to workaround the high CPU usage issue |
||||
//err = cudaDeviceSynchronize(); |
||||
|
||||
while((err = cudaStreamQuery(context_streams[0][thr_id])) == cudaErrorNotReady || |
||||
(err == cudaSuccess && (err = cudaStreamQuery(context_streams[1][thr_id])) == cudaErrorNotReady)) |
||||
usleep(1000); |
||||
|
||||
usleep(1000); |
||||
} |
||||
else |
||||
{ |
||||
// this call was replaced by the loop below to workaround the high CPU usage issue |
||||
//err = cudaStreamSynchronize(context_streams[stream][thr_id]); |
||||
|
||||
while((err = cudaStreamQuery(context_streams[stream][thr_id])) == cudaErrorNotReady) |
||||
usleep(1000); |
||||
} |
||||
|
||||
if(err != cudaSuccess) |
||||
{ |
||||
applog(LOG_ERR, "GPU #%d: CUDA error `%s` while executing the kernel.", device_map[thr_id], cudaGetErrorString(err)); |
||||
return false; |
||||
} |
||||
|
||||
return true; |
||||
} |
||||
|
||||
uint32_t* cuda_transferbuffer(int thr_id, int stream) |
||||
{ |
||||
return context_X[stream][thr_id]; |
||||
} |
||||
|
||||
uint32_t* cuda_hashbuffer(int thr_id, int stream) |
||||
{ |
||||
return context_H[stream][thr_id]; |
||||
} |
@ -0,0 +1,135 @@
@@ -0,0 +1,135 @@
|
||||
#ifndef SALSA_KERNEL_H |
||||
#define SALSA_KERNEL_H |
||||
|
||||
#include <stdio.h> |
||||
#include <stdbool.h> |
||||
#include <malloc.h> |
||||
#include <string.h> |
||||
#include <cuda_runtime.h> |
||||
|
||||
#include "miner.h" |
||||
|
||||
#define MAX_DEVICES MAX_GPUS |
||||
|
||||
#define A_SCRYPT 0 |
||||
#define A_SCRYPT_JANE 1 |
||||
|
||||
// from ccminer.cpp
|
||||
extern short device_map[MAX_GPUS]; |
||||
extern int device_interactive[MAX_GPUS]; |
||||
extern int device_batchsize[MAX_GPUS]; |
||||
extern int device_backoff[MAX_GPUS]; |
||||
extern int device_lookup_gap[MAX_GPUS]; |
||||
extern int device_texturecache[MAX_GPUS]; |
||||
extern int device_singlememory[MAX_GPUS]; |
||||
extern char *device_config[MAX_GPUS]; |
||||
extern char *device_name[MAX_GPUS]; |
||||
extern bool autotune; |
||||
|
||||
extern int opt_nfactor; |
||||
extern char *jane_params; |
||||
extern bool abort_flag; |
||||
extern bool autotune; |
||||
extern int parallel; |
||||
|
||||
extern void get_currentalgo(char* buf, int sz); |
||||
|
||||
typedef unsigned int uint32_t; // define this as 32 bit type derived from int
|
||||
|
||||
static char algo[64] = { 0 }; |
||||
static __inline bool IS_SCRYPT() { if (algo[0] == '\0') get_currentalgo(algo, 64); return !strcmp(algo,"scrypt"); } |
||||
static __inline bool IS_SCRYPT_JANE() { if (algo[0] == '\0') get_currentalgo(algo, 64); return !strcmp(algo,"scrypt-jane"); } |
||||
|
||||
// CUDA externals
|
||||
extern int cuda_num_devices(); |
||||
extern void cuda_shutdown(int thr_id); |
||||
extern int cuda_throughput(int thr_id); |
||||
|
||||
extern uint32_t *cuda_transferbuffer(int thr_id, int stream); |
||||
extern uint32_t *cuda_hashbuffer(int thr_id, int stream); |
||||
|
||||
extern void cuda_scrypt_HtoD(int thr_id, uint32_t *X, int stream); |
||||
extern void cuda_scrypt_serialize(int thr_id, int stream); |
||||
extern void cuda_scrypt_core(int thr_id, int stream, unsigned int N); |
||||
extern void cuda_scrypt_done(int thr_id, int stream); |
||||
extern void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA); |
||||
extern bool cuda_scrypt_sync(int thr_id, int stream); |
||||
extern void cuda_scrypt_flush(int thr_id, int stream); |
||||
|
||||
extern bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); |
||||
extern void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h); |
||||
|
||||
extern bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); |
||||
extern void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h); |
||||
|
||||
extern void computeGold(uint32_t *idata, uint32_t *reference, uchar *scratchpad); |
||||
|
||||
extern bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); |
||||
extern bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]); |
||||
|
||||
#ifdef __NVCC__ |
||||
extern void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h); |
||||
extern void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h); |
||||
#endif |
||||
|
||||
// If we're in C++ mode, we're either compiling .cu files or scrypt.cpp
|
||||
|
||||
#ifdef __NVCC__ |
||||
|
||||
/**
|
||||
* An pure virtual interface for a CUDA kernel implementation. |
||||
* TODO: encapsulate the kernel launch parameters in some kind of wrapper. |
||||
*/ |
||||
class KernelInterface |
||||
{ |
||||
public: |
||||
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) = 0; |
||||
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) = 0; |
||||
virtual bool bindtexture_1D(uint32_t *d_V, size_t size) { return true; } |
||||
virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) { return true; } |
||||
virtual bool unbindtexture_1D() { return true; } |
||||
virtual bool unbindtexture_2D() { return true; } |
||||
|
||||
virtual char get_identifier() = 0; |
||||
virtual int get_major_version() { return 1; } |
||||
virtual int get_minor_version() { return 0; } |
||||
virtual int max_warps_per_block() = 0; |
||||
virtual int get_texel_width() = 0; |
||||
virtual bool no_textures() { return false; }; |
||||
virtual bool single_memory() { return false; }; |
||||
virtual int threads_per_wu() { return 1; } |
||||
virtual bool support_lookup_gap() { return false; } |
||||
virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeDefault; } |
||||
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferNone; } |
||||
|
||||
virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) { |
||||
return default_prepare_keccak256(thr_id, host_pdata, ptarget); |
||||
} |
||||
virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) { |
||||
default_do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h); |
||||
} |
||||
|
||||
virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) { |
||||
return default_prepare_blake256(thr_id, host_pdata, ptarget); |
||||
} |
||||
virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) { |
||||
default_do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h); |
||||
} |
||||
}; |
||||
|
||||
// Not performing error checking is actually bad, but...
|
||||
#define checkCudaErrors(x) x |
||||
#define getLastCudaError(x) |
||||
|
||||
#endif // #ifdef __NVCC__
|
||||
|
||||
// Define work unit size
|
||||
#define TOTAL_WARP_LIMIT 4096 |
||||
#define WU_PER_WARP (32 / THREADS_PER_WU) |
||||
#define WU_PER_BLOCK (WU_PER_WARP*WARPS_PER_BLOCK) |
||||
#define WU_PER_LAUNCH (GRID_BLOCKS*WU_PER_BLOCK) |
||||
|
||||
// make scratchpad size dependent on N and LOOKUP_GAP
|
||||
#define SCRATCH (((N+LOOKUP_GAP-1)/LOOKUP_GAP)*32) |
||||
|
||||
#endif // #ifndef SALSA_KERNEL_H
|
@ -0,0 +1,29 @@
@@ -0,0 +1,29 @@
|
||||
#ifndef SCRYPT_JANE_H |
||||
#define SCRYPT_JANE_H |
||||
|
||||
/*
|
||||
Nfactor: Increases CPU & Memory Hardness |
||||
N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used |
||||
|
||||
rfactor: Increases Memory Hardness |
||||
r = (1 << rfactor): How large a chunk is |
||||
|
||||
pfactor: Increases CPU Hardness |
||||
p = (1 << pfactor): Number of times to mix the main chunk |
||||
|
||||
A block is the basic mixing unit (salsa/chacha block = 64 bytes) |
||||
A chunk is (2 * r) blocks |
||||
|
||||
~Memory used = (N + 2) * ((2 * r) * block size) |
||||
*/ |
||||
|
||||
#include <stdlib.h> |
||||
#include <stdint.h> |
||||
#include <memory.h> |
||||
|
||||
typedef void (*scrypt_fatal_errorfn)(const char *msg); |
||||
void scrypt_set_fatal_error(scrypt_fatal_errorfn fn); |
||||
|
||||
void scrypt_N_1_1(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, uint32_t N, unsigned char *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V); |
||||
|
||||
#endif /* SCRYPT_JANE_H */ |
@ -0,0 +1,638 @@
@@ -0,0 +1,638 @@
|
||||
/*
|
||||
* Copyright 2011 ArtForz |
||||
* Copyright 2011-2013 pooler |
||||
* |
||||
* This program is free software; you can redistribute it and/or modify it |
||||
* under the terms of the GNU General Public License as published by the Free |
||||
* Software Foundation; either version 2 of the License, or (at your option) |
||||
* any later version. See COPYING for more details. |
||||
*/ |
||||
|
||||
#include "cpuminer-config.h" |
||||
#include "miner.h" |
||||
|
||||
#include <string.h> |
||||
#include <stdint.h> |
||||
|
||||
#ifdef WIN32 |
||||
#define __attribute__(x) |
||||
#endif |
||||
|
||||
#if defined(__arm__) && defined(__APCS_32__) |
||||
#define EXTERN_SHA256 |
||||
#endif |
||||
|
||||
static const uint32_t sha256_h[8] = { |
||||
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, |
||||
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 |
||||
}; |
||||
|
||||
static const uint32_t sha256_k[64] = { |
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
||||
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
||||
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, |
||||
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, |
||||
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, |
||||
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, |
||||
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, |
||||
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, |
||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
||||
}; |
||||
|
||||
void sha256_init(uint32_t *state) |
||||
{ |
||||
memcpy(state, sha256_h, 32); |
||||
} |
||||
|
||||
/* Elementary functions used by SHA256 */ |
||||
#define Ch(x, y, z) ((x & (y ^ z)) ^ z) |
||||
#define Maj(x, y, z) ((x & (y | z)) | (y & z)) |
||||
#define ROTR(x, n) ((x >> n) | (x << (32 - n))) |
||||
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) |
||||
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) |
||||
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) |
||||
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) |
||||
|
||||
/* SHA256 round function */ |
||||
#define RND(a, b, c, d, e, f, g, h, k) \ |
||||
do { \ |
||||
t0 = h + S1(e) + Ch(e, f, g) + k; \ |
||||
t1 = S0(a) + Maj(a, b, c); \ |
||||
d += t0; \ |
||||
h = t0 + t1; \ |
||||
} while (0) |
||||
|
||||
/* Adjusted round function for rotating state */ |
||||
#define RNDr(S, W, i) \ |
||||
RND(S[(64 - i) % 8], S[(65 - i) % 8], \ |
||||
S[(66 - i) % 8], S[(67 - i) % 8], \ |
||||
S[(68 - i) % 8], S[(69 - i) % 8], \ |
||||
S[(70 - i) % 8], S[(71 - i) % 8], \ |
||||
W[i] + sha256_k[i]) |
||||
|
||||
#ifndef EXTERN_SHA256 |
||||
|
||||
/*
|
||||
* SHA256 block compression function. The 256-bit state is transformed via |
||||
* the 512-bit input block to produce a new state. |
||||
*/ |
||||
void sha256_transform(uint32_t *state, const uint32_t *block, int swap) |
||||
{ |
||||
uint32_t W[64]; |
||||
uint32_t S[8]; |
||||
uint32_t t0, t1; |
||||
int i; |
||||
|
||||
/* 1. Prepare message schedule W. */ |
||||
if (swap) { |
||||
for (i = 0; i < 16; i++) |
||||
W[i] = swab32(block[i]); |
||||
} else |
||||
memcpy(W, block, 64); |
||||
for (i = 16; i < 64; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; |
||||
} |
||||
|
||||
/* 2. Initialize working variables. */ |
||||
memcpy(S, state, 32); |
||||
|
||||
/* 3. Mix. */ |
||||
RNDr(S, W, 0); |
||||
RNDr(S, W, 1); |
||||
RNDr(S, W, 2); |
||||
RNDr(S, W, 3); |
||||
RNDr(S, W, 4); |
||||
RNDr(S, W, 5); |
||||
RNDr(S, W, 6); |
||||
RNDr(S, W, 7); |
||||
RNDr(S, W, 8); |
||||
RNDr(S, W, 9); |
||||
RNDr(S, W, 10); |
||||
RNDr(S, W, 11); |
||||
RNDr(S, W, 12); |
||||
RNDr(S, W, 13); |
||||
RNDr(S, W, 14); |
||||
RNDr(S, W, 15); |
||||
RNDr(S, W, 16); |
||||
RNDr(S, W, 17); |
||||
RNDr(S, W, 18); |
||||
RNDr(S, W, 19); |
||||
RNDr(S, W, 20); |
||||
RNDr(S, W, 21); |
||||
RNDr(S, W, 22); |
||||
RNDr(S, W, 23); |
||||
RNDr(S, W, 24); |
||||
RNDr(S, W, 25); |
||||
RNDr(S, W, 26); |
||||
RNDr(S, W, 27); |
||||
RNDr(S, W, 28); |
||||
RNDr(S, W, 29); |
||||
RNDr(S, W, 30); |
||||
RNDr(S, W, 31); |
||||
RNDr(S, W, 32); |
||||
RNDr(S, W, 33); |
||||
RNDr(S, W, 34); |
||||
RNDr(S, W, 35); |
||||
RNDr(S, W, 36); |
||||
RNDr(S, W, 37); |
||||
RNDr(S, W, 38); |
||||
RNDr(S, W, 39); |
||||
RNDr(S, W, 40); |
||||
RNDr(S, W, 41); |
||||
RNDr(S, W, 42); |
||||
RNDr(S, W, 43); |
||||
RNDr(S, W, 44); |
||||
RNDr(S, W, 45); |
||||
RNDr(S, W, 46); |
||||
RNDr(S, W, 47); |
||||
RNDr(S, W, 48); |
||||
RNDr(S, W, 49); |
||||
RNDr(S, W, 50); |
||||
RNDr(S, W, 51); |
||||
RNDr(S, W, 52); |
||||
RNDr(S, W, 53); |
||||
RNDr(S, W, 54); |
||||
RNDr(S, W, 55); |
||||
RNDr(S, W, 56); |
||||
RNDr(S, W, 57); |
||||
RNDr(S, W, 58); |
||||
RNDr(S, W, 59); |
||||
RNDr(S, W, 60); |
||||
RNDr(S, W, 61); |
||||
RNDr(S, W, 62); |
||||
RNDr(S, W, 63); |
||||
|
||||
/* 4. Mix local working variables into global state */ |
||||
for (i = 0; i < 8; i++) |
||||
state[i] += S[i]; |
||||
} |
||||
|
||||
#endif /* EXTERN_SHA256 */ |
||||
|
||||
|
||||
static const uint32_t sha256d_hash1[16] = { |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x80000000, 0x00000000, 0x00000000, 0x00000000, |
||||
0x00000000, 0x00000000, 0x00000000, 0x00000100 |
||||
}; |
||||
|
||||
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) |
||||
{ |
||||
uint32_t S[16]; |
||||
int i; |
||||
|
||||
sha256_init(S); |
||||
sha256_transform(S, data, 0); |
||||
sha256_transform(S, data + 16, 0); |
||||
memcpy(S + 8, sha256d_hash1 + 8, 32); |
||||
sha256_init(hash); |
||||
sha256_transform(hash, S, 0); |
||||
for (i = 0; i < 8; i++) |
||||
hash[i] = swab32(hash[i]); |
||||
} |
||||
|
||||
void sha256d(unsigned char *hash, const unsigned char *data, int len) |
||||
{ |
||||
uint32_t S[16], T[16]; |
||||
int i, r; |
||||
|
||||
sha256_init(S); |
||||
for (r = len; r > -9; r -= 64) { |
||||
if (r < 64) |
||||
memset(T, 0, 64); |
||||
memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); |
||||
if (r >= 0 && r < 64) |
||||
((unsigned char *)T)[r] = 0x80; |
||||
for (i = 0; i < 16; i++) |
||||
T[i] = be32dec(T + i); |
||||
if (r < 56) |
||||
T[15] = 8 * len; |
||||
sha256_transform(S, T, 0); |
||||
} |
||||
memcpy(S + 8, sha256d_hash1 + 8, 32); |
||||
sha256_init(T); |
||||
sha256_transform(T, S, 0); |
||||
for (i = 0; i < 8; i++) |
||||
be32enc((uint32_t *)hash + i, T[i]); |
||||
} |
||||
|
||||
static inline void sha256d_preextend(uint32_t *W) |
||||
{ |
||||
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; |
||||
W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1]; |
||||
W[18] = s1(W[16]) + W[11] + W[ 2]; |
||||
W[19] = s1(W[17]) + W[12] + s0(W[ 4]); |
||||
W[20] = W[13] + s0(W[ 5]) + W[ 4]; |
||||
W[21] = W[14] + s0(W[ 6]) + W[ 5]; |
||||
W[22] = W[15] + s0(W[ 7]) + W[ 6]; |
||||
W[23] = W[16] + s0(W[ 8]) + W[ 7]; |
||||
W[24] = W[17] + s0(W[ 9]) + W[ 8]; |
||||
W[25] = s0(W[10]) + W[ 9]; |
||||
W[26] = s0(W[11]) + W[10]; |
||||
W[27] = s0(W[12]) + W[11]; |
||||
W[28] = s0(W[13]) + W[12]; |
||||
W[29] = s0(W[14]) + W[13]; |
||||
W[30] = s0(W[15]) + W[14]; |
||||
W[31] = s0(W[16]) + W[15]; |
||||
} |
||||
|
||||
static inline void sha256d_prehash(uint32_t *S, const uint32_t *W) |
||||
{ |
||||
uint32_t t0, t1; |
||||
RNDr(S, W, 0); |
||||
RNDr(S, W, 1); |
||||
RNDr(S, W, 2); |
||||
} |
||||
|
||||
#ifdef EXTERN_SHA256 |
||||
|
||||
void sha256d_ms(uint32_t *hash, uint32_t *W, |
||||
const uint32_t *midstate, const uint32_t *prehash); |
||||
|
||||
#else |
||||
|
||||
static inline void sha256d_ms(uint32_t *hash, uint32_t *W, |
||||
const uint32_t *midstate, const uint32_t *prehash) |
||||
{ |
||||
uint32_t S[64]; |
||||
uint32_t t0, t1; |
||||
int i; |
||||
|
||||
S[18] = W[18]; |
||||
S[19] = W[19]; |
||||
S[20] = W[20]; |
||||
S[22] = W[22]; |
||||
S[23] = W[23]; |
||||
S[24] = W[24]; |
||||
S[30] = W[30]; |
||||
S[31] = W[31]; |
||||
|
||||
W[18] += s0(W[3]); |
||||
W[19] += W[3]; |
||||
W[20] += s1(W[18]); |
||||
W[21] = s1(W[19]); |
||||
W[22] += s1(W[20]); |
||||
W[23] += s1(W[21]); |
||||
W[24] += s1(W[22]); |
||||
W[25] = s1(W[23]) + W[18]; |
||||
W[26] = s1(W[24]) + W[19]; |
||||
W[27] = s1(W[25]) + W[20]; |
||||
W[28] = s1(W[26]) + W[21]; |
||||
W[29] = s1(W[27]) + W[22]; |
||||
W[30] += s1(W[28]) + W[23]; |
||||
W[31] += s1(W[29]) + W[24]; |
||||
for (i = 32; i < 64; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; |
||||
} |
||||
|
||||
memcpy(S, prehash, 32); |
||||
|
||||
RNDr(S, W, 3); |
||||
RNDr(S, W, 4); |
||||
RNDr(S, W, 5); |
||||
RNDr(S, W, 6); |
||||
RNDr(S, W, 7); |
||||
RNDr(S, W, 8); |
||||
RNDr(S, W, 9); |
||||
RNDr(S, W, 10); |
||||
RNDr(S, W, 11); |
||||
RNDr(S, W, 12); |
||||
RNDr(S, W, 13); |
||||
RNDr(S, W, 14); |
||||
RNDr(S, W, 15); |
||||
RNDr(S, W, 16); |
||||
RNDr(S, W, 17); |
||||
RNDr(S, W, 18); |
||||
RNDr(S, W, 19); |
||||
RNDr(S, W, 20); |
||||
RNDr(S, W, 21); |
||||
RNDr(S, W, 22); |
||||
RNDr(S, W, 23); |
||||
RNDr(S, W, 24); |
||||
RNDr(S, W, 25); |
||||
RNDr(S, W, 26); |
||||
RNDr(S, W, 27); |
||||
RNDr(S, W, 28); |
||||
RNDr(S, W, 29); |
||||
RNDr(S, W, 30); |
||||
RNDr(S, W, 31); |
||||
RNDr(S, W, 32); |
||||
RNDr(S, W, 33); |
||||
RNDr(S, W, 34); |
||||
RNDr(S, W, 35); |
||||
RNDr(S, W, 36); |
||||
RNDr(S, W, 37); |
||||
RNDr(S, W, 38); |
||||
RNDr(S, W, 39); |
||||
RNDr(S, W, 40); |
||||
RNDr(S, W, 41); |
||||
RNDr(S, W, 42); |
||||
RNDr(S, W, 43); |
||||
RNDr(S, W, 44); |
||||
RNDr(S, W, 45); |
||||
RNDr(S, W, 46); |
||||
RNDr(S, W, 47); |
||||
RNDr(S, W, 48); |
||||
RNDr(S, W, 49); |
||||
RNDr(S, W, 50); |
||||
RNDr(S, W, 51); |
||||
RNDr(S, W, 52); |
||||
RNDr(S, W, 53); |
||||
RNDr(S, W, 54); |
||||
RNDr(S, W, 55); |
||||
RNDr(S, W, 56); |
||||
RNDr(S, W, 57); |
||||
RNDr(S, W, 58); |
||||
RNDr(S, W, 59); |
||||
RNDr(S, W, 60); |
||||
RNDr(S, W, 61); |
||||
RNDr(S, W, 62); |
||||
RNDr(S, W, 63); |
||||
|
||||
for (i = 0; i < 8; i++) |
||||
S[i] += midstate[i]; |
||||
|
||||
W[18] = S[18]; |
||||
W[19] = S[19]; |
||||
W[20] = S[20]; |
||||
W[22] = S[22]; |
||||
W[23] = S[23]; |
||||
W[24] = S[24]; |
||||
W[30] = S[30]; |
||||
W[31] = S[31]; |
||||
|
||||
memcpy(S + 8, sha256d_hash1 + 8, 32); |
||||
S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0]; |
||||
S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1]; |
||||
S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2]; |
||||
S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3]; |
||||
S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4]; |
||||
S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5]; |
||||
S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6]; |
||||
S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7]; |
||||
S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8]; |
||||
S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9]; |
||||
S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10]; |
||||
S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11]; |
||||
S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12]; |
||||
S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13]; |
||||
S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14]; |
||||
S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15]; |
||||
for (i = 32; i < 60; i += 2) { |
||||
S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16]; |
||||
S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15]; |
||||
} |
||||
S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44]; |
||||
|
||||
sha256_init(hash); |
||||
|
||||
RNDr(hash, S, 0); |
||||
RNDr(hash, S, 1); |
||||
RNDr(hash, S, 2); |
||||
RNDr(hash, S, 3); |
||||
RNDr(hash, S, 4); |
||||
RNDr(hash, S, 5); |
||||
RNDr(hash, S, 6); |
||||
RNDr(hash, S, 7); |
||||
RNDr(hash, S, 8); |
||||
RNDr(hash, S, 9); |
||||
RNDr(hash, S, 10); |
||||
RNDr(hash, S, 11); |
||||
RNDr(hash, S, 12); |
||||
RNDr(hash, S, 13); |
||||
RNDr(hash, S, 14); |
||||
RNDr(hash, S, 15); |
||||
RNDr(hash, S, 16); |
||||
RNDr(hash, S, 17); |
||||
RNDr(hash, S, 18); |
||||
RNDr(hash, S, 19); |
||||
RNDr(hash, S, 20); |
||||
RNDr(hash, S, 21); |
||||
RNDr(hash, S, 22); |
||||
RNDr(hash, S, 23); |
||||
RNDr(hash, S, 24); |
||||
RNDr(hash, S, 25); |
||||
RNDr(hash, S, 26); |
||||
RNDr(hash, S, 27); |
||||
RNDr(hash, S, 28); |
||||
RNDr(hash, S, 29); |
||||
RNDr(hash, S, 30); |
||||
RNDr(hash, S, 31); |
||||
RNDr(hash, S, 32); |
||||
RNDr(hash, S, 33); |
||||
RNDr(hash, S, 34); |
||||
RNDr(hash, S, 35); |
||||
RNDr(hash, S, 36); |
||||
RNDr(hash, S, 37); |
||||
RNDr(hash, S, 38); |
||||
RNDr(hash, S, 39); |
||||
RNDr(hash, S, 40); |
||||
RNDr(hash, S, 41); |
||||
RNDr(hash, S, 42); |
||||
RNDr(hash, S, 43); |
||||
RNDr(hash, S, 44); |
||||
RNDr(hash, S, 45); |
||||
RNDr(hash, S, 46); |
||||
RNDr(hash, S, 47); |
||||
RNDr(hash, S, 48); |
||||
RNDr(hash, S, 49); |
||||
RNDr(hash, S, 50); |
||||
RNDr(hash, S, 51); |
||||
RNDr(hash, S, 52); |
||||
RNDr(hash, S, 53); |
||||
RNDr(hash, S, 54); |
||||
RNDr(hash, S, 55); |
||||
RNDr(hash, S, 56); |
||||
|
||||
hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5]) |
||||
+ S[57] + sha256_k[57]; |
||||
hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4]) |
||||
+ S[58] + sha256_k[58]; |
||||
hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3]) |
||||
+ S[59] + sha256_k[59]; |
||||
hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2]) |
||||
+ S[60] + sha256_k[60] |
||||
+ sha256_h[7]; |
||||
} |
||||
|
||||
#endif /* EXTERN_SHA256 */ |
||||
|
||||
#ifdef HAVE_SHA256_4WAY |
||||
|
||||
void sha256d_ms_4way(uint32_t *hash, uint32_t *data, |
||||
const uint32_t *midstate, const uint32_t *prehash); |
||||
|
||||
static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata, |
||||
const uint32_t *ptarget, uint32_t max_nonce, struct timeval *tv_start, struct timeval *tv_end, unsigned long *hashes_done) |
||||
{ |
||||
gettimeofday(tv_start, NULL); |
||||
|
||||
uint32_t data[4 * 64] __attribute__((aligned(128))); |
||||
uint32_t hash[4 * 8] __attribute__((aligned(32))); |
||||
uint32_t midstate[4 * 8] __attribute__((aligned(32))); |
||||
uint32_t prehash[4 * 8] __attribute__((aligned(32))); |
||||
uint32_t n = pdata[19] - 1; |
||||
const uint32_t first_nonce = pdata[19]; |
||||
const uint32_t Htarg = ptarget[7]; |
||||
int i, j; |
||||
|
||||
memcpy(data, pdata + 16, 64); |
||||
sha256d_preextend(data); |
||||
for (i = 31; i >= 0; i--) |
||||
for (j = 0; j < 4; j++) |
||||
data[i * 4 + j] = data[i]; |
||||
|
||||
sha256_init(midstate); |
||||
sha256_transform(midstate, pdata, 0); |
||||
memcpy(prehash, midstate, 32); |
||||
sha256d_prehash(prehash, pdata + 16); |
||||
for (i = 7; i >= 0; i--) { |
||||
for (j = 0; j < 4; j++) { |
||||
midstate[i * 4 + j] = midstate[i]; |
||||
prehash[i * 4 + j] = prehash[i]; |
||||
} |
||||
} |
||||
|
||||
do { |
||||
for (i = 0; i < 4; i++) |
||||
data[4 * 3 + i] = ++n; |
||||
|
||||
sha256d_ms_4way(hash, data, midstate, prehash); |
||||
|
||||
for (i = 0; i < 4; i++) { |
||||
if (swab32(hash[4 * 7 + i]) <= Htarg) { |
||||
pdata[19] = data[4 * 3 + i]; |
||||
sha256d_80_swap(hash, pdata); |
||||
if (fulltest(hash, ptarget)) { |
||||
*hashes_done = n - first_nonce + 1; |
||||
gettimeofday(&tv_end, NULL); |
||||
return 1; |
||||
} |
||||
} |
||||
} |
||||
} while (n < max_nonce && !work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = n - first_nonce + 1; |
||||
pdata[19] = n; |
||||
gettimeofday(&tv_end, NULL); |
||||
return 0; |
||||
} |
||||
|
||||
#endif /* HAVE_SHA256_4WAY */ |
||||
|
||||
#ifdef HAVE_SHA256_8WAY |
||||
|
||||
void sha256d_ms_8way(uint32_t *hash, uint32_t *data, |
||||
const uint32_t *midstate, const uint32_t *prehash); |
||||
|
||||
static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata, |
||||
const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) |
||||
{ |
||||
uint32_t data[8 * 64] __attribute__((aligned(128))); |
||||
uint32_t hash[8 * 8] __attribute__((aligned(32))); |
||||
uint32_t midstate[8 * 8] __attribute__((aligned(32))); |
||||
uint32_t prehash[8 * 8] __attribute__((aligned(32))); |
||||
uint32_t n = pdata[19] - 1; |
||||
const uint32_t first_nonce = pdata[19]; |
||||
const uint32_t Htarg = ptarget[7]; |
||||
int i, j; |
||||
|
||||
memcpy(data, pdata + 16, 64); |
||||
sha256d_preextend(data); |
||||
for (i = 31; i >= 0; i--) |
||||
for (j = 0; j < 8; j++) |
||||
data[i * 8 + j] = data[i]; |
||||
|
||||
sha256_init(midstate); |
||||
sha256_transform(midstate, pdata, 0); |
||||
memcpy(prehash, midstate, 32); |
||||
sha256d_prehash(prehash, pdata + 16); |
||||
for (i = 7; i >= 0; i--) { |
||||
for (j = 0; j < 8; j++) { |
||||
midstate[i * 8 + j] = midstate[i]; |
||||
prehash[i * 8 + j] = prehash[i]; |
||||
} |
||||
} |
||||
|
||||
do { |
||||
for (i = 0; i < 8; i++) |
||||
data[8 * 3 + i] = ++n; |
||||
|
||||
sha256d_ms_8way(hash, data, midstate, prehash); |
||||
|
||||
for (i = 0; i < 8; i++) { |
||||
if (swab32(hash[8 * 7 + i]) <= Htarg) { |
||||
pdata[19] = data[8 * 3 + i]; |
||||
sha256d_80_swap(hash, pdata); |
||||
if (fulltest(hash, ptarget)) { |
||||
*hashes_done = n - first_nonce + 1; |
||||
return 1; |
||||
} |
||||
} |
||||
} |
||||
} while (n < max_nonce && !work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = n - first_nonce + 1; |
||||
pdata[19] = n; |
||||
return 0; |
||||
} |
||||
|
||||
#endif /* HAVE_SHA256_8WAY */ |
||||
|
||||
int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, |
||||
uint32_t max_nonce, struct timeval *tv_start, struct timeval *tv_end, unsigned long *hashes_done) |
||||
{ |
||||
uint32_t data[64] __attribute__((aligned(128))); |
||||
uint32_t hash[8] __attribute__((aligned(32))); |
||||
uint32_t midstate[8] __attribute__((aligned(32))); |
||||
uint32_t prehash[8] __attribute__((aligned(32))); |
||||
uint32_t n = pdata[19] - 1; |
||||
const uint32_t first_nonce = pdata[19]; |
||||
const uint32_t Htarg = ptarget[7]; |
||||
|
||||
#ifdef HAVE_SHA256_8WAY |
||||
if (sha256_use_8way()) |
||||
return scanhash_sha256d_8way(thr_id, pdata, ptarget, |
||||
max_nonce, hashes_done); |
||||
#endif |
||||
#ifdef HAVE_SHA256_4WAY |
||||
if (sha256_use_4way()) |
||||
return scanhash_sha256d_4way(thr_id, pdata, ptarget, |
||||
max_nonce, hashes_done); |
||||
#endif |
||||
|
||||
memcpy(data, pdata + 16, 64); |
||||
sha256d_preextend(data); |
||||
|
||||
sha256_init(midstate); |
||||
sha256_transform(midstate, pdata, 0); |
||||
memcpy(prehash, midstate, 32); |
||||
sha256d_prehash(prehash, pdata + 16); |
||||
|
||||
do { |
||||
data[3] = ++n; |
||||
sha256d_ms(hash, data, midstate, prehash); |
||||
if (swab32(hash[7]) <= Htarg) { |
||||
pdata[19] = data[3]; |
||||
sha256d_80_swap(hash, pdata); |
||||
if (fulltest(hash, ptarget)) { |
||||
*hashes_done = n - first_nonce + 1; |
||||
return 1; |
||||
} |
||||
} |
||||
} while (n < max_nonce && !work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = n - first_nonce + 1; |
||||
pdata[19] = n; |
||||
return 0; |
||||
} |
@ -0,0 +1,441 @@
@@ -0,0 +1,441 @@
|
||||
// |
||||
// =============== SHA256 part on nVidia GPU ====================== |
||||
// |
||||
// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64 |
||||
// |
||||
|
||||
#include <map> |
||||
|
||||
#include "cuda_runtime.h" |
||||
#include "miner.h" |
||||
|
||||
#include "salsa_kernel.h" |
||||
|
||||
#include "sha256.h" |
||||
|
||||
// define some error checking macros |
||||
#undef checkCudaErrors |
||||
|
||||
#if WIN32 |
||||
#define DELIMITER '/' |
||||
#else |
||||
#define DELIMITER '/' |
||||
#endif |
||||
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ ) |
||||
|
||||
#define checkCudaErrors(x) { \ |
||||
cudaGetLastError(); \ |
||||
x; \ |
||||
cudaError_t err = cudaGetLastError(); \ |
||||
if (err != cudaSuccess) \ |
||||
applog(LOG_ERR, "GPU #%d: cudaError %d (%s) calling '%s' (%s line %d)\n", (int) device_map[thr_id], err, cudaGetErrorString(err), #x, __FILENAME__, __LINE__); \ |
||||
} |
||||
|
||||
// from salsa_kernel.cu |
||||
extern std::map<int, uint32_t *> context_idata[2]; |
||||
extern std::map<int, uint32_t *> context_odata[2]; |
||||
extern std::map<int, cudaStream_t> context_streams[2]; |
||||
extern std::map<int, uint32_t *> context_tstate[2]; |
||||
extern std::map<int, uint32_t *> context_ostate[2]; |
||||
extern std::map<int, uint32_t *> context_hash[2]; |
||||
|
||||
static const uint32_t host_sha256_h[8] = { |
||||
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, |
||||
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 |
||||
}; |
||||
|
||||
static const uint32_t host_sha256_k[64] = { |
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
||||
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
||||
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, |
||||
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, |
||||
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, |
||||
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, |
||||
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, |
||||
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, |
||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
||||
}; |
||||
|
||||
/* Elementary functions used by SHA256 */ |
||||
#define Ch(x, y, z) ((x & (y ^ z)) ^ z) |
||||
#define Maj(x, y, z) ((x & (y | z)) | (y & z)) |
||||
#define ROTR(x, n) ((x >> n) | (x << (32 - n))) |
||||
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) |
||||
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) |
||||
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) |
||||
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) |
||||
|
||||
/* SHA256 round function */ |
||||
#define RND(a, b, c, d, e, f, g, h, k) \ |
||||
do { \ |
||||
t0 = h + S1(e) + Ch(e, f, g) + k; \ |
||||
t1 = S0(a) + Maj(a, b, c); \ |
||||
d += t0; \ |
||||
h = t0 + t1; \ |
||||
} while (0) |
||||
|
||||
/* Adjusted round function for rotating state */ |
||||
#define RNDr(S, W, i) \ |
||||
RND(S[(64 - i) % 8], S[(65 - i) % 8], \ |
||||
S[(66 - i) % 8], S[(67 - i) % 8], \ |
||||
S[(68 - i) % 8], S[(69 - i) % 8], \ |
||||
S[(70 - i) % 8], S[(71 - i) % 8], \ |
||||
W[i] + sha256_k[i]) |
||||
|
||||
static const uint32_t host_keypad[12] = { |
||||
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 |
||||
}; |
||||
|
||||
static const uint32_t host_innerpad[11] = { |
||||
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 |
||||
}; |
||||
|
||||
static const uint32_t host_outerpad[8] = { |
||||
0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300 |
||||
}; |
||||
|
||||
static const uint32_t host_finalblk[16] = { |
||||
0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 |
||||
}; |
||||
|
||||
// |
||||
// CUDA code |
||||
// |
||||
|
||||
__constant__ uint32_t sha256_h[8]; |
||||
__constant__ uint32_t sha256_k[64]; |
||||
__constant__ uint32_t keypad[12]; |
||||
__constant__ uint32_t innerpad[11]; |
||||
__constant__ uint32_t outerpad[8]; |
||||
__constant__ uint32_t finalblk[16]; |
||||
__constant__ uint32_t pdata[20]; |
||||
__constant__ uint32_t midstate[8]; |
||||
|
||||
__device__ void mycpy12(uint32_t *d, const uint32_t *s) { |
||||
#pragma unroll 3 |
||||
for (int k=0; k < 3; k++) d[k] = s[k]; |
||||
} |
||||
|
||||
__device__ void mycpy16(uint32_t *d, const uint32_t *s) { |
||||
#pragma unroll 4 |
||||
for (int k=0; k < 4; k++) d[k] = s[k]; |
||||
} |
||||
|
||||
__device__ void mycpy32(uint32_t *d, const uint32_t *s) { |
||||
#pragma unroll 8 |
||||
for (int k=0; k < 8; k++) d[k] = s[k]; |
||||
} |
||||
|
||||
__device__ void mycpy44(uint32_t *d, const uint32_t *s) { |
||||
#pragma unroll 11 |
||||
for (int k=0; k < 11; k++) d[k] = s[k]; |
||||
} |
||||
|
||||
__device__ void mycpy48(uint32_t *d, const uint32_t *s) { |
||||
#pragma unroll 12 |
||||
for (int k=0; k < 12; k++) d[k] = s[k]; |
||||
} |
||||
|
||||
__device__ void mycpy64(uint32_t *d, const uint32_t *s) { |
||||
#pragma unroll 16 |
||||
for (int k=0; k < 16; k++) d[k] = s[k]; |
||||
} |
||||
|
||||
__device__ uint32_t cuda_swab32(uint32_t x) |
||||
{ |
||||
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) |
||||
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); |
||||
} |
||||
|
||||
__device__ void mycpy32_swab32(uint32_t *d, const uint32_t *s) { |
||||
#pragma unroll 8 |
||||
for (int k=0; k < 8; k++) d[k] = cuda_swab32(s[k]); |
||||
} |
||||
|
||||
__device__ void mycpy64_swab32(uint32_t *d, const uint32_t *s) { |
||||
#pragma unroll 16 |
||||
for (int k=0; k < 16; k++) d[k] = cuda_swab32(s[k]); |
||||
} |
||||
|
||||
__device__ void cuda_sha256_init(uint32_t *state) |
||||
{ |
||||
mycpy32(state, sha256_h); |
||||
} |
||||
|
||||
/* |
||||
* SHA256 block compression function. The 256-bit state is transformed via |
||||
* the 512-bit input block to produce a new state. Modified for lower register use. |
||||
*/ |
||||
__device__ void cuda_sha256_transform(uint32_t *state, const uint32_t *block) |
||||
{ |
||||
uint32_t W[64]; // only 4 of these are accessed during each partial Mix |
||||
uint32_t S[8]; |
||||
uint32_t t0, t1; |
||||
int i; |
||||
|
||||
/* 1. Initialize working variables. */ |
||||
mycpy32(S, state); |
||||
|
||||
/* 2. Prepare message schedule W and Mix. */ |
||||
mycpy16(W, block); |
||||
RNDr(S, W, 0); RNDr(S, W, 1); RNDr(S, W, 2); RNDr(S, W, 3); |
||||
|
||||
mycpy16(W+4, block+4); |
||||
RNDr(S, W, 4); RNDr(S, W, 5); RNDr(S, W, 6); RNDr(S, W, 7); |
||||
|
||||
mycpy16(W+8, block+8); |
||||
RNDr(S, W, 8); RNDr(S, W, 9); RNDr(S, W, 10); RNDr(S, W, 11); |
||||
|
||||
mycpy16(W+12, block+12); |
||||
RNDr(S, W, 12); RNDr(S, W, 13); RNDr(S, W, 14); RNDr(S, W, 15); |
||||
|
||||
#pragma unroll 2 |
||||
for (i = 16; i < 20; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } |
||||
RNDr(S, W, 16); RNDr(S, W, 17); RNDr(S, W, 18); RNDr(S, W, 19); |
||||
|
||||
#pragma unroll 2 |
||||
for (i = 20; i < 24; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } |
||||
RNDr(S, W, 20); RNDr(S, W, 21); RNDr(S, W, 22); RNDr(S, W, 23); |
||||
|
||||
#pragma unroll 2 |
||||
for (i = 24; i < 28; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } |
||||
RNDr(S, W, 24); RNDr(S, W, 25); RNDr(S, W, 26); RNDr(S, W, 27); |
||||
|
||||
#pragma unroll 2 |
||||
for (i = 28; i < 32; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } |
||||
RNDr(S, W, 28); RNDr(S, W, 29); RNDr(S, W, 30); RNDr(S, W, 31); |
||||
|
||||
#pragma unroll 2 |
||||
for (i = 32; i < 36; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } |
||||
RNDr(S, W, 32); RNDr(S, W, 33); RNDr(S, W, 34); RNDr(S, W, 35); |
||||
|
||||
#pragma unroll 2 |
||||
for (i = 36; i < 40; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } |
||||
RNDr(S, W, 36); RNDr(S, W, 37); RNDr(S, W, 38); RNDr(S, W, 39); |
||||
|
||||
#pragma unroll 2 |
||||
for (i = 40; i < 44; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } |
||||
RNDr(S, W, 40); RNDr(S, W, 41); RNDr(S, W, 42); RNDr(S, W, 43); |
||||
|
||||
#pragma unroll 2 |
||||
for (i = 44; i < 48; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } |
||||
RNDr(S, W, 44); RNDr(S, W, 45); RNDr(S, W, 46); RNDr(S, W, 47); |
||||
|
||||
#pragma unroll 2 |
||||
for (i = 48; i < 52; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } |
||||
RNDr(S, W, 48); RNDr(S, W, 49); RNDr(S, W, 50); RNDr(S, W, 51); |
||||
|
||||
#pragma unroll 2 |
||||
for (i = 52; i < 56; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } |
||||
RNDr(S, W, 52); RNDr(S, W, 53); RNDr(S, W, 54); RNDr(S, W, 55); |
||||
|
||||
#pragma unroll 2 |
||||
for (i = 56; i < 60; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } |
||||
RNDr(S, W, 56); RNDr(S, W, 57); RNDr(S, W, 58); RNDr(S, W, 59); |
||||
|
||||
#pragma unroll 2 |
||||
for (i = 60; i < 64; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } |
||||
RNDr(S, W, 60); RNDr(S, W, 61); RNDr(S, W, 62); RNDr(S, W, 63); |
||||
|
||||
/* 3. Mix local working variables into global state */ |
||||
#pragma unroll 8 |
||||
for (i = 0; i < 8; i++) |
||||
state[i] += S[i]; |
||||
} |
||||
|
||||
// |
||||
// HMAC SHA256 functions, modified to work with pdata and nonce directly |
||||
// |
||||
|
||||
__device__ void cuda_HMAC_SHA256_80_init(uint32_t *tstate, uint32_t *ostate, uint32_t nonce) |
||||
{ |
||||
uint32_t ihash[8]; |
||||
uint32_t pad[16]; |
||||
int i; |
||||
|
||||
/* tstate is assumed to contain the midstate of key */ |
||||
mycpy12(pad, pdata + 16); |
||||
pad[3] = nonce; |
||||
mycpy48(pad + 4, keypad); |
||||
cuda_sha256_transform(tstate, pad); |
||||
mycpy32(ihash, tstate); |
||||
|
||||
cuda_sha256_init(ostate); |
||||
#pragma unroll 8 |
||||
for (i = 0; i < 8; i++) |
||||
pad[i] = ihash[i] ^ 0x5c5c5c5c; |
||||
#pragma unroll 8 |
||||
for (i=8; i < 16; i++) |
||||
pad[i] = 0x5c5c5c5c; |
||||
cuda_sha256_transform(ostate, pad); |
||||
|
||||
cuda_sha256_init(tstate); |
||||
#pragma unroll 8 |
||||
for (i = 0; i < 8; i++) |
||||
pad[i] = ihash[i] ^ 0x36363636; |
||||
#pragma unroll 8 |
||||
for (i=8; i < 16; i++) |
||||
pad[i] = 0x36363636; |
||||
cuda_sha256_transform(tstate, pad); |
||||
} |
||||
|
||||
__device__ void cuda_PBKDF2_SHA256_80_128(const uint32_t *tstate, |
||||
const uint32_t *ostate, uint32_t *output, uint32_t nonce) |
||||
{ |
||||
uint32_t istate[8], ostate2[8]; |
||||
uint32_t ibuf[16], obuf[16]; |
||||
|
||||
mycpy32(istate, tstate); |
||||
cuda_sha256_transform(istate, pdata); |
||||
|
||||
mycpy12(ibuf, pdata + 16); |
||||
ibuf[3] = nonce; |
||||
ibuf[4] = 1; |
||||
mycpy44(ibuf + 5, innerpad); |
||||
|
||||
mycpy32(obuf, istate); |
||||
mycpy32(obuf + 8, outerpad); |
||||
cuda_sha256_transform(obuf, ibuf); |
||||
|
||||
mycpy32(ostate2, ostate); |
||||
cuda_sha256_transform(ostate2, obuf); |
||||
mycpy32_swab32(output, ostate2); // TODO: coalescing would be desired |
||||
|
||||
mycpy32(obuf, istate); |
||||
ibuf[4] = 2; |
||||
cuda_sha256_transform(obuf, ibuf); |
||||
|
||||
mycpy32(ostate2, ostate); |
||||
cuda_sha256_transform(ostate2, obuf); |
||||
mycpy32_swab32(output+8, ostate2); // TODO: coalescing would be desired |
||||
|
||||
mycpy32(obuf, istate); |
||||
ibuf[4] = 3; |
||||
cuda_sha256_transform(obuf, ibuf); |
||||
|
||||
mycpy32(ostate2, ostate); |
||||
cuda_sha256_transform(ostate2, obuf); |
||||
mycpy32_swab32(output+16, ostate2); // TODO: coalescing would be desired |
||||
|
||||
mycpy32(obuf, istate); |
||||
ibuf[4] = 4; |
||||
cuda_sha256_transform(obuf, ibuf); |
||||
|
||||
mycpy32(ostate2, ostate); |
||||
cuda_sha256_transform(ostate2, obuf); |
||||
mycpy32_swab32(output+24, ostate2); // TODO: coalescing would be desired |
||||
} |
||||
|
||||
__global__ void cuda_pre_sha256(uint32_t g_inp[32], uint32_t g_tstate_ext[8], uint32_t g_ostate_ext[8], uint32_t nonce) |
||||
{ |
||||
nonce += (blockIdx.x * blockDim.x) + threadIdx.x; |
||||
g_inp += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
g_tstate_ext += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
g_ostate_ext += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
|
||||
uint32_t tstate[8], ostate[8]; |
||||
mycpy32(tstate, midstate); |
||||
|
||||
cuda_HMAC_SHA256_80_init(tstate, ostate, nonce); |
||||
|
||||
mycpy32(g_tstate_ext, tstate); // TODO: coalescing would be desired |
||||
mycpy32(g_ostate_ext, ostate); // TODO: coalescing would be desired |
||||
|
||||
cuda_PBKDF2_SHA256_80_128(tstate, ostate, g_inp, nonce); |
||||
} |
||||
|
||||
__global__ void cuda_post_sha256(uint32_t g_output[8], uint32_t g_tstate_ext[8], uint32_t g_ostate_ext[8], uint32_t g_salt_ext[32]) |
||||
{ |
||||
g_output += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
g_tstate_ext += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
g_ostate_ext += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
g_salt_ext += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
|
||||
uint32_t tstate[16]; |
||||
mycpy32(tstate, g_tstate_ext); // TODO: coalescing would be desired |
||||
|
||||
uint32_t halfsalt[16]; |
||||
mycpy64_swab32(halfsalt, g_salt_ext); // TODO: coalescing would be desired |
||||
cuda_sha256_transform(tstate, halfsalt); |
||||
mycpy64_swab32(halfsalt, g_salt_ext+16); // TODO: coalescing would be desired |
||||
cuda_sha256_transform(tstate, halfsalt); |
||||
cuda_sha256_transform(tstate, finalblk); |
||||
|
||||
uint32_t buf[16]; |
||||
mycpy32(buf, tstate); |
||||
mycpy32(buf + 8, outerpad); |
||||
|
||||
uint32_t ostate[16]; |
||||
mycpy32(ostate, g_ostate_ext); |
||||
|
||||
cuda_sha256_transform(ostate, buf); |
||||
mycpy32_swab32(g_output, ostate); // TODO: coalescing would be desired |
||||
} |
||||
|
||||
// |
||||
// callable host code to initialize constants and to call kernels |
||||
// |
||||
|
||||
void prepare_sha256(int thr_id, uint32_t host_pdata[20], uint32_t host_midstate[8]) |
||||
{ |
||||
static bool init[8] = {false, false, false, false, false, false, false, false}; |
||||
if (!init[thr_id]) |
||||
{ |
||||
checkCudaErrors(cudaMemcpyToSymbol(sha256_h, host_sha256_h, sizeof(host_sha256_h), 0, cudaMemcpyHostToDevice)); |
||||
checkCudaErrors(cudaMemcpyToSymbol(sha256_k, host_sha256_k, sizeof(host_sha256_k), 0, cudaMemcpyHostToDevice)); |
||||
checkCudaErrors(cudaMemcpyToSymbol(keypad, host_keypad, sizeof(host_keypad), 0, cudaMemcpyHostToDevice)); |
||||
checkCudaErrors(cudaMemcpyToSymbol(innerpad, host_innerpad, sizeof(host_innerpad), 0, cudaMemcpyHostToDevice)); |
||||
checkCudaErrors(cudaMemcpyToSymbol(outerpad, host_outerpad, sizeof(host_outerpad), 0, cudaMemcpyHostToDevice)); |
||||
checkCudaErrors(cudaMemcpyToSymbol(finalblk, host_finalblk, sizeof(host_finalblk), 0, cudaMemcpyHostToDevice)); |
||||
init[thr_id] = true; |
||||
} |
||||
checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); |
||||
checkCudaErrors(cudaMemcpyToSymbol(midstate, host_midstate, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); |
||||
} |
||||
|
||||
void pre_sha256(int thr_id, int stream, uint32_t nonce, int throughput) |
||||
{ |
||||
dim3 block(128); |
||||
dim3 grid((throughput+127)/128); |
||||
|
||||
cuda_pre_sha256<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_idata[stream][thr_id], context_tstate[stream][thr_id], context_ostate[stream][thr_id], nonce); |
||||
} |
||||
|
||||
void post_sha256(int thr_id, int stream, int throughput) |
||||
{ |
||||
dim3 block(128); |
||||
dim3 grid((throughput+127)/128); |
||||
|
||||
cuda_post_sha256<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_hash[stream][thr_id], context_tstate[stream][thr_id], context_ostate[stream][thr_id], context_odata[stream][thr_id]); |
||||
} |
@ -0,0 +1,10 @@
@@ -0,0 +1,10 @@
|
||||
#ifndef SHA256_H |
||||
#define SHA256_H |
||||
|
||||
#include <stdint.h> |
||||
|
||||
extern "C" void prepare_sha256(int thr_id, uint32_t cpu_pdata[20], uint32_t cpu_midstate[8]); |
||||
extern "C" void pre_sha256(int thr_id, int stream, uint32_t nonce, int throughput); |
||||
extern "C" void post_sha256(int thr_id, int stream, int throughput); |
||||
|
||||
#endif // #ifndef SHA256_H
|
@ -0,0 +1,781 @@
@@ -0,0 +1,781 @@
|
||||
/* Copyright (C) 2013 David G. Andersen. All rights reserved. |
||||
* with modifications by Christian Buchner |
||||
* |
||||
* Use of this code is covered under the Apache 2.0 license, which |
||||
* can be found in the file "LICENSE" |
||||
* |
||||
* The array notation for b[] and bx[] arrays was converted to uint4, |
||||
* in preparation for some experimental changes to memory access patterns. |
||||
* Also this kernel is going to be a testbed for adaptation to Fermi devices. |
||||
*/ |
||||
|
||||
// TODO: experiment with different memory access patterns in write/read_keys_direct functions |
||||
// TODO: attempt V.Volkov style ILP (factor 4) |
||||
|
||||
#include <map> |
||||
|
||||
#include "cuda_runtime.h" |
||||
#include "miner.h" |
||||
|
||||
#include "salsa_kernel.h" |
||||
#include "test_kernel.h" |
||||
|
||||
#define TEXWIDTH 32768 |
||||
#define THREADS_PER_WU 4 // four threads per hash |
||||
|
||||
typedef enum |
||||
{ |
||||
ANDERSEN, |
||||
SIMPLE |
||||
} MemoryAccess; |
||||
|
||||
|
||||
// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes) |
||||
__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT]; |
||||
|
||||
// iteration count N |
||||
__constant__ uint32_t c_N; |
||||
__constant__ uint32_t c_N_1; // N-1 |
||||
// scratch buffer size SCRATCH |
||||
__constant__ uint32_t c_SCRATCH; |
||||
__constant__ uint32_t c_SCRATCH_WU_PER_WARP; // (SCRATCH * WU_PER_WARP) |
||||
__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP) - 1 |
||||
|
||||
// using texture references for the "tex" variants of the B kernels |
||||
texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V; |
||||
texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V; |
||||
|
||||
template <int ALGO> __device__ __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3); |
||||
|
||||
static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) { |
||||
left.x ^= right.x; |
||||
left.y ^= right.y; |
||||
left.z ^= right.z; |
||||
left.w ^= right.w; |
||||
return left; |
||||
} |
||||
|
||||
static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) { |
||||
left.x += right.x; |
||||
left.y += right.y; |
||||
left.z += right.z; |
||||
left.w += right.w; |
||||
return left; |
||||
} |
||||
|
||||
|
||||
/* write_keys writes the 8 keys being processed by a warp to the global |
||||
* scratchpad. To effectively use memory bandwidth, it performs the writes |
||||
* (and reads, for read_keys) 128 bytes at a time per memory location |
||||
* by __shfl'ing the 4 entries in bx to the threads in the next-up |
||||
* thread group. It then has eight threads together perform uint4 |
||||
* (128 bit) writes to the destination region. This seems to make |
||||
* quite effective use of memory bandwidth. An approach that spread |
||||
* uint32s across more threads was slower because of the increased |
||||
* computation it required. |
||||
* |
||||
* "start" is the loop iteration producing the write - the offset within |
||||
* the block's memory. |
||||
* |
||||
* Internally, this algorithm first __shfl's the 4 bx entries to |
||||
* the next up thread group, and then uses a conditional move to |
||||
* ensure that odd-numbered thread groups exchange the b/bx ordering |
||||
* so that the right parts are written together. |
||||
* |
||||
* Thanks to Babu for helping design the 128-bit-per-write version. |
||||
* |
||||
* _direct lets the caller specify the absolute start location instead of |
||||
* the relative start location, as an attempt to reduce some recomputation. |
||||
*/ |
||||
|
||||
template <MemoryAccess SCHEME> __device__ __forceinline__ |
||||
void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start) |
||||
{ |
||||
uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; |
||||
|
||||
if (SCHEME == ANDERSEN) { |
||||
uint4 t=b, t2; |
||||
extern __shared__ unsigned char shared[]; |
||||
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); |
||||
uint32_t *s = &tmp[threadIdx.x/32][threadIdx.x%32]; |
||||
uint32_t *st = &tmp[threadIdx.x/32][(threadIdx.x + 4)%32]; |
||||
*s = bx.x; t2.x = *st; |
||||
*s = bx.y; t2.y = *st; |
||||
*s = bx.z; t2.z = *st; |
||||
*s = bx.w; t2.w = *st; |
||||
*s = start; int t2_start = *st + 4; |
||||
bool c = (threadIdx.x & 0x4); |
||||
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t); |
||||
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2); |
||||
} else { |
||||
*((uint4 *)(&scratch[start ])) = b; |
||||
*((uint4 *)(&scratch[start+16])) = bx; |
||||
} |
||||
} |
||||
|
||||
template <MemoryAccess SCHEME, int TEX_DIM> __device__ __forceinline__ |
||||
void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start) |
||||
{ |
||||
uint32_t *scratch; |
||||
|
||||
if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; |
||||
if (SCHEME == ANDERSEN) { |
||||
extern __shared__ unsigned char shared[]; |
||||
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); |
||||
uint32_t *s = &tmp[threadIdx.x/32][threadIdx.x%32]; |
||||
*s = start; int t2_start = tmp[threadIdx.x/32][(threadIdx.x + 4)%32] + 4; |
||||
if (TEX_DIM > 0) { start /= 4; t2_start /= 4; } |
||||
bool c = (threadIdx.x & 0x4); |
||||
if (TEX_DIM == 0) { |
||||
b = *((uint4 *)(&scratch[c ? t2_start : start])); |
||||
bx = *((uint4 *)(&scratch[c ? start : t2_start])); |
||||
} else if (TEX_DIM == 1) { |
||||
b = tex1Dfetch(texRef1D_4_V, c ? t2_start : start); |
||||
bx = tex1Dfetch(texRef1D_4_V, c ? start : t2_start); |
||||
} else if (TEX_DIM == 2) { |
||||
b = tex2D(texRef2D_4_V, 0.5f + ((c ? t2_start : start)%TEXWIDTH), 0.5f + ((c ? t2_start : start)/TEXWIDTH)); |
||||
bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH)); |
||||
} |
||||
uint4 temp = b; b = (c ? bx : b); bx = (c ? temp : bx); |
||||
uint32_t *st = &tmp[threadIdx.x/32][(threadIdx.x + 28)%32]; |
||||
*s = bx.x; bx.x = *st; |
||||
*s = bx.y; bx.y = *st; |
||||
*s = bx.z; bx.z = *st; |
||||
*s = bx.w; bx.w = *st; |
||||
} else { |
||||
if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start])); |
||||
else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4); |
||||
else if (TEX_DIM == 2) b = tex2D(texRef2D_4_V, 0.5f + ((start/4)%TEXWIDTH), 0.5f + ((start/4)/TEXWIDTH)); |
||||
if (TEX_DIM == 0) bx = *((uint4 *)(&scratch[start+16])); |
||||
else if (TEX_DIM == 1) bx = tex1Dfetch(texRef1D_4_V, (start+16)/4); |
||||
else if (TEX_DIM == 2) bx = tex2D(texRef2D_4_V, 0.5f + (((start+16)/4)%TEXWIDTH), 0.5f + (((start+16)/4)/TEXWIDTH)); |
||||
} |
||||
} |
||||
|
||||
|
||||
__device__ __forceinline__ |
||||
void primary_order_shuffle(uint4 &b, uint4 &bx) |
||||
{ |
||||
/* Inner loop shuffle targets */ |
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
extern __shared__ unsigned char shared[]; |
||||
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); |
||||
unsigned int wrp = threadIdx.x/32, lane = threadIdx.x%32; |
||||
uint32_t *s = &tmp[wrp][lane]; |
||||
uint32_t *s1 = &tmp[wrp][x1]; |
||||
uint32_t *s2 = &tmp[wrp][x2]; |
||||
uint32_t *s3 = &tmp[wrp][x3]; |
||||
|
||||
*s = b.w; b.w = *s1; |
||||
*s = b.z; b.z = *s2; |
||||
*s = b.y; b.y = *s3; |
||||
uint32_t temp = b.y; b.y = b.w; b.w = temp; |
||||
|
||||
*s = bx.w; bx.w = *s1; |
||||
*s = bx.z; bx.z = *s2; |
||||
*s = bx.y; bx.y = *s3; |
||||
temp = bx.y; bx.y = bx.w; bx.w = temp; |
||||
} |
||||
|
||||
/* |
||||
* load_key loads a 32*32bit key from a contiguous region of memory in B. |
||||
* The input keys are in external order (i.e., 0, 1, 2, 3, ...). |
||||
* After loading, each thread has its four b and four bx keys stored |
||||
* in internal processing order. |
||||
*/ |
||||
|
||||
__device__ __forceinline__ |
||||
void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int key_offset = scrypt_block * 32; |
||||
uint32_t thread_in_block = threadIdx.x % 4; |
||||
|
||||
// Read in permuted order. Key loads are not our bottleneck right now. |
||||
b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4]; |
||||
b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4]; |
||||
b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4]; |
||||
b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4]; |
||||
bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16]; |
||||
bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16]; |
||||
bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16]; |
||||
bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16]; |
||||
|
||||
primary_order_shuffle(b, bx); |
||||
|
||||
} |
||||
|
||||
/* |
||||
* store_key performs the opposite transform as load_key, taking |
||||
* internally-ordered b and bx and storing them into a contiguous |
||||
* region of B in external order. |
||||
*/ |
||||
|
||||
__device__ __forceinline__ |
||||
void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int key_offset = scrypt_block * 32; |
||||
uint32_t thread_in_block = threadIdx.x % 4; |
||||
|
||||
primary_order_shuffle(b, bx); |
||||
|
||||
B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w; |
||||
} |
||||
|
||||
|
||||
/* |
||||
* load_key loads a 32*32bit key from a contiguous region of memory in B. |
||||
* The input keys are in external order (i.e., 0, 1, 2, 3, ...). |
||||
* After loading, each thread has its four b and four bx keys stored |
||||
* in internal processing order. |
||||
*/ |
||||
|
||||
__device__ __forceinline__ |
||||
void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int key_offset = scrypt_block * 32; |
||||
uint32_t thread_in_block = threadIdx.x % 4; |
||||
|
||||
// Read in permuted order. Key loads are not our bottleneck right now. |
||||
b.x = B[key_offset + 4*0 + thread_in_block%4]; |
||||
b.y = B[key_offset + 4*1 + thread_in_block%4]; |
||||
b.z = B[key_offset + 4*2 + thread_in_block%4]; |
||||
b.w = B[key_offset + 4*3 + thread_in_block%4]; |
||||
bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16]; |
||||
bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16]; |
||||
bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16]; |
||||
bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16]; |
||||
} |
||||
|
||||
/* |
||||
* store_key performs the opposite transform as load_key, taking |
||||
* internally-ordered b and bx and storing them into a contiguous |
||||
* region of B in external order. |
||||
*/ |
||||
|
||||
__device__ __forceinline__ |
||||
void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx) |
||||
{ |
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int key_offset = scrypt_block * 32; |
||||
uint32_t thread_in_block = threadIdx.x % 4; |
||||
|
||||
B[key_offset + 4*0 + thread_in_block%4] = b.x; |
||||
B[key_offset + 4*1 + thread_in_block%4] = b.y; |
||||
B[key_offset + 4*2 + thread_in_block%4] = b.z; |
||||
B[key_offset + 4*3 + thread_in_block%4] = b.w; |
||||
B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x; |
||||
B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y; |
||||
B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z; |
||||
B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w; |
||||
} |
||||
|
||||
|
||||
template <int ALGO> __device__ __forceinline__ |
||||
void load_key(const uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
switch(ALGO) { |
||||
case A_SCRYPT: load_key_salsa(B, b, bx); break; |
||||
case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break; |
||||
} |
||||
} |
||||
|
||||
template <int ALGO> __device__ __forceinline__ |
||||
void store_key(uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
switch(ALGO) { |
||||
case A_SCRYPT: store_key_salsa(B, b, bx); break; |
||||
case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* |
||||
* salsa_xor_core (Salsa20/8 cypher) |
||||
* The original scrypt called: |
||||
* xor_salsa8(&X[0], &X[16]); <-- the "b" loop |
||||
* xor_salsa8(&X[16], &X[0]); <-- the "bx" loop |
||||
* This version is unrolled to handle both of these loops in a single |
||||
* call to avoid unnecessary data movement. |
||||
*/ |
||||
|
||||
#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<<amt)|(tmp>>(32-amt))); } |
||||
|
||||
__device__ __forceinline__ |
||||
void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) |
||||
{ |
||||
extern __shared__ unsigned char shared[]; |
||||
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); |
||||
unsigned int wrp = threadIdx.x/32, lane = threadIdx.x%32; |
||||
uint32_t *s = &tmp[wrp][lane]; |
||||
uint32_t *s1 = &tmp[wrp][x1]; |
||||
uint32_t *s2 = &tmp[wrp][x2]; |
||||
uint32_t *s3 = &tmp[wrp][x3]; |
||||
|
||||
uint4 x; |
||||
|
||||
b ^= bx; |
||||
x = b; |
||||
|
||||
// Enter in "primary order" (t0 has 0, 4, 8, 12) |
||||
// (t1 has 5, 9, 13, 1) |
||||
// (t2 has 10, 14, 2, 6) |
||||
// (t3 has 15, 3, 7, 11) |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 4; j++) { |
||||
|
||||
// Mixing phase of salsa |
||||
XOR_ROTATE_ADD(x.y, x.x, x.w, 7); |
||||
XOR_ROTATE_ADD(x.z, x.y, x.x, 9); |
||||
XOR_ROTATE_ADD(x.w, x.z, x.y, 13); |
||||
XOR_ROTATE_ADD(x.x, x.w, x.z, 18); |
||||
|
||||
/* Transpose rows and columns. */ |
||||
/* Unclear if this optimization is needed: These are ordered based |
||||
* upon the dependencies needed in the later xors. Compiler should be |
||||
* able to figure this out, but might as well give it a hand. */ |
||||
*s = x.y; x.y = *s3; |
||||
*s = x.w; x.w = *s1; |
||||
*s = x.z; x.z = *s2; |
||||
|
||||
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first, |
||||
* but the register targets are rewritten here to swap x[1] and x[3] so that |
||||
* they can be directly shuffled to and from our peer threads without |
||||
* reassignment. The reverse shuffle then puts them back in the right place. |
||||
*/ |
||||
|
||||
XOR_ROTATE_ADD(x.w, x.x, x.y, 7); |
||||
XOR_ROTATE_ADD(x.z, x.w, x.x, 9); |
||||
XOR_ROTATE_ADD(x.y, x.z, x.w, 13); |
||||
XOR_ROTATE_ADD(x.x, x.y, x.z, 18); |
||||
|
||||
*s = x.w; x.w = *s3; |
||||
*s = x.y; x.y = *s1; |
||||
*s = x.z; x.z = *s2; |
||||
} |
||||
|
||||
b += x; |
||||
// The next two lines are the beginning of the BX-centric loop iteration |
||||
bx ^= b; |
||||
x = bx; |
||||
|
||||
// This is a copy of the same loop above, identical but stripped of comments. |
||||
// Duplicated so that we can complete a bx-based loop with fewer register moves. |
||||
#pragma unroll |
||||
for (int j = 0; j < 4; j++) { |
||||
XOR_ROTATE_ADD(x.y, x.x, x.w, 7); |
||||
XOR_ROTATE_ADD(x.z, x.y, x.x, 9); |
||||
XOR_ROTATE_ADD(x.w, x.z, x.y, 13); |
||||
XOR_ROTATE_ADD(x.x, x.w, x.z, 18); |
||||
|
||||
*s = x.y; x.y = *s3; |
||||
*s = x.w; x.w = *s1; |
||||
*s = x.z; x.z = *s2; |
||||
|
||||
XOR_ROTATE_ADD(x.w, x.x, x.y, 7); |
||||
XOR_ROTATE_ADD(x.z, x.w, x.x, 9); |
||||
XOR_ROTATE_ADD(x.y, x.z, x.w, 13); |
||||
XOR_ROTATE_ADD(x.x, x.y, x.z, 18); |
||||
|
||||
*s = x.w; x.w = *s3; |
||||
*s = x.y; x.y = *s1; |
||||
*s = x.z; x.z = *s2; |
||||
} |
||||
|
||||
// At the end of these iterations, the data is in primary order again. |
||||
#undef XOR_ROTATE_ADD |
||||
|
||||
bx += x; |
||||
} |
||||
|
||||
|
||||
/* |
||||
* chacha_xor_core (ChaCha20/8 cypher) |
||||
* This version is unrolled to handle both of these loops in a single |
||||
* call to avoid unnecessary data movement. |
||||
* |
||||
* load_key and store_key must not use primary order when |
||||
* using ChaCha20/8, but rather the basic transposed order |
||||
* (referred to as "column mode" below) |
||||
*/ |
||||
|
||||
#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<<amt)|(tmp>>(32-amt))); } |
||||
|
||||
__device__ __forceinline__ |
||||
void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) |
||||
{ |
||||
extern __shared__ unsigned char shared[]; |
||||
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); |
||||
unsigned int wrp = threadIdx.x/32, lane = threadIdx.x%32; |
||||
uint32_t *s = &tmp[wrp][lane]; |
||||
uint32_t *s1 = &tmp[wrp][x1]; |
||||
uint32_t *s2 = &tmp[wrp][x2]; |
||||
uint32_t *s3 = &tmp[wrp][x3]; |
||||
|
||||
uint4 x; |
||||
|
||||
b ^= bx; |
||||
x = b; |
||||
|
||||
// Enter in "column" mode (t0 has 0, 4, 8, 12) |
||||
// (t1 has 1, 5, 9, 13) |
||||
// (t2 has 2, 6, 10, 14) |
||||
// (t3 has 3, 7, 11, 15) |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 4; j++) { |
||||
|
||||
// Column Mixing phase of chacha |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
||||
|
||||
*s = x.y; x.y = *s1; |
||||
*s = x.z; x.z = *s2; |
||||
*s = x.w; x.w = *s3; |
||||
|
||||
// Diagonal Mixing phase of chacha |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
||||
|
||||
*s = x.y; x.y = *s3; |
||||
*s = x.z; x.z = *s2; |
||||
*s = x.w; x.w = *s1; |
||||
} |
||||
|
||||
b += x; |
||||
// The next two lines are the beginning of the BX-centric loop iteration |
||||
bx ^= b; |
||||
x = bx; |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 4; j++) { |
||||
|
||||
// Column Mixing phase of chacha |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
||||
|
||||
*s = x.y; x.y = *s1; |
||||
*s = x.z; x.z = *s2; |
||||
*s = x.w; x.w = *s3; |
||||
|
||||
// Diagonal Mixing phase of chacha |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
||||
|
||||
*s = x.y; x.y = *s3; |
||||
*s = x.z; x.z = *s2; |
||||
*s = x.w; x.w = *s1; |
||||
} |
||||
|
||||
#undef CHACHA_PRIMITIVE |
||||
|
||||
bx += x; |
||||
} |
||||
|
||||
template <int ALGO> __device__ __forceinline__ |
||||
void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) |
||||
{ |
||||
switch(ALGO) { |
||||
case A_SCRYPT: salsa_xor_core(b, bx, x1, x2, x3); break; |
||||
case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* |
||||
* The hasher_gen_kernel operates on a group of 1024-bit input keys |
||||
* in B, stored as: |
||||
* B = { k1B k1Bx k2B k2Bx ... } |
||||
* and fills up the scratchpad with the iterative hashes derived from |
||||
* those keys: |
||||
* scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... } |
||||
* scratch is 1024 times larger than the input keys B. |
||||
* It is extremely important to stream writes effectively into scratch; |
||||
* less important to coalesce the reads from B. |
||||
* |
||||
* Key ordering note: Keys are input from B in "original" order: |
||||
* K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 } |
||||
* After inputting into kernel_gen, each component k and kx of the |
||||
* key is transmuted into a permuted internal order to make processing faster: |
||||
* K = k, kx with: |
||||
* k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11 |
||||
* and similarly for kx. |
||||
*/ |
||||
|
||||
template <int ALGO, MemoryAccess SCHEME> __global__ |
||||
void test_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end) |
||||
{ |
||||
uint4 b, bx; |
||||
|
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; |
||||
|
||||
int i=begin; |
||||
|
||||
if (i == 0) { |
||||
load_key<ALGO>(d_idata, b, bx); |
||||
write_keys_direct<SCHEME>(b, bx, start); |
||||
++i; |
||||
} else read_keys_direct<SCHEME,0>(b, bx, start+32*(i-1)); |
||||
|
||||
while (i < end) { |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
write_keys_direct<SCHEME>(b, bx, start+32*i); |
||||
++i; |
||||
} |
||||
} |
||||
|
||||
template <int ALGO, MemoryAccess SCHEME> __global__ |
||||
void test_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP) |
||||
{ |
||||
uint4 b, bx; |
||||
|
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; |
||||
|
||||
int i=begin; |
||||
|
||||
if (i == 0) { |
||||
load_key<ALGO>(d_idata, b, bx); |
||||
write_keys_direct<SCHEME>(b, bx, start); |
||||
++i; |
||||
} else { |
||||
int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP; |
||||
read_keys_direct<SCHEME,0>(b, bx, start+32*pos); |
||||
while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} |
||||
|
||||
while (i < end) { |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
if (i % LOOKUP_GAP == 0) |
||||
write_keys_direct<SCHEME>(b, bx, start+32*(i/LOOKUP_GAP)); |
||||
++i; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* |
||||
* hasher_hash_kernel runs the second phase of scrypt after the scratch |
||||
* buffer is filled with the iterative hashes: It bounces through |
||||
* the scratch buffer in pseudorandom order, mixing the key as it goes. |
||||
*/ |
||||
|
||||
template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__ |
||||
void test_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end) |
||||
{ |
||||
extern __shared__ unsigned char shared[]; |
||||
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); |
||||
|
||||
uint4 b, bx; |
||||
|
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4); |
||||
if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP; |
||||
|
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
if (begin == 0) { |
||||
read_keys_direct<SCHEME,TEX_DIM>(b, bx, start+32*c_N_1); |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} else load_key<ALGO>(d_odata, b, bx); |
||||
|
||||
for (int i = begin; i < end; i++) { |
||||
tmp[threadIdx.x/32][threadIdx.x%32] = bx.x; |
||||
int j = (tmp[threadIdx.x/32][(threadIdx.x & 0x1c)] & (c_N_1)); |
||||
uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*j); |
||||
b ^= t; bx ^= tx; |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} |
||||
|
||||
store_key<ALGO>(d_odata, b, bx); |
||||
} |
||||
|
||||
template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__ |
||||
void test_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP) |
||||
{ |
||||
extern __shared__ unsigned char shared[]; |
||||
uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared); |
||||
|
||||
uint4 b, bx; |
||||
|
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4); |
||||
if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP; |
||||
|
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
if (begin == 0) { |
||||
int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP); |
||||
read_keys_direct<SCHEME,TEX_DIM>(b, bx, start+32*pos); |
||||
while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} else load_key<ALGO>(d_odata, b, bx); |
||||
|
||||
for (int i = begin; i < end; i++) { |
||||
tmp[threadIdx.x/32][threadIdx.x%32] = bx.x; |
||||
int j = (tmp[threadIdx.x/32][(threadIdx.x & 0x1c)] & (c_N_1)); |
||||
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; |
||||
uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos); |
||||
while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3); |
||||
b ^= t; bx ^= tx; |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} |
||||
|
||||
store_key<ALGO>(d_odata, b, bx); |
||||
} |
||||
|
||||
|
||||
TestKernel::TestKernel() : KernelInterface() |
||||
{ |
||||
} |
||||
|
||||
bool TestKernel::bindtexture_1D(uint32_t *d_V, size_t size) |
||||
{ |
||||
cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>(); |
||||
texRef1D_4_V.normalized = 0; |
||||
texRef1D_4_V.filterMode = cudaFilterModePoint; |
||||
texRef1D_4_V.addressMode[0] = cudaAddressModeClamp; |
||||
checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size)); |
||||
return true; |
||||
} |
||||
|
||||
bool TestKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) |
||||
{ |
||||
cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>(); |
||||
texRef2D_4_V.normalized = 0; |
||||
texRef2D_4_V.filterMode = cudaFilterModePoint; |
||||
texRef2D_4_V.addressMode[0] = cudaAddressModeClamp; |
||||
texRef2D_4_V.addressMode[1] = cudaAddressModeClamp; |
||||
// maintain texture width of TEXWIDTH (max. limit is 65000) |
||||
while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; } |
||||
while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; } |
||||
checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch)); |
||||
return true; |
||||
} |
||||
|
||||
bool TestKernel::unbindtexture_1D() |
||||
{ |
||||
checkCudaErrors(cudaUnbindTexture(texRef1D_4_V)); |
||||
return true; |
||||
} |
||||
|
||||
bool TestKernel::unbindtexture_2D() |
||||
{ |
||||
checkCudaErrors(cudaUnbindTexture(texRef2D_4_V)); |
||||
return true; |
||||
} |
||||
|
||||
void TestKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) |
||||
{ |
||||
checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice)); |
||||
} |
||||
|
||||
bool TestKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) |
||||
{ |
||||
bool success = true; |
||||
|
||||
// compute required shared memory per block for __shfl() emulation |
||||
size_t shared = ((threads.x + 31) / 32) * (32+1) * sizeof(uint32_t); |
||||
|
||||
// make some constants available to kernel, update only initially and when changing |
||||
static int prev_N[MAX_DEVICES] = {0}; |
||||
if (N != prev_N[thr_id]) { |
||||
uint32_t h_N = N; |
||||
uint32_t h_N_1 = N-1; |
||||
uint32_t h_SCRATCH = SCRATCH; |
||||
uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP); |
||||
uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1; |
||||
|
||||
cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
|
||||
prev_N[thr_id] = N; |
||||
} |
||||
|
||||
// First phase: Sequential writes to scratchpad. |
||||
|
||||
int batch = device_batchsize[thr_id]; |
||||
|
||||
unsigned int pos = 0; |
||||
do { |
||||
if (LOOKUP_GAP == 1) { |
||||
if (IS_SCRYPT()) test_scrypt_core_kernelA<A_SCRYPT, ANDERSEN> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N)); |
||||
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N)); |
||||
} else { |
||||
if (IS_SCRYPT()) test_scrypt_core_kernelA_LG<A_SCRYPT, ANDERSEN> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
} |
||||
pos += batch; |
||||
} while (pos < N); |
||||
|
||||
// Second phase: Random read access from scratchpad. |
||||
pos = 0; |
||||
do { |
||||
if (LOOKUP_GAP == 1) { |
||||
if (texture_cache == 0) { |
||||
if (IS_SCRYPT()) test_scrypt_core_kernelB<A_SCRYPT, ANDERSEN, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
} |
||||
else if (texture_cache == 1) { |
||||
if (IS_SCRYPT()) test_scrypt_core_kernelB<A_SCRYPT, ANDERSEN, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
} |
||||
else if (texture_cache == 2) { |
||||
if (IS_SCRYPT()) test_scrypt_core_kernelB<A_SCRYPT, ANDERSEN, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
} |
||||
} else { |
||||
if (texture_cache == 0) { |
||||
if (IS_SCRYPT()) test_scrypt_core_kernelB_LG<A_SCRYPT, ANDERSEN, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
} |
||||
else if (texture_cache == 1) { |
||||
if (IS_SCRYPT()) test_scrypt_core_kernelB_LG<A_SCRYPT, ANDERSEN, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
} |
||||
else if (texture_cache == 2) { |
||||
if (IS_SCRYPT()) test_scrypt_core_kernelB_LG<A_SCRYPT, ANDERSEN, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
} |
||||
} |
||||
|
||||
pos += batch; |
||||
} while (pos < N); |
||||
|
||||
return success; |
||||
} |
@ -0,0 +1,30 @@
@@ -0,0 +1,30 @@
|
||||
#ifndef TEST_KERNEL_H |
||||
#define TEST_KERNEL_H |
||||
|
||||
#include "salsa_kernel.h" |
||||
|
||||
class TestKernel : public KernelInterface |
||||
{ |
||||
public: |
||||
TestKernel(); |
||||
|
||||
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V); |
||||
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache); |
||||
virtual bool bindtexture_1D(uint32_t *d_V, size_t size); |
||||
virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch); |
||||
virtual bool unbindtexture_1D(); |
||||
virtual bool unbindtexture_2D(); |
||||
|
||||
virtual char get_identifier() { return 'f'; }; |
||||
virtual int get_major_version() { return 1; }; |
||||
virtual int get_minor_version() { return 0; }; |
||||
|
||||
virtual int max_warps_per_block() { return 32; }; |
||||
virtual int get_texel_width() { return 4; }; |
||||
virtual int threads_per_wu() { return 4; } |
||||
virtual bool support_lookup_gap() { return true; } |
||||
virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; } |
||||
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; } |
||||
}; |
||||
|
||||
#endif // #ifndef TEST_KERNEL_H
|
@ -0,0 +1,731 @@
@@ -0,0 +1,731 @@
|
||||
/* Copyright (C) 2013 David G. Andersen. All rights reserved. |
||||
* with modifications by Christian Buchner |
||||
* |
||||
* Use of this code is covered under the Apache 2.0 license, which |
||||
* can be found in the file "LICENSE" |
||||
*/ |
||||
|
||||
// attempt V.Volkov style ILP (factor 4) |
||||
|
||||
#include <map> |
||||
|
||||
#include "cuda_runtime.h" |
||||
#include "miner.h" |
||||
|
||||
#include "salsa_kernel.h" |
||||
#include "titan_kernel.h" |
||||
|
||||
#define THREADS_PER_WU 4 // four threads per hash |
||||
|
||||
typedef enum |
||||
{ |
||||
ANDERSEN, |
||||
SIMPLE |
||||
} MemoryAccess; |
||||
|
||||
#if __CUDA_ARCH__ < 350 |
||||
// Kepler (Compute 3.0) |
||||
#define __ldg(x) (*(x)) |
||||
#endif |
||||
|
||||
// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes) |
||||
__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT]; |
||||
|
||||
// iteration count N |
||||
__constant__ uint32_t c_N; |
||||
__constant__ uint32_t c_N_1; // N-1 |
||||
// scratch buffer size SCRATCH |
||||
__constant__ uint32_t c_SCRATCH; |
||||
__constant__ uint32_t c_SCRATCH_WU_PER_WARP; // (SCRATCH * WU_PER_WARP) |
||||
__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP)-1 |
||||
|
||||
template <int ALGO> __device__ __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3); |
||||
|
||||
static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) { |
||||
left.x ^= right.x; |
||||
left.y ^= right.y; |
||||
left.z ^= right.z; |
||||
left.w ^= right.w; |
||||
return left; |
||||
} |
||||
|
||||
static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) { |
||||
left.x += right.x; |
||||
left.y += right.y; |
||||
left.z += right.z; |
||||
left.w += right.w; |
||||
return left; |
||||
} |
||||
|
||||
static __device__ uint4 __shfl(const uint4 bx, int target_thread) { |
||||
return make_uint4(__shfl((int)bx.x, target_thread), __shfl((int)bx.y, target_thread), __shfl((int)bx.z, target_thread), __shfl((int)bx.w, target_thread)); |
||||
} |
||||
|
||||
/* write_keys writes the 8 keys being processed by a warp to the global |
||||
* scratchpad. To effectively use memory bandwidth, it performs the writes |
||||
* (and reads, for read_keys) 128 bytes at a time per memory location |
||||
* by __shfl'ing the 4 entries in bx to the threads in the next-up |
||||
* thread group. It then has eight threads together perform uint4 |
||||
* (128 bit) writes to the destination region. This seems to make |
||||
* quite effective use of memory bandwidth. An approach that spread |
||||
* uint32s across more threads was slower because of the increased |
||||
* computation it required. |
||||
* |
||||
* "start" is the loop iteration producing the write - the offset within |
||||
* the block's memory. |
||||
* |
||||
* Internally, this algorithm first __shfl's the 4 bx entries to |
||||
* the next up thread group, and then uses a conditional move to |
||||
* ensure that odd-numbered thread groups exchange the b/bx ordering |
||||
* so that the right parts are written together. |
||||
* |
||||
* Thanks to Babu for helping design the 128-bit-per-write version. |
||||
* |
||||
* _direct lets the caller specify the absolute start location instead of |
||||
* the relative start location, as an attempt to reduce some recomputation. |
||||
*/ |
||||
|
||||
template <MemoryAccess SCHEME> __device__ __forceinline__ |
||||
void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start) |
||||
{ |
||||
uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; |
||||
if (SCHEME == ANDERSEN) { |
||||
int target_thread = (threadIdx.x + 4)%32; |
||||
uint4 t=b, t2=__shfl(bx, target_thread); |
||||
int t2_start = __shfl((int)start, target_thread) + 4; |
||||
bool c = (threadIdx.x & 0x4); |
||||
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t); |
||||
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2); |
||||
} else { |
||||
*((uint4 *)(&scratch[start ])) = b; |
||||
*((uint4 *)(&scratch[start+16])) = bx; |
||||
} |
||||
} |
||||
|
||||
template <MemoryAccess SCHEME> __device__ __forceinline__ |
||||
void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start) |
||||
{ |
||||
uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; |
||||
if (SCHEME == ANDERSEN) { |
||||
int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4; |
||||
bool c = (threadIdx.x & 0x4); |
||||
b = __ldg((uint4 *)(&scratch[c ? t2_start : start])); |
||||
bx = __ldg((uint4 *)(&scratch[c ? start : t2_start])); |
||||
uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx); |
||||
bx = __shfl(bx, (threadIdx.x + 28)%32); |
||||
} else { |
||||
b = *((uint4 *)(&scratch[start])); |
||||
bx = *((uint4 *)(&scratch[start+16])); |
||||
} |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
void primary_order_shuffle(uint32_t b[4], uint32_t bx[4]) { |
||||
/* Inner loop shuffle targets */ |
||||
int x1 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
b[3] = __shfl((int)b[3], x1); |
||||
b[2] = __shfl((int)b[2], x2); |
||||
b[1] = __shfl((int)b[1], x3); |
||||
uint32_t tmp = b[1]; b[1] = b[3]; b[3] = tmp; |
||||
|
||||
bx[3] = __shfl((int)bx[3], x1); |
||||
bx[2] = __shfl((int)bx[2], x2); |
||||
bx[1] = __shfl((int)bx[1], x3); |
||||
tmp = bx[1]; bx[1] = bx[3]; bx[3] = tmp; |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
void primary_order_shuffle(uint4 &b, uint4 &bx) { |
||||
/* Inner loop shuffle targets */ |
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
b.w = __shfl((int)b.w, x1); |
||||
b.z = __shfl((int)b.z, x2); |
||||
b.y = __shfl((int)b.y, x3); |
||||
uint32_t tmp = b.y; b.y = b.w; b.w = tmp; |
||||
|
||||
bx.w = __shfl((int)bx.w, x1); |
||||
bx.z = __shfl((int)bx.z, x2); |
||||
bx.y = __shfl((int)bx.y, x3); |
||||
tmp = bx.y; bx.y = bx.w; bx.w = tmp; |
||||
} |
||||
|
||||
/* |
||||
* load_key loads a 32*32bit key from a contiguous region of memory in B. |
||||
* The input keys are in external order (i.e., 0, 1, 2, 3, ...). |
||||
* After loading, each thread has its four b and four bx keys stored |
||||
* in internal processing order. |
||||
*/ |
||||
|
||||
__device__ __forceinline__ |
||||
void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int key_offset = scrypt_block * 32; |
||||
uint32_t thread_in_block = threadIdx.x % 4; |
||||
|
||||
// Read in permuted order. Key loads are not our bottleneck right now. |
||||
b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4]; |
||||
b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4]; |
||||
b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4]; |
||||
b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4]; |
||||
bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16]; |
||||
bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16]; |
||||
bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16]; |
||||
bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16]; |
||||
|
||||
primary_order_shuffle(b, bx); |
||||
} |
||||
|
||||
/* |
||||
* store_key performs the opposite transform as load_key, taking |
||||
* internally-ordered b and bx and storing them into a contiguous |
||||
* region of B in external order. |
||||
*/ |
||||
|
||||
__device__ __forceinline__ |
||||
void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int key_offset = scrypt_block * 32; |
||||
uint32_t thread_in_block = threadIdx.x % 4; |
||||
|
||||
primary_order_shuffle(b, bx); |
||||
|
||||
B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z; |
||||
B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w; |
||||
} |
||||
|
||||
|
||||
/* |
||||
* load_key loads a 32*32bit key from a contiguous region of memory in B. |
||||
* The input keys are in external order (i.e., 0, 1, 2, 3, ...). |
||||
* After loading, each thread has its four b and four bx keys stored |
||||
* in internal processing order. |
||||
*/ |
||||
|
||||
__device__ __forceinline__ |
||||
void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int key_offset = scrypt_block * 32; |
||||
uint32_t thread_in_block = threadIdx.x % 4; |
||||
|
||||
// Read in permuted order. Key loads are not our bottleneck right now. |
||||
b.x = B[key_offset + 4*0 + thread_in_block%4]; |
||||
b.y = B[key_offset + 4*1 + thread_in_block%4]; |
||||
b.z = B[key_offset + 4*2 + thread_in_block%4]; |
||||
b.w = B[key_offset + 4*3 + thread_in_block%4]; |
||||
bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16]; |
||||
bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16]; |
||||
bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16]; |
||||
bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16]; |
||||
} |
||||
|
||||
/* |
||||
* store_key performs the opposite transform as load_key, taking |
||||
* internally-ordered b and bx and storing them into a contiguous |
||||
* region of B in external order. |
||||
*/ |
||||
|
||||
__device__ __forceinline__ |
||||
void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx) |
||||
{ |
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int key_offset = scrypt_block * 32; |
||||
uint32_t thread_in_block = threadIdx.x % 4; |
||||
|
||||
B[key_offset + 4*0 + thread_in_block%4] = b.x; |
||||
B[key_offset + 4*1 + thread_in_block%4] = b.y; |
||||
B[key_offset + 4*2 + thread_in_block%4] = b.z; |
||||
B[key_offset + 4*3 + thread_in_block%4] = b.w; |
||||
B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x; |
||||
B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y; |
||||
B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z; |
||||
B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w; |
||||
} |
||||
|
||||
|
||||
template <int ALGO> __device__ __forceinline__ |
||||
void load_key(const uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
switch(ALGO) { |
||||
case A_SCRYPT: load_key_salsa(B, b, bx); break; |
||||
case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break; |
||||
} |
||||
} |
||||
|
||||
template <int ALGO> __device__ __forceinline__ |
||||
void store_key(uint32_t *B, uint4 &b, uint4 &bx) |
||||
{ |
||||
switch(ALGO) { |
||||
case A_SCRYPT: store_key_salsa(B, b, bx); break; |
||||
case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* |
||||
* salsa_xor_core (Salsa20/8 cypher) |
||||
* The original scrypt called: |
||||
* xor_salsa8(&X[0], &X[16]); <-- the "b" loop |
||||
* xor_salsa8(&X[16], &X[0]); <-- the "bx" loop |
||||
* This version is unrolled to handle both of these loops in a single |
||||
* call to avoid unnecessary data movement. |
||||
*/ |
||||
|
||||
#if __CUDA_ARCH__ < 350 |
||||
// Kepler (Compute 3.0) |
||||
#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<<amt)|(tmp>>(32-amt))); } |
||||
#else |
||||
// Kepler (Compute 3.5) |
||||
#define ROTL(a, b) __funnelshift_l( a, a, b ); |
||||
#define XOR_ROTATE_ADD(dst, s1, s2, amt) dst ^= ROTL(s1+s2, amt); |
||||
#endif |
||||
|
||||
|
||||
__device__ __forceinline__ |
||||
void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) |
||||
{ |
||||
uint4 x; |
||||
|
||||
b ^= bx; |
||||
x = b; |
||||
|
||||
// Enter in "primary order" (t0 has 0, 4, 8, 12) |
||||
// (t1 has 5, 9, 13, 1) |
||||
// (t2 has 10, 14, 2, 6) |
||||
// (t3 has 15, 3, 7, 11) |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 4; j++) |
||||
{ |
||||
// Mixing phase of salsa |
||||
XOR_ROTATE_ADD(x.y, x.x, x.w, 7); |
||||
XOR_ROTATE_ADD(x.z, x.y, x.x, 9); |
||||
XOR_ROTATE_ADD(x.w, x.z, x.y, 13); |
||||
XOR_ROTATE_ADD(x.x, x.w, x.z, 18); |
||||
|
||||
/* Transpose rows and columns. */ |
||||
/* Unclear if this optimization is needed: These are ordered based |
||||
* upon the dependencies needed in the later xors. Compiler should be |
||||
* able to figure this out, but might as well give it a hand. */ |
||||
x.y = __shfl((int)x.y, x3); |
||||
x.w = __shfl((int)x.w, x1); |
||||
x.z = __shfl((int)x.z, x2); |
||||
|
||||
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first, |
||||
* but the register targets are rewritten here to swap x[1] and x[3] so that |
||||
* they can be directly shuffled to and from our peer threads without |
||||
* reassignment. The reverse shuffle then puts them back in the right place. |
||||
*/ |
||||
|
||||
XOR_ROTATE_ADD(x.w, x.x, x.y, 7); |
||||
XOR_ROTATE_ADD(x.z, x.w, x.x, 9); |
||||
XOR_ROTATE_ADD(x.y, x.z, x.w, 13); |
||||
XOR_ROTATE_ADD(x.x, x.y, x.z, 18); |
||||
|
||||
x.w = __shfl((int)x.w, x3); |
||||
x.y = __shfl((int)x.y, x1); |
||||
x.z = __shfl((int)x.z, x2); |
||||
} |
||||
|
||||
b += x; |
||||
// The next two lines are the beginning of the BX-centric loop iteration |
||||
bx ^= b; |
||||
x = bx; |
||||
|
||||
// This is a copy of the same loop above, identical but stripped of comments. |
||||
// Duplicated so that we can complete a bx-based loop with fewer register moves. |
||||
#pragma unroll 4 |
||||
for (int j = 0; j < 4; j++) |
||||
{ |
||||
XOR_ROTATE_ADD(x.y, x.x, x.w, 7); |
||||
XOR_ROTATE_ADD(x.z, x.y, x.x, 9); |
||||
XOR_ROTATE_ADD(x.w, x.z, x.y, 13); |
||||
XOR_ROTATE_ADD(x.x, x.w, x.z, 18); |
||||
|
||||
x.y = __shfl((int)x.y, x3); |
||||
x.w = __shfl((int)x.w, x1); |
||||
x.z = __shfl((int)x.z, x2); |
||||
|
||||
XOR_ROTATE_ADD(x.w, x.x, x.y, 7); |
||||
XOR_ROTATE_ADD(x.z, x.w, x.x, 9); |
||||
XOR_ROTATE_ADD(x.y, x.z, x.w, 13); |
||||
XOR_ROTATE_ADD(x.x, x.y, x.z, 18); |
||||
|
||||
x.w = __shfl((int)x.w, x3); |
||||
x.y = __shfl((int)x.y, x1); |
||||
x.z = __shfl((int)x.z, x2); |
||||
} |
||||
|
||||
// At the end of these iterations, the data is in primary order again. |
||||
#undef XOR_ROTATE_ADD |
||||
|
||||
bx += x; |
||||
} |
||||
|
||||
|
||||
/* |
||||
* chacha_xor_core (ChaCha20/8 cypher) |
||||
* This version is unrolled to handle both of these loops in a single |
||||
* call to avoid unnecessary data movement. |
||||
* |
||||
* load_key and store_key must not use primary order when |
||||
* using ChaCha20/8, but rather the basic transposed order |
||||
* (referred to as "column mode" below) |
||||
*/ |
||||
|
||||
#if __CUDA_ARCH__ < 320 |
||||
// Kepler (Compute 3.0) |
||||
#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<<amt)|(tmp>>(32-amt))); } |
||||
#else |
||||
// Kepler (Compute 3.5) |
||||
#define ROTL(a, b) __funnelshift_l( a, a, b ); |
||||
#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { pt += ps; rt = ROTL(rt ^ pt,amt); } |
||||
#endif |
||||
|
||||
__device__ __forceinline__ |
||||
void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) |
||||
{ |
||||
uint4 x; |
||||
|
||||
b ^= bx; |
||||
x = b; |
||||
|
||||
// Enter in "column" mode (t0 has 0, 4, 8, 12) |
||||
// (t1 has 1, 5, 9, 13) |
||||
// (t2 has 2, 6, 10, 14) |
||||
// (t3 has 3, 7, 11, 15) |
||||
|
||||
#pragma unroll 4 |
||||
for (int j = 0; j < 4; j++) { |
||||
|
||||
// Column Mixing phase of chacha |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
||||
|
||||
x.y = __shfl((int)x.y, x1); |
||||
x.z = __shfl((int)x.z, x2); |
||||
x.w = __shfl((int)x.w, x3); |
||||
|
||||
// Diagonal Mixing phase of chacha |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
||||
|
||||
x.y = __shfl((int)x.y, x3); |
||||
x.z = __shfl((int)x.z, x2); |
||||
x.w = __shfl((int)x.w, x1); |
||||
} |
||||
|
||||
b += x; |
||||
// The next two lines are the beginning of the BX-centric loop iteration |
||||
bx ^= b; |
||||
x = bx; |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 4; j++) |
||||
{ |
||||
|
||||
// Column Mixing phase of chacha |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
||||
|
||||
x.y = __shfl((int)x.y, x1); |
||||
x.z = __shfl((int)x.z, x2); |
||||
x.w = __shfl((int)x.w, x3); |
||||
|
||||
// Diagonal Mixing phase of chacha |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12) |
||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
||||
|
||||
x.y = __shfl((int)x.y, x3); |
||||
x.z = __shfl((int)x.z, x2); |
||||
x.w = __shfl((int)x.w, x1); |
||||
} |
||||
|
||||
#undef CHACHA_PRIMITIVE |
||||
|
||||
bx += x; |
||||
} |
||||
|
||||
|
||||
template <int ALGO> __device__ __forceinline__ |
||||
void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3) |
||||
{ |
||||
switch(ALGO) { |
||||
case A_SCRYPT: salsa_xor_core(b, bx, x1, x2, x3); break; |
||||
case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* |
||||
* The hasher_gen_kernel operates on a group of 1024-bit input keys |
||||
* in B, stored as: |
||||
* B = { k1B k1Bx k2B k2Bx ... } |
||||
* and fills up the scratchpad with the iterative hashes derived from |
||||
* those keys: |
||||
* scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... } |
||||
* scratch is 1024 times larger than the input keys B. |
||||
* It is extremely important to stream writes effectively into scratch; |
||||
* less important to coalesce the reads from B. |
||||
* |
||||
* Key ordering note: Keys are input from B in "original" order: |
||||
* K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 } |
||||
* After inputting into kernel_gen, each component k and kx of the |
||||
* key is transmuted into a permuted internal order to make processing faster: |
||||
* K = k, kx with: |
||||
* k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11 |
||||
* and similarly for kx. |
||||
*/ |
||||
|
||||
template <int ALGO, MemoryAccess SCHEME> __global__ |
||||
void titan_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end) |
||||
{ |
||||
uint4 b, bx; |
||||
|
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; |
||||
|
||||
int i=begin; |
||||
|
||||
if (i == 0) { |
||||
load_key<ALGO>(d_idata, b, bx); |
||||
write_keys_direct<SCHEME>(b, bx, start); |
||||
++i; |
||||
} else read_keys_direct<SCHEME>(b, bx, start+32*(i-1)); |
||||
|
||||
while (i < end) { |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
write_keys_direct<SCHEME>(b, bx, start+32*i); |
||||
++i; |
||||
} |
||||
} |
||||
|
||||
template <int ALGO, MemoryAccess SCHEME> __global__ |
||||
void titan_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP) |
||||
{ |
||||
uint4 b, bx; |
||||
|
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; |
||||
|
||||
int i=begin; |
||||
|
||||
if (i == 0) { |
||||
load_key<ALGO>(d_idata, b, bx); |
||||
write_keys_direct<SCHEME>(b, bx, start); |
||||
++i; |
||||
} else { |
||||
int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP; |
||||
read_keys_direct<SCHEME>(b, bx, start+32*pos); |
||||
while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} |
||||
|
||||
while (i < end) { |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
if (i % LOOKUP_GAP == 0) |
||||
write_keys_direct<SCHEME>(b, bx, start+32*(i/LOOKUP_GAP)); |
||||
++i; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* |
||||
* hasher_hash_kernel runs the second phase of scrypt after the scratch |
||||
* buffer is filled with the iterative hashes: It bounces through |
||||
* the scratch buffer in pseudorandom order, mixing the key as it goes. |
||||
*/ |
||||
|
||||
template <int ALGO, MemoryAccess SCHEME> __global__ |
||||
void titan_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end) |
||||
{ |
||||
uint4 b, bx; |
||||
|
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int start = ((scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; |
||||
|
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
if (begin == 0) { |
||||
read_keys_direct<SCHEME>(b, bx, start+32*c_N_1); |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} else load_key<ALGO>(d_odata, b, bx); |
||||
|
||||
for (int i = begin; i < end; i++) { |
||||
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
||||
uint4 t, tx; read_keys_direct<SCHEME>(t, tx, start+32*j); |
||||
b ^= t; bx ^= tx; |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} |
||||
|
||||
store_key<ALGO>(d_odata, b, bx); |
||||
} |
||||
|
||||
template <int ALGO, MemoryAccess SCHEME> __global__ |
||||
void titan_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP) |
||||
{ |
||||
uint4 b, bx; |
||||
|
||||
int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU; |
||||
int start = ((scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP; |
||||
|
||||
int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3); |
||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
||||
|
||||
if (begin == 0) { |
||||
int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP); |
||||
read_keys_direct<SCHEME>(b, bx, start+32*pos); |
||||
while(loop--) |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} |
||||
else |
||||
load_key<ALGO>(d_odata, b, bx); |
||||
|
||||
if (SCHEME == SIMPLE) |
||||
{ |
||||
// better divergent thread handling submitted by nVidia engineers, but |
||||
// supposedly this does not run with the ANDERSEN memory access scheme |
||||
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
||||
int pos = j/LOOKUP_GAP; |
||||
int loop = -1; |
||||
uint4 t, tx; |
||||
|
||||
int i = begin; |
||||
while(i < end) |
||||
{ |
||||
if (loop == -1) { |
||||
j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
||||
pos = j/LOOKUP_GAP; |
||||
loop = j-pos*LOOKUP_GAP; |
||||
read_keys_direct<SCHEME>(t, tx, start+32*pos); |
||||
} |
||||
if (loop == 0) { |
||||
b ^= t; bx ^= tx; |
||||
t=b;tx=bx; |
||||
} |
||||
|
||||
block_mixer<ALGO>(t, tx, x1, x2, x3); |
||||
if (loop == 0) { |
||||
b=t;bx=tx; |
||||
i++; |
||||
} |
||||
loop--; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
// this is my original implementation, now used with the ANDERSEN |
||||
// memory access scheme only. |
||||
for (int i = begin; i < end; i++) { |
||||
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
||||
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; |
||||
uint4 t, tx; read_keys_direct<SCHEME>(t, tx, start+32*pos); |
||||
while (loop--) |
||||
block_mixer<ALGO>(t, tx, x1, x2, x3); |
||||
b ^= t; bx ^= tx; |
||||
block_mixer<ALGO>(b, bx, x1, x2, x3); |
||||
} |
||||
} |
||||
|
||||
store_key<ALGO>(d_odata, b, bx); |
||||
} |
||||
|
||||
|
||||
TitanKernel::TitanKernel() : KernelInterface() |
||||
{ |
||||
} |
||||
|
||||
void TitanKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) |
||||
{ |
||||
checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice)); |
||||
} |
||||
|
||||
bool TitanKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, |
||||
uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) |
||||
{ |
||||
bool success = true; |
||||
|
||||
// make some constants available to kernel, update only initially and when changing |
||||
static int prev_N[MAX_DEVICES] = {0}; |
||||
if (N != prev_N[thr_id]) { |
||||
uint32_t h_N = N; |
||||
uint32_t h_N_1 = N-1; |
||||
uint32_t h_SCRATCH = SCRATCH; |
||||
uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP); |
||||
uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1; |
||||
|
||||
cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); |
||||
|
||||
prev_N[thr_id] = N; |
||||
} |
||||
|
||||
// First phase: Sequential writes to scratchpad. |
||||
|
||||
int batch = device_batchsize[thr_id]; |
||||
|
||||
unsigned int pos = 0; |
||||
do { |
||||
if (LOOKUP_GAP == 1) { |
||||
if (IS_SCRYPT()) titan_scrypt_core_kernelA<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); |
||||
if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); |
||||
} else { |
||||
if (IS_SCRYPT()) titan_scrypt_core_kernelA_LG<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
} |
||||
pos += batch; |
||||
|
||||
} while (pos < N); |
||||
|
||||
// Second phase: Random read access from scratchpad. |
||||
|
||||
pos = 0; |
||||
do { |
||||
if (LOOKUP_GAP == 1) { |
||||
if (IS_SCRYPT()) titan_scrypt_core_kernelB<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); |
||||
} else { |
||||
if (IS_SCRYPT()) titan_scrypt_core_kernelB_LG<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); |
||||
} |
||||
pos += batch; |
||||
|
||||
} while (pos < N); |
||||
|
||||
return success; |
||||
} |
@ -0,0 +1,26 @@
@@ -0,0 +1,26 @@
|
||||
#ifndef TITAN_KERNEL_H |
||||
#define TITAN_KERNEL_H |
||||
|
||||
#include "salsa_kernel.h" |
||||
|
||||
class TitanKernel : public KernelInterface |
||||
{ |
||||
public: |
||||
TitanKernel(); |
||||
|
||||
virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V); |
||||
virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache); |
||||
|
||||
virtual char get_identifier() { return 't'; } |
||||
virtual int get_major_version() { return 3; } |
||||
virtual int get_minor_version() { return 5; } |
||||
|
||||
virtual int max_warps_per_block() { return 32; } |
||||
virtual int get_texel_width() { return 4; } |
||||
virtual bool no_textures() { return true; } |
||||
virtual int threads_per_wu() { return 4; } |
||||
virtual bool support_lookup_gap() { return true; } |
||||
virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; } |
||||
}; |
||||
|
||||
#endif // #ifndef TITAN_KERNEL_H
|
Loading…
Reference in new issue