scrypt: reduce cpu usage (flush/sync)
and do some minimal keccak changes (for jane)
This commit is contained in:
parent
9aace79718
commit
b1bddb54d6
@ -514,10 +514,10 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u
|
|||||||
cuda_scrypt_done(thr_id, nxt);
|
cuda_scrypt_done(thr_id, nxt);
|
||||||
|
|
||||||
cuda_scrypt_DtoH(thr_id, cuda_X[nxt], nxt, false);
|
cuda_scrypt_DtoH(thr_id, cuda_X[nxt], nxt, false);
|
||||||
cuda_scrypt_flush(thr_id, nxt);
|
|
||||||
|
|
||||||
if(!cuda_scrypt_sync(thr_id, cur)) {
|
//cuda_scrypt_flush(thr_id, nxt);
|
||||||
return -1;
|
if(!cuda_scrypt_sync(thr_id, nxt)) {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(Xbuf[cur].ptr, cuda_X[cur], 128 * throughput);
|
memcpy(Xbuf[cur].ptr, cuda_X[cur], 128 * throughput);
|
||||||
@ -562,51 +562,49 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u
|
|||||||
cuda_scrypt_serialize(thr_id, nxt);
|
cuda_scrypt_serialize(thr_id, nxt);
|
||||||
pre_keccak512(thr_id, nxt, nonce[nxt], throughput);
|
pre_keccak512(thr_id, nxt, nonce[nxt], throughput);
|
||||||
cuda_scrypt_core(thr_id, nxt, N);
|
cuda_scrypt_core(thr_id, nxt, N);
|
||||||
cuda_scrypt_flush(thr_id, nxt); // required
|
//cuda_scrypt_flush(thr_id, nxt);
|
||||||
|
if (!cuda_scrypt_sync(thr_id, nxt)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
post_keccak512(thr_id, nxt, nonce[nxt], throughput);
|
post_keccak512(thr_id, nxt, nonce[nxt], throughput);
|
||||||
cuda_scrypt_done(thr_id, nxt);
|
cuda_scrypt_done(thr_id, nxt);
|
||||||
|
|
||||||
cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true);
|
cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true);
|
||||||
cuda_scrypt_flush(thr_id, nxt); // seems required here
|
//cuda_scrypt_flush(thr_id, nxt); // made by cuda_scrypt_sync
|
||||||
|
if (!cuda_scrypt_sync(thr_id, nxt)) {
|
||||||
if (!cuda_scrypt_sync(thr_id, cur)) {
|
break;
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(iteration > 0)
|
for (int i=0; iteration > 0 && i<throughput; i++)
|
||||||
{
|
{
|
||||||
for(int i=0;i<throughput;++i) {
|
if (hash[cur][8*i+7] <= Htarg && fulltest(&hash[cur][8*i], ptarget))
|
||||||
volatile unsigned char *hashc = (unsigned char *)(&hash[cur][8*i]);
|
{
|
||||||
|
uint32_t _ALIGN(64) thash[8], tdata[20];
|
||||||
|
uint32_t tmp_nonce = nonce[cur] + i;
|
||||||
|
|
||||||
if (hash[cur][8*i+7] <= Htarg && fulltest(&hash[cur][8*i], ptarget))
|
for(int z=0;z<19;z++)
|
||||||
|
tdata[z] = bswap_32x4(pdata[z]);
|
||||||
|
tdata[19] = bswap_32x4(tmp_nonce);
|
||||||
|
|
||||||
|
scrypt_pbkdf2_1((unsigned char *)tdata, 80, (unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128);
|
||||||
|
scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)(Ybuf.ptr), (scrypt_mix_word_t *)(Vbuf.ptr), N);
|
||||||
|
scrypt_pbkdf2_1((unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)thash, 32);
|
||||||
|
|
||||||
|
if (memcmp(thash, &hash[cur][8*i], 32) == 0)
|
||||||
{
|
{
|
||||||
uint32_t _ALIGN(64) thash[8], tdata[20];
|
*hashes_done = n - pdata[19];
|
||||||
uint32_t tmp_nonce = nonce[cur] + i;
|
pdata[19] = tmp_nonce;
|
||||||
|
scrypt_free(&Vbuf);
|
||||||
for(int z=0;z<20;z++)
|
scrypt_free(&Ybuf);
|
||||||
tdata[z] = bswap_32x4(pdata[z]);
|
scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]);
|
||||||
tdata[19] = bswap_32x4(tmp_nonce);
|
delete[] data[0]; delete[] data[1];
|
||||||
|
gettimeofday(tv_end, NULL);
|
||||||
scrypt_pbkdf2_1((unsigned char *)tdata, 80, (unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128);
|
return 1;
|
||||||
scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)(Ybuf.ptr), (scrypt_mix_word_t *)(Vbuf.ptr), N);
|
} else {
|
||||||
scrypt_pbkdf2_1((unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)thash, 32);
|
applog(LOG_WARNING, "GPU #%d: %s result does not validate on CPU! (i=%d, s=%d)",
|
||||||
|
device_map[thr_id], device_name[thr_id], i, cur);
|
||||||
if (memcmp(thash, &hash[cur][8*i], 32) == 0)
|
|
||||||
{
|
|
||||||
*hashes_done = n - pdata[19];
|
|
||||||
pdata[19] = tmp_nonce;
|
|
||||||
scrypt_free(&Vbuf);
|
|
||||||
scrypt_free(&Ybuf);
|
|
||||||
scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]);
|
|
||||||
delete[] data[0]; delete[] data[1];
|
|
||||||
gettimeofday(tv_end, NULL);
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
applog(LOG_WARNING, "GPU #%d: %s result does not validate on CPU! (i=%d, s=%d)",
|
|
||||||
device_map[thr_id], device_name[thr_id], i, cur);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -615,7 +613,7 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u
|
|||||||
nxt = (nxt+1)&1;
|
nxt = (nxt+1)&1;
|
||||||
++iteration;
|
++iteration;
|
||||||
} while (n <= max_nonce && !work_restart[thr_id].restart);
|
} while (n <= max_nonce && !work_restart[thr_id].restart);
|
||||||
|
out:
|
||||||
scrypt_free(&Vbuf);
|
scrypt_free(&Vbuf);
|
||||||
scrypt_free(&Ybuf);
|
scrypt_free(&Ybuf);
|
||||||
scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]);
|
scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]);
|
||||||
|
17
scrypt.cpp
17
scrypt.cpp
@ -802,9 +802,8 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, unsign
|
|||||||
cuda_scrypt_done(thr_id, nxt);
|
cuda_scrypt_done(thr_id, nxt);
|
||||||
|
|
||||||
cuda_scrypt_DtoH(thr_id, X[nxt], nxt, false);
|
cuda_scrypt_DtoH(thr_id, X[nxt], nxt, false);
|
||||||
cuda_scrypt_flush(thr_id, nxt);
|
//cuda_scrypt_flush(thr_id, nxt);
|
||||||
|
if(!cuda_scrypt_sync(thr_id, nxt))
|
||||||
if(!cuda_scrypt_sync(thr_id, cur))
|
|
||||||
{
|
{
|
||||||
result = -1;
|
result = -1;
|
||||||
break;
|
break;
|
||||||
@ -858,15 +857,19 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, unsign
|
|||||||
pre_sha256(thr_id, nxt, nonce[nxt], throughput);
|
pre_sha256(thr_id, nxt, nonce[nxt], throughput);
|
||||||
|
|
||||||
cuda_scrypt_core(thr_id, nxt, N);
|
cuda_scrypt_core(thr_id, nxt, N);
|
||||||
cuda_scrypt_flush(thr_id, nxt); // required here ?
|
// cuda_scrypt_flush(thr_id, nxt);
|
||||||
|
if (!cuda_scrypt_sync(thr_id, nxt)) {
|
||||||
|
printf("error\n");
|
||||||
|
result = -1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
post_sha256(thr_id, nxt, throughput);
|
post_sha256(thr_id, nxt, throughput);
|
||||||
cuda_scrypt_done(thr_id, nxt);
|
cuda_scrypt_done(thr_id, nxt);
|
||||||
|
|
||||||
cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true);
|
cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true);
|
||||||
cuda_scrypt_flush(thr_id, nxt); // required here ?
|
// cuda_scrypt_flush(thr_id, nxt);
|
||||||
|
if (!cuda_scrypt_sync(thr_id, nxt)) {
|
||||||
if (!cuda_scrypt_sync(thr_id, cur)) {
|
|
||||||
printf("error\n");
|
printf("error\n");
|
||||||
result = -1;
|
result = -1;
|
||||||
break;
|
break;
|
||||||
|
141
scrypt/keccak.cu
141
scrypt/keccak.cu
@ -112,19 +112,19 @@ static const uint64_t host_keccak_round_constants[24] = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
__constant__ uint64_t c_keccak_round_constants[24];
|
__constant__ uint64_t c_keccak_round_constants[24];
|
||||||
__constant__ uint32_t pdata[20];
|
__constant__ uint32_t c_data[20];
|
||||||
|
|
||||||
__device__
|
__device__
|
||||||
void keccak_block(keccak_hash_state *S, const uint32_t *in) {
|
void keccak_block(keccak_hash_state *S, const uint32_t *in)
|
||||||
size_t i;
|
{
|
||||||
uint64_t *s = S->state, t[5], u[5], v, w;
|
uint64_t *s = S->state, t[5], u[5], v, w;
|
||||||
|
|
||||||
/* absorb input */
|
/* absorb input */
|
||||||
#pragma unroll 9
|
#pragma unroll 9
|
||||||
for (i = 0; i < 72 / 8; i++, in += 2)
|
for (int i = 0; i < 72 / 8; i++, in += 2)
|
||||||
s[i] ^= U32TO64_LE(in);
|
s[i] ^= U32TO64_LE(in);
|
||||||
|
|
||||||
for (i = 0; i < 24; i++) {
|
for (int i = 0; i < 24; i++) {
|
||||||
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
|
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
|
||||||
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
|
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
|
||||||
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
|
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
|
||||||
@ -186,8 +186,9 @@ void keccak_block(keccak_hash_state *S, const uint32_t *in) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
__device__
|
__device__
|
||||||
void keccak_hash_init(keccak_hash_state *S) {
|
void keccak_hash_init(keccak_hash_state *S)
|
||||||
#pragma unroll 25
|
{
|
||||||
|
#pragma unroll 25
|
||||||
for (int i=0; i<25; ++i)
|
for (int i=0; i<25; ++i)
|
||||||
S->state[i] = 0ULL;
|
S->state[i] = 0ULL;
|
||||||
}
|
}
|
||||||
@ -218,50 +219,62 @@ __device__ void keccak_hash_update64(keccak_hash_state *S, const uint32_t *in) {
|
|||||||
mycpy64(S->buffer, in);
|
mycpy64(S->buffer, in);
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ void keccak_hash_finish8(keccak_hash_state *S, uint32_t *hash) {
|
__device__
|
||||||
|
void keccak_hash_finish8(keccak_hash_state *S, uint32_t *hash)
|
||||||
|
{
|
||||||
S->buffer[8/4] = 0x01;
|
S->buffer[8/4] = 0x01;
|
||||||
#pragma unroll 15
|
#pragma unroll 15
|
||||||
for (int i=8/4+1; i < 72/4; ++i) S->buffer[i] = 0;
|
for (int i=8/4+1; i < 72/4; ++i) S->buffer[i] = 0;
|
||||||
S->buffer[72/4 - 1] |= 0x80000000;
|
S->buffer[72/4 - 1] |= 0x80000000U;
|
||||||
keccak_block(S, (const uint32_t*)S->buffer);
|
keccak_block(S, (const uint32_t*)S->buffer);
|
||||||
#pragma unroll 8
|
|
||||||
for (size_t i = 0; i < 64; i += 8) {
|
#pragma unroll 8
|
||||||
|
for (int i = 0; i < 64; i += 8) {
|
||||||
U64TO32_LE((&hash[i/4]), S->state[i / 8]);
|
U64TO32_LE((&hash[i/4]), S->state[i / 8]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ void keccak_hash_finish12(keccak_hash_state *S, uint32_t *hash) {
|
__device__
|
||||||
|
void keccak_hash_finish12(keccak_hash_state *S, uint32_t *hash)
|
||||||
|
{
|
||||||
S->buffer[12/4] = 0x01;
|
S->buffer[12/4] = 0x01;
|
||||||
#pragma unroll 14
|
#pragma unroll 14
|
||||||
for (int i=12/4+1; i < 72/4; ++i) S->buffer[i] = 0;
|
for (int i=12/4+1; i < 72/4; ++i) S->buffer[i] = 0;
|
||||||
S->buffer[72/4 - 1] |= 0x80000000;
|
S->buffer[72/4 - 1] |= 0x80000000U;
|
||||||
keccak_block(S, (const uint32_t*)S->buffer);
|
keccak_block(S, (const uint32_t*)S->buffer);
|
||||||
#pragma unroll 8
|
|
||||||
for (size_t i = 0; i < 64; i += 8) {
|
#pragma unroll 8
|
||||||
|
for (int i = 0; i < 64; i += 8) {
|
||||||
U64TO32_LE((&hash[i/4]), S->state[i / 8]);
|
U64TO32_LE((&hash[i/4]), S->state[i / 8]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ void keccak_hash_finish60(keccak_hash_state *S, uint32_t *hash) {
|
__device__
|
||||||
|
void keccak_hash_finish60(keccak_hash_state *S, uint32_t *hash)
|
||||||
|
{
|
||||||
S->buffer[60/4] = 0x01;
|
S->buffer[60/4] = 0x01;
|
||||||
#pragma unroll 2
|
#pragma unroll
|
||||||
for (int i=60/4+1; i < 72/4; ++i) S->buffer[i] = 0;
|
for (int i=60/4+1; i < 72/4; ++i) S->buffer[i] = 0;
|
||||||
S->buffer[72/4 - 1] |= 0x80000000;
|
S->buffer[72/4 - 1] |= 0x80000000U;
|
||||||
keccak_block(S, (const uint32_t*)S->buffer);
|
keccak_block(S, (const uint32_t*)S->buffer);
|
||||||
#pragma unroll 8
|
|
||||||
for (size_t i = 0; i < 64; i += 8) {
|
#pragma unroll 8
|
||||||
|
for (int i = 0; i < 64; i += 8) {
|
||||||
U64TO32_LE((&hash[i/4]), S->state[i / 8]);
|
U64TO32_LE((&hash[i/4]), S->state[i / 8]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ void keccak_hash_finish64(keccak_hash_state *S, uint32_t *hash) {
|
__device__
|
||||||
|
void keccak_hash_finish64(keccak_hash_state *S, uint32_t *hash)
|
||||||
|
{
|
||||||
S->buffer[64/4] = 0x01;
|
S->buffer[64/4] = 0x01;
|
||||||
#pragma unroll 1
|
#pragma unroll
|
||||||
for (int i=64/4+1; i < 72/4; ++i) S->buffer[i] = 0;
|
for (int i=64/4+1; i < 72/4; ++i) S->buffer[i] = 0;
|
||||||
S->buffer[72/4 - 1] |= 0x80000000;
|
S->buffer[72/4 - 1] |= 0x80000000U;
|
||||||
keccak_block(S, (const uint32_t*)S->buffer);
|
keccak_block(S, (const uint32_t*)S->buffer);
|
||||||
#pragma unroll 8
|
|
||||||
for (size_t i = 0; i < 64; i += 8) {
|
#pragma unroll 8
|
||||||
|
for (int i = 0; i < 64; i += 8) {
|
||||||
U64TO32_LE((&hash[i/4]), S->state[i / 8]);
|
U64TO32_LE((&hash[i/4]), S->state[i / 8]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -275,7 +288,8 @@ typedef struct pbkdf2_hmac_state_t {
|
|||||||
} pbkdf2_hmac_state;
|
} pbkdf2_hmac_state;
|
||||||
|
|
||||||
|
|
||||||
__device__ void pbkdf2_hash(uint32_t *hash, const uint32_t *m) {
|
__device__ void pbkdf2_hash(uint32_t *hash, const uint32_t *m)
|
||||||
|
{
|
||||||
keccak_hash_state st;
|
keccak_hash_state st;
|
||||||
keccak_hash_init(&st);
|
keccak_hash_init(&st);
|
||||||
keccak_hash_update72(&st, m);
|
keccak_hash_update72(&st, m);
|
||||||
@ -284,32 +298,32 @@ __device__ void pbkdf2_hash(uint32_t *hash, const uint32_t *m) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* hmac */
|
/* hmac */
|
||||||
__device__ void pbkdf2_hmac_init80(pbkdf2_hmac_state *st, const uint32_t *key) {
|
__device__
|
||||||
uint32_t pad[72/4];
|
void pbkdf2_hmac_init80(pbkdf2_hmac_state *st, const uint32_t *key)
|
||||||
size_t i;
|
{
|
||||||
|
uint32_t pad[72/4] = { 0 };
|
||||||
|
//#pragma unroll 18
|
||||||
|
//for (int i = 0; i < 72/4; i++)
|
||||||
|
// pad[i] = 0;
|
||||||
|
|
||||||
keccak_hash_init(&st->inner);
|
keccak_hash_init(&st->inner);
|
||||||
keccak_hash_init(&st->outer);
|
keccak_hash_init(&st->outer);
|
||||||
|
|
||||||
#pragma unroll 18
|
|
||||||
for (i = 0; i < 72/4; i++)
|
|
||||||
pad[i] = 0;
|
|
||||||
|
|
||||||
/* key > blocksize bytes, hash it */
|
/* key > blocksize bytes, hash it */
|
||||||
pbkdf2_hash(pad, key);
|
pbkdf2_hash(pad, key);
|
||||||
|
|
||||||
/* inner = (key ^ 0x36) */
|
/* inner = (key ^ 0x36) */
|
||||||
/* h(inner || ...) */
|
/* h(inner || ...) */
|
||||||
#pragma unroll 18
|
#pragma unroll 18
|
||||||
for (i = 0; i < 72/4; i++)
|
for (int i = 0; i < 72/4; i++)
|
||||||
pad[i] ^= 0x36363636;
|
pad[i] ^= 0x36363636U;
|
||||||
keccak_hash_update72(&st->inner, pad);
|
keccak_hash_update72(&st->inner, pad);
|
||||||
|
|
||||||
/* outer = (key ^ 0x5c) */
|
/* outer = (key ^ 0x5c) */
|
||||||
/* h(outer || ...) */
|
/* h(outer || ...) */
|
||||||
#pragma unroll 18
|
#pragma unroll 18
|
||||||
for (i = 0; i < 72/4; i++)
|
for (int i = 0; i < 72/4; i++)
|
||||||
pad[i] ^= 0x6a6a6a6a;
|
pad[i] ^= 0x6a6a6a6aU;
|
||||||
keccak_hash_update72(&st->outer, pad);
|
keccak_hash_update72(&st->outer, pad);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -370,21 +384,20 @@ __device__ void pbkdf2_statecopy8(pbkdf2_hmac_state *d, pbkdf2_hmac_state *s) {
|
|||||||
__global__ __launch_bounds__(128)
|
__global__ __launch_bounds__(128)
|
||||||
void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce)
|
void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce)
|
||||||
{
|
{
|
||||||
nonce += (blockIdx.x * blockDim.x) + threadIdx.x;
|
|
||||||
g_idata += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x);
|
|
||||||
|
|
||||||
uint32_t data[20];
|
uint32_t data[20];
|
||||||
|
|
||||||
|
const uint32_t thread = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||||
|
nonce += thread;
|
||||||
|
g_idata += thread * 32;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i=0; i <19; ++i)
|
for (int i=0; i<19; i++)
|
||||||
data[i] = cuda_swab32(pdata[i]);
|
data[i] = cuda_swab32(c_data[i]);
|
||||||
data[19] = cuda_swab32(nonce);
|
data[19] = cuda_swab32(nonce);
|
||||||
|
|
||||||
// scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)data, 80, (uint8_t*)g_idata, 128);
|
// scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)data, 80, (uint8_t*)g_idata, 128);
|
||||||
|
|
||||||
pbkdf2_hmac_state hmac_pw, work;
|
pbkdf2_hmac_state hmac_pw;
|
||||||
uint32_t ti[16];
|
|
||||||
uint32_t be;
|
|
||||||
|
|
||||||
/* hmac(password, ...) */
|
/* hmac(password, ...) */
|
||||||
pbkdf2_hmac_init80(&hmac_pw, data);
|
pbkdf2_hmac_init80(&hmac_pw, data);
|
||||||
@ -393,14 +406,17 @@ void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce)
|
|||||||
pbkdf2_hmac_update72(&hmac_pw, data);
|
pbkdf2_hmac_update72(&hmac_pw, data);
|
||||||
pbkdf2_hmac_update8(&hmac_pw, data+72/4);
|
pbkdf2_hmac_update8(&hmac_pw, data+72/4);
|
||||||
|
|
||||||
|
pbkdf2_hmac_state work;
|
||||||
|
uint32_t ti[16];
|
||||||
|
|
||||||
/* U1 = hmac(password, salt || be(i)) */
|
/* U1 = hmac(password, salt || be(i)) */
|
||||||
be = cuda_swab32(1);
|
uint32_t be = 0x01000000U;//cuda_swab32(1);
|
||||||
pbkdf2_statecopy8(&work, &hmac_pw);
|
pbkdf2_statecopy8(&work, &hmac_pw);
|
||||||
pbkdf2_hmac_update4_8(&work, &be);
|
pbkdf2_hmac_update4_8(&work, &be);
|
||||||
pbkdf2_hmac_finish12(&work, ti);
|
pbkdf2_hmac_finish12(&work, ti);
|
||||||
mycpy64(g_idata, ti);
|
mycpy64(g_idata, ti);
|
||||||
|
|
||||||
be = cuda_swab32(2);
|
be = 0x02000000U;//cuda_swab32(2);
|
||||||
pbkdf2_statecopy8(&work, &hmac_pw);
|
pbkdf2_statecopy8(&work, &hmac_pw);
|
||||||
pbkdf2_hmac_update4_8(&work, &be);
|
pbkdf2_hmac_update4_8(&work, &be);
|
||||||
pbkdf2_hmac_finish12(&work, ti);
|
pbkdf2_hmac_finish12(&work, ti);
|
||||||
@ -411,22 +427,21 @@ void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce)
|
|||||||
__global__ __launch_bounds__(128)
|
__global__ __launch_bounds__(128)
|
||||||
void cuda_post_keccak512(uint32_t *g_odata, uint32_t *g_hash, uint32_t nonce)
|
void cuda_post_keccak512(uint32_t *g_odata, uint32_t *g_hash, uint32_t nonce)
|
||||||
{
|
{
|
||||||
nonce += (blockIdx.x * blockDim.x) + threadIdx.x;
|
|
||||||
g_odata += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x);
|
|
||||||
g_hash += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
|
|
||||||
|
|
||||||
uint32_t data[20];
|
uint32_t data[20];
|
||||||
|
|
||||||
#pragma unroll 19
|
const uint32_t thread = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||||
for (int i=0; i <19; ++i)
|
g_hash += thread * 8;
|
||||||
data[i] = cuda_swab32(pdata[i]);
|
g_odata += thread * 32;
|
||||||
|
nonce += thread;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i=0; i<19; i++)
|
||||||
|
data[i] = cuda_swab32(c_data[i]);
|
||||||
data[19] = cuda_swab32(nonce);
|
data[19] = cuda_swab32(nonce);
|
||||||
|
|
||||||
// scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)g_odata, 128, (uint8_t*)g_hash, 32);
|
// scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)g_odata, 128, (uint8_t*)g_hash, 32);
|
||||||
|
|
||||||
pbkdf2_hmac_state hmac_pw;
|
pbkdf2_hmac_state hmac_pw;
|
||||||
uint32_t ti[16];
|
|
||||||
uint32_t be;
|
|
||||||
|
|
||||||
/* hmac(password, ...) */
|
/* hmac(password, ...) */
|
||||||
pbkdf2_hmac_init80(&hmac_pw, data);
|
pbkdf2_hmac_init80(&hmac_pw, data);
|
||||||
@ -435,8 +450,10 @@ void cuda_post_keccak512(uint32_t *g_odata, uint32_t *g_hash, uint32_t nonce)
|
|||||||
pbkdf2_hmac_update72(&hmac_pw, g_odata);
|
pbkdf2_hmac_update72(&hmac_pw, g_odata);
|
||||||
pbkdf2_hmac_update56(&hmac_pw, g_odata+72/4);
|
pbkdf2_hmac_update56(&hmac_pw, g_odata+72/4);
|
||||||
|
|
||||||
|
uint32_t ti[16];
|
||||||
|
|
||||||
/* U1 = hmac(password, salt || be(i)) */
|
/* U1 = hmac(password, salt || be(i)) */
|
||||||
be = cuda_swab32(1);
|
uint32_t be = 0x01000000U;//cuda_swab32(1);
|
||||||
pbkdf2_hmac_update4_56(&hmac_pw, &be);
|
pbkdf2_hmac_update4_56(&hmac_pw, &be);
|
||||||
pbkdf2_hmac_finish60(&hmac_pw, ti);
|
pbkdf2_hmac_finish60(&hmac_pw, ti);
|
||||||
mycpy32(g_hash, ti);
|
mycpy32(g_hash, ti);
|
||||||
@ -455,7 +472,7 @@ extern "C" void prepare_keccak512(int thr_id, const uint32_t host_pdata[20])
|
|||||||
checkCudaErrors(cudaMemcpyToSymbol(c_keccak_round_constants, host_keccak_round_constants, sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
|
checkCudaErrors(cudaMemcpyToSymbol(c_keccak_round_constants, host_keccak_round_constants, sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
|
||||||
init[thr_id] = true;
|
init[thr_id] = true;
|
||||||
}
|
}
|
||||||
checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
|
checkCudaErrors(cudaMemcpyToSymbol(c_data, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void pre_keccak512(int thr_id, int stream, uint32_t nonce, int throughput)
|
extern "C" void pre_keccak512(int thr_id, int stream, uint32_t nonce, int throughput)
|
||||||
|
@ -54,6 +54,8 @@ void NV2Kernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
|
|||||||
bool NV2Kernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
|
bool NV2Kernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
|
||||||
{
|
{
|
||||||
bool success = true;
|
bool success = true;
|
||||||
|
bool scrypt = IS_SCRYPT();
|
||||||
|
bool chacha = IS_SCRYPT_JANE();
|
||||||
|
|
||||||
// make some constants available to kernel, update only initially and when changing
|
// make some constants available to kernel, update only initially and when changing
|
||||||
static uint32_t prev_N[MAX_GPUS] = { 0 };
|
static uint32_t prev_N[MAX_GPUS] = { 0 };
|
||||||
@ -77,11 +79,11 @@ bool NV2Kernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr
|
|||||||
do
|
do
|
||||||
{
|
{
|
||||||
if (LOOKUP_GAP == 1) {
|
if (LOOKUP_GAP == 1) {
|
||||||
if (IS_SCRYPT()) nv2_scrypt_core_kernelA<A_SCRYPT> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
|
if (scrypt) nv2_scrypt_core_kernelA<A_SCRYPT> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
|
||||||
if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelA<A_SCRYPT_JANE><<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
|
if (chacha) nv2_scrypt_core_kernelA<A_SCRYPT_JANE><<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
|
||||||
} else {
|
} else {
|
||||||
if (IS_SCRYPT()) nv2_scrypt_core_kernelA_LG<A_SCRYPT> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
|
if (scrypt) nv2_scrypt_core_kernelA_LG<A_SCRYPT> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
|
||||||
if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelA_LG<A_SCRYPT_JANE><<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
|
if (chacha) nv2_scrypt_core_kernelA_LG<A_SCRYPT_JANE><<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
|
||||||
}
|
}
|
||||||
pos += batch;
|
pos += batch;
|
||||||
} while (pos < N);
|
} while (pos < N);
|
||||||
@ -91,11 +93,11 @@ bool NV2Kernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr
|
|||||||
do
|
do
|
||||||
{
|
{
|
||||||
if (LOOKUP_GAP == 1) {
|
if (LOOKUP_GAP == 1) {
|
||||||
if (IS_SCRYPT()) nv2_scrypt_core_kernelB<A_SCRYPT > <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
|
if (scrypt) nv2_scrypt_core_kernelB<A_SCRYPT > <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
|
||||||
if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelB<A_SCRYPT_JANE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
|
if (chacha) nv2_scrypt_core_kernelB<A_SCRYPT_JANE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
|
||||||
} else {
|
} else {
|
||||||
if (IS_SCRYPT()) nv2_scrypt_core_kernelB_LG<A_SCRYPT > <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
|
if (scrypt) nv2_scrypt_core_kernelB_LG<A_SCRYPT > <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
|
||||||
if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelB_LG<A_SCRYPT_JANE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
|
if (chacha) nv2_scrypt_core_kernelB_LG<A_SCRYPT_JANE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
|
||||||
}
|
}
|
||||||
|
|
||||||
pos += batch;
|
pos += batch;
|
||||||
|
@ -819,8 +819,9 @@ void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA)
|
|||||||
bool cuda_scrypt_sync(int thr_id, int stream)
|
bool cuda_scrypt_sync(int thr_id, int stream)
|
||||||
{
|
{
|
||||||
cudaError_t err;
|
cudaError_t err;
|
||||||
|
uint32_t wait_us = 0;
|
||||||
|
|
||||||
if(device_interactive[thr_id] && !opt_benchmark)
|
if (device_interactive[thr_id] && !opt_benchmark)
|
||||||
{
|
{
|
||||||
// For devices that also do desktop rendering or compositing, we want to free up some time slots.
|
// For devices that also do desktop rendering or compositing, we want to free up some time slots.
|
||||||
// That requires making a pause in work submission when there is no active task on the GPU,
|
// That requires making a pause in work submission when there is no active task on the GPU,
|
||||||
@ -830,27 +831,30 @@ bool cuda_scrypt_sync(int thr_id, int stream)
|
|||||||
//err = cudaDeviceSynchronize();
|
//err = cudaDeviceSynchronize();
|
||||||
|
|
||||||
while((err = cudaStreamQuery(context_streams[0][thr_id])) == cudaErrorNotReady ||
|
while((err = cudaStreamQuery(context_streams[0][thr_id])) == cudaErrorNotReady ||
|
||||||
(err == cudaSuccess && (err = cudaStreamQuery(context_streams[1][thr_id])) == cudaErrorNotReady))
|
(err == cudaSuccess && (err = cudaStreamQuery(context_streams[1][thr_id])) == cudaErrorNotReady)) {
|
||||||
usleep(1000);
|
usleep(50); wait_us+=50;
|
||||||
|
}
|
||||||
|
|
||||||
usleep(1000);
|
usleep(50); wait_us+=50;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
// this call was replaced by the loop below to workaround the high CPU usage issue
|
// this call was replaced by the loop below to workaround the high CPU usage issue
|
||||||
//err = cudaStreamSynchronize(context_streams[stream][thr_id]);
|
//err = cudaStreamSynchronize(context_streams[stream][thr_id]);
|
||||||
|
|
||||||
while((err = cudaStreamQuery(context_streams[stream][thr_id])) == cudaErrorNotReady)
|
while((err = cudaStreamQuery(context_streams[stream][thr_id])) == cudaErrorNotReady) {
|
||||||
usleep(1000);
|
usleep(50); wait_us+=50;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(err != cudaSuccess)
|
if (err != cudaSuccess) {
|
||||||
{
|
|
||||||
if (!abort_flag)
|
if (!abort_flag)
|
||||||
applog(LOG_ERR, "GPU #%d: CUDA error `%s` while executing the kernel.", device_map[thr_id], cudaGetErrorString(err));
|
applog(LOG_ERR, "GPU #%d: CUDA error `%s` while waiting the kernel.", device_map[thr_id], cudaGetErrorString(err));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//if (opt_debug) {
|
||||||
|
// applog(LOG_DEBUG, "GPU #%d: %s %u us", device_map[thr_id], __FUNCTION__, wait_us);
|
||||||
|
//}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -676,6 +676,8 @@ bool TitanKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int t
|
|||||||
uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
|
uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
|
||||||
{
|
{
|
||||||
bool success = true;
|
bool success = true;
|
||||||
|
bool scrypt = IS_SCRYPT();
|
||||||
|
bool chacha = IS_SCRYPT_JANE();
|
||||||
|
|
||||||
// make some constants available to kernel, update only initially and when changing
|
// make some constants available to kernel, update only initially and when changing
|
||||||
static uint32_t prev_N[MAX_GPUS] = { 0 };
|
static uint32_t prev_N[MAX_GPUS] = { 0 };
|
||||||
@ -703,11 +705,11 @@ bool TitanKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int t
|
|||||||
unsigned int pos = 0;
|
unsigned int pos = 0;
|
||||||
do {
|
do {
|
||||||
if (LOOKUP_GAP == 1) {
|
if (LOOKUP_GAP == 1) {
|
||||||
if (IS_SCRYPT()) titan_scrypt_core_kernelA<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
|
if (scrypt) titan_scrypt_core_kernelA<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
|
||||||
if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
|
if (chacha) titan_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
|
||||||
} else {
|
} else {
|
||||||
if (IS_SCRYPT()) titan_scrypt_core_kernelA_LG<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
|
if (scrypt) titan_scrypt_core_kernelA_LG<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
|
||||||
if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
|
if (chacha) titan_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
|
||||||
}
|
}
|
||||||
pos += batch;
|
pos += batch;
|
||||||
|
|
||||||
@ -718,11 +720,11 @@ bool TitanKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int t
|
|||||||
pos = 0;
|
pos = 0;
|
||||||
do {
|
do {
|
||||||
if (LOOKUP_GAP == 1) {
|
if (LOOKUP_GAP == 1) {
|
||||||
if (IS_SCRYPT()) titan_scrypt_core_kernelB<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
|
if (scrypt) titan_scrypt_core_kernelB<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
|
||||||
if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
|
if (chacha) titan_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
|
||||||
} else {
|
} else {
|
||||||
if (IS_SCRYPT()) titan_scrypt_core_kernelB_LG<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
|
if (scrypt) titan_scrypt_core_kernelB_LG<A_SCRYPT, ANDERSEN> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
|
||||||
if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
|
if (chacha) titan_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
|
||||||
}
|
}
|
||||||
pos += batch;
|
pos += batch;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user