Browse Source
indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued...master
Tanguy Pruvot
10 years ago
11 changed files with 2852 additions and 2 deletions
@ -0,0 +1,83 @@
@@ -0,0 +1,83 @@
|
||||
|
||||
extern "C" { |
||||
#include "neoscrypt/neoscrypt.h" |
||||
} |
||||
|
||||
#include "cuda_helper.h" |
||||
#include "miner.h" |
||||
|
||||
static uint32_t *d_hash[MAX_GPUS] ; |
||||
extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget); |
||||
extern void neoscrypt_cpu_init(int thr_id, uint32_t threads, uint32_t* hash); |
||||
extern uint32_t neoscrypt_cpu_hash_k4(int stratum, int thr_id, uint32_t threads, uint32_t startNounce, int order); |
||||
|
||||
#define SHIFT 130 |
||||
|
||||
int scanhash_neoscrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) |
||||
{ |
||||
const uint32_t first_nonce = pdata[19]; |
||||
const int stratum = have_stratum; |
||||
|
||||
if (opt_benchmark) |
||||
((uint32_t*)ptarget)[7] = 0x0000ff; |
||||
|
||||
int intensity = is_windows() ? 18 : 19; |
||||
uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity); |
||||
throughput = throughput / 32; /* set for max intensity ~= 20 */ |
||||
throughput = min(throughput, max_nonce - first_nonce + 1); |
||||
|
||||
static bool init[MAX_GPUS] = { 0 }; |
||||
if (!init[thr_id]) |
||||
{ |
||||
cudaSetDevice(device_map[thr_id]); |
||||
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); |
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 32 * SHIFT * sizeof(uint64_t) * throughput)); |
||||
neoscrypt_cpu_init(thr_id, throughput, d_hash[thr_id]); |
||||
|
||||
applog(LOG_INFO, "Using %d cuda threads", throughput); |
||||
|
||||
init[thr_id] = true; |
||||
} |
||||
|
||||
uint32_t endiandata[20]; |
||||
if (stratum) { |
||||
for (int k = 0; k < 20; k++) |
||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); |
||||
} else { |
||||
for (int k = 0; k < 20; k++) |
||||
endiandata[k] = pdata[k]; |
||||
} |
||||
|
||||
neoscrypt_setBlockTarget(endiandata,ptarget); |
||||
|
||||
do { |
||||
uint32_t foundNonce = neoscrypt_cpu_hash_k4(stratum, thr_id, throughput, pdata[19], 0); |
||||
if (foundNonce != UINT32_MAX) |
||||
{ |
||||
uint32_t _ALIGN(64) vhash64[8]; |
||||
|
||||
*hashes_done = pdata[19] - first_nonce + 1; |
||||
|
||||
if (stratum) { |
||||
be32enc(&endiandata[19], foundNonce); |
||||
} else { |
||||
endiandata[19] = foundNonce; |
||||
} |
||||
neoscrypt((uchar*) endiandata, (uchar*)vhash64, 0x80000620); |
||||
|
||||
if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { |
||||
pdata[19] = foundNonce; |
||||
return 1; |
||||
} else { |
||||
applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce); |
||||
} |
||||
} |
||||
|
||||
pdata[19] += throughput; |
||||
|
||||
} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce + 1; |
||||
return 0; |
||||
} |
@ -0,0 +1,607 @@
@@ -0,0 +1,607 @@
|
||||
#include <stdio.h> |
||||
#include <memory.h> |
||||
|
||||
#include "cuda_helper.h" |
||||
#include "cuda_vectors.h" |
||||
|
||||
__device__ uint4 * W; |
||||
uint32_t *d_NNonce[MAX_GPUS]; |
||||
uint32_t *d_nnounce[MAX_GPUS]; |
||||
__constant__ uint32_t pTarget[8]; |
||||
__constant__ uint32_t key_init[16]; |
||||
__constant__ uint32_t input_init[16]; |
||||
__constant__ uint32_t c_data[80]; |
||||
|
||||
|
||||
#define SALSA_SMALL_UNROLL 1 |
||||
#define CHACHA_SMALL_UNROLL 1 |
||||
#define BLAKE2S_BLOCK_SIZE 64U |
||||
#define BLAKE2S_OUT_SIZE 32U |
||||
#define BLAKE2S_KEY_SIZE 32U |
||||
#define BLOCK_SIZE 64U |
||||
#define FASTKDF_BUFFER_SIZE 256U |
||||
#define PASSWORD_LEN 80U |
||||
/// constants /// |
||||
|
||||
static const __constant__ uint8 BLAKE2S_IV_Vec = |
||||
{ |
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, |
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 |
||||
}; |
||||
|
||||
|
||||
static const uint8 BLAKE2S_IV_Vechost = |
||||
{ |
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, |
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 |
||||
}; |
||||
|
||||
static const uint32_t BLAKE2S_SIGMA_host[10][16] = |
||||
{ |
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, |
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, |
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, |
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, |
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, |
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, |
||||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, |
||||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, |
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, |
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, |
||||
}; |
||||
__constant__ uint32_t BLAKE2S_SIGMA[10][16]; |
||||
|
||||
// Blake2S |
||||
|
||||
#define BLAKE2S_BLOCK_SIZE 64U |
||||
#define BLAKE2S_OUT_SIZE 32U |
||||
#define BLAKE2S_KEY_SIZE 32U |
||||
|
||||
#if __CUDA_ARCH__ >= 500 |
||||
#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ |
||||
idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ |
||||
a += b; d = __byte_perm(d^a,0, 0x1032); \ |
||||
c += d; b = rotateR(b^c, 12); \ |
||||
idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ |
||||
a += b; d = __byte_perm(d^a,0, 0x0321); \ |
||||
c += d; b = rotateR(b^c, 7); \ |
||||
} |
||||
#else |
||||
#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ |
||||
idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ |
||||
a += b; d = rotate(d^a,16); \ |
||||
c += d; b = rotateR(b^c, 12); \ |
||||
idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ |
||||
a += b; d = rotateR(d^a,8); \ |
||||
c += d; b = rotateR(b^c, 7); \ |
||||
} |
||||
#endif |
||||
|
||||
#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n))) |
||||
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) |
||||
|
||||
#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \ |
||||
idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \ |
||||
a += b; d = ROTR32(d^a,16); \ |
||||
c += d; b = ROTR32(b^c, 12); \ |
||||
idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \ |
||||
a += b; d = ROTR32(d^a,8); \ |
||||
c += d; b = ROTR32(b^c, 7); \ |
||||
} |
||||
|
||||
|
||||
static __forceinline__ __device__ void Blake2S(uint32_t * inout, const uint32_t * TheKey) |
||||
{ |
||||
uint16 V; |
||||
uint32_t idx; |
||||
uint8 tmpblock; |
||||
|
||||
V.hi = BLAKE2S_IV_Vec; |
||||
V.lo = BLAKE2S_IV_Vec; |
||||
V.lo.s0 ^= 0x01012020; |
||||
|
||||
// Copy input block for later |
||||
tmpblock = V.lo; |
||||
|
||||
V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; |
||||
|
||||
for (int x = 0; x < 10; ++x) |
||||
{ |
||||
BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); |
||||
BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); |
||||
BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); |
||||
BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); |
||||
BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); |
||||
BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); |
||||
BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); |
||||
BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); |
||||
} |
||||
|
||||
V.lo ^= V.hi; |
||||
V.lo ^= tmpblock; |
||||
|
||||
V.hi = BLAKE2S_IV_Vec; |
||||
tmpblock = V.lo; |
||||
|
||||
V.hi.s4 ^= 128; |
||||
V.hi.s6 = ~V.hi.s6; |
||||
|
||||
for (int x = 0; x < 10; ++x) |
||||
{ |
||||
BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); |
||||
BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); |
||||
BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); |
||||
BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); |
||||
BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); |
||||
BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); |
||||
BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); |
||||
BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); |
||||
} |
||||
|
||||
V.lo ^= V.hi ^ tmpblock; |
||||
|
||||
((uint8*)inout)[0]=V.lo; |
||||
|
||||
} |
||||
|
||||
static __forceinline__ __host__ void Blake2Shost(uint32_t * inout, const uint32_t * inkey) |
||||
{ |
||||
uint16 V; |
||||
uint32_t idx; |
||||
uint8 tmpblock; |
||||
|
||||
V.hi = BLAKE2S_IV_Vechost; |
||||
V.lo = BLAKE2S_IV_Vechost; |
||||
V.lo.s0 ^= 0x01012020; |
||||
|
||||
// Copy input block for later |
||||
tmpblock = V.lo; |
||||
|
||||
V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; |
||||
|
||||
for (int x = 0; x < 10; ++x) |
||||
{ |
||||
BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey); |
||||
BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey); |
||||
BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey); |
||||
BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey); |
||||
BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey); |
||||
BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey); |
||||
BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey); |
||||
BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey); |
||||
} |
||||
|
||||
V.lo ^= V.hi; |
||||
V.lo ^= tmpblock; |
||||
|
||||
V.hi = BLAKE2S_IV_Vechost; |
||||
tmpblock = V.lo; |
||||
|
||||
V.hi.s4 ^= 128; |
||||
V.hi.s6 = ~V.hi.s6; |
||||
|
||||
for (int x = 0; x < 10; ++x) |
||||
{ |
||||
BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); |
||||
BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); |
||||
BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); |
||||
BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); |
||||
BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); |
||||
BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); |
||||
BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); |
||||
BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); |
||||
} |
||||
|
||||
V.lo ^= V.hi ^ tmpblock; |
||||
|
||||
((uint8*)inout)[0] = V.lo; |
||||
} |
||||
|
||||
static __forceinline__ __device__ void fastkdf256(const uint32_t* password, uint8_t* output) |
||||
{ |
||||
uint8_t bufidx = 0; |
||||
uchar4 bufhelper; |
||||
uint8_t A[320],B[288]; |
||||
|
||||
((uintx64*)A)[0] = ((uintx64*)password)[0]; |
||||
((uint816 *)A)[4] = ((uint816 *)password)[0]; |
||||
|
||||
((uintx64*)B)[0] = ((uintx64*)password)[0]; |
||||
((uint48 *)B)[8] = ((uint48 *)password)[0]; |
||||
|
||||
uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = { 0 }; |
||||
|
||||
((uint816*)input)[0] = ((uint816*)input_init)[0]; |
||||
((uint48*)key)[0] = ((uint48*)key_init)[0]; |
||||
|
||||
for (int i = 0; i < 32; ++i) |
||||
{ |
||||
bufhelper = ((uchar4*)input)[0]; |
||||
for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) |
||||
bufhelper += ((uchar4*)input)[x]; |
||||
bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; |
||||
|
||||
int qbuf = bufidx/4; |
||||
int rbuf = bufidx&3; |
||||
int bitbuf = rbuf << 3; |
||||
uint32_t shifted[9]; |
||||
|
||||
shift256R2(shifted, ((uint8*)input)[0], bitbuf); |
||||
|
||||
for (int k = 0; k < 9; ++k) { |
||||
((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k]; |
||||
} |
||||
|
||||
if (bufidx < BLAKE2S_KEY_SIZE) {((uint8*)B)[8] = ((uint8*)B)[0];} |
||||
else if (bufidx > FASTKDF_BUFFER_SIZE-BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];} |
||||
|
||||
if (i<31) { |
||||
for (int k = 0; k <BLAKE2S_BLOCK_SIZE / 4; k++) { |
||||
((uchar4*)(input))[k] = make_uchar4( |
||||
(A + bufidx)[4 * k], (A + bufidx)[4 * k + 1], |
||||
(A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3] |
||||
); |
||||
} |
||||
|
||||
for (int k = 0; k <BLAKE2S_KEY_SIZE / 4; k++) { |
||||
((uchar4*)(key))[k] = make_uchar4( |
||||
(B + bufidx)[4 * k], (B + bufidx)[4 * k + 1], |
||||
(B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3] |
||||
); |
||||
} |
||||
|
||||
Blake2S((uint32_t*)input, key); |
||||
} |
||||
} |
||||
|
||||
int left = FASTKDF_BUFFER_SIZE - bufidx; |
||||
int qleft =left/4; |
||||
int rleft =left&3; |
||||
for (int k = 0; k < qleft; ++k) { |
||||
((uchar4*)output)[k] = make_uchar4( |
||||
(B + bufidx)[4 * k], (B + bufidx)[4 * k + 1], |
||||
(B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3] |
||||
) ^ ((uchar4*)A)[k]; |
||||
} |
||||
for (int i = 4*qleft; i < 4*qleft+rleft; ++i) { |
||||
output[i] = (B + bufidx)[i] ^ A[i]; |
||||
} |
||||
for (int i = qleft*4+rleft; i < (qleft+1)*4; ++i) { |
||||
((uint8_t *)output)[i] = ((uint8_t *)B)[i - left] ^ ((uint8_t *)A)[i]; |
||||
} |
||||
for (int i = qleft+1; i < FASTKDF_BUFFER_SIZE/4; ++i) { |
||||
((uchar4 *)output)[i] = make_uchar4(B[4*i - left],B[4*i+1-left], |
||||
B[4*i+2-left],B[4*i+3-left]) ^ ((uchar4 *)A)[i]; |
||||
} |
||||
} |
||||
|
||||
static __forceinline__ __device__ void fastkdf32( const uint32_t * password, const uint32_t * salt, uint32_t * output) |
||||
{ |
||||
uint8_t bufidx = 0; |
||||
uchar4 bufhelper; |
||||
|
||||
uint8_t A[320]; |
||||
uint8_t B[288]; |
||||
|
||||
// Initialize the password buffer |
||||
((uintx64*)A)[0] = ((uintx64*)password)[0]; |
||||
((uint816*)A)[4] = ((uint816*)password)[0]; |
||||
((uintx64*)B)[0] = ((uintx64*)salt)[0]; |
||||
((uintx64*)B)[1] = ((uintx64*)salt)[0]; |
||||
|
||||
uint32_t input[BLAKE2S_BLOCK_SIZE/4]; |
||||
uint32_t key[BLAKE2S_BLOCK_SIZE/4] = { 0 }; |
||||
|
||||
((uint816*)input)[0] = ((uint816*)password)[0]; |
||||
((uint48*)key)[0] = ((uint48*)salt)[0]; |
||||
|
||||
for (int i = 0; i < 32; ++i) |
||||
{ |
||||
Blake2S((uint32_t*)input, key); |
||||
|
||||
bufidx = 0; |
||||
bufhelper = ((uchar4*)input)[0]; |
||||
|
||||
for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) { bufhelper += ((uchar4*)input)[x]; } |
||||
|
||||
bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; |
||||
int qbuf = bufidx / 4; |
||||
int rbuf = bufidx & 3; |
||||
int bitbuf = rbuf << 3; |
||||
uint32_t shifted[9]; |
||||
|
||||
shift256R2(shifted, ((uint8*)input)[0], bitbuf); |
||||
|
||||
for (int k = 0; k < 9; ++k) { |
||||
((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k]; |
||||
} |
||||
|
||||
if (i<31){ |
||||
if (bufidx < BLAKE2S_KEY_SIZE) {((uint8*)B)[8] = ((uint8*)B)[0];} |
||||
else if (bufidx > FASTKDF_BUFFER_SIZE - BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];} |
||||
// MyUnion Test; |
||||
|
||||
for (uint8_t k = 0; k <BLAKE2S_BLOCK_SIZE/4 ; k++) { |
||||
((uchar4*)(input))[k] = |
||||
make_uchar4((A + bufidx)[4 * k], (A + bufidx)[4 * k + 1], (A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3]); |
||||
} |
||||
for (uint8_t k = 0; k <BLAKE2S_KEY_SIZE / 4; k++) { |
||||
((uchar4*)(key))[k] = |
||||
make_uchar4((B + bufidx)[4 * k], (B + bufidx)[4 * k + 1], (B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]); |
||||
} |
||||
} |
||||
} |
||||
|
||||
uchar4 unfucked[1]; |
||||
unfucked[0] = make_uchar4(B[28 + bufidx], B[29 + bufidx],B[30 + bufidx], B[31 + bufidx]); |
||||
((uint32_t*)output)[7] = ((uint32_t*)unfucked)[0] ^ ((uint32_t*)A)[7]; |
||||
} |
||||
|
||||
|
||||
#define SALSA(a,b,c,d) { \ |
||||
t =a+d; b^=rotate(t, 7); \ |
||||
t =b+a; c^=rotate(t, 9); \ |
||||
t =c+b; d^=rotate(t, 13); \ |
||||
t =d+c; a^=rotate(t, 18); \ |
||||
} |
||||
|
||||
#define SALSA_CORE(state) { \ |
||||
SALSA(state.s0,state.s4,state.s8,state.sc); \ |
||||
SALSA(state.s5,state.s9,state.sd,state.s1); \ |
||||
SALSA(state.sa,state.se,state.s2,state.s6); \ |
||||
SALSA(state.sf,state.s3,state.s7,state.sb); \ |
||||
SALSA(state.s0,state.s1,state.s2,state.s3); \ |
||||
SALSA(state.s5,state.s6,state.s7,state.s4); \ |
||||
SALSA(state.sa,state.sb,state.s8,state.s9); \ |
||||
SALSA(state.sf,state.sc,state.sd,state.se); \ |
||||
} |
||||
|
||||
#if __CUDA_ARCH__ >=500 |
||||
#define CHACHA_STEP(a,b,c,d) { \ |
||||
a += b; d = __byte_perm(d^a,0,0x1032); \ |
||||
c += d; b = rotate(b^c, 12); \ |
||||
a += b; d = __byte_perm(d^a,0,0x2103); \ |
||||
c += d; b = rotate(b^c, 7); \ |
||||
} |
||||
#else |
||||
#define CHACHA_STEP(a,b,c,d) { \ |
||||
a += b; d = rotate(d^a,16); \ |
||||
c += d; b = rotate(b^c, 12); \ |
||||
a += b; d = rotate(d^a,8); \ |
||||
c += d; b = rotate(b^c, 7); \ |
||||
} |
||||
#endif |
||||
|
||||
#define CHACHA_CORE_PARALLEL(state) { \ |
||||
CHACHA_STEP(state.lo.s0, state.lo.s4, state.hi.s0, state.hi.s4); \ |
||||
CHACHA_STEP(state.lo.s1, state.lo.s5, state.hi.s1, state.hi.s5); \ |
||||
CHACHA_STEP(state.lo.s2, state.lo.s6, state.hi.s2, state.hi.s6); \ |
||||
CHACHA_STEP(state.lo.s3, state.lo.s7, state.hi.s3, state.hi.s7); \ |
||||
CHACHA_STEP(state.lo.s0, state.lo.s5, state.hi.s2, state.hi.s7); \ |
||||
CHACHA_STEP(state.lo.s1, state.lo.s6, state.hi.s3, state.hi.s4); \ |
||||
CHACHA_STEP(state.lo.s2, state.lo.s7, state.hi.s0, state.hi.s5); \ |
||||
CHACHA_STEP(state.lo.s3, state.lo.s4, state.hi.s1, state.hi.s6); \ |
||||
} |
||||
|
||||
|
||||
static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X) |
||||
{ |
||||
uint16 state = X; |
||||
uint32_t t; |
||||
|
||||
for (int i = 0; i < 10; ++i) { SALSA_CORE(state);} |
||||
|
||||
return(X + state); |
||||
} |
||||
|
||||
static __device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X) |
||||
{ |
||||
uint16 st = X; |
||||
|
||||
for (int i = 0; i < 10; ++i) {CHACHA_CORE_PARALLEL(st);} |
||||
return(X + st); |
||||
} |
||||
|
||||
static __device__ __forceinline__ void neoscrypt_chacha(uint16 *XV) |
||||
{ |
||||
XV[0] ^= XV[3]; |
||||
uint16 temp; |
||||
|
||||
XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0]; |
||||
temp = chacha_small_parallel_rnd(XV[1]); XV[2] ^= temp; |
||||
XV[1] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[1]; |
||||
XV[3] = chacha_small_parallel_rnd(XV[3]); |
||||
XV[2] = temp; |
||||
} |
||||
|
||||
static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV) |
||||
{ |
||||
XV[0] ^= XV[3]; |
||||
uint16 temp; |
||||
|
||||
XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; |
||||
temp = salsa_small_scalar_rnd(XV[1]); XV[2] ^= temp; |
||||
XV[1] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[1]; |
||||
XV[3] = salsa_small_scalar_rnd(XV[3]); |
||||
XV[2] = temp; |
||||
} |
||||
|
||||
|
||||
#define SHIFT 130 |
||||
|
||||
__global__ __launch_bounds__(128, 1) |
||||
void neoscrypt_gpu_hash_k0(int stratum, uint32_t threads, uint32_t startNonce) |
||||
{ |
||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
uint32_t shift = SHIFT * 16 * thread; |
||||
// if (thread < threads) |
||||
{ |
||||
uint32_t data[80]; |
||||
uint16 X[4]; |
||||
const uint32_t nonce = startNonce + thread; |
||||
|
||||
for (int i = 0; i<20; i++) { |
||||
((uint4*)data)[i] = ((uint4 *)c_data)[i]; |
||||
} //ld.local.v4 |
||||
data[19] = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!! |
||||
data[39] = data[19]; |
||||
data[59] = data[19]; |
||||
|
||||
fastkdf256(data, (uint8_t*)X); |
||||
|
||||
((uintx64 *)(W + shift))[0] = ((uintx64 *)X)[0]; |
||||
// ((ulonglong16 *)(W + shift))[0] = ((ulonglong16 *)X)[0]; |
||||
} |
||||
} |
||||
|
||||
__global__ __launch_bounds__(128, 1) |
||||
void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce) |
||||
{ |
||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
uint32_t shift = SHIFT * 16 * thread; |
||||
// if (thread < threads) |
||||
{ |
||||
uint16 X[4]; |
||||
((uintx64 *)X)[0]= __ldg32(&(W + shift)[0]); |
||||
|
||||
//#pragma unroll |
||||
for (int i = 0; i < 128; ++i) |
||||
{ |
||||
neoscrypt_chacha(X); |
||||
((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)X)[0]; |
||||
// ((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)X)[0]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
__global__ __launch_bounds__(128, 1) |
||||
void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce) |
||||
{ |
||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
uint32_t shift = SHIFT * 16 * thread; |
||||
// if (thread < threads) |
||||
{ |
||||
uint16 X[4]; |
||||
((uintx64 *)X)[0] = __ldg32(&(W + shift)[2048]); |
||||
|
||||
for (int t = 0; t < 128; t++) |
||||
{ |
||||
int idx = X[3].lo.s0 & 0x7F; |
||||
((uintx64 *)X)[0] ^= __ldg32(&(W + shift)[idx << 4]); |
||||
neoscrypt_chacha(X); |
||||
|
||||
} |
||||
((uintx64 *)(W + shift))[129] = ((uintx64*)X)[0]; // best checked |
||||
|
||||
} |
||||
} |
||||
|
||||
__global__ __launch_bounds__(128, 1) |
||||
void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce) |
||||
{ |
||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
// if (thread < threads) |
||||
{ |
||||
uint32_t shift = SHIFT * 16 * thread; |
||||
uint16 Z[4]; |
||||
|
||||
((uintx64*)Z)[0] = __ldg32(&(W + shift)[0]); |
||||
|
||||
//#pragma unroll |
||||
for (int i = 0; i < 128; ++i) { |
||||
neoscrypt_salsa(Z); |
||||
((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)Z)[0]; |
||||
// ((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)Z)[0]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
__global__ __launch_bounds__(128, 1) |
||||
void neoscrypt_gpu_hash_k4(int stratum, uint32_t threads, uint32_t startNonce, uint32_t *nonceVector) |
||||
{ |
||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
// if (thread < threads) |
||||
{ |
||||
const uint32_t nonce = startNonce + thread; |
||||
|
||||
uint32_t shift = SHIFT * 16 * thread; |
||||
uint16 Z[4]; |
||||
uint32_t outbuf[8]; |
||||
uint32_t data[80]; |
||||
|
||||
for (int i=0; i<20; i++) { |
||||
((uint4*)data)[i] = ((uint4 *)c_data)[i]; |
||||
} |
||||
|
||||
data[19] = (stratum) ? cuda_swab32(nonce) : nonce; |
||||
data[39] = data[19]; |
||||
data[59] = data[19]; |
||||
((uintx64 *)Z)[0] = __ldg32(&(W + shift)[2048]); |
||||
for (int t = 0; t < 128; t++) |
||||
{ |
||||
int idx = Z[3].lo.s0 & 0x7F; |
||||
((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[idx << 4]); |
||||
neoscrypt_salsa(Z); |
||||
} |
||||
((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[2064]); |
||||
fastkdf32(data, (uint32_t*)Z, outbuf); |
||||
if (outbuf[7] <= pTarget[7]) { |
||||
uint32_t tmp = atomicExch(&nonceVector[0], nonce); |
||||
} |
||||
} |
||||
} |
||||
|
||||
void neoscrypt_cpu_init(int thr_id, uint32_t threads, uint32_t *hash) |
||||
{ |
||||
cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice); |
||||
cudaMemcpyToSymbol(W, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice); |
||||
cudaMalloc(&d_NNonce[thr_id], sizeof(uint32_t)); |
||||
} |
||||
|
||||
__host__ |
||||
uint32_t neoscrypt_cpu_hash_k4(int stratum, int thr_id, uint32_t threads, uint32_t startNounce, int order) |
||||
{ |
||||
uint32_t result[MAX_GPUS] = { 0xffffffff }; |
||||
cudaMemset(d_NNonce[thr_id], 0xff, sizeof(uint32_t)); |
||||
|
||||
const uint32_t threadsperblock = 128; |
||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock); |
||||
dim3 block(threadsperblock); |
||||
|
||||
// neoscrypt_gpu_hash_orig << <grid, block >> >(threads, startNounce, d_NNonce[thr_id]); |
||||
|
||||
neoscrypt_gpu_hash_k0 << <grid, block >> >(stratum,threads, startNounce); |
||||
neoscrypt_gpu_hash_k01 << <grid, block >> >(threads, startNounce); |
||||
neoscrypt_gpu_hash_k2 << <grid, block >> >(threads, startNounce); |
||||
neoscrypt_gpu_hash_k3 << <grid, block >> >(threads, startNounce); |
||||
neoscrypt_gpu_hash_k4 << <grid, block >> >(stratum,threads, startNounce, d_NNonce[thr_id]); |
||||
|
||||
MyStreamSynchronize(NULL, order, thr_id); |
||||
cudaMemcpy(&result[thr_id], d_NNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
||||
|
||||
return result[thr_id]; |
||||
} |
||||
|
||||
__host__ |
||||
void neoscrypt_setBlockTarget(uint32_t* pdata, const void *target) |
||||
{ |
||||
unsigned char PaddedMessage[80*4]; //bring balance to the force |
||||
uint32_t input[16], key[16] = { 0 }; |
||||
memcpy(PaddedMessage, pdata, 80); |
||||
memcpy(PaddedMessage + 80, pdata, 80); |
||||
memcpy(PaddedMessage + 160, pdata, 80); |
||||
memcpy(PaddedMessage + 240, pdata, 80); |
||||
|
||||
((uint16*)input)[0] = ((uint16*)pdata)[0]; |
||||
((uint8*)key)[0] = ((uint8*)pdata)[0]; |
||||
// for (int i = 0; i<10; i++) { printf(" pdata/input %d %08x %08x \n",i,pdata[2*i],pdata[2*i+1]); } |
||||
|
||||
Blake2Shost(input,key); |
||||
|
||||
cudaMemcpyToSymbol(pTarget, target, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); |
||||
cudaMemcpyToSymbol(input_init, input, 16 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); |
||||
cudaMemcpyToSymbol(key_init, key, 16 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); |
||||
|
||||
cudaMemcpyToSymbol(c_data, PaddedMessage, 40 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice); |
||||
} |
||||
|
@ -0,0 +1,992 @@
@@ -0,0 +1,992 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Colin Percival, 2011 ArtForz |
||||
* Copyright (c) 2012 Andrew Moon (floodyberry) |
||||
* Copyright (c) 2012 Samuel Neves <sneves@dei.uc.pt> |
||||
* Copyright (c) 2014 John Doering <ghostlander@phoenixcoin.org> |
||||
* All rights reserved. |
||||
* |
||||
* Redistribution and use in source and binary forms, with or without |
||||
* modification, are permitted provided that the following conditions |
||||
* are met: |
||||
* 1. Redistributions of source code must retain the above copyright |
||||
* notice, this list of conditions and the following disclaimer. |
||||
* 2. Redistributions in binary form must reproduce the above copyright |
||||
* notice, this list of conditions and the following disclaimer in the |
||||
* documentation and/or other materials provided with the distribution. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
||||
* SUCH DAMAGE. |
||||
*/ |
||||
|
||||
|
||||
#include <stdlib.h> |
||||
#include <stdint.h> |
||||
#include <string.h> |
||||
|
||||
#include "neoscrypt.h" |
||||
|
||||
|
||||
#if (WINDOWS) |
||||
/* sizeof(unsigned long) = 4 for MinGW64 */ |
||||
typedef unsigned long long ulong; |
||||
#else |
||||
typedef unsigned long ulong; |
||||
#endif |
||||
typedef unsigned int uint; |
||||
typedef unsigned char uchar; |
||||
typedef unsigned int bool; |
||||
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? a : b) |
||||
#define MAX(a, b) ((a) > (b) ? a : b) |
||||
|
||||
|
||||
/* SHA-256 */ |
||||
|
||||
static const uint32_t sha256_constants[64] = { |
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
||||
}; |
||||
|
||||
#define Ch(x,y,z) (z ^ (x & (y ^ z))) |
||||
#define Maj(x,y,z) (((x | y) & z) | (x & y)) |
||||
#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) |
||||
#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) |
||||
#define G0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ (x >> 3)) |
||||
#define G1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10)) |
||||
#define W0(in,i) (U8TO32_BE(&in[i * 4])) |
||||
#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) |
||||
#define STEP(i) \ |
||||
t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ |
||||
t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \ |
||||
r[7] = r[6]; \ |
||||
r[6] = r[5]; \ |
||||
r[5] = r[4]; \ |
||||
r[4] = r[3] + t0; \ |
||||
r[3] = r[2]; \ |
||||
r[2] = r[1]; \ |
||||
r[1] = r[0]; \ |
||||
r[0] = t0 + t1; |
||||
|
||||
|
||||
typedef struct sha256_hash_state_t { |
||||
uint32_t H[8]; |
||||
uint64_t T; |
||||
uint32_t leftover; |
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; |
||||
} sha256_hash_state; |
||||
|
||||
|
||||
static void sha256_blocks(sha256_hash_state *S, const uint8_t *in, size_t blocks) |
||||
{ |
||||
uint32_t r[8], w[64], t0, t1; |
||||
size_t i; |
||||
|
||||
for (i = 0; i < 8; i++) |
||||
r[i] = S->H[i]; |
||||
|
||||
while (blocks--) { |
||||
for (i = 0U; i < 16; i++) { |
||||
w[i] = W0(in, i); |
||||
} |
||||
for (i = 16; i < 64; i++) { |
||||
w[i] = W1(i); |
||||
} |
||||
for (i = 0U; i < 64; i++) { |
||||
STEP(i); |
||||
} |
||||
for (i = 0U; i < 8U; i++) { |
||||
r[i] += S->H[i]; |
||||
S->H[i] = r[i]; |
||||
} |
||||
S->T += SCRYPT_HASH_BLOCK_SIZE * 8; |
||||
in += SCRYPT_HASH_BLOCK_SIZE; |
||||
} |
||||
} |
||||
|
||||
static void neoscrypt_hash_init_sha256(sha256_hash_state *S) |
||||
{ |
||||
S->H[0] = 0x6a09e667; |
||||
S->H[1] = 0xbb67ae85; |
||||
S->H[2] = 0x3c6ef372; |
||||
S->H[3] = 0xa54ff53a; |
||||
S->H[4] = 0x510e527f; |
||||
S->H[5] = 0x9b05688c; |
||||
S->H[6] = 0x1f83d9ab; |
||||
S->H[7] = 0x5be0cd19; |
||||
S->T = 0; |
||||
S->leftover = 0; |
||||
} |
||||
|
||||
static void neoscrypt_hash_update_sha256(sha256_hash_state *S, const uint8_t *in, size_t inlen) |
||||
{ |
||||
size_t blocks, want; |
||||
|
||||
/* handle the previous data */ |
||||
if (S->leftover) { |
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); |
||||
want = (want < inlen) ? want : inlen; |
||||
memcpy(S->buffer + S->leftover, in, want); |
||||
S->leftover += (uint32_t)want; |
||||
if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) |
||||
return; |
||||
in += want; |
||||
inlen -= want; |
||||
sha256_blocks(S, S->buffer, 1); |
||||
} |
||||
|
||||
/* handle the current data */ |
||||
blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); |
||||
S->leftover = (uint32_t)(inlen - blocks); |
||||
if (blocks) { |
||||
sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); |
||||
in += blocks; |
||||
} |
||||
|
||||
/* handle leftover data */ |
||||
if (S->leftover) |
||||
memcpy(S->buffer, in, S->leftover); |
||||
} |
||||
|
||||
static void neoscrypt_hash_finish_sha256(sha256_hash_state *S, uint8_t *hash) |
||||
{ |
||||
uint64_t t = S->T + (S->leftover * 8); |
||||
|
||||
S->buffer[S->leftover] = 0x80; |
||||
if (S->leftover <= 55) { |
||||
memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); |
||||
} else { |
||||
memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); |
||||
sha256_blocks(S, S->buffer, 1); |
||||
memset(S->buffer, 0, 56); |
||||
} |
||||
|
||||
U64TO8_BE(S->buffer + 56, t); |
||||
sha256_blocks(S, S->buffer, 1); |
||||
|
||||
U32TO8_BE(&hash[ 0], S->H[0]); |
||||
U32TO8_BE(&hash[ 4], S->H[1]); |
||||
U32TO8_BE(&hash[ 8], S->H[2]); |
||||
U32TO8_BE(&hash[12], S->H[3]); |
||||
U32TO8_BE(&hash[16], S->H[4]); |
||||
U32TO8_BE(&hash[20], S->H[5]); |
||||
U32TO8_BE(&hash[24], S->H[6]); |
||||
U32TO8_BE(&hash[28], S->H[7]); |
||||
} |
||||
|
||||
static void neoscrypt_hash_sha256(hash_digest hash, const uint8_t *m, size_t mlen) |
||||
{ |
||||
sha256_hash_state st; |
||||
neoscrypt_hash_init_sha256(&st); |
||||
neoscrypt_hash_update_sha256(&st, m, mlen); |
||||
neoscrypt_hash_finish_sha256(&st, hash); |
||||
} |
||||
|
||||
|
||||
/* HMAC for SHA-256 */ |
||||
|
||||
typedef struct sha256_hmac_state_t { |
||||
sha256_hash_state inner, outer; |
||||
} sha256_hmac_state; |
||||
|
||||
static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen) |
||||
{ |
||||
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; |
||||
size_t i; |
||||
|
||||
neoscrypt_hash_init_sha256(&st->inner); |
||||
neoscrypt_hash_init_sha256(&st->outer); |
||||
|
||||
if (keylen <= SCRYPT_HASH_BLOCK_SIZE) { |
||||
/* use the key directly if it's <= blocksize bytes */ |
||||
memcpy(pad, key, keylen); |
||||
} else { |
||||
/* if it's > blocksize bytes, hash it */ |
||||
neoscrypt_hash_sha256(pad, key, keylen); |
||||
} |
||||
|
||||
/* inner = (key ^ 0x36) */ |
||||
/* h(inner || ...) */ |
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) |
||||
pad[i] ^= 0x36; |
||||
neoscrypt_hash_update_sha256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); |
||||
|
||||
/* outer = (key ^ 0x5c) */ |
||||
/* h(outer || ...) */ |
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) |
||||
pad[i] ^= (0x5c ^ 0x36); |
||||
neoscrypt_hash_update_sha256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); |
||||
} |
||||
|
||||
static void neoscrypt_hmac_update_sha256(sha256_hmac_state *st, const uint8_t *m, size_t mlen) |
||||
{ |
||||
/* h(inner || m...) */ |
||||
neoscrypt_hash_update_sha256(&st->inner, m, mlen); |
||||
} |
||||
|
||||
static void neoscrypt_hmac_finish_sha256(sha256_hmac_state *st, hash_digest mac) |
||||
{ |
||||
/* h(inner || m) */ |
||||
hash_digest innerhash; |
||||
neoscrypt_hash_finish_sha256(&st->inner, innerhash); |
||||
|
||||
/* h(outer || h(inner || m)) */ |
||||
neoscrypt_hash_update_sha256(&st->outer, innerhash, sizeof(innerhash)); |
||||
neoscrypt_hash_finish_sha256(&st->outer, mac); |
||||
} |
||||
|
||||
|
||||
/* PBKDF2 for SHA-256 */ |
||||
|
||||
static void neoscrypt_pbkdf2_sha256(const uint8_t *password, size_t password_len, |
||||
const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *output, size_t output_len) |
||||
{ |
||||
sha256_hmac_state hmac_pw, hmac_pw_salt, work; |
||||
hash_digest ti, u; |
||||
uint8_t be[4]; |
||||
uint32_t i, j, k, blocks; |
||||
|
||||
/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ |
||||
|
||||
/* hmac(password, ...) */ |
||||
neoscrypt_hmac_init_sha256(&hmac_pw, password, password_len); |
||||
|
||||
/* hmac(password, salt...) */ |
||||
hmac_pw_salt = hmac_pw; |
||||
neoscrypt_hmac_update_sha256(&hmac_pw_salt, salt, salt_len); |
||||
|
||||
blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; |
||||
for(i = 1; i <= blocks; i++) { |
||||
/* U1 = hmac(password, salt || be(i)) */ |
||||
U32TO8_BE(be, i); |
||||
work = hmac_pw_salt; |
||||
neoscrypt_hmac_update_sha256(&work, be, 4); |
||||
neoscrypt_hmac_finish_sha256(&work, ti); |
||||
memcpy(u, ti, sizeof(u)); |
||||
|
||||
/* T[i] = U1 ^ U2 ^ U3... */ |
||||
for(j = 0; j < N - 1; j++) { |
||||
/* UX = hmac(password, U{X-1}) */ |
||||
work = hmac_pw; |
||||
neoscrypt_hmac_update_sha256(&work, u, SCRYPT_HASH_DIGEST_SIZE); |
||||
neoscrypt_hmac_finish_sha256(&work, u); |
||||
|
||||
/* T[i] ^= UX */ |
||||
for(k = 0; k < sizeof(u); k++) |
||||
ti[k] ^= u[k]; |
||||
} |
||||
|
||||
memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len); |
||||
output += SCRYPT_HASH_DIGEST_SIZE; |
||||
output_len -= SCRYPT_HASH_DIGEST_SIZE; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* NeoScrypt */ |
||||
|
||||
#if defined(ASM) |
||||
|
||||
extern void neoscrypt_salsa(uint *X, uint rounds); |
||||
extern void neoscrypt_salsa_tangle(uint *X, uint count); |
||||
extern void neoscrypt_chacha(uint *X, uint rounds); |
||||
|
||||
extern void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len); |
||||
extern void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len); |
||||
extern void neoscrypt_blkxor(void *dstp, const void *srcp, uint len); |
||||
|
||||
#else |
||||
|
||||
/* Salsa20, rounds must be a multiple of 2 */ |
||||
static void neoscrypt_salsa(uint *X, uint rounds) |
||||
{ |
||||
uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t; |
||||
|
||||
x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3]; |
||||
x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7]; |
||||
x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11]; |
||||
x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15]; |
||||
|
||||
#define quarter(a, b, c, d) \ |
||||
t = a + d; t = ROTL32(t, 7); b ^= t; \ |
||||
t = b + a; t = ROTL32(t, 9); c ^= t; \ |
||||
t = c + b; t = ROTL32(t, 13); d ^= t; \ |
||||
t = d + c; t = ROTL32(t, 18); a ^= t; |
||||
|
||||
for(; rounds; rounds -= 2) { |
||||
quarter( x0, x4, x8, x12); |
||||
quarter( x5, x9, x13, x1); |
||||
quarter(x10, x14, x2, x6); |
||||
quarter(x15, x3, x7, x11); |
||||
quarter( x0, x1, x2, x3); |
||||
quarter( x5, x6, x7, x4); |
||||
quarter(x10, x11, x8, x9); |
||||
quarter(x15, x12, x13, x14); |
||||
} |
||||
|
||||
X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3; |
||||
X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7; |
||||
X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11; |
||||
X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15; |
||||
|
||||
#undef quarter |
||||
} |
||||
|
||||
/* ChaCha20, rounds must be a multiple of 2 */ |
||||
static void neoscrypt_chacha(uint *X, uint rounds) |
||||
{ |
||||
uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t; |
||||
|
||||
x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3]; |
||||
x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7]; |
||||
x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11]; |
||||
x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15]; |
||||
|
||||
#define quarter(a,b,c,d) \ |
||||
a += b; t = d ^ a; d = ROTL32(t, 16); \ |
||||
c += d; t = b ^ c; b = ROTL32(t, 12); \ |
||||
a += b; t = d ^ a; d = ROTL32(t, 8); \ |
||||
c += d; t = b ^ c; b = ROTL32(t, 7); |
||||
|
||||
for(; rounds; rounds -= 2) { |
||||
quarter( x0, x4, x8, x12); |
||||
quarter( x1, x5, x9, x13); |
||||
quarter( x2, x6, x10, x14); |
||||
quarter( x3, x7, x11, x15); |
||||
quarter( x0, x5, x10, x15); |
||||
quarter( x1, x6, x11, x12); |
||||
quarter( x2, x7, x8, x13); |
||||
quarter( x3, x4, x9, x14); |
||||
} |
||||
|
||||
X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3; |
||||
X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7; |
||||
X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11; |
||||
X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15; |
||||
|
||||
#undef quarter |
||||
} |
||||
|
||||
|
||||
/* Fast 32-bit / 64-bit memcpy();
|
||||
* len must be a multiple of 32 bytes */ |
||||
static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) |
||||
{ |
||||
ulong *dst = (ulong *) dstp; |
||||
ulong *src = (ulong *) srcp; |
||||
uint i; |
||||
|
||||
for(i = 0; i < (len / sizeof(ulong)); i += 4) { |
||||
dst[i] = src[i]; |
||||
dst[i + 1] = src[i + 1]; |
||||
dst[i + 2] = src[i + 2]; |
||||
dst[i + 3] = src[i + 3]; |
||||
} |
||||
} |
||||
|
||||
/* Fast 32-bit / 64-bit block swapper;
|
||||
* len must be a multiple of 32 bytes */ |
||||
static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) |
||||
{ |
||||
ulong *blkA = (ulong *) blkAp; |
||||
ulong *blkB = (ulong *) blkBp; |
||||
register ulong t0, t1, t2, t3; |
||||
uint i; |
||||
|
||||
for(i = 0; i < (len / sizeof(ulong)); i += 4) { |
||||
t0 = blkA[i]; |
||||
t1 = blkA[i + 1]; |
||||
t2 = blkA[i + 2]; |
||||
t3 = blkA[i + 3]; |
||||
blkA[i] = blkB[i]; |
||||
blkA[i + 1] = blkB[i + 1]; |
||||
blkA[i + 2] = blkB[i + 2]; |
||||
blkA[i + 3] = blkB[i + 3]; |
||||
blkB[i] = t0; |
||||
blkB[i + 1] = t1; |
||||
blkB[i + 2] = t2; |
||||
blkB[i + 3] = t3; |
||||
} |
||||
} |
||||
|
||||
/* Fast 32-bit / 64-bit block XOR engine;
|
||||
* len must be a multiple of 32 bytes */ |
||||
static void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) |
||||
{ |
||||
ulong *dst = (ulong *) dstp; |
||||
ulong *src = (ulong *) srcp; |
||||
uint i; |
||||
|
||||
for (i = 0; i < (len / sizeof(ulong)); i += 4) { |
||||
dst[i] ^= src[i]; |
||||
dst[i + 1] ^= src[i + 1]; |
||||
dst[i + 2] ^= src[i + 2]; |
||||
dst[i + 3] ^= src[i + 3]; |
||||
} |
||||
} |
||||
|
||||
#endif |
||||
|
||||
/* 32-bit / 64-bit optimised memcpy() */ |
||||
static void neoscrypt_copy(void *dstp, const void *srcp, uint len) |
||||
{ |
||||
ulong *dst = (ulong *) dstp; |
||||
ulong *src = (ulong *) srcp; |
||||
uint i, tail; |
||||
|
||||
for(i = 0; i < (len / sizeof(ulong)); i++) |
||||
dst[i] = src[i]; |
||||
|
||||
tail = len & (sizeof(ulong) - 1); |
||||
if(tail) { |
||||
uchar *dstb = (uchar *) dstp; |
||||
uchar *srcb = (uchar *) srcp; |
||||
|
||||
for(i = len - tail; i < len; i++) |
||||
dstb[i] = srcb[i]; |
||||
} |
||||
} |
||||
|
||||
/* 32-bit / 64-bit optimised memory erase aka memset() to zero */ |
||||
static void neoscrypt_erase(void *dstp, uint len) |
||||
{ |
||||
const ulong null = 0; |
||||
ulong *dst = (ulong *) dstp; |
||||
uint i, tail; |
||||
|
||||
for (i = 0; i < (len / sizeof(ulong)); i++) |
||||
dst[i] = null; |
||||
|
||||
tail = len & (sizeof(ulong) - 1); |
||||
if (tail) { |
||||
uchar *dstb = (uchar *) dstp; |
||||
|
||||
for(i = len - tail; i < len; i++) |
||||
dstb[i] = (uchar)null; |
||||
} |
||||
} |
||||
|
||||
/* 32-bit / 64-bit optimised XOR engine */ |
||||
static void neoscrypt_xor(void *dstp, const void *srcp, uint len) |
||||
{ |
||||
ulong *dst = (ulong *) dstp; |
||||
ulong *src = (ulong *) srcp; |
||||
uint i, tail; |
||||
|
||||
for (i = 0; i < (len / sizeof(ulong)); i++) |
||||
dst[i] ^= src[i]; |
||||
|
||||
tail = len & (sizeof(ulong) - 1); |
||||
if (tail) { |
||||
uchar *dstb = (uchar *) dstp; |
||||
uchar *srcb = (uchar *) srcp; |
||||
|
||||
for(i = len - tail; i < len; i++) |
||||
dstb[i] ^= srcb[i]; |
||||
} |
||||
} |
||||
|
||||
|
||||
/* BLAKE2s */ |
||||
|
||||
#define BLAKE2S_BLOCK_SIZE 64U |
||||
#define BLAKE2S_OUT_SIZE 32U |
||||
#define BLAKE2S_KEY_SIZE 32U |
||||
|
||||
/* Parameter block of 32 bytes */ |
||||
typedef struct blake2s_param_t { |
||||
uchar digest_length; |
||||
uchar key_length; |
||||
uchar fanout; |
||||
uchar depth; |
||||
uint leaf_length; |
||||
uchar node_offset[6]; |
||||
uchar node_depth; |
||||
uchar inner_length; |
||||
uchar salt[8]; |
||||
uchar personal[8]; |
||||
} blake2s_param; |
||||
|
||||
/* State block of 180 bytes */ |
||||
typedef struct blake2s_state_t { |
||||
uint h[8]; |
||||
uint t[2]; |
||||
uint f[2]; |
||||
uchar buf[2 * BLAKE2S_BLOCK_SIZE]; |
||||
uint buflen; |
||||
} blake2s_state; |
||||
|
||||
static const uint blake2s_IV[8] = { |
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, |
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 |
||||
}; |
||||
|
||||
static const uint8_t blake2s_sigma[10][16] = { |
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , |
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , |
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , |
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , |
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , |
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , |
||||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , |
||||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , |
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , |
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , |
||||
}; |
||||
|
||||
static void blake2s_compress(blake2s_state *S, const uint *buf) |
||||
{ |
||||
uint i; |
||||
uint m[16]; |
||||
uint v[16]; |
||||
|
||||
neoscrypt_copy(m, buf, 64); |
||||
neoscrypt_copy(v, S, 32); |
||||
|
||||
v[ 8] = blake2s_IV[0]; |
||||
v[ 9] = blake2s_IV[1]; |
||||
v[10] = blake2s_IV[2]; |
||||
v[11] = blake2s_IV[3]; |
||||
v[12] = S->t[0] ^ blake2s_IV[4]; |
||||
v[13] = S->t[1] ^ blake2s_IV[5]; |
||||
v[14] = S->f[0] ^ blake2s_IV[6]; |
||||
v[15] = S->f[1] ^ blake2s_IV[7]; |
||||
|
||||
#define G(r,i,a,b,c,d) do { \ |
||||
a = a + b + m[blake2s_sigma[r][2*i+0]]; \ |
||||
d = ROTR32(d ^ a, 16); \ |
||||
c = c + d; \ |
||||
b = ROTR32(b ^ c, 12); \ |
||||
a = a + b + m[blake2s_sigma[r][2*i+1]]; \ |
||||
d = ROTR32(d ^ a, 8); \ |
||||
c = c + d; \ |
||||
b = ROTR32(b ^ c, 7); \ |
||||
} while(0) |
||||
|
||||
#define ROUND(r) do { \ |
||||
G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \ |
||||
G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \ |
||||
G(r, 2, v[ 2], v[ 6], v[10], v[14]); \ |
||||
G(r, 3, v[ 3], v[ 7], v[11], v[15]); \ |
||||
G(r, 4, v[ 0], v[ 5], v[10], v[15]); \ |
||||
G(r, 5, v[ 1], v[ 6], v[11], v[12]); \ |
||||
G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \ |
||||
G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \ |
||||
} while(0) |
||||
|
||||
ROUND(0); |
||||
ROUND(1); |
||||
ROUND(2); |
||||
ROUND(3); |
||||
ROUND(4); |
||||
ROUND(5); |
||||
ROUND(6); |
||||
ROUND(7); |
||||
ROUND(8); |
||||
ROUND(9); |
||||
|
||||
for (i = 0; i < 8; i++) |
||||
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; |
||||
|
||||
#undef G |
||||
#undef ROUND |
||||
} |
||||
|
||||
static void blake2s_update(blake2s_state *S, const uchar *input, uint input_size) |
||||
{ |
||||
uint left, fill; |
||||
|
||||
while(input_size > 0) { |
||||
left = S->buflen; |
||||
fill = 2 * BLAKE2S_BLOCK_SIZE - left; |
||||
if(input_size > fill) { |
||||
/* Buffer fill */ |
||||
neoscrypt_copy(S->buf + left, input, fill); |
||||
S->buflen += fill; |
||||
/* Counter increment */ |
||||
S->t[0] += BLAKE2S_BLOCK_SIZE; |
||||
/* Compress */ |
||||
blake2s_compress(S, (uint *) S->buf); |
||||
/* Shift buffer left */ |
||||
neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, BLAKE2S_BLOCK_SIZE); |
||||
S->buflen -= BLAKE2S_BLOCK_SIZE; |
||||
input += fill; |
||||
input_size -= fill; |
||||
} else { |
||||
neoscrypt_copy(S->buf + left, input, input_size); |
||||
S->buflen += input_size; |
||||
/* Do not compress */ |
||||
input += input_size; |
||||
input_size = 0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
static void neoscrypt_blake2s(const void *input, const uint input_size, const void *key, const uchar key_size, |
||||
void *output, const uchar output_size) |
||||
{ |
||||
uchar block[BLAKE2S_BLOCK_SIZE]; |
||||
blake2s_param P[1]; |
||||
blake2s_state S[1]; |
||||
|
||||
/* Initialise */ |
||||
neoscrypt_erase(P, 32); |
||||
P->digest_length = output_size; |
||||
P->key_length = key_size; |
||||
P->fanout = 1; |
||||
P->depth = 1; |
||||
|
||||
neoscrypt_erase(S, 180); |
||||
neoscrypt_copy(S, blake2s_IV, 32); |
||||
neoscrypt_xor(S, P, 32); |
||||
|
||||
neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE); |
||||
neoscrypt_copy(block, key, key_size); |
||||
blake2s_update(S, (uchar *) block, BLAKE2S_BLOCK_SIZE); |
||||
|
||||
/* Update */ |
||||
blake2s_update(S, (uchar *) input, input_size); |
||||
|
||||
/* Finish */ |
||||
if(S->buflen > BLAKE2S_BLOCK_SIZE) { |
||||
S->t[0] += BLAKE2S_BLOCK_SIZE; |
||||
blake2s_compress(S, (uint *) S->buf); |
||||
S->buflen -= BLAKE2S_BLOCK_SIZE; |
||||
neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, S->buflen); |
||||
} |
||||
S->t[0] += S->buflen; |
||||
S->f[0] = ~0U; |
||||
neoscrypt_erase(S->buf + S->buflen, 2 * BLAKE2S_BLOCK_SIZE - S->buflen); |
||||
blake2s_compress(S, (uint *) S->buf); |
||||
|
||||
/* Write back */ |
||||
neoscrypt_copy(output, S, output_size); |
||||
|
||||
//for (int k = 0; k<4; k++) { printf("cpu blake %d %08x %08x\n", k, ((unsigned int*)output)[2 * k], ((unsigned int*)output)[2 * k + 1]); }
|
||||
} |
||||
|
||||
|
||||
#define FASTKDF_BUFFER_SIZE 256U |
||||
|
||||
/* FastKDF, a fast buffered key derivation function:
|
||||
* FASTKDF_BUFFER_SIZE must be a power of 2; |
||||
* password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; |
||||
* prf_output_size must be <= prf_key_size; */ |
||||
static void neoscrypt_fastkdf(const uchar *password, uint password_len, const uchar *salt, uint salt_len, |
||||
uint N, uchar *output, uint output_len) |
||||
{ |
||||
//for (int i = 0; i<10; i++) { printf("cpu password %d %08x %08x\n", i, ((unsigned int*)password)[2 * i], ((unsigned int*)password)[2 * i+1]); }
|
||||
const uint stack_align = 0x40; |
||||
const uint kdf_buf_size = 256U; //FASTKDF_BUFFER_SIZE
|
||||
const uint prf_input_size = 64U; //BLAKE2S_BLOCK_SIZE
|
||||
const uint prf_key_size = 32U; //BLAKE2S_KEY_SIZE
|
||||
const uint prf_output_size = 32U; //BLAKE2S_OUT_SIZE
|
||||
uint bufptr, a, b, i, j; |
||||
uchar *A, *B, *prf_input, *prf_key, *prf_output; |
||||
uchar *stack; |
||||
stack = (uchar*)malloc(sizeof(uchar) * 2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align); |
||||
/* Align and set up the buffers in stack */ |
||||
//uchar stack[2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align];
|
||||
|
||||
A = &stack[stack_align & ~(stack_align - 1)]; |
||||
B = &A[kdf_buf_size + prf_input_size]; |
||||
prf_output = &A[2 * kdf_buf_size + prf_input_size + prf_key_size]; |
||||
|
||||
/* Initialise the password buffer */ |
||||
if(password_len > kdf_buf_size) |
||||
password_len = kdf_buf_size; |
||||
|
||||
a = kdf_buf_size / password_len; |
||||
for(i = 0; i < a; i++) |
||||
neoscrypt_copy(&A[i * password_len], &password[0], password_len); |
||||
b = kdf_buf_size - a * password_len; |
||||
if(b) |
||||
neoscrypt_copy(&A[a * password_len], &password[0], b); |
||||
neoscrypt_copy(&A[kdf_buf_size], &password[0], prf_input_size); |
||||
|
||||
/* Initialise the salt buffer */ |
||||
if(salt_len > kdf_buf_size) |
||||
salt_len = kdf_buf_size; |
||||
|
||||
a = kdf_buf_size / salt_len; |
||||
for(i = 0; i < a; i++) |
||||
neoscrypt_copy(&B[i * salt_len], &salt[0], salt_len); |
||||
b = kdf_buf_size - a * salt_len; |
||||
if(b) |
||||
neoscrypt_copy(&B[a * salt_len], &salt[0], b); |
||||
neoscrypt_copy(&B[kdf_buf_size], &salt[0], prf_key_size); |
||||
|
||||
/* The primary iteration */ |
||||
for(i = 0, bufptr = 0; i < N; i++) { |
||||
|
||||
/* Map the PRF input buffer */ |
||||
prf_input = &A[bufptr]; |
||||
|
||||
/* Map the PRF key buffer */ |
||||
prf_key = &B[bufptr]; |
||||
|
||||
/* PRF */ |
||||
// for (int k = 0; k<(prf_input_size/4); k++) { printf("cpu bufptr %08x before blake %d %d %08x \n",bufptr, i, k, ((unsigned int*)prf_input)[k]); }
|
||||
neoscrypt_blake2s(prf_input, prf_input_size, prf_key, prf_key_size, prf_output, prf_output_size); |
||||
// for (int k = 0; k<(prf_output_size/4); k++) { printf("cpu after blake %d %d %08x \n", i, k, ((unsigned int*)prf_output)[k]); }
|
||||
|
||||
/* Calculate the next buffer pointer */ |
||||
for(j = 0, bufptr = 0; j < prf_output_size; j++) |
||||
bufptr += prf_output[j]; |
||||
bufptr &= (kdf_buf_size - 1); |
||||
|
||||
/* Modify the salt buffer */ |
||||
neoscrypt_xor(&B[bufptr], &prf_output[0], prf_output_size); |
||||
|
||||
/* Head modified, tail updated */ |
||||
if(bufptr < prf_key_size) |
||||
neoscrypt_copy(&B[kdf_buf_size + bufptr], &B[bufptr], MIN(prf_output_size, prf_key_size - bufptr)); |
||||
|
||||
/* Tail modified, head updated */ |
||||
if((kdf_buf_size - bufptr) < prf_output_size) |
||||
neoscrypt_copy(&B[0], &B[kdf_buf_size], prf_output_size - (kdf_buf_size - bufptr)); |
||||
} |
||||
|
||||
/* Modify and copy into the output buffer */ |
||||
if(output_len > kdf_buf_size) |
||||
output_len = kdf_buf_size; |
||||
|
||||
a = kdf_buf_size - bufptr; |
||||
if(a >= output_len) { |
||||
neoscrypt_xor(&B[bufptr], &A[0], output_len); |
||||
neoscrypt_copy(&output[0], &B[bufptr], output_len); |
||||
} else { |
||||
neoscrypt_xor(&B[bufptr], &A[0], a); |
||||
neoscrypt_xor(&B[0], &A[a], output_len - a); |
||||
neoscrypt_copy(&output[0], &B[bufptr], a); |
||||
neoscrypt_copy(&output[a], &B[0], output_len - a); |
||||
} |
||||
// for (int i = 0; i<10; i++) { printf("cpu fastkdf %d %08x %08x\n", i, ((unsigned int*)output)[2 * i], ((unsigned int*)output)[2 * i + 1]); }
|
||||
} |
||||
|
||||
|
||||
/* Configurable optimised block mixer */ |
||||
static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode) |
||||
{ |
||||
uint i, mixer, rounds; |
||||
|
||||
mixer = mixmode >> 8; |
||||
rounds = mixmode & 0xFF; |
||||
|
||||
/* NeoScrypt flow: Scrypt flow:
|
||||
Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa"; |
||||
Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb"; |
||||
Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya; |
||||
Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb; |
||||
Xa" = Ya; Xb" = Yc; |
||||
Xc" = Yb; Xd" = Yd; */ |
||||
|
||||
if (r == 1) { |
||||
neoscrypt_blkxor(&X[0], &X[16], SCRYPT_BLOCK_SIZE); |
||||
if(mixer) |
||||
neoscrypt_chacha(&X[0], rounds); |
||||
else |
||||
neoscrypt_salsa(&X[0], rounds); |
||||
neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); |
||||
if(mixer) |
||||
neoscrypt_chacha(&X[16], rounds); |
||||
else |
||||
neoscrypt_salsa(&X[16], rounds); |
||||
return; |
||||
} |
||||
|
||||
if (r == 2) { |
||||
neoscrypt_blkxor(&X[0], &X[48], SCRYPT_BLOCK_SIZE); |
||||
if(mixer) |
||||
neoscrypt_chacha(&X[0], rounds); |
||||
else |
||||
neoscrypt_salsa(&X[0], rounds); |
||||
neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); |
||||
if(mixer) |
||||
neoscrypt_chacha(&X[16], rounds); |
||||
else |
||||
neoscrypt_salsa(&X[16], rounds); |
||||
neoscrypt_blkxor(&X[32], &X[16], SCRYPT_BLOCK_SIZE); |
||||
if(mixer) |
||||
neoscrypt_chacha(&X[32], rounds); |
||||
else |
||||
neoscrypt_salsa(&X[32], rounds); |
||||
neoscrypt_blkxor(&X[48], &X[32], SCRYPT_BLOCK_SIZE); |
||||
if(mixer) |
||||
neoscrypt_chacha(&X[48], rounds); |
||||
else |
||||
neoscrypt_salsa(&X[48], rounds); |
||||
neoscrypt_blkswp(&X[16], &X[32], SCRYPT_BLOCK_SIZE); |
||||
return; |
||||
} |
||||
|
||||
/* Reference code for any reasonable r */ |
||||
for (i = 0; i < 2 * r; i++) { |
||||
if(i) neoscrypt_blkxor(&X[16 * i], &X[16 * (i - 1)], SCRYPT_BLOCK_SIZE); |
||||
else neoscrypt_blkxor(&X[0], &X[16 * (2 * r - 1)], SCRYPT_BLOCK_SIZE); |
||||
if(mixer) |
||||
neoscrypt_chacha(&X[16 * i], rounds); |
||||
else |
||||
neoscrypt_salsa(&X[16 * i], rounds); |
||||
neoscrypt_blkcpy(&Y[16 * i], &X[16 * i], SCRYPT_BLOCK_SIZE); |
||||
} |
||||
for (i = 0; i < r; i++) |
||||
neoscrypt_blkcpy(&X[16 * i], &Y[16 * 2 * i], SCRYPT_BLOCK_SIZE); |
||||
for (i = 0; i < r; i++) |
||||
neoscrypt_blkcpy(&X[16 * (i + r)], &Y[16 * (2 * i + 1)], SCRYPT_BLOCK_SIZE); |
||||
} |
||||
|
||||
/* NeoScrypt core engine:
|
||||
* p = 1, salt = password; |
||||
* Basic customisation (required): |
||||
* profile bit 0: |
||||
* 0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20; |
||||
* 1 = Scrypt(1024, 1, 1) with Salsa20/8; |
||||
* profile bits 4 to 1: |
||||
* 0000 = FastKDF-BLAKE2s; |
||||
* 0001 = PBKDF2-HMAC-SHA256; |
||||
* Extended customisation (optional): |
||||
* profile bit 31: |
||||
* 0 = extended customisation absent; |
||||
* 1 = extended customisation present; |
||||
* profile bits 7 to 5 (rfactor): |
||||
* 000 = r of 1; |
||||
* 001 = r of 2; |
||||
* 010 = r of 4; |
||||
* ... |
||||
* 111 = r of 128; |
||||
* profile bits 12 to 8 (Nfactor): |
||||
* 00000 = N of 2; |
||||
* 00001 = N of 4; |
||||
* 00010 = N of 8; |
||||
* ..... |
||||
* 00110 = N of 128; |
||||
* ..... |
||||
* 01001 = N of 1024; |
||||
* ..... |
||||
* 11110 = N of 2147483648; |
||||
* profile bits 30 to 13 are reserved */ |
||||
void neoscrypt(const uchar *password, uchar *output, uint profile) |
||||
{ |
||||
uint N = 128, r = 2, dblmix = 1, mixmode = 0x14, stack_align = 0x40; |
||||
uint kdf, i, j; |
||||
uint *X, *Y, *Z, *V; |
||||
|
||||
if(profile & 0x1) { |
||||
N = 1024; /* N = (1 << (Nfactor + 1)); */ |
||||
r = 1; /* r = (1 << rfactor); */ |
||||
dblmix = 0; /* Salsa only */ |
||||
mixmode = 0x08; /* 8 rounds */ |
||||
} |
||||
|
||||
if(profile >> 31) { |
||||
N = (1 << (((profile >> 8) & 0x1F) + 1)); |
||||
r = (1 << ((profile >> 5) & 0x7)); |
||||
} |
||||
uchar *stack; |
||||
stack = (uchar*)malloc(((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align)*sizeof(uchar)); |
||||
/* X = r * 2 * SCRYPT_BLOCK_SIZE */ |
||||
X = (uint *) &stack[stack_align & ~(stack_align - 1)]; |
||||
/* Z is a copy of X for ChaCha */ |
||||
Z = &X[32 * r]; |
||||
/* Y is an X sized temporal space */ |
||||
Y = &X[64 * r]; |
||||
/* V = N * r * 2 * SCRYPT_BLOCK_SIZE */ |
||||
V = &X[96 * r]; |
||||
|
||||
/* X = KDF(password, salt) */ |
||||
kdf = (profile >> 1) & 0xF; |
||||
|
||||
switch(kdf) { |
||||
|
||||
default: |
||||
case(0x0): |
||||
neoscrypt_fastkdf(password, 80, password, 80, 32, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); |
||||
break; |
||||
|
||||
case(0x1): |
||||
neoscrypt_pbkdf2_sha256(password, 80, password, 80, 1, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); |
||||
break; |
||||
} |
||||
|
||||
/* Process ChaCha 1st, Salsa 2nd and XOR them into FastKDF; otherwise Salsa only */ |
||||
|
||||
if(dblmix) { |
||||
/* blkcpy(Z, X) */ |
||||
neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); |
||||
|
||||
/* Z = SMix(Z) */ |
||||
for(i = 0; i < N; i++) { |
||||
/* blkcpy(V, Z) */ |
||||
neoscrypt_blkcpy(&V[i * (32 * r)], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); |
||||
/* blkmix(Z, Y) */ |
||||
neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100)); |
||||
} |
||||
for(i = 0; i < N; i++) { |
||||
/* integerify(Z) mod N */ |
||||
j = (32 * r) * (Z[16 * (2 * r - 1)] & (N - 1)); |
||||
/* blkxor(Z, V) */ |
||||
neoscrypt_blkxor(&Z[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE); |
||||
/* blkmix(Z, Y) */ |
||||
neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100)); |
||||
} |
||||
} |
||||
|
||||
#if (ASM) |
||||
/* Must be called before and after SSE2 Salsa */ |
||||
neoscrypt_salsa_tangle(&X[0], r * 2); |
||||
#endif |
||||
|
||||
/* X = SMix(X) */ |
||||
for(i = 0; i < N; i++) { |
||||
/* blkcpy(V, X) */ |
||||
neoscrypt_blkcpy(&V[i * (32 * r)], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); |
||||
/* blkmix(X, Y) */ |
||||
neoscrypt_blkmix(&X[0], &Y[0], r, mixmode); |
||||
} |
||||
for(i = 0; i < N; i++) { |
||||
/* integerify(X) mod N */ |
||||
j = (32 * r) * (X[16 * (2 * r - 1)] & (N - 1)); |
||||
/* blkxor(X, V) */ |
||||
neoscrypt_blkxor(&X[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE); |
||||
/* blkmix(X, Y) */ |
||||
neoscrypt_blkmix(&X[0], &Y[0], r, mixmode); |
||||
} |
||||
|
||||
#if (ASM) |
||||
neoscrypt_salsa_tangle(&X[0], r * 2); |
||||
#endif |
||||
|
||||
if(dblmix) |
||||
/* blkxor(X, Z) */ |
||||
neoscrypt_blkxor(&X[0], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); |
||||
|
||||
/* output = KDF(password, X) */ |
||||
switch(kdf) { |
||||
|
||||
default: |
||||
case(0x0): |
||||
neoscrypt_fastkdf(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 32, output, 32); |
||||
break; |
||||
|
||||
case(0x1): |
||||
neoscrypt_pbkdf2_sha256(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32); |
||||
break; |
||||
} |
||||
} |
||||
|
@ -0,0 +1,33 @@
@@ -0,0 +1,33 @@
|
||||
#if (__cplusplus) |
||||
extern "C" { |
||||
#endif |
||||
|
||||
void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile); |
||||
|
||||
#if (__cplusplus) |
||||
} |
||||
#else |
||||
|
||||
#define SCRYPT_BLOCK_SIZE 64 |
||||
#define SCRYPT_HASH_BLOCK_SIZE 64 |
||||
#define SCRYPT_HASH_DIGEST_SIZE 32 |
||||
|
||||
typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE]; |
||||
|
||||
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) |
||||
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) |
||||
|
||||
#define U8TO32_BE(p) \ |
||||
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ |
||||
((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]))) |
||||
|
||||
#define U32TO8_BE(p, v) \ |
||||
(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ |
||||
(p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); |
||||
|
||||
#define U64TO8_BE(p, v) \ |
||||
U32TO8_BE((p), (uint32_t)((v) >> 32)); \ |
||||
U32TO8_BE((p) + 4, (uint32_t)((v) )); |
||||
|
||||
#endif |
||||
|
Loading…
Reference in new issue