From 65c1d787e7194ae5abc9228be8bd73bed28bfb46 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 3 May 2015 14:28:10 +0200 Subject: [PATCH] add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... --- Makefile.am | 1 + README.txt | 7 +- ccminer.cpp | 12 +- ccminer.vcxproj | 3 + miner.h | 4 + neoscrypt.cu | 83 +++ neoscrypt/cuda_neoscrypt.cu | 607 +++++++++++++++++++ neoscrypt/cuda_vectors.h | 1109 +++++++++++++++++++++++++++++++++++ neoscrypt/neoscrypt.c | 992 +++++++++++++++++++++++++++++++ neoscrypt/neoscrypt.h | 33 ++ util.cpp | 3 + 11 files changed, 2852 insertions(+), 2 deletions(-) create mode 100644 neoscrypt.cu create mode 100644 neoscrypt/cuda_neoscrypt.cu create mode 100644 neoscrypt/cuda_vectors.h create mode 100644 neoscrypt/neoscrypt.c create mode 100644 neoscrypt/neoscrypt.h diff --git a/Makefile.am b/Makefile.am index 2ca7e6f..e6ee928 100644 --- a/Makefile.am +++ b/Makefile.am @@ -41,6 +41,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \ quark/quarkcoin.cu quark/animecoin.cu \ quark/cuda_quark_compactionTest.cu \ + neoscrypt.cu neoscrypt/neoscrypt.c neoscrypt/cuda_neoscrypt.cu \ cuda_nist5.cu pentablake.cu skein.cu skein2.cu zr5.cu \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ diff --git a/README.txt b/README.txt index a45e534..e13479c 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccMiner release 1.6.2-tpruvot (Apr 2015) - "Scrypt/N/Jane algos" +ccMiner release 1.6.3-tpruvot (May 2015) - "Neoscrypt" --------------------------------------------------------------- *************************************************************** @@ -37,6 +37,7 @@ Deep, Doom and Qubit Keccak (Maxcoin) Pentablake (Blake 512 x5) 1Coin Triple S +Neoscrypt (FeatherCoin) Scrypt and Scrypt:N Scrypt-Jane (Chacha) Skein (Skein + SHA) @@ -76,6 +77,7 @@ its command line interface and options. lyra2 use to mine Vertcoin mjollnir use to mine Mjollnircoin myr-gr use to mine Myriad-Groest + neoscrypt use to mine FeatherCoin nist5 use to mine TalkCoin penta use to mine Joincoin / Pentablake pluck use to mine Supcoin @@ -211,6 +213,9 @@ features. >>> RELEASE HISTORY <<< + Not released!! v1.6.3 + Import Neoscrypt from djm34 work + Apr. 21th 2015 v1.6.2 Import Scrypt, Scrypt:N and Scrypt-jane from Cudaminer Add the --time-limit command line parameter diff --git a/ccminer.cpp b/ccminer.cpp index 9fa1241..3bebf98 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -98,6 +98,7 @@ enum sha_algos { ALGO_LYRA2, ALGO_MJOLLNIR, /* Hefty hash */ ALGO_MYR_GR, + ALGO_NEOSCRYPT, ALGO_NIST5, ALGO_PENTABLAKE, ALGO_PLUCK, @@ -135,6 +136,7 @@ static const char *algo_names[] = { "lyra2", "mjollnir", "myr-gr", + "neoscrypt", "nist5", "penta", "pluck", @@ -273,6 +275,7 @@ Options:\n\ lyra2 VertCoin\n\ mjollnir Mjollnircoin\n\ myr-gr Myriad-Groestl\n\ + neoscrypt use to mine FeatherCoin\n\ nist5 NIST5 (TalkCoin)\n\ penta Pentablake hash (5x Blake 512)\n\ pluck SupCoin\n\ @@ -537,7 +540,7 @@ static bool work_decode(const json_t *val, struct work *work) int adata_sz = ARRAY_SIZE(work->data), atarget_sz = ARRAY_SIZE(work->target); int i; - if (opt_algo == ALGO_ZR5) { + if (opt_algo == ALGO_NEOSCRYPT || opt_algo == ALGO_ZR5) { data_size = 80; adata_sz = 20; } @@ -1241,6 +1244,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) switch (opt_algo) { case ALGO_JACKPOT: + case ALGO_NEOSCRYPT: case ALGO_PLUCK: case ALGO_SCRYPT: case ALGO_SCRYPT_JANE: @@ -1472,6 +1476,7 @@ static void *miner_thread(void *userdata) minmax = 0x400000; break; case ALGO_LYRA2: + case ALGO_NEOSCRYPT: case ALGO_SCRYPT: case ALGO_SCRYPT_JANE: minmax = 0x100000; @@ -1599,6 +1604,11 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; + case ALGO_NEOSCRYPT: + rc = scanhash_neoscrypt(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + case ALGO_NIST5: rc = scanhash_nist5(thr_id, work.data, work.target, max_nonce, &hashes_done); diff --git a/ccminer.vcxproj b/ccminer.vcxproj index e05120e..9322821 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -265,6 +265,7 @@ + @@ -435,6 +436,8 @@ -Xptxas "-abi=yes" %(AdditionalOptions) -Xptxas "-abi=yes" %(AdditionalOptions) + + diff --git a/miner.h b/miner.h index 1a90a2a..2c17fec 100644 --- a/miner.h +++ b/miner.h @@ -324,6 +324,9 @@ extern int scanhash_lyra2(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_neoscrypt(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); + extern int scanhash_nist5(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -689,6 +692,7 @@ unsigned int jackpothash(void *state, const void *input); void groestlhash(void *state, const void *input); void lyra2_hash(void *state, const void *input); void myriadhash(void *state, const void *input); +void neoscrypt(const uchar *password, uchar *output, uint profile); void nist5hash(void *state, const void *input); void pentablakehash(void *output, const void *input); void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N); diff --git a/neoscrypt.cu b/neoscrypt.cu new file mode 100644 index 0000000..792812c --- /dev/null +++ b/neoscrypt.cu @@ -0,0 +1,83 @@ + +extern "C" { +#include "neoscrypt/neoscrypt.h" +} + +#include "cuda_helper.h" +#include "miner.h" + +static uint32_t *d_hash[MAX_GPUS] ; +extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget); +extern void neoscrypt_cpu_init(int thr_id, uint32_t threads, uint32_t* hash); +extern uint32_t neoscrypt_cpu_hash_k4(int stratum, int thr_id, uint32_t threads, uint32_t startNounce, int order); + +#define SHIFT 130 + +int scanhash_neoscrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + const int stratum = have_stratum; + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x0000ff; + + int intensity = is_windows() ? 18 : 19; + uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity); + throughput = throughput / 32; /* set for max intensity ~= 20 */ + throughput = min(throughput, max_nonce - first_nonce + 1); + + static bool init[MAX_GPUS] = { 0 }; + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 32 * SHIFT * sizeof(uint64_t) * throughput)); + neoscrypt_cpu_init(thr_id, throughput, d_hash[thr_id]); + + applog(LOG_INFO, "Using %d cuda threads", throughput); + + init[thr_id] = true; + } + + uint32_t endiandata[20]; + if (stratum) { + for (int k = 0; k < 20; k++) + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + } else { + for (int k = 0; k < 20; k++) + endiandata[k] = pdata[k]; + } + + neoscrypt_setBlockTarget(endiandata,ptarget); + + do { + uint32_t foundNonce = neoscrypt_cpu_hash_k4(stratum, thr_id, throughput, pdata[19], 0); + if (foundNonce != UINT32_MAX) + { + uint32_t _ALIGN(64) vhash64[8]; + + *hashes_done = pdata[19] - first_nonce + 1; + + if (stratum) { + be32enc(&endiandata[19], foundNonce); + } else { + endiandata[19] = foundNonce; + } + neoscrypt((uchar*) endiandata, (uchar*)vhash64, 0x80000620); + + if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { + pdata[19] = foundNonce; + return 1; + } else { + applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce); + } + } + + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); + + *hashes_done = pdata[19] - first_nonce + 1; + return 0; +} diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu new file mode 100644 index 0000000..652d3ca --- /dev/null +++ b/neoscrypt/cuda_neoscrypt.cu @@ -0,0 +1,607 @@ +#include +#include + +#include "cuda_helper.h" +#include "cuda_vectors.h" + + __device__ uint4 * W; +uint32_t *d_NNonce[MAX_GPUS]; +uint32_t *d_nnounce[MAX_GPUS]; +__constant__ uint32_t pTarget[8]; +__constant__ uint32_t key_init[16]; +__constant__ uint32_t input_init[16]; +__constant__ uint32_t c_data[80]; + + +#define SALSA_SMALL_UNROLL 1 +#define CHACHA_SMALL_UNROLL 1 +#define BLAKE2S_BLOCK_SIZE 64U +#define BLAKE2S_OUT_SIZE 32U +#define BLAKE2S_KEY_SIZE 32U +#define BLOCK_SIZE 64U +#define FASTKDF_BUFFER_SIZE 256U +#define PASSWORD_LEN 80U +/// constants /// + +static const __constant__ uint8 BLAKE2S_IV_Vec = + { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 + }; + + +static const uint8 BLAKE2S_IV_Vechost = +{ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +static const uint32_t BLAKE2S_SIGMA_host[10][16] = +{ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, +}; +__constant__ uint32_t BLAKE2S_SIGMA[10][16]; + +// Blake2S + +#define BLAKE2S_BLOCK_SIZE 64U +#define BLAKE2S_OUT_SIZE 32U +#define BLAKE2S_KEY_SIZE 32U + +#if __CUDA_ARCH__ >= 500 +#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ + idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ + a += b; d = __byte_perm(d^a,0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ + a += b; d = __byte_perm(d^a,0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} +#else +#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ + idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ + a += b; d = rotate(d^a,16); \ + c += d; b = rotateR(b^c, 12); \ + idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ + a += b; d = rotateR(d^a,8); \ + c += d; b = rotateR(b^c, 7); \ +} +#endif + +#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n))) +#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) + +#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \ + idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \ + a += b; d = ROTR32(d^a,16); \ + c += d; b = ROTR32(b^c, 12); \ + idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \ + a += b; d = ROTR32(d^a,8); \ + c += d; b = ROTR32(b^c, 7); \ +} + + +static __forceinline__ __device__ void Blake2S(uint32_t * inout, const uint32_t * TheKey) +{ + uint16 V; + uint32_t idx; + uint8 tmpblock; + + V.hi = BLAKE2S_IV_Vec; + V.lo = BLAKE2S_IV_Vec; + V.lo.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; + + for (int x = 0; x < 10; ++x) + { + BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + } + + V.lo ^= V.hi; + V.lo ^= tmpblock; + + V.hi = BLAKE2S_IV_Vec; + tmpblock = V.lo; + + V.hi.s4 ^= 128; + V.hi.s6 = ~V.hi.s6; + + for (int x = 0; x < 10; ++x) + { + BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + } + + V.lo ^= V.hi ^ tmpblock; + + ((uint8*)inout)[0]=V.lo; + +} + +static __forceinline__ __host__ void Blake2Shost(uint32_t * inout, const uint32_t * inkey) +{ + uint16 V; + uint32_t idx; + uint8 tmpblock; + + V.hi = BLAKE2S_IV_Vechost; + V.lo = BLAKE2S_IV_Vechost; + V.lo.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; + + for (int x = 0; x < 10; ++x) + { + BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey); + BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey); + BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey); + BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey); + BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey); + BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey); + BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey); + BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey); + } + + V.lo ^= V.hi; + V.lo ^= tmpblock; + + V.hi = BLAKE2S_IV_Vechost; + tmpblock = V.lo; + + V.hi.s4 ^= 128; + V.hi.s6 = ~V.hi.s6; + + for (int x = 0; x < 10; ++x) + { + BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + } + + V.lo ^= V.hi ^ tmpblock; + + ((uint8*)inout)[0] = V.lo; +} + +static __forceinline__ __device__ void fastkdf256(const uint32_t* password, uint8_t* output) +{ + uint8_t bufidx = 0; + uchar4 bufhelper; + uint8_t A[320],B[288]; + + ((uintx64*)A)[0] = ((uintx64*)password)[0]; + ((uint816 *)A)[4] = ((uint816 *)password)[0]; + + ((uintx64*)B)[0] = ((uintx64*)password)[0]; + ((uint48 *)B)[8] = ((uint48 *)password)[0]; + + uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = { 0 }; + + ((uint816*)input)[0] = ((uint816*)input_init)[0]; + ((uint48*)key)[0] = ((uint48*)key_init)[0]; + + for (int i = 0; i < 32; ++i) + { + bufhelper = ((uchar4*)input)[0]; + for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + bufhelper += ((uchar4*)input)[x]; + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + + int qbuf = bufidx/4; + int rbuf = bufidx&3; + int bitbuf = rbuf << 3; + uint32_t shifted[9]; + + shift256R2(shifted, ((uint8*)input)[0], bitbuf); + + for (int k = 0; k < 9; ++k) { + ((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k]; + } + + if (bufidx < BLAKE2S_KEY_SIZE) {((uint8*)B)[8] = ((uint8*)B)[0];} + else if (bufidx > FASTKDF_BUFFER_SIZE-BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];} + + if (i<31) { + for (int k = 0; k FASTKDF_BUFFER_SIZE - BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];} +// MyUnion Test; + + for (uint8_t k = 0; k =500 +#define CHACHA_STEP(a,b,c,d) { \ + a += b; d = __byte_perm(d^a,0,0x1032); \ + c += d; b = rotate(b^c, 12); \ + a += b; d = __byte_perm(d^a,0,0x2103); \ + c += d; b = rotate(b^c, 7); \ +} +#else +#define CHACHA_STEP(a,b,c,d) { \ + a += b; d = rotate(d^a,16); \ + c += d; b = rotate(b^c, 12); \ + a += b; d = rotate(d^a,8); \ + c += d; b = rotate(b^c, 7); \ +} +#endif + +#define CHACHA_CORE_PARALLEL(state) { \ + CHACHA_STEP(state.lo.s0, state.lo.s4, state.hi.s0, state.hi.s4); \ + CHACHA_STEP(state.lo.s1, state.lo.s5, state.hi.s1, state.hi.s5); \ + CHACHA_STEP(state.lo.s2, state.lo.s6, state.hi.s2, state.hi.s6); \ + CHACHA_STEP(state.lo.s3, state.lo.s7, state.hi.s3, state.hi.s7); \ + CHACHA_STEP(state.lo.s0, state.lo.s5, state.hi.s2, state.hi.s7); \ + CHACHA_STEP(state.lo.s1, state.lo.s6, state.hi.s3, state.hi.s4); \ + CHACHA_STEP(state.lo.s2, state.lo.s7, state.hi.s0, state.hi.s5); \ + CHACHA_STEP(state.lo.s3, state.lo.s4, state.hi.s1, state.hi.s6); \ +} + + +static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X) +{ + uint16 state = X; + uint32_t t; + + for (int i = 0; i < 10; ++i) { SALSA_CORE(state);} + + return(X + state); +} + +static __device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X) +{ + uint16 st = X; + + for (int i = 0; i < 10; ++i) {CHACHA_CORE_PARALLEL(st);} + return(X + st); +} + +static __device__ __forceinline__ void neoscrypt_chacha(uint16 *XV) +{ + XV[0] ^= XV[3]; + uint16 temp; + + XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0]; + temp = chacha_small_parallel_rnd(XV[1]); XV[2] ^= temp; + XV[1] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[1]; + XV[3] = chacha_small_parallel_rnd(XV[3]); + XV[2] = temp; +} + +static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV) +{ + XV[0] ^= XV[3]; + uint16 temp; + + XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; + temp = salsa_small_scalar_rnd(XV[1]); XV[2] ^= temp; + XV[1] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[1]; + XV[3] = salsa_small_scalar_rnd(XV[3]); + XV[2] = temp; +} + + +#define SHIFT 130 + +__global__ __launch_bounds__(128, 1) +void neoscrypt_gpu_hash_k0(int stratum, uint32_t threads, uint32_t startNonce) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + uint32_t shift = SHIFT * 16 * thread; +// if (thread < threads) + { + uint32_t data[80]; + uint16 X[4]; + const uint32_t nonce = startNonce + thread; + + for (int i = 0; i<20; i++) { + ((uint4*)data)[i] = ((uint4 *)c_data)[i]; + } //ld.local.v4 + data[19] = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!! + data[39] = data[19]; + data[59] = data[19]; + + fastkdf256(data, (uint8_t*)X); + + ((uintx64 *)(W + shift))[0] = ((uintx64 *)X)[0]; +// ((ulonglong16 *)(W + shift))[0] = ((ulonglong16 *)X)[0]; + } +} + +__global__ __launch_bounds__(128, 1) +void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + uint32_t shift = SHIFT * 16 * thread; +// if (thread < threads) + { + uint16 X[4]; + ((uintx64 *)X)[0]= __ldg32(&(W + shift)[0]); + + //#pragma unroll + for (int i = 0; i < 128; ++i) + { + neoscrypt_chacha(X); + ((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)X)[0]; +// ((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)X)[0]; + } + } +} + +__global__ __launch_bounds__(128, 1) +void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + uint32_t shift = SHIFT * 16 * thread; +// if (thread < threads) + { + uint16 X[4]; + ((uintx64 *)X)[0] = __ldg32(&(W + shift)[2048]); + + for (int t = 0; t < 128; t++) + { + int idx = X[3].lo.s0 & 0x7F; + ((uintx64 *)X)[0] ^= __ldg32(&(W + shift)[idx << 4]); + neoscrypt_chacha(X); + + } + ((uintx64 *)(W + shift))[129] = ((uintx64*)X)[0]; // best checked + + } +} + +__global__ __launch_bounds__(128, 1) +void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) + { + uint32_t shift = SHIFT * 16 * thread; + uint16 Z[4]; + + ((uintx64*)Z)[0] = __ldg32(&(W + shift)[0]); + + //#pragma unroll + for (int i = 0; i < 128; ++i) { + neoscrypt_salsa(Z); + ((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)Z)[0]; +// ((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)Z)[0]; + } + } +} + +__global__ __launch_bounds__(128, 1) +void neoscrypt_gpu_hash_k4(int stratum, uint32_t threads, uint32_t startNonce, uint32_t *nonceVector) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + + uint32_t shift = SHIFT * 16 * thread; + uint16 Z[4]; + uint32_t outbuf[8]; + uint32_t data[80]; + + for (int i=0; i<20; i++) { + ((uint4*)data)[i] = ((uint4 *)c_data)[i]; + } + + data[19] = (stratum) ? cuda_swab32(nonce) : nonce; + data[39] = data[19]; + data[59] = data[19]; + ((uintx64 *)Z)[0] = __ldg32(&(W + shift)[2048]); + for (int t = 0; t < 128; t++) + { + int idx = Z[3].lo.s0 & 0x7F; + ((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[idx << 4]); + neoscrypt_salsa(Z); + } + ((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[2064]); + fastkdf32(data, (uint32_t*)Z, outbuf); + if (outbuf[7] <= pTarget[7]) { + uint32_t tmp = atomicExch(&nonceVector[0], nonce); + } + } +} + +void neoscrypt_cpu_init(int thr_id, uint32_t threads, uint32_t *hash) +{ + cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(W, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice); + cudaMalloc(&d_NNonce[thr_id], sizeof(uint32_t)); +} + +__host__ +uint32_t neoscrypt_cpu_hash_k4(int stratum, int thr_id, uint32_t threads, uint32_t startNounce, int order) +{ + uint32_t result[MAX_GPUS] = { 0xffffffff }; + cudaMemset(d_NNonce[thr_id], 0xff, sizeof(uint32_t)); + + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + +// neoscrypt_gpu_hash_orig << > >(threads, startNounce, d_NNonce[thr_id]); + + neoscrypt_gpu_hash_k0 << > >(stratum,threads, startNounce); + neoscrypt_gpu_hash_k01 << > >(threads, startNounce); + neoscrypt_gpu_hash_k2 << > >(threads, startNounce); + neoscrypt_gpu_hash_k3 << > >(threads, startNounce); + neoscrypt_gpu_hash_k4 << > >(stratum,threads, startNounce, d_NNonce[thr_id]); + + MyStreamSynchronize(NULL, order, thr_id); + cudaMemcpy(&result[thr_id], d_NNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + + return result[thr_id]; +} + +__host__ +void neoscrypt_setBlockTarget(uint32_t* pdata, const void *target) +{ + unsigned char PaddedMessage[80*4]; //bring balance to the force + uint32_t input[16], key[16] = { 0 }; + memcpy(PaddedMessage, pdata, 80); + memcpy(PaddedMessage + 80, pdata, 80); + memcpy(PaddedMessage + 160, pdata, 80); + memcpy(PaddedMessage + 240, pdata, 80); + + ((uint16*)input)[0] = ((uint16*)pdata)[0]; + ((uint8*)key)[0] = ((uint8*)pdata)[0]; +// for (int i = 0; i<10; i++) { printf(" pdata/input %d %08x %08x \n",i,pdata[2*i],pdata[2*i+1]); } + + Blake2Shost(input,key); + + cudaMemcpyToSymbol(pTarget, target, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(input_init, input, 16 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(key_init, key, 16 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + + cudaMemcpyToSymbol(c_data, PaddedMessage, 40 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice); +} + diff --git a/neoscrypt/cuda_vectors.h b/neoscrypt/cuda_vectors.h new file mode 100644 index 0000000..a654f7d --- /dev/null +++ b/neoscrypt/cuda_vectors.h @@ -0,0 +1,1109 @@ +#ifndef CUDA_VECTOR_H +#define CUDA_VECTOR_H + + +/////////////////////////////////////////////////////////////////////////////////// +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif + +#include "cuda_helper.h" + +//typedef __device_builtin__ struct ulong16 ulong16; + + +typedef struct __align__(32) uint8 +{ + unsigned int s0, s1, s2, s3, s4, s5, s6, s7; +} uint8; + +typedef struct __align__(64) ulonglong2to8 +{ +ulonglong2 l0,l1,l2,l3; +} ulonglong2to8; + +typedef struct __align__(128) ulonglong8to16 +{ + ulonglong2to8 lo, hi; +} ulonglong8to16; + +typedef struct __align__(256) ulonglong16to32 +{ + ulonglong8to16 lo, hi; +} ulonglong16to32; + +typedef struct __align__(512) ulonglong32to64 +{ + ulonglong16to32 lo, hi; +} ulonglong32to64; + + + +typedef struct __align__(1024) ulonglonglong +{ + ulonglong8to16 s0,s1,s2,s3,s4,s5,s6,s7; +} ulonglonglong; + + + + +typedef struct __align__(64) uint16 +{ + union { + struct {unsigned int s0, s1, s2, s3, s4, s5, s6, s7;}; + uint8 lo; + }; + union { + struct {unsigned int s8, s9, sa, sb, sc, sd, se, sf;}; + uint8 hi; + }; +} uint16; + +typedef struct __align__(128) uint32 +{ + + uint16 lo,hi; +} uint32; + + + +struct __align__(128) ulong8 +{ + ulonglong4 s0, s1, s2, s3; +}; +typedef __device_builtin__ struct ulong8 ulong8; + + +typedef struct __align__(256) ulonglong16 +{ + ulonglong2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf; +} ulonglong16; + +typedef struct __align__(32) uint48 +{ + uint4 s0, s1; + +} uint48; + +typedef struct __align__(64) uint816 +{ + uint48 s0, s1; + +} uint816; + +typedef struct __align__(128) uint1632 +{ + uint816 s0, s1; + +} uint1632; + +typedef struct __align__(256) uintx64 +{ + uint1632 s0, s1; + +} uintx64; + +typedef struct __align__(512) uintx128 +{ + uintx64 s0, s1; + +} uintx128; + +typedef struct __align__(1024) uintx256 +{ + uintx128 s0, s1; + +} uintx256; + + + +typedef struct __align__(256) uint4x16 +{ + uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; +} uint4x16; + +static __inline__ __device__ ulonglong2to8 make_ulonglong2to8(ulonglong2 s0, ulonglong2 s1, ulonglong2 s2, ulonglong2 s3) +{ +ulonglong2to8 t; t.l0=s0; t.l1=s1; t.l2=s2; t.l3=s3; +return t; +} + +static __inline__ __device__ ulonglong8to16 make_ulonglong8to16(const ulonglong2to8 &s0, const ulonglong2to8 &s1) +{ + ulonglong8to16 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __device__ ulonglong16to32 make_ulonglong16to32(const ulonglong8to16 &s0, const ulonglong8to16 &s1) +{ + ulonglong16to32 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __device__ ulonglong32to64 make_ulonglong32to64(const ulonglong16to32 &s0, const ulonglong16to32 &s1) +{ + ulonglong32to64 t; t.lo = s0; t.hi = s1; + return t; +} + + +static __inline__ __host__ __device__ ulonglonglong make_ulonglonglong( + const ulonglong8to16 &s0, const ulonglong8to16 &s1, const ulonglong8to16 &s2, const ulonglong8to16 &s3, + const ulonglong8to16 &s4, const ulonglong8to16 &s5, const ulonglong8to16 &s6, const ulonglong8to16 &s7) +{ + ulonglonglong t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + + +static __inline__ __device__ uint48 make_uint48(uint4 s0, uint4 s1) +{ + uint48 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uint816 make_uint816(const uint48 &s0, const uint48 &s1) +{ + uint816 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uint1632 make_uint1632(const uint816 &s0, const uint816 &s1) +{ + uint1632 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uintx64 make_uintx64(const uint1632 &s0, const uint1632 &s1) +{ + uintx64 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uintx128 make_uintx128(const uintx64 &s0, const uintx64 &s1) +{ + uintx128 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uintx256 make_uintx256(const uintx128 &s0, const uintx128 &s1) +{ + uintx256 t; t.s0 = s0; t.s1 = s1; + return t; +} + + +static __inline__ __device__ uintx256 make_uintx64(const uintx128 &s0, const uintx128 &s1) +{ + uintx256 t; t.s0 = s0; t.s1 = s1; + return t; +} + + +static __inline__ __host__ __device__ uint4x16 make_uint4x16( + uint4 s0, uint4 s1, uint4 s2, uint4 s3, uint4 s4, uint4 s5, uint4 s6, uint4 s7, + uint4 s8, uint4 s9, uint4 sa, uint4 sb, uint4 sc, uint4 sd, uint4 se, uint4 sf) +{ + uint4x16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.s10 = sa; t.s11 = sb; t.s12 = sc; t.s13 = sd; t.s14 = se; t.s15 = sf; + return t; +} + + + + +static __inline__ __host__ __device__ uint16 make_uint16( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7, + unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf) +{ + uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf; + return t; +} + +static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b) +{ + uint16 t; t.lo=a; t.hi=b; return t; +} + +static __inline__ __host__ __device__ uint32 make_uint32(const uint16 &a, const uint16 &b) +{ + uint32 t; t.lo = a; t.hi = b; return t; +} + + +static __inline__ __host__ __device__ uint8 make_uint8( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7) +{ + uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + + +static __inline__ __host__ __device__ ulonglong16 make_ulonglong16(const ulonglong2 &s0, const ulonglong2 &s1, + const ulonglong2 &s2, const ulonglong2 &s3, const ulonglong2 &s4, const ulonglong2 &s5, const ulonglong2 &s6, const ulonglong2 &s7, + const ulonglong2 &s8, const ulonglong2 &s9, + const ulonglong2 &sa, const ulonglong2 &sb, const ulonglong2 &sc, const ulonglong2 &sd, const ulonglong2 &se, const ulonglong2 &sf +) { + ulonglong16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf; + return t; +} + + + +static __inline__ __host__ __device__ ulong8 make_ulong8( + ulonglong4 s0, ulonglong4 s1, ulonglong4 s2, ulonglong4 s3) +{ + ulong8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3;// t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + + +static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b) { return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b) { return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } + + + + + +static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __forceinline__ __device__ uint4 operator+ (uint4 a, uint4 b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } + + +static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } +static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); } +static __forceinline__ __device__ ulonglong2 operator+ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x + b.x, a.y + b.y); } + +static __forceinline__ __device__ ulong8 operator^ (const ulong8 &a, const ulong8 &b) { + return make_ulong8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3); +} //, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); } + +static __forceinline__ __device__ ulong8 operator+ (const ulong8 &a, const ulong8 &b) { + return make_ulong8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3); +} //, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); } + + +static __forceinline__ __device__ __host__ uint8 operator^ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); } + +static __forceinline__ __device__ __host__ uint8 operator+ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); } + +////////////// mess++ ////// + +static __forceinline__ __device__ uint48 operator^ (const uint48 &a, const uint48 &b) { + return make_uint48(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +static __forceinline__ __device__ uint816 operator^ (const uint816 &a, const uint816 &b) { + return make_uint816(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +static __forceinline__ __device__ uint1632 operator^ (const uint1632 &a, const uint1632 &b) { + return make_uint1632(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + + +static __forceinline__ __device__ uintx64 operator^ (const uintx64 &a, const uintx64 &b) { + return make_uintx64(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +static __forceinline__ __device__ uintx128 operator^ (const uintx128 &a, const uintx128 &b) { + return make_uintx128(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +static __forceinline__ __device__ uintx256 operator^ (const uintx256 &a, const uintx256 &b) { + return make_uintx256(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +///////////////////////// + +static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) { + return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7, + a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf); +} + +static __forceinline__ __device__ __host__ uint16 operator+ (const uint16 &a, const uint16 &b) { + return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7, + a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf); +} + +static __forceinline__ __device__ uint32 operator^ (const uint32 &a, const uint32 &b) { + return make_uint32(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ uint32 operator+ (const uint32 &a, const uint32 &b) { + return make_uint32(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ ulonglong16 operator^ (const ulonglong16 &a, const ulonglong16 &b) { + return make_ulonglong16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7, + a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf +); +} + +static __forceinline__ __device__ ulonglong16 operator+ (const ulonglong16 &a, const ulonglong16 &b) { + return make_ulonglong16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7, + a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf +); +} + +static __forceinline__ __device__ void operator^= (ulong8 &a, const ulong8 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (uintx64 &a, const uintx64 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator^= (uintx128 &a, const uintx128 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (uintx256 &a, const uintx256 &b) { a = a ^ b; } + + +static __forceinline__ __device__ void operator^= (uint816 &a, const uint816 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator^= (uint48 &a, const uint48 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator^= (uint32 &a, const uint32 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (uint32 &a, const uint32 &b) { a = a + b; } + + +static __forceinline__ __device__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) { a = a ^ b; } +static __forceinline__ __device__ __host__ void operator^= (uint8 &a, const uint8 &b) { a = a ^ b; } +static __forceinline__ __device__ __host__ void operator^= (uint16 &a, const uint16 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator^= (ulonglong16 &a, const ulonglong16 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator+= (ulonglong2 &a, const ulonglong2 &b) { a = a + b; } + +static __forceinline__ __device__ +ulonglong2to8 operator^ (const ulonglong2to8 &a, const ulonglong2to8 &b) +{ + return make_ulonglong2to8(a.l0 ^ b.l0, a.l1 ^ b.l1, a.l2 ^ b.l2, a.l3 ^ b.l3); +} +static __forceinline__ __device__ +ulonglong2to8 operator+ (const ulonglong2to8 &a, const ulonglong2to8 &b) +{ + return make_ulonglong2to8(a.l0 + b.l0, a.l1 + b.l1, a.l2 + b.l2, a.l3 + b.l3); +} + + +static __forceinline__ __device__ +ulonglong8to16 operator^ (const ulonglong8to16 &a, const ulonglong8to16 &b) +{ + return make_ulonglong8to16(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong8to16 operator+ (const ulonglong8to16 &a, const ulonglong8to16 &b) +{ + return make_ulonglong8to16(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ +ulonglong16to32 operator^ (const ulonglong16to32 &a, const ulonglong16to32 &b) +{ + return make_ulonglong16to32(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong16to32 operator+ (const ulonglong16to32 &a, const ulonglong16to32 &b) +{ + return make_ulonglong16to32(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ +ulonglong32to64 operator^ (const ulonglong32to64 &a, const ulonglong32to64 &b) +{ + return make_ulonglong32to64(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong32to64 operator+ (const ulonglong32to64 &a, const ulonglong32to64 &b) +{ + return make_ulonglong32to64(a.lo + b.lo, a.hi + b.hi); +} + + +static __forceinline__ __device__ ulonglonglong operator^ (const ulonglonglong &a, const ulonglonglong &b) { + return make_ulonglonglong(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); +} + +static __forceinline__ __device__ ulonglonglong operator+ (const ulonglonglong &a, const ulonglonglong &b) { + return make_ulonglonglong(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); +} + + +static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulonglong2to8 &b) { a = a ^ b; } + + +static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; } +static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; } +static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; } +static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; } +static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) { a = a + b; } +static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) { a = a + b; } +static __forceinline__ __device__ void operator+= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a ^ b; } + + +static __forceinline__ __device__ void operator+= (ulonglonglong &a, const ulonglonglong &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglonglong &a, const ulonglonglong &b) { a = a ^ b; } + +#if __CUDA_ARCH__ < 320 + +#define rotate ROTL32 +#define rotateR ROTR32 + +#else + +static __forceinline__ __device__ uint4 rotate4(uint4 vec4, uint32_t shift) +{ + uint4 ret; + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift)); + return ret; +} + +static __forceinline__ __device__ uint4 rotate4R(uint4 vec4, uint32_t shift) +{ + uint4 ret; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift)); + return ret; +} + +static __forceinline__ __device__ uint32_t rotate(uint32_t vec4, uint32_t shift) +{ + uint32_t ret; + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift)); + return ret; +} + + +static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift) +{ + uint32_t ret; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift)); + return ret; +} + + + +static __device__ __inline__ uint8 __ldg8(const uint8_t *ptr) +{ + uint8 test; + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr)); + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ uint32_t __ldgtoint(const uint8_t *ptr) +{ + uint32_t test; + asm volatile ("ld.global.nc.u32 {%0},[%1];" : "=r"(test) : __LDG_PTR(ptr)); + return (test); +} + +static __device__ __inline__ uint32_t __ldgtoint64(const uint8_t *ptr) +{ + uint64_t test; + asm volatile ("ld.global.nc.u64 {%0},[%1];" : "=l"(test) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ uint32_t __ldgtoint_unaligned(const uint8_t *ptr) +{ + uint32_t test; + asm volatile ("{\n\t" + ".reg .u8 a,b,c,d; \n\t" + "ld.global.nc.u8 a,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "mov.b32 %0,{a,b,c,d}; }\n\t" + : "=r"(test) : __LDG_PTR(ptr)); + return (test); +} + +static __device__ __inline__ uint64_t __ldgtoint64_unaligned(const uint8_t *ptr) +{ + uint64_t test; + asm volatile ("{\n\t" + ".reg .u8 a,b,c,d,e,f,g,h; \n\t" + ".reg .u32 i,j; \n\t" + "ld.global.nc.u8 a,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "ld.global.nc.u8 e,[%1+4]; \n\t" + "ld.global.nc.u8 f,[%1+5]; \n\t" + "ld.global.nc.u8 g,[%1+6]; \n\t" + "ld.global.nc.u8 h,[%1+7]; \n\t" + "mov.b32 i,{a,b,c,d}; \n\t" + "mov.b32 j,{e,f,g,h}; \n\t" + "mov.b64 %0,{i,j}; }\n\t" + : "=l"(test) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ uint64_t __ldgtoint64_trunc(const uint8_t *ptr) +{ + uint32_t zero = 0; + uint64_t test; + asm volatile ("{\n\t" + ".reg .u8 a,b,c,d; \n\t" + ".reg .u32 i; \n\t" + "ld.global.nc.u8 a,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "mov.b32 i,{a,b,c,d}; \n\t" + "mov.b64 %0,{i,%1}; }\n\t" + : "=l"(test) : __LDG_PTR(ptr), "r"(zero)); + return (test); +} + + + +static __device__ __inline__ uint32_t __ldgtoint_unaligned2(const uint8_t *ptr) +{ + uint32_t test; + asm("{\n\t" + ".reg .u8 e,b,c,d; \n\t" + "ld.global.nc.u8 e,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "mov.b32 %0,{e,b,c,d}; }\n\t" + : "=r"(test) : __LDG_PTR(ptr)); + return (test); +} + +#endif + +static __forceinline__ __device__ void shift256R2(uint32_t * ret, const uint8 &vec4, uint32_t shift) +{ + uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[8] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s6); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[7] = cuda_swab32(truc); + truc2 = cuda_swab32(vec4.s5); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[6] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s4); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[5] = cuda_swab32(truc); + truc2 = cuda_swab32(vec4.s3); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[4] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s2); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[3] = cuda_swab32(truc); + truc2 = cuda_swab32(vec4.s1); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[2] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s0); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[1] = cuda_swab32(truc); + asm("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift)); + ret[0] = cuda_swab32(truc); + +} + +#define shift256R3(ret,vec4, shift) \ +{ \ + \ +uint32_t truc=0,truc2=cuda_swab32(vec4.s7),truc3=0; \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[8] = cuda_swab32(truc); \ +truc2=cuda_swab32(vec4.s6);truc3=cuda_swab32(vec4.s7); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[7] = cuda_swab32(truc); \ +truc2=cuda_swab32(vec4.s5);truc3=cuda_swab32(vec4.s6); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[6] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s4); truc3 = cuda_swab32(vec4.s5); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[5] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s3); truc3 = cuda_swab32(vec4.s4); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[4] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s2); truc3 = cuda_swab32(vec4.s3); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[3] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s1); truc3 = cuda_swab32(vec4.s2); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[2] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s0); truc3 = cuda_swab32(vec4.s1); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[1] = cuda_swab32(truc); \ +truc3 = cuda_swab32(vec4.s0); \ + asm volatile ("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift)); \ + ret[0] = cuda_swab32(truc); \ + \ + \ +} + + +static __device__ __inline__ uint32 __ldg32b(const uint32 *ptr) +{ + uint32 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.lo.s0), "=r"(ret.lo.s1), "=r"(ret.lo.s2), "=r"(ret.lo.s3) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.lo.s4), "=r"(ret.lo.s5), "=r"(ret.lo.s6), "=r"(ret.lo.s7) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.lo.s8), "=r"(ret.lo.s9), "=r"(ret.lo.sa), "=r"(ret.lo.sb) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.lo.sc), "=r"(ret.lo.sd), "=r"(ret.lo.se), "=r"(ret.lo.sf) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.hi.s0), "=r"(ret.hi.s1), "=r"(ret.hi.s2), "=r"(ret.hi.s3) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.hi.s4), "=r"(ret.hi.s5), "=r"(ret.hi.s6), "=r"(ret.hi.s7) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.hi.s8), "=r"(ret.hi.s9), "=r"(ret.hi.sa), "=r"(ret.hi.sb) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.hi.sc), "=r"(ret.hi.sd), "=r"(ret.hi.se), "=r"(ret.hi.sf) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ uint16 __ldg16b(const uint16 *ptr) +{ + uint16 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0), "=r"(ret.s1), "=r"(ret.s2), "=r"(ret.s3) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s4), "=r"(ret.s5), "=r"(ret.s6), "=r"(ret.s7) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s8), "=r"(ret.s9), "=r"(ret.sa), "=r"(ret.sb) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.sc), "=r"(ret.sd), "=r"(ret.se), "=r"(ret.sf) : __LDG_PTR(ptr)); + return ret; +} + + +static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr) +{ + uintx64 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ uintx64 __ldg32c(const uintx64 *ptr) +{ + uintx64 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + + return ret; +} + +static __device__ __inline__ uintx128 __ldg128(const uintx128 *ptr) +{ + uintx128 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s0.s1.s0.s0.x), "=r"(ret.s0.s0.s1.s0.s0.y), "=r"(ret.s0.s0.s1.s0.s0.z), "=r"(ret.s0.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s0.s1.s0.s1.x), "=r"(ret.s0.s0.s1.s0.s1.y), "=r"(ret.s0.s0.s1.s0.s1.z), "=r"(ret.s0.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s0.s1.s1.s0.x), "=r"(ret.s0.s0.s1.s1.s0.y), "=r"(ret.s0.s0.s1.s1.s0.z), "=r"(ret.s0.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s0.s1.s1.s1.x), "=r"(ret.s0.s0.s1.s1.s1.y), "=r"(ret.s0.s0.s1.s1.s1.z), "=r"(ret.s0.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s0.s1.s0.s0.s0.x), "=r"(ret.s0.s1.s0.s0.s0.y), "=r"(ret.s0.s1.s0.s0.s0.z), "=r"(ret.s0.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s0.s1.s0.s0.s1.x), "=r"(ret.s0.s1.s0.s0.s1.y), "=r"(ret.s0.s1.s0.s0.s1.z), "=r"(ret.s0.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s0.s1.s0.s1.s0.x), "=r"(ret.s0.s1.s0.s1.s0.y), "=r"(ret.s0.s1.s0.s1.s0.z), "=r"(ret.s0.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s0.s1.s0.s1.s1.x), "=r"(ret.s0.s1.s0.s1.s1.y), "=r"(ret.s0.s1.s0.s1.s1.z), "=r"(ret.s0.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s0.s1.s1.s0.s0.x), "=r"(ret.s0.s1.s1.s0.s0.y), "=r"(ret.s0.s1.s1.s0.s0.z), "=r"(ret.s0.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s0.s1.s1.s0.s1.x), "=r"(ret.s0.s1.s1.s0.s1.y), "=r"(ret.s0.s1.s1.s0.s1.z), "=r"(ret.s0.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s0.s1.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s0.s1.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+256];" : "=r"(ret.s1.s0.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+272];" : "=r"(ret.s1.s0.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+288];" : "=r"(ret.s1.s0.s0.s1.s0.x), "=r"(ret.s1.s0.s0.s1.s0.y), "=r"(ret.s1.s0.s0.s1.s0.z), "=r"(ret.s1.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+304];" : "=r"(ret.s1.s0.s0.s1.s1.x), "=r"(ret.s1.s0.s0.s1.s1.y), "=r"(ret.s1.s0.s0.s1.s1.z), "=r"(ret.s1.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+320];" : "=r"(ret.s1.s0.s1.s0.s0.x), "=r"(ret.s1.s0.s1.s0.s0.y), "=r"(ret.s1.s0.s1.s0.s0.z), "=r"(ret.s1.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+336];" : "=r"(ret.s1.s0.s1.s0.s1.x), "=r"(ret.s1.s0.s1.s0.s1.y), "=r"(ret.s1.s0.s1.s0.s1.z), "=r"(ret.s1.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+352];" : "=r"(ret.s1.s0.s1.s1.s0.x), "=r"(ret.s1.s0.s1.s1.s0.y), "=r"(ret.s1.s0.s1.s1.s0.z), "=r"(ret.s1.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+368];" : "=r"(ret.s1.s0.s1.s1.s1.x), "=r"(ret.s1.s0.s1.s1.s1.y), "=r"(ret.s1.s0.s1.s1.s1.z), "=r"(ret.s1.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+384];" : "=r"(ret.s1.s1.s0.s0.s0.x), "=r"(ret.s1.s1.s0.s0.s0.y), "=r"(ret.s1.s1.s0.s0.s0.z), "=r"(ret.s1.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+400];" : "=r"(ret.s1.s1.s0.s0.s1.x), "=r"(ret.s1.s1.s0.s0.s1.y), "=r"(ret.s1.s1.s0.s0.s1.z), "=r"(ret.s1.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+416];" : "=r"(ret.s1.s1.s0.s1.s0.x), "=r"(ret.s1.s1.s0.s1.s0.y), "=r"(ret.s1.s1.s0.s1.s0.z), "=r"(ret.s1.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+432];" : "=r"(ret.s1.s1.s0.s1.s1.x), "=r"(ret.s1.s1.s0.s1.s1.y), "=r"(ret.s1.s1.s0.s1.s1.z), "=r"(ret.s1.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+448];" : "=r"(ret.s1.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+464];" : "=r"(ret.s1.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+480];" : "=r"(ret.s1.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+496];" : "=r"(ret.s1.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + + return ret; +} + +static __device__ __inline__ uintx256 __ldg256(const uintx256 *ptr) +{ + uintx256 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s0.s0.s1.s0.s0.x), "=r"(ret.s0.s0.s0.s1.s0.s0.y), "=r"(ret.s0.s0.s0.s1.s0.s0.z), "=r"(ret.s0.s0.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s0.s0.s1.s0.s1.x), "=r"(ret.s0.s0.s0.s1.s0.s1.y), "=r"(ret.s0.s0.s0.s1.s0.s1.z), "=r"(ret.s0.s0.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s0.s0.s1.s1.s0.x), "=r"(ret.s0.s0.s0.s1.s1.s0.y), "=r"(ret.s0.s0.s0.s1.s1.s0.z), "=r"(ret.s0.s0.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s0.s0.s1.s1.s1.x), "=r"(ret.s0.s0.s0.s1.s1.s1.y), "=r"(ret.s0.s0.s0.s1.s1.s1.z), "=r"(ret.s0.s0.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s0.s0.s1.s0.s0.s0.x), "=r"(ret.s0.s0.s1.s0.s0.s0.y), "=r"(ret.s0.s0.s1.s0.s0.s0.z), "=r"(ret.s0.s0.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s0.s0.s1.s0.s0.s1.x), "=r"(ret.s0.s0.s1.s0.s0.s1.y), "=r"(ret.s0.s0.s1.s0.s0.s1.z), "=r"(ret.s0.s0.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s0.s0.s1.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s0.s0.s1.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s0.s0.s1.s1.s0.s0.x), "=r"(ret.s0.s0.s1.s1.s0.s0.y), "=r"(ret.s0.s0.s1.s1.s0.s0.z), "=r"(ret.s0.s0.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s0.s0.s1.s1.s0.s1.x), "=r"(ret.s0.s0.s1.s1.s0.s1.y), "=r"(ret.s0.s0.s1.s1.s0.s1.z), "=r"(ret.s0.s0.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s0.s0.s1.s1.s1.s0.x), "=r"(ret.s0.s0.s1.s1.s1.s0.y), "=r"(ret.s0.s0.s1.s1.s1.s0.z), "=r"(ret.s0.s0.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s0.s0.s1.s1.s1.s1.x), "=r"(ret.s0.s0.s1.s1.s1.s1.y), "=r"(ret.s0.s0.s1.s1.s1.s1.z), "=r"(ret.s0.s0.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+256];" : "=r"(ret.s0.s1.s0.s0.s0.s0.x), "=r"(ret.s0.s1.s0.s0.s0.s0.y), "=r"(ret.s0.s1.s0.s0.s0.s0.z), "=r"(ret.s0.s1.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+272];" : "=r"(ret.s0.s1.s0.s0.s0.s1.x), "=r"(ret.s0.s1.s0.s0.s0.s1.y), "=r"(ret.s0.s1.s0.s0.s0.s1.z), "=r"(ret.s0.s1.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+288];" : "=r"(ret.s0.s1.s0.s0.s1.s0.x), "=r"(ret.s0.s1.s0.s0.s1.s0.y), "=r"(ret.s0.s1.s0.s0.s1.s0.z), "=r"(ret.s0.s1.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+304];" : "=r"(ret.s0.s1.s0.s0.s1.s1.x), "=r"(ret.s0.s1.s0.s0.s1.s1.y), "=r"(ret.s0.s1.s0.s0.s1.s1.z), "=r"(ret.s0.s1.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+320];" : "=r"(ret.s0.s1.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+336];" : "=r"(ret.s0.s1.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+352];" : "=r"(ret.s0.s1.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+368];" : "=r"(ret.s0.s1.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+384];" : "=r"(ret.s0.s1.s1.s0.s0.s0.x), "=r"(ret.s0.s1.s1.s0.s0.s0.y), "=r"(ret.s0.s1.s1.s0.s0.s0.z), "=r"(ret.s0.s1.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+400];" : "=r"(ret.s0.s1.s1.s0.s0.s1.x), "=r"(ret.s0.s1.s1.s0.s0.s1.y), "=r"(ret.s0.s1.s1.s0.s0.s1.z), "=r"(ret.s0.s1.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+416];" : "=r"(ret.s0.s1.s1.s0.s1.s0.x), "=r"(ret.s0.s1.s1.s0.s1.s0.y), "=r"(ret.s0.s1.s1.s0.s1.s0.z), "=r"(ret.s0.s1.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+432];" : "=r"(ret.s0.s1.s1.s0.s1.s1.x), "=r"(ret.s0.s1.s1.s0.s1.s1.y), "=r"(ret.s0.s1.s1.s0.s1.s1.z), "=r"(ret.s0.s1.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+448];" : "=r"(ret.s0.s1.s1.s1.s0.s0.x), "=r"(ret.s0.s1.s1.s1.s0.s0.y), "=r"(ret.s0.s1.s1.s1.s0.s0.z), "=r"(ret.s0.s1.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+464];" : "=r"(ret.s0.s1.s1.s1.s0.s1.x), "=r"(ret.s0.s1.s1.s1.s0.s1.y), "=r"(ret.s0.s1.s1.s1.s0.s1.z), "=r"(ret.s0.s1.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+480];" : "=r"(ret.s0.s1.s1.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+496];" : "=r"(ret.s0.s1.s1.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+512];" : "=r"(ret.s1.s0.s0.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+528];" : "=r"(ret.s1.s0.s0.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+544];" : "=r"(ret.s1.s0.s0.s0.s1.s0.x), "=r"(ret.s1.s0.s0.s0.s1.s0.y), "=r"(ret.s1.s0.s0.s0.s1.s0.z), "=r"(ret.s1.s0.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+560];" : "=r"(ret.s1.s0.s0.s0.s1.s1.x), "=r"(ret.s1.s0.s0.s0.s1.s1.y), "=r"(ret.s1.s0.s0.s0.s1.s1.z), "=r"(ret.s1.s0.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+576];" : "=r"(ret.s1.s0.s0.s1.s0.s0.x), "=r"(ret.s1.s0.s0.s1.s0.s0.y), "=r"(ret.s1.s0.s0.s1.s0.s0.z), "=r"(ret.s1.s0.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+592];" : "=r"(ret.s1.s0.s0.s1.s0.s1.x), "=r"(ret.s1.s0.s0.s1.s0.s1.y), "=r"(ret.s1.s0.s0.s1.s0.s1.z), "=r"(ret.s1.s0.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+608];" : "=r"(ret.s1.s0.s0.s1.s1.s0.x), "=r"(ret.s1.s0.s0.s1.s1.s0.y), "=r"(ret.s1.s0.s0.s1.s1.s0.z), "=r"(ret.s1.s0.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+624];" : "=r"(ret.s1.s0.s0.s1.s1.s1.x), "=r"(ret.s1.s0.s0.s1.s1.s1.y), "=r"(ret.s1.s0.s0.s1.s1.s1.z), "=r"(ret.s1.s0.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+640];" : "=r"(ret.s1.s0.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+656];" : "=r"(ret.s1.s0.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+672];" : "=r"(ret.s1.s0.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+688];" : "=r"(ret.s1.s0.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+704];" : "=r"(ret.s1.s0.s1.s1.s0.s0.x), "=r"(ret.s1.s0.s1.s1.s0.s0.y), "=r"(ret.s1.s0.s1.s1.s0.s0.z), "=r"(ret.s1.s0.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+720];" : "=r"(ret.s1.s0.s1.s1.s0.s1.x), "=r"(ret.s1.s0.s1.s1.s0.s1.y), "=r"(ret.s1.s0.s1.s1.s0.s1.z), "=r"(ret.s1.s0.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+736];" : "=r"(ret.s1.s0.s1.s1.s1.s0.x), "=r"(ret.s1.s0.s1.s1.s1.s0.y), "=r"(ret.s1.s0.s1.s1.s1.s0.z), "=r"(ret.s1.s0.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+752];" : "=r"(ret.s1.s0.s1.s1.s1.s1.x), "=r"(ret.s1.s0.s1.s1.s1.s1.y), "=r"(ret.s1.s0.s1.s1.s1.s1.z), "=r"(ret.s1.s0.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+768];" : "=r"(ret.s1.s1.s0.s0.s0.s0.x), "=r"(ret.s1.s1.s0.s0.s0.s0.y), "=r"(ret.s1.s1.s0.s0.s0.s0.z), "=r"(ret.s1.s1.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+784];" : "=r"(ret.s1.s1.s0.s0.s0.s1.x), "=r"(ret.s1.s1.s0.s0.s0.s1.y), "=r"(ret.s1.s1.s0.s0.s0.s1.z), "=r"(ret.s1.s1.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+800];" : "=r"(ret.s1.s1.s0.s0.s1.s0.x), "=r"(ret.s1.s1.s0.s0.s1.s0.y), "=r"(ret.s1.s1.s0.s0.s1.s0.z), "=r"(ret.s1.s1.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+816];" : "=r"(ret.s1.s1.s0.s0.s1.s1.x), "=r"(ret.s1.s1.s0.s0.s1.s1.y), "=r"(ret.s1.s1.s0.s0.s1.s1.z), "=r"(ret.s1.s1.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+832];" : "=r"(ret.s1.s1.s0.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+848];" : "=r"(ret.s1.s1.s0.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+864];" : "=r"(ret.s1.s1.s0.s1.s1.s0.x), "=r"(ret.s1.s1.s0.s1.s1.s0.y), "=r"(ret.s1.s1.s0.s1.s1.s0.z), "=r"(ret.s1.s1.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+880];" : "=r"(ret.s1.s1.s0.s1.s1.s1.x), "=r"(ret.s1.s1.s0.s1.s1.s1.y), "=r"(ret.s1.s1.s0.s1.s1.s1.z), "=r"(ret.s1.s1.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+896];" : "=r"(ret.s1.s1.s1.s0.s0.s0.x), "=r"(ret.s1.s1.s1.s0.s0.s0.y), "=r"(ret.s1.s1.s1.s0.s0.s0.z), "=r"(ret.s1.s1.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+912];" : "=r"(ret.s1.s1.s1.s0.s0.s1.x), "=r"(ret.s1.s1.s1.s0.s0.s1.y), "=r"(ret.s1.s1.s1.s0.s0.s1.z), "=r"(ret.s1.s1.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+928];" : "=r"(ret.s1.s1.s1.s0.s1.s0.x), "=r"(ret.s1.s1.s1.s0.s1.s0.y), "=r"(ret.s1.s1.s1.s0.s1.s0.z), "=r"(ret.s1.s1.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+944];" : "=r"(ret.s1.s1.s1.s0.s1.s1.x), "=r"(ret.s1.s1.s1.s0.s1.s1.y), "=r"(ret.s1.s1.s1.s0.s1.s1.z), "=r"(ret.s1.s1.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+960];" : "=r"(ret.s1.s1.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+976];" : "=r"(ret.s1.s1.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+992];" : "=r"(ret.s1.s1.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+1008];" : "=r"(ret.s1.s1.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + + return ret; +} + +static __device__ __inline__ ulonglong2 __ldg2(const ulonglong2 *ptr) +{ + ulonglong2 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr)); +return ret; +} + +static __device__ __inline__ ulonglong4 __ldg4(const ulonglong4 *ptr) +{ + ulonglong4 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.z), "=l"(ret.w) : __LDG_PTR(ptr)); + return ret; +} + + +static __device__ __inline__ ulonglong2to8 __ldg2to8(const ulonglong2to8 *ptr) +{ + ulonglong2to8 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.l0.x), "=l"(ret.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.l1.x), "=l"(ret.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.l2.x), "=l"(ret.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.l3.x), "=l"(ret.l3.y) : __LDG_PTR(ptr)); + return ret; +} +static __device__ __inline__ ulonglong8to16 __ldg8to16(const ulonglong8to16 *ptr) +{ + ulonglong8to16 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.lo.l0.x), "=l"(ret.lo.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.lo.l1.x), "=l"(ret.lo.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.lo.l2.x), "=l"(ret.lo.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.lo.l3.x), "=l"(ret.lo.l3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.hi.l0.x), "=l"(ret.hi.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.hi.l1.x), "=l"(ret.hi.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.hi.l2.x), "=l"(ret.hi.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.hi.l3.x), "=l"(ret.hi.l3.y) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ ulonglonglong __ldgxtralong(const ulonglonglong *ptr) +{ + ulonglonglong ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.lo.l0.x), "=l"(ret.s0.lo.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s0.lo.l1.x), "=l"(ret.s0.lo.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s0.lo.l2.x), "=l"(ret.s0.lo.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s0.lo.l3.x), "=l"(ret.s0.lo.l3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s0.hi.l0.x), "=l"(ret.s0.hi.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s0.hi.l1.x), "=l"(ret.s0.hi.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.s0.hi.l2.x), "=l"(ret.s0.hi.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.s0.hi.l3.x), "=l"(ret.s0.hi.l3.y) : __LDG_PTR(ptr)); + + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+128];" : "=l"(ret.s1.lo.l0.x), "=l"(ret.s1.lo.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+144];" : "=l"(ret.s1.lo.l1.x), "=l"(ret.s1.lo.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+160];" : "=l"(ret.s1.lo.l2.x), "=l"(ret.s1.lo.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+176];" : "=l"(ret.s1.lo.l3.x), "=l"(ret.s1.lo.l3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+192];" : "=l"(ret.s1.hi.l0.x), "=l"(ret.s1.hi.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+208];" : "=l"(ret.s1.hi.l1.x), "=l"(ret.s1.hi.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+224];" : "=l"(ret.s1.hi.l2.x), "=l"(ret.s1.hi.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+240];" : "=l"(ret.s1.hi.l3.x), "=l"(ret.s1.hi.l3.y) : __LDG_PTR(ptr)); + + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+256];" : "=l"(ret.s2.lo.l0.x), "=l"(ret.s2.lo.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+272];" : "=l"(ret.s2.lo.l1.x), "=l"(ret.s2.lo.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+288];" : "=l"(ret.s2.lo.l2.x), "=l"(ret.s2.lo.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+304];" : "=l"(ret.s2.lo.l3.x), "=l"(ret.s2.lo.l3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+320];" : "=l"(ret.s2.hi.l0.x), "=l"(ret.s2.hi.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+336];" : "=l"(ret.s2.hi.l1.x), "=l"(ret.s2.hi.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+352];" : "=l"(ret.s2.hi.l2.x), "=l"(ret.s2.hi.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+368];" : "=l"(ret.s2.hi.l3.x), "=l"(ret.s2.hi.l3.y) : __LDG_PTR(ptr)); + + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+384];" : "=l"(ret.s3.lo.l0.x), "=l"(ret.s3.lo.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+400];" : "=l"(ret.s3.lo.l1.x), "=l"(ret.s3.lo.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+416];" : "=l"(ret.s3.lo.l2.x), "=l"(ret.s3.lo.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+432];" : "=l"(ret.s3.lo.l3.x), "=l"(ret.s3.lo.l3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+448];" : "=l"(ret.s3.hi.l0.x), "=l"(ret.s3.hi.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+464];" : "=l"(ret.s3.hi.l1.x), "=l"(ret.s3.hi.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+480];" : "=l"(ret.s3.hi.l2.x), "=l"(ret.s3.hi.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+496];" : "=l"(ret.s3.hi.l3.x), "=l"(ret.s3.hi.l3.y) : __LDG_PTR(ptr)); + + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+512];" : "=l"(ret.s4.lo.l0.x), "=l"(ret.s4.lo.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+528];" : "=l"(ret.s4.lo.l1.x), "=l"(ret.s4.lo.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+544];" : "=l"(ret.s4.lo.l2.x), "=l"(ret.s4.lo.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+560];" : "=l"(ret.s4.lo.l3.x), "=l"(ret.s4.lo.l3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+576];" : "=l"(ret.s4.hi.l0.x), "=l"(ret.s4.hi.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+592];" : "=l"(ret.s4.hi.l1.x), "=l"(ret.s4.hi.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+608];" : "=l"(ret.s4.hi.l2.x), "=l"(ret.s4.hi.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+624];" : "=l"(ret.s4.hi.l3.x), "=l"(ret.s4.hi.l3.y) : __LDG_PTR(ptr)); + + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+640];" : "=l"(ret.s5.lo.l0.x), "=l"(ret.s5.lo.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+656];" : "=l"(ret.s5.lo.l1.x), "=l"(ret.s5.lo.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+672];" : "=l"(ret.s5.lo.l2.x), "=l"(ret.s5.lo.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+688];" : "=l"(ret.s5.lo.l3.x), "=l"(ret.s5.lo.l3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+704];" : "=l"(ret.s5.hi.l0.x), "=l"(ret.s5.hi.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+720];" : "=l"(ret.s5.hi.l1.x), "=l"(ret.s5.hi.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+736];" : "=l"(ret.s5.hi.l2.x), "=l"(ret.s5.hi.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+752];" : "=l"(ret.s5.hi.l3.x), "=l"(ret.s5.hi.l3.y) : __LDG_PTR(ptr)); + + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+768];" : "=l"(ret.s6.lo.l0.x), "=l"(ret.s6.lo.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+784];" : "=l"(ret.s6.lo.l1.x), "=l"(ret.s6.lo.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+800];" : "=l"(ret.s6.lo.l2.x), "=l"(ret.s6.lo.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+816];" : "=l"(ret.s6.lo.l3.x), "=l"(ret.s6.lo.l3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+832];" : "=l"(ret.s6.hi.l0.x), "=l"(ret.s6.hi.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+848];" : "=l"(ret.s6.hi.l1.x), "=l"(ret.s6.hi.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+864];" : "=l"(ret.s6.hi.l2.x), "=l"(ret.s6.hi.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+880];" : "=l"(ret.s6.hi.l3.x), "=l"(ret.s6.hi.l3.y) : __LDG_PTR(ptr)); + + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+896];" : "=l"(ret.s7.lo.l0.x), "=l"(ret.s7.lo.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+912];" : "=l"(ret.s7.lo.l1.x), "=l"(ret.s7.lo.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+928];" : "=l"(ret.s7.lo.l2.x), "=l"(ret.s7.lo.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+944];" : "=l"(ret.s7.lo.l3.x), "=l"(ret.s7.lo.l3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+960];" : "=l"(ret.s7.hi.l0.x), "=l"(ret.s7.hi.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+976];" : "=l"(ret.s7.hi.l1.x), "=l"(ret.s7.hi.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+992];" : "=l"(ret.s7.hi.l2.x), "=l"(ret.s7.hi.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+1008];" : "=l"(ret.s7.hi.l3.x), "=l"(ret.s7.hi.l3.y) : __LDG_PTR(ptr)); + + + + return ret; +} + + +static __device__ __inline__ ulonglong16 __ldg64(const ulonglong2 *ptr) +{ + ulonglong16 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.s6.x), "=l"(ret.s6.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.s7.x), "=l"(ret.s7.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+128];" : "=l"(ret.s8.x), "=l"(ret.s8.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+144];" : "=l"(ret.s9.x), "=l"(ret.s9.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+160];" : "=l"(ret.sa.x), "=l"(ret.sa.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+176];" : "=l"(ret.sb.x), "=l"(ret.sb.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+192];" : "=l"(ret.sc.x), "=l"(ret.sc.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+208];" : "=l"(ret.sd.x), "=l"(ret.sd.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+224];" : "=l"(ret.se.x), "=l"(ret.se.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+240];" : "=l"(ret.sf.x), "=l"(ret.sf.y) : __LDG_PTR(ptr)); + return ret; +} + + +static __device__ __inline__ ulonglong16 __ldg64b(const ulonglong16 *ptr) +{ + ulonglong16 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.s6.x), "=l"(ret.s6.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.s7.x), "=l"(ret.s7.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+128];" : "=l"(ret.s8.x), "=l"(ret.s8.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+144];" : "=l"(ret.s9.x), "=l"(ret.s9.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+160];" : "=l"(ret.sa.x), "=l"(ret.sa.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+176];" : "=l"(ret.sb.x), "=l"(ret.sb.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+192];" : "=l"(ret.sc.x), "=l"(ret.sc.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+208];" : "=l"(ret.sd.x), "=l"(ret.sd.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+224];" : "=l"(ret.se.x), "=l"(ret.se.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+240];" : "=l"(ret.sf.x), "=l"(ret.sf.y) : __LDG_PTR(ptr)); + return ret; +} + + + +static __device__ __inline__ ulonglong16 __ldg64b(const uint32 *ptr) +{ + ulonglong16 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.s6.x), "=l"(ret.s6.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.s7.x), "=l"(ret.s7.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+128];" : "=l"(ret.s8.x), "=l"(ret.s8.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+144];" : "=l"(ret.s9.x), "=l"(ret.s9.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+160];" : "=l"(ret.sa.x), "=l"(ret.sa.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+176];" : "=l"(ret.sb.x), "=l"(ret.sb.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+192];" : "=l"(ret.sc.x), "=l"(ret.sc.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+208];" : "=l"(ret.sd.x), "=l"(ret.sd.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+224];" : "=l"(ret.se.x), "=l"(ret.se.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+240];" : "=l"(ret.sf.x), "=l"(ret.sf.y) : __LDG_PTR(ptr)); + return ret; +} + + + +static __forceinline__ __device__ uint8 swapvec(const uint8 &buf) +{ + uint8 vec; + vec.s0 = cuda_swab32(buf.s0); + vec.s1 = cuda_swab32(buf.s1); + vec.s2 = cuda_swab32(buf.s2); + vec.s3 = cuda_swab32(buf.s3); + vec.s4 = cuda_swab32(buf.s4); + vec.s5 = cuda_swab32(buf.s5); + vec.s6 = cuda_swab32(buf.s6); + vec.s7 = cuda_swab32(buf.s7); + return vec; +} + + +static __forceinline__ __device__ uint8 swapvec(const uint8 *buf) +{ + uint8 vec; + vec.s0 = cuda_swab32(buf[0].s0); + vec.s1 = cuda_swab32(buf[0].s1); + vec.s2 = cuda_swab32(buf[0].s2); + vec.s3 = cuda_swab32(buf[0].s3); + vec.s4 = cuda_swab32(buf[0].s4); + vec.s5 = cuda_swab32(buf[0].s5); + vec.s6 = cuda_swab32(buf[0].s6); + vec.s7 = cuda_swab32(buf[0].s7); + return vec; +} + +static __forceinline__ __device__ uint16 swapvec(const uint16 *buf) +{ + uint16 vec; + vec.s0 = cuda_swab32(buf[0].s0); + vec.s1 = cuda_swab32(buf[0].s1); + vec.s2 = cuda_swab32(buf[0].s2); + vec.s3 = cuda_swab32(buf[0].s3); + vec.s4 = cuda_swab32(buf[0].s4); + vec.s5 = cuda_swab32(buf[0].s5); + vec.s6 = cuda_swab32(buf[0].s6); + vec.s7 = cuda_swab32(buf[0].s7); + vec.s8 = cuda_swab32(buf[0].s8); + vec.s9 = cuda_swab32(buf[0].s9); + vec.sa = cuda_swab32(buf[0].sa); + vec.sb = cuda_swab32(buf[0].sb); + vec.sc = cuda_swab32(buf[0].sc); + vec.sd = cuda_swab32(buf[0].sd); + vec.se = cuda_swab32(buf[0].se); + vec.sf = cuda_swab32(buf[0].sf); + return vec; +} + +static __forceinline__ __device__ uint16 swapvec(const uint16 &buf) +{ + uint16 vec; + vec.s0 = cuda_swab32(buf.s0); + vec.s1 = cuda_swab32(buf.s1); + vec.s2 = cuda_swab32(buf.s2); + vec.s3 = cuda_swab32(buf.s3); + vec.s4 = cuda_swab32(buf.s4); + vec.s5 = cuda_swab32(buf.s5); + vec.s6 = cuda_swab32(buf.s6); + vec.s7 = cuda_swab32(buf.s7); + vec.s8 = cuda_swab32(buf.s8); + vec.s9 = cuda_swab32(buf.s9); + vec.sa = cuda_swab32(buf.sa); + vec.sb = cuda_swab32(buf.sb); + vec.sc = cuda_swab32(buf.sc); + vec.sd = cuda_swab32(buf.sd); + vec.se = cuda_swab32(buf.se); + vec.sf = cuda_swab32(buf.sf); + return vec; +} + +#endif // #ifndef CUDA_VECTOR_H diff --git a/neoscrypt/neoscrypt.c b/neoscrypt/neoscrypt.c new file mode 100644 index 0000000..59150ed --- /dev/null +++ b/neoscrypt/neoscrypt.c @@ -0,0 +1,992 @@ +/* + * Copyright (c) 2009 Colin Percival, 2011 ArtForz + * Copyright (c) 2012 Andrew Moon (floodyberry) + * Copyright (c) 2012 Samuel Neves + * Copyright (c) 2014 John Doering + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include +#include +#include + +#include "neoscrypt.h" + + +#if (WINDOWS) +/* sizeof(unsigned long) = 4 for MinGW64 */ +typedef unsigned long long ulong; +#else +typedef unsigned long ulong; +#endif +typedef unsigned int uint; +typedef unsigned char uchar; +typedef unsigned int bool; + + +#define MIN(a, b) ((a) < (b) ? a : b) +#define MAX(a, b) ((a) > (b) ? a : b) + + +/* SHA-256 */ + +static const uint32_t sha256_constants[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +#define Ch(x,y,z) (z ^ (x & (y ^ z))) +#define Maj(x,y,z) (((x | y) & z) | (x & y)) +#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) +#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) +#define G0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ (x >> 3)) +#define G1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10)) +#define W0(in,i) (U8TO32_BE(&in[i * 4])) +#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) +#define STEP(i) \ + t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ + t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \ + r[7] = r[6]; \ + r[6] = r[5]; \ + r[5] = r[4]; \ + r[4] = r[3] + t0; \ + r[3] = r[2]; \ + r[2] = r[1]; \ + r[1] = r[0]; \ + r[0] = t0 + t1; + + +typedef struct sha256_hash_state_t { + uint32_t H[8]; + uint64_t T; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} sha256_hash_state; + + +static void sha256_blocks(sha256_hash_state *S, const uint8_t *in, size_t blocks) +{ + uint32_t r[8], w[64], t0, t1; + size_t i; + + for (i = 0; i < 8; i++) + r[i] = S->H[i]; + + while (blocks--) { + for (i = 0U; i < 16; i++) { + w[i] = W0(in, i); + } + for (i = 16; i < 64; i++) { + w[i] = W1(i); + } + for (i = 0U; i < 64; i++) { + STEP(i); + } + for (i = 0U; i < 8U; i++) { + r[i] += S->H[i]; + S->H[i] = r[i]; + } + S->T += SCRYPT_HASH_BLOCK_SIZE * 8; + in += SCRYPT_HASH_BLOCK_SIZE; + } +} + +static void neoscrypt_hash_init_sha256(sha256_hash_state *S) +{ + S->H[0] = 0x6a09e667; + S->H[1] = 0xbb67ae85; + S->H[2] = 0x3c6ef372; + S->H[3] = 0xa54ff53a; + S->H[4] = 0x510e527f; + S->H[5] = 0x9b05688c; + S->H[6] = 0x1f83d9ab; + S->H[7] = 0x5be0cd19; + S->T = 0; + S->leftover = 0; +} + +static void neoscrypt_hash_update_sha256(sha256_hash_state *S, const uint8_t *in, size_t inlen) +{ + size_t blocks, want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + sha256_blocks(S, S->buffer, 1); + } + + /* handle the current data */ + blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + S->leftover = (uint32_t)(inlen - blocks); + if (blocks) { + sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + in += blocks; + } + + /* handle leftover data */ + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void neoscrypt_hash_finish_sha256(sha256_hash_state *S, uint8_t *hash) +{ + uint64_t t = S->T + (S->leftover * 8); + + S->buffer[S->leftover] = 0x80; + if (S->leftover <= 55) { + memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); + } else { + memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); + sha256_blocks(S, S->buffer, 1); + memset(S->buffer, 0, 56); + } + + U64TO8_BE(S->buffer + 56, t); + sha256_blocks(S, S->buffer, 1); + + U32TO8_BE(&hash[ 0], S->H[0]); + U32TO8_BE(&hash[ 4], S->H[1]); + U32TO8_BE(&hash[ 8], S->H[2]); + U32TO8_BE(&hash[12], S->H[3]); + U32TO8_BE(&hash[16], S->H[4]); + U32TO8_BE(&hash[20], S->H[5]); + U32TO8_BE(&hash[24], S->H[6]); + U32TO8_BE(&hash[28], S->H[7]); +} + +static void neoscrypt_hash_sha256(hash_digest hash, const uint8_t *m, size_t mlen) +{ + sha256_hash_state st; + neoscrypt_hash_init_sha256(&st); + neoscrypt_hash_update_sha256(&st, m, mlen); + neoscrypt_hash_finish_sha256(&st, hash); +} + + +/* HMAC for SHA-256 */ + +typedef struct sha256_hmac_state_t { + sha256_hash_state inner, outer; +} sha256_hmac_state; + +static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen) +{ + uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; + size_t i; + + neoscrypt_hash_init_sha256(&st->inner); + neoscrypt_hash_init_sha256(&st->outer); + + if (keylen <= SCRYPT_HASH_BLOCK_SIZE) { + /* use the key directly if it's <= blocksize bytes */ + memcpy(pad, key, keylen); + } else { + /* if it's > blocksize bytes, hash it */ + neoscrypt_hash_sha256(pad, key, keylen); + } + + /* inner = (key ^ 0x36) */ + /* h(inner || ...) */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= 0x36; + neoscrypt_hash_update_sha256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); + + /* outer = (key ^ 0x5c) */ + /* h(outer || ...) */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= (0x5c ^ 0x36); + neoscrypt_hash_update_sha256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); +} + +static void neoscrypt_hmac_update_sha256(sha256_hmac_state *st, const uint8_t *m, size_t mlen) +{ + /* h(inner || m...) */ + neoscrypt_hash_update_sha256(&st->inner, m, mlen); +} + +static void neoscrypt_hmac_finish_sha256(sha256_hmac_state *st, hash_digest mac) +{ + /* h(inner || m) */ + hash_digest innerhash; + neoscrypt_hash_finish_sha256(&st->inner, innerhash); + + /* h(outer || h(inner || m)) */ + neoscrypt_hash_update_sha256(&st->outer, innerhash, sizeof(innerhash)); + neoscrypt_hash_finish_sha256(&st->outer, mac); +} + + +/* PBKDF2 for SHA-256 */ + +static void neoscrypt_pbkdf2_sha256(const uint8_t *password, size_t password_len, + const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *output, size_t output_len) +{ + sha256_hmac_state hmac_pw, hmac_pw_salt, work; + hash_digest ti, u; + uint8_t be[4]; + uint32_t i, j, k, blocks; + + /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ + + /* hmac(password, ...) */ + neoscrypt_hmac_init_sha256(&hmac_pw, password, password_len); + + /* hmac(password, salt...) */ + hmac_pw_salt = hmac_pw; + neoscrypt_hmac_update_sha256(&hmac_pw_salt, salt, salt_len); + + blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; + for(i = 1; i <= blocks; i++) { + /* U1 = hmac(password, salt || be(i)) */ + U32TO8_BE(be, i); + work = hmac_pw_salt; + neoscrypt_hmac_update_sha256(&work, be, 4); + neoscrypt_hmac_finish_sha256(&work, ti); + memcpy(u, ti, sizeof(u)); + + /* T[i] = U1 ^ U2 ^ U3... */ + for(j = 0; j < N - 1; j++) { + /* UX = hmac(password, U{X-1}) */ + work = hmac_pw; + neoscrypt_hmac_update_sha256(&work, u, SCRYPT_HASH_DIGEST_SIZE); + neoscrypt_hmac_finish_sha256(&work, u); + + /* T[i] ^= UX */ + for(k = 0; k < sizeof(u); k++) + ti[k] ^= u[k]; + } + + memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len); + output += SCRYPT_HASH_DIGEST_SIZE; + output_len -= SCRYPT_HASH_DIGEST_SIZE; + } +} + + +/* NeoScrypt */ + +#if defined(ASM) + +extern void neoscrypt_salsa(uint *X, uint rounds); +extern void neoscrypt_salsa_tangle(uint *X, uint count); +extern void neoscrypt_chacha(uint *X, uint rounds); + +extern void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len); +extern void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len); +extern void neoscrypt_blkxor(void *dstp, const void *srcp, uint len); + +#else + +/* Salsa20, rounds must be a multiple of 2 */ +static void neoscrypt_salsa(uint *X, uint rounds) +{ + uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t; + + x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3]; + x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7]; + x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11]; + x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15]; + +#define quarter(a, b, c, d) \ + t = a + d; t = ROTL32(t, 7); b ^= t; \ + t = b + a; t = ROTL32(t, 9); c ^= t; \ + t = c + b; t = ROTL32(t, 13); d ^= t; \ + t = d + c; t = ROTL32(t, 18); a ^= t; + + for(; rounds; rounds -= 2) { + quarter( x0, x4, x8, x12); + quarter( x5, x9, x13, x1); + quarter(x10, x14, x2, x6); + quarter(x15, x3, x7, x11); + quarter( x0, x1, x2, x3); + quarter( x5, x6, x7, x4); + quarter(x10, x11, x8, x9); + quarter(x15, x12, x13, x14); + } + + X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3; + X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7; + X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11; + X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15; + +#undef quarter +} + +/* ChaCha20, rounds must be a multiple of 2 */ +static void neoscrypt_chacha(uint *X, uint rounds) +{ + uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t; + + x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3]; + x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7]; + x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11]; + x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15]; + +#define quarter(a,b,c,d) \ + a += b; t = d ^ a; d = ROTL32(t, 16); \ + c += d; t = b ^ c; b = ROTL32(t, 12); \ + a += b; t = d ^ a; d = ROTL32(t, 8); \ + c += d; t = b ^ c; b = ROTL32(t, 7); + + for(; rounds; rounds -= 2) { + quarter( x0, x4, x8, x12); + quarter( x1, x5, x9, x13); + quarter( x2, x6, x10, x14); + quarter( x3, x7, x11, x15); + quarter( x0, x5, x10, x15); + quarter( x1, x6, x11, x12); + quarter( x2, x7, x8, x13); + quarter( x3, x4, x9, x14); + } + + X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3; + X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7; + X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11; + X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15; + +#undef quarter +} + + +/* Fast 32-bit / 64-bit memcpy(); + * len must be a multiple of 32 bytes */ +static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) +{ + ulong *dst = (ulong *) dstp; + ulong *src = (ulong *) srcp; + uint i; + + for(i = 0; i < (len / sizeof(ulong)); i += 4) { + dst[i] = src[i]; + dst[i + 1] = src[i + 1]; + dst[i + 2] = src[i + 2]; + dst[i + 3] = src[i + 3]; + } +} + +/* Fast 32-bit / 64-bit block swapper; + * len must be a multiple of 32 bytes */ +static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) +{ + ulong *blkA = (ulong *) blkAp; + ulong *blkB = (ulong *) blkBp; + register ulong t0, t1, t2, t3; + uint i; + + for(i = 0; i < (len / sizeof(ulong)); i += 4) { + t0 = blkA[i]; + t1 = blkA[i + 1]; + t2 = blkA[i + 2]; + t3 = blkA[i + 3]; + blkA[i] = blkB[i]; + blkA[i + 1] = blkB[i + 1]; + blkA[i + 2] = blkB[i + 2]; + blkA[i + 3] = blkB[i + 3]; + blkB[i] = t0; + blkB[i + 1] = t1; + blkB[i + 2] = t2; + blkB[i + 3] = t3; + } +} + +/* Fast 32-bit / 64-bit block XOR engine; + * len must be a multiple of 32 bytes */ +static void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) +{ + ulong *dst = (ulong *) dstp; + ulong *src = (ulong *) srcp; + uint i; + + for (i = 0; i < (len / sizeof(ulong)); i += 4) { + dst[i] ^= src[i]; + dst[i + 1] ^= src[i + 1]; + dst[i + 2] ^= src[i + 2]; + dst[i + 3] ^= src[i + 3]; + } +} + +#endif + +/* 32-bit / 64-bit optimised memcpy() */ +static void neoscrypt_copy(void *dstp, const void *srcp, uint len) +{ + ulong *dst = (ulong *) dstp; + ulong *src = (ulong *) srcp; + uint i, tail; + + for(i = 0; i < (len / sizeof(ulong)); i++) + dst[i] = src[i]; + + tail = len & (sizeof(ulong) - 1); + if(tail) { + uchar *dstb = (uchar *) dstp; + uchar *srcb = (uchar *) srcp; + + for(i = len - tail; i < len; i++) + dstb[i] = srcb[i]; + } +} + +/* 32-bit / 64-bit optimised memory erase aka memset() to zero */ +static void neoscrypt_erase(void *dstp, uint len) +{ + const ulong null = 0; + ulong *dst = (ulong *) dstp; + uint i, tail; + + for (i = 0; i < (len / sizeof(ulong)); i++) + dst[i] = null; + + tail = len & (sizeof(ulong) - 1); + if (tail) { + uchar *dstb = (uchar *) dstp; + + for(i = len - tail; i < len; i++) + dstb[i] = (uchar)null; + } +} + +/* 32-bit / 64-bit optimised XOR engine */ +static void neoscrypt_xor(void *dstp, const void *srcp, uint len) +{ + ulong *dst = (ulong *) dstp; + ulong *src = (ulong *) srcp; + uint i, tail; + + for (i = 0; i < (len / sizeof(ulong)); i++) + dst[i] ^= src[i]; + + tail = len & (sizeof(ulong) - 1); + if (tail) { + uchar *dstb = (uchar *) dstp; + uchar *srcb = (uchar *) srcp; + + for(i = len - tail; i < len; i++) + dstb[i] ^= srcb[i]; + } +} + + +/* BLAKE2s */ + +#define BLAKE2S_BLOCK_SIZE 64U +#define BLAKE2S_OUT_SIZE 32U +#define BLAKE2S_KEY_SIZE 32U + +/* Parameter block of 32 bytes */ +typedef struct blake2s_param_t { + uchar digest_length; + uchar key_length; + uchar fanout; + uchar depth; + uint leaf_length; + uchar node_offset[6]; + uchar node_depth; + uchar inner_length; + uchar salt[8]; + uchar personal[8]; +} blake2s_param; + +/* State block of 180 bytes */ +typedef struct blake2s_state_t { + uint h[8]; + uint t[2]; + uint f[2]; + uchar buf[2 * BLAKE2S_BLOCK_SIZE]; + uint buflen; +} blake2s_state; + +static const uint blake2s_IV[8] = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +static const uint8_t blake2s_sigma[10][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , +}; + +static void blake2s_compress(blake2s_state *S, const uint *buf) +{ + uint i; + uint m[16]; + uint v[16]; + + neoscrypt_copy(m, buf, 64); + neoscrypt_copy(v, S, 32); + + v[ 8] = blake2s_IV[0]; + v[ 9] = blake2s_IV[1]; + v[10] = blake2s_IV[2]; + v[11] = blake2s_IV[3]; + v[12] = S->t[0] ^ blake2s_IV[4]; + v[13] = S->t[1] ^ blake2s_IV[5]; + v[14] = S->f[0] ^ blake2s_IV[6]; + v[15] = S->f[1] ^ blake2s_IV[7]; + +#define G(r,i,a,b,c,d) do { \ + a = a + b + m[blake2s_sigma[r][2*i+0]]; \ + d = ROTR32(d ^ a, 16); \ + c = c + d; \ + b = ROTR32(b ^ c, 12); \ + a = a + b + m[blake2s_sigma[r][2*i+1]]; \ + d = ROTR32(d ^ a, 8); \ + c = c + d; \ + b = ROTR32(b ^ c, 7); \ +} while(0) + +#define ROUND(r) do { \ + G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \ + G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \ + G(r, 2, v[ 2], v[ 6], v[10], v[14]); \ + G(r, 3, v[ 3], v[ 7], v[11], v[15]); \ + G(r, 4, v[ 0], v[ 5], v[10], v[15]); \ + G(r, 5, v[ 1], v[ 6], v[11], v[12]); \ + G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \ + G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \ +} while(0) + + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + + for (i = 0; i < 8; i++) + S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + +#undef G +#undef ROUND +} + +static void blake2s_update(blake2s_state *S, const uchar *input, uint input_size) +{ + uint left, fill; + + while(input_size > 0) { + left = S->buflen; + fill = 2 * BLAKE2S_BLOCK_SIZE - left; + if(input_size > fill) { + /* Buffer fill */ + neoscrypt_copy(S->buf + left, input, fill); + S->buflen += fill; + /* Counter increment */ + S->t[0] += BLAKE2S_BLOCK_SIZE; + /* Compress */ + blake2s_compress(S, (uint *) S->buf); + /* Shift buffer left */ + neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, BLAKE2S_BLOCK_SIZE); + S->buflen -= BLAKE2S_BLOCK_SIZE; + input += fill; + input_size -= fill; + } else { + neoscrypt_copy(S->buf + left, input, input_size); + S->buflen += input_size; + /* Do not compress */ + input += input_size; + input_size = 0; + } + } +} + +static void neoscrypt_blake2s(const void *input, const uint input_size, const void *key, const uchar key_size, + void *output, const uchar output_size) +{ + uchar block[BLAKE2S_BLOCK_SIZE]; + blake2s_param P[1]; + blake2s_state S[1]; + + /* Initialise */ + neoscrypt_erase(P, 32); + P->digest_length = output_size; + P->key_length = key_size; + P->fanout = 1; + P->depth = 1; + + neoscrypt_erase(S, 180); + neoscrypt_copy(S, blake2s_IV, 32); + neoscrypt_xor(S, P, 32); + + neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE); + neoscrypt_copy(block, key, key_size); + blake2s_update(S, (uchar *) block, BLAKE2S_BLOCK_SIZE); + + /* Update */ + blake2s_update(S, (uchar *) input, input_size); + + /* Finish */ + if(S->buflen > BLAKE2S_BLOCK_SIZE) { + S->t[0] += BLAKE2S_BLOCK_SIZE; + blake2s_compress(S, (uint *) S->buf); + S->buflen -= BLAKE2S_BLOCK_SIZE; + neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, S->buflen); + } + S->t[0] += S->buflen; + S->f[0] = ~0U; + neoscrypt_erase(S->buf + S->buflen, 2 * BLAKE2S_BLOCK_SIZE - S->buflen); + blake2s_compress(S, (uint *) S->buf); + + /* Write back */ + neoscrypt_copy(output, S, output_size); + + //for (int k = 0; k<4; k++) { printf("cpu blake %d %08x %08x\n", k, ((unsigned int*)output)[2 * k], ((unsigned int*)output)[2 * k + 1]); } +} + + +#define FASTKDF_BUFFER_SIZE 256U + +/* FastKDF, a fast buffered key derivation function: + * FASTKDF_BUFFER_SIZE must be a power of 2; + * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; + * prf_output_size must be <= prf_key_size; */ +static void neoscrypt_fastkdf(const uchar *password, uint password_len, const uchar *salt, uint salt_len, + uint N, uchar *output, uint output_len) +{ + //for (int i = 0; i<10; i++) { printf("cpu password %d %08x %08x\n", i, ((unsigned int*)password)[2 * i], ((unsigned int*)password)[2 * i+1]); } + const uint stack_align = 0x40; + const uint kdf_buf_size = 256U; //FASTKDF_BUFFER_SIZE + const uint prf_input_size = 64U; //BLAKE2S_BLOCK_SIZE + const uint prf_key_size = 32U; //BLAKE2S_KEY_SIZE + const uint prf_output_size = 32U; //BLAKE2S_OUT_SIZE + uint bufptr, a, b, i, j; + uchar *A, *B, *prf_input, *prf_key, *prf_output; + uchar *stack; + stack = (uchar*)malloc(sizeof(uchar) * 2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align); + /* Align and set up the buffers in stack */ + //uchar stack[2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align]; + + A = &stack[stack_align & ~(stack_align - 1)]; + B = &A[kdf_buf_size + prf_input_size]; + prf_output = &A[2 * kdf_buf_size + prf_input_size + prf_key_size]; + + /* Initialise the password buffer */ + if(password_len > kdf_buf_size) + password_len = kdf_buf_size; + + a = kdf_buf_size / password_len; + for(i = 0; i < a; i++) + neoscrypt_copy(&A[i * password_len], &password[0], password_len); + b = kdf_buf_size - a * password_len; + if(b) + neoscrypt_copy(&A[a * password_len], &password[0], b); + neoscrypt_copy(&A[kdf_buf_size], &password[0], prf_input_size); + + /* Initialise the salt buffer */ + if(salt_len > kdf_buf_size) + salt_len = kdf_buf_size; + + a = kdf_buf_size / salt_len; + for(i = 0; i < a; i++) + neoscrypt_copy(&B[i * salt_len], &salt[0], salt_len); + b = kdf_buf_size - a * salt_len; + if(b) + neoscrypt_copy(&B[a * salt_len], &salt[0], b); + neoscrypt_copy(&B[kdf_buf_size], &salt[0], prf_key_size); + + /* The primary iteration */ + for(i = 0, bufptr = 0; i < N; i++) { + + /* Map the PRF input buffer */ + prf_input = &A[bufptr]; + + /* Map the PRF key buffer */ + prf_key = &B[bufptr]; + + /* PRF */ + // for (int k = 0; k<(prf_input_size/4); k++) { printf("cpu bufptr %08x before blake %d %d %08x \n",bufptr, i, k, ((unsigned int*)prf_input)[k]); } + neoscrypt_blake2s(prf_input, prf_input_size, prf_key, prf_key_size, prf_output, prf_output_size); + // for (int k = 0; k<(prf_output_size/4); k++) { printf("cpu after blake %d %d %08x \n", i, k, ((unsigned int*)prf_output)[k]); } + + /* Calculate the next buffer pointer */ + for(j = 0, bufptr = 0; j < prf_output_size; j++) + bufptr += prf_output[j]; + bufptr &= (kdf_buf_size - 1); + + /* Modify the salt buffer */ + neoscrypt_xor(&B[bufptr], &prf_output[0], prf_output_size); + + /* Head modified, tail updated */ + if(bufptr < prf_key_size) + neoscrypt_copy(&B[kdf_buf_size + bufptr], &B[bufptr], MIN(prf_output_size, prf_key_size - bufptr)); + + /* Tail modified, head updated */ + if((kdf_buf_size - bufptr) < prf_output_size) + neoscrypt_copy(&B[0], &B[kdf_buf_size], prf_output_size - (kdf_buf_size - bufptr)); + } + + /* Modify and copy into the output buffer */ + if(output_len > kdf_buf_size) + output_len = kdf_buf_size; + + a = kdf_buf_size - bufptr; + if(a >= output_len) { + neoscrypt_xor(&B[bufptr], &A[0], output_len); + neoscrypt_copy(&output[0], &B[bufptr], output_len); + } else { + neoscrypt_xor(&B[bufptr], &A[0], a); + neoscrypt_xor(&B[0], &A[a], output_len - a); + neoscrypt_copy(&output[0], &B[bufptr], a); + neoscrypt_copy(&output[a], &B[0], output_len - a); + } + // for (int i = 0; i<10; i++) { printf("cpu fastkdf %d %08x %08x\n", i, ((unsigned int*)output)[2 * i], ((unsigned int*)output)[2 * i + 1]); } +} + + +/* Configurable optimised block mixer */ +static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode) +{ + uint i, mixer, rounds; + + mixer = mixmode >> 8; + rounds = mixmode & 0xFF; + + /* NeoScrypt flow: Scrypt flow: + Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa"; + Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb"; + Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya; + Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb; + Xa" = Ya; Xb" = Yc; + Xc" = Yb; Xd" = Yd; */ + + if (r == 1) { + neoscrypt_blkxor(&X[0], &X[16], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[0], rounds); + else + neoscrypt_salsa(&X[0], rounds); + neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[16], rounds); + else + neoscrypt_salsa(&X[16], rounds); + return; + } + + if (r == 2) { + neoscrypt_blkxor(&X[0], &X[48], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[0], rounds); + else + neoscrypt_salsa(&X[0], rounds); + neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[16], rounds); + else + neoscrypt_salsa(&X[16], rounds); + neoscrypt_blkxor(&X[32], &X[16], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[32], rounds); + else + neoscrypt_salsa(&X[32], rounds); + neoscrypt_blkxor(&X[48], &X[32], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[48], rounds); + else + neoscrypt_salsa(&X[48], rounds); + neoscrypt_blkswp(&X[16], &X[32], SCRYPT_BLOCK_SIZE); + return; + } + + /* Reference code for any reasonable r */ + for (i = 0; i < 2 * r; i++) { + if(i) neoscrypt_blkxor(&X[16 * i], &X[16 * (i - 1)], SCRYPT_BLOCK_SIZE); + else neoscrypt_blkxor(&X[0], &X[16 * (2 * r - 1)], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[16 * i], rounds); + else + neoscrypt_salsa(&X[16 * i], rounds); + neoscrypt_blkcpy(&Y[16 * i], &X[16 * i], SCRYPT_BLOCK_SIZE); + } + for (i = 0; i < r; i++) + neoscrypt_blkcpy(&X[16 * i], &Y[16 * 2 * i], SCRYPT_BLOCK_SIZE); + for (i = 0; i < r; i++) + neoscrypt_blkcpy(&X[16 * (i + r)], &Y[16 * (2 * i + 1)], SCRYPT_BLOCK_SIZE); +} + +/* NeoScrypt core engine: + * p = 1, salt = password; + * Basic customisation (required): + * profile bit 0: + * 0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20; + * 1 = Scrypt(1024, 1, 1) with Salsa20/8; + * profile bits 4 to 1: + * 0000 = FastKDF-BLAKE2s; + * 0001 = PBKDF2-HMAC-SHA256; + * Extended customisation (optional): + * profile bit 31: + * 0 = extended customisation absent; + * 1 = extended customisation present; + * profile bits 7 to 5 (rfactor): + * 000 = r of 1; + * 001 = r of 2; + * 010 = r of 4; + * ... + * 111 = r of 128; + * profile bits 12 to 8 (Nfactor): + * 00000 = N of 2; + * 00001 = N of 4; + * 00010 = N of 8; + * ..... + * 00110 = N of 128; + * ..... + * 01001 = N of 1024; + * ..... + * 11110 = N of 2147483648; + * profile bits 30 to 13 are reserved */ +void neoscrypt(const uchar *password, uchar *output, uint profile) +{ + uint N = 128, r = 2, dblmix = 1, mixmode = 0x14, stack_align = 0x40; + uint kdf, i, j; + uint *X, *Y, *Z, *V; + + if(profile & 0x1) { + N = 1024; /* N = (1 << (Nfactor + 1)); */ + r = 1; /* r = (1 << rfactor); */ + dblmix = 0; /* Salsa only */ + mixmode = 0x08; /* 8 rounds */ + } + + if(profile >> 31) { + N = (1 << (((profile >> 8) & 0x1F) + 1)); + r = (1 << ((profile >> 5) & 0x7)); + } + uchar *stack; + stack = (uchar*)malloc(((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align)*sizeof(uchar)); + /* X = r * 2 * SCRYPT_BLOCK_SIZE */ + X = (uint *) &stack[stack_align & ~(stack_align - 1)]; + /* Z is a copy of X for ChaCha */ + Z = &X[32 * r]; + /* Y is an X sized temporal space */ + Y = &X[64 * r]; + /* V = N * r * 2 * SCRYPT_BLOCK_SIZE */ + V = &X[96 * r]; + + /* X = KDF(password, salt) */ + kdf = (profile >> 1) & 0xF; + + switch(kdf) { + + default: + case(0x0): + neoscrypt_fastkdf(password, 80, password, 80, 32, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); + break; + + case(0x1): + neoscrypt_pbkdf2_sha256(password, 80, password, 80, 1, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); + break; + } + + /* Process ChaCha 1st, Salsa 2nd and XOR them into FastKDF; otherwise Salsa only */ + + if(dblmix) { + /* blkcpy(Z, X) */ + neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); + + /* Z = SMix(Z) */ + for(i = 0; i < N; i++) { + /* blkcpy(V, Z) */ + neoscrypt_blkcpy(&V[i * (32 * r)], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); + /* blkmix(Z, Y) */ + neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100)); + } + for(i = 0; i < N; i++) { + /* integerify(Z) mod N */ + j = (32 * r) * (Z[16 * (2 * r - 1)] & (N - 1)); + /* blkxor(Z, V) */ + neoscrypt_blkxor(&Z[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE); + /* blkmix(Z, Y) */ + neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100)); + } + } + +#if (ASM) + /* Must be called before and after SSE2 Salsa */ + neoscrypt_salsa_tangle(&X[0], r * 2); +#endif + + /* X = SMix(X) */ + for(i = 0; i < N; i++) { + /* blkcpy(V, X) */ + neoscrypt_blkcpy(&V[i * (32 * r)], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); + /* blkmix(X, Y) */ + neoscrypt_blkmix(&X[0], &Y[0], r, mixmode); + } + for(i = 0; i < N; i++) { + /* integerify(X) mod N */ + j = (32 * r) * (X[16 * (2 * r - 1)] & (N - 1)); + /* blkxor(X, V) */ + neoscrypt_blkxor(&X[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE); + /* blkmix(X, Y) */ + neoscrypt_blkmix(&X[0], &Y[0], r, mixmode); + } + +#if (ASM) + neoscrypt_salsa_tangle(&X[0], r * 2); +#endif + + if(dblmix) + /* blkxor(X, Z) */ + neoscrypt_blkxor(&X[0], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); + + /* output = KDF(password, X) */ + switch(kdf) { + + default: + case(0x0): + neoscrypt_fastkdf(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 32, output, 32); + break; + + case(0x1): + neoscrypt_pbkdf2_sha256(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32); + break; + } +} + diff --git a/neoscrypt/neoscrypt.h b/neoscrypt/neoscrypt.h new file mode 100644 index 0000000..5c4d4e4 --- /dev/null +++ b/neoscrypt/neoscrypt.h @@ -0,0 +1,33 @@ +#if (__cplusplus) +extern "C" { +#endif + +void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile); + +#if (__cplusplus) +} +#else + +#define SCRYPT_BLOCK_SIZE 64 +#define SCRYPT_HASH_BLOCK_SIZE 64 +#define SCRYPT_HASH_DIGEST_SIZE 32 + +typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) +#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) + +#define U8TO32_BE(p) \ + (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]))) + +#define U32TO8_BE(p, v) \ + (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); + +#define U64TO8_BE(p, v) \ + U32TO8_BE((p), (uint32_t)((v) >> 32)); \ + U32TO8_BE((p) + 4, (uint32_t)((v) )); + +#endif + diff --git a/util.cpp b/util.cpp index 577e87a..6c8e298 100644 --- a/util.cpp +++ b/util.cpp @@ -1777,6 +1777,9 @@ void print_hash_tests(void) myriadhash(&hash[0], &buf[0]); printpfx("myriad", hash); + neoscrypt(&buf[0], &hash[0], 80000620); + printpfx("neoscrypt", hash); + nist5hash(&hash[0], &buf[0]); printpfx("nist5", hash);