diff --git a/Makefile.am b/Makefile.am
index 2ca7e6f..e6ee928 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -41,6 +41,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \
quark/quarkcoin.cu quark/animecoin.cu \
quark/cuda_quark_compactionTest.cu \
+ neoscrypt.cu neoscrypt/neoscrypt.c neoscrypt/cuda_neoscrypt.cu \
cuda_nist5.cu pentablake.cu skein.cu skein2.cu zr5.cu \
sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
diff --git a/README.txt b/README.txt
index a45e534..e13479c 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
-ccMiner release 1.6.2-tpruvot (Apr 2015) - "Scrypt/N/Jane algos"
+ccMiner release 1.6.3-tpruvot (May 2015) - "Neoscrypt"
---------------------------------------------------------------
***************************************************************
@@ -37,6 +37,7 @@ Deep, Doom and Qubit
Keccak (Maxcoin)
Pentablake (Blake 512 x5)
1Coin Triple S
+Neoscrypt (FeatherCoin)
Scrypt and Scrypt:N
Scrypt-Jane (Chacha)
Skein (Skein + SHA)
@@ -76,6 +77,7 @@ its command line interface and options.
lyra2 use to mine Vertcoin
mjollnir use to mine Mjollnircoin
myr-gr use to mine Myriad-Groest
+ neoscrypt use to mine FeatherCoin
nist5 use to mine TalkCoin
penta use to mine Joincoin / Pentablake
pluck use to mine Supcoin
@@ -211,6 +213,9 @@ features.
>>> RELEASE HISTORY <<<
+ Not released!! v1.6.3
+ Import Neoscrypt from djm34 work
+
Apr. 21th 2015 v1.6.2
Import Scrypt, Scrypt:N and Scrypt-jane from Cudaminer
Add the --time-limit command line parameter
diff --git a/ccminer.cpp b/ccminer.cpp
index 9fa1241..3bebf98 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -98,6 +98,7 @@ enum sha_algos {
ALGO_LYRA2,
ALGO_MJOLLNIR, /* Hefty hash */
ALGO_MYR_GR,
+ ALGO_NEOSCRYPT,
ALGO_NIST5,
ALGO_PENTABLAKE,
ALGO_PLUCK,
@@ -135,6 +136,7 @@ static const char *algo_names[] = {
"lyra2",
"mjollnir",
"myr-gr",
+ "neoscrypt",
"nist5",
"penta",
"pluck",
@@ -273,6 +275,7 @@ Options:\n\
lyra2 VertCoin\n\
mjollnir Mjollnircoin\n\
myr-gr Myriad-Groestl\n\
+ neoscrypt use to mine FeatherCoin\n\
nist5 NIST5 (TalkCoin)\n\
penta Pentablake hash (5x Blake 512)\n\
pluck SupCoin\n\
@@ -537,7 +540,7 @@ static bool work_decode(const json_t *val, struct work *work)
int adata_sz = ARRAY_SIZE(work->data), atarget_sz = ARRAY_SIZE(work->target);
int i;
- if (opt_algo == ALGO_ZR5) {
+ if (opt_algo == ALGO_NEOSCRYPT || opt_algo == ALGO_ZR5) {
data_size = 80; adata_sz = 20;
}
@@ -1241,6 +1244,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
switch (opt_algo) {
case ALGO_JACKPOT:
+ case ALGO_NEOSCRYPT:
case ALGO_PLUCK:
case ALGO_SCRYPT:
case ALGO_SCRYPT_JANE:
@@ -1472,6 +1476,7 @@ static void *miner_thread(void *userdata)
minmax = 0x400000;
break;
case ALGO_LYRA2:
+ case ALGO_NEOSCRYPT:
case ALGO_SCRYPT:
case ALGO_SCRYPT_JANE:
minmax = 0x100000;
@@ -1599,6 +1604,11 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done);
break;
+ case ALGO_NEOSCRYPT:
+ rc = scanhash_neoscrypt(thr_id, work.data, work.target,
+ max_nonce, &hashes_done);
+ break;
+
case ALGO_NIST5:
rc = scanhash_nist5(thr_id, work.data, work.target,
max_nonce, &hashes_done);
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index e05120e..9322821 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -265,6 +265,7 @@
+
@@ -435,6 +436,8 @@
-Xptxas "-abi=yes" %(AdditionalOptions)
-Xptxas "-abi=yes" %(AdditionalOptions)
+
+
diff --git a/miner.h b/miner.h
index 1a90a2a..2c17fec 100644
--- a/miner.h
+++ b/miner.h
@@ -324,6 +324,9 @@ extern int scanhash_lyra2(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
+extern int scanhash_neoscrypt(int thr_id, uint32_t *pdata,
+ const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);
+
extern int scanhash_nist5(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
@@ -689,6 +692,7 @@ unsigned int jackpothash(void *state, const void *input);
void groestlhash(void *state, const void *input);
void lyra2_hash(void *state, const void *input);
void myriadhash(void *state, const void *input);
+void neoscrypt(const uchar *password, uchar *output, uint profile);
void nist5hash(void *state, const void *input);
void pentablakehash(void *output, const void *input);
void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N);
diff --git a/neoscrypt.cu b/neoscrypt.cu
new file mode 100644
index 0000000..792812c
--- /dev/null
+++ b/neoscrypt.cu
@@ -0,0 +1,83 @@
+
+extern "C" {
+#include "neoscrypt/neoscrypt.h"
+}
+
+#include "cuda_helper.h"
+#include "miner.h"
+
+static uint32_t *d_hash[MAX_GPUS] ;
+extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget);
+extern void neoscrypt_cpu_init(int thr_id, uint32_t threads, uint32_t* hash);
+extern uint32_t neoscrypt_cpu_hash_k4(int stratum, int thr_id, uint32_t threads, uint32_t startNounce, int order);
+
+#define SHIFT 130
+
+int scanhash_neoscrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
+{
+ const uint32_t first_nonce = pdata[19];
+ const int stratum = have_stratum;
+
+ if (opt_benchmark)
+ ((uint32_t*)ptarget)[7] = 0x0000ff;
+
+ int intensity = is_windows() ? 18 : 19;
+ uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity);
+ throughput = throughput / 32; /* set for max intensity ~= 20 */
+ throughput = min(throughput, max_nonce - first_nonce + 1);
+
+ static bool init[MAX_GPUS] = { 0 };
+ if (!init[thr_id])
+ {
+ cudaSetDevice(device_map[thr_id]);
+ cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+
+ CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 32 * SHIFT * sizeof(uint64_t) * throughput));
+ neoscrypt_cpu_init(thr_id, throughput, d_hash[thr_id]);
+
+ applog(LOG_INFO, "Using %d cuda threads", throughput);
+
+ init[thr_id] = true;
+ }
+
+ uint32_t endiandata[20];
+ if (stratum) {
+ for (int k = 0; k < 20; k++)
+ be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+ } else {
+ for (int k = 0; k < 20; k++)
+ endiandata[k] = pdata[k];
+ }
+
+ neoscrypt_setBlockTarget(endiandata,ptarget);
+
+ do {
+ uint32_t foundNonce = neoscrypt_cpu_hash_k4(stratum, thr_id, throughput, pdata[19], 0);
+ if (foundNonce != UINT32_MAX)
+ {
+ uint32_t _ALIGN(64) vhash64[8];
+
+ *hashes_done = pdata[19] - first_nonce + 1;
+
+ if (stratum) {
+ be32enc(&endiandata[19], foundNonce);
+ } else {
+ endiandata[19] = foundNonce;
+ }
+ neoscrypt((uchar*) endiandata, (uchar*)vhash64, 0x80000620);
+
+ if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
+ pdata[19] = foundNonce;
+ return 1;
+ } else {
+ applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+ }
+ }
+
+ pdata[19] += throughput;
+
+ } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
+
+ *hashes_done = pdata[19] - first_nonce + 1;
+ return 0;
+}
diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu
new file mode 100644
index 0000000..652d3ca
--- /dev/null
+++ b/neoscrypt/cuda_neoscrypt.cu
@@ -0,0 +1,607 @@
+#include
+#include
+
+#include "cuda_helper.h"
+#include "cuda_vectors.h"
+
+ __device__ uint4 * W;
+uint32_t *d_NNonce[MAX_GPUS];
+uint32_t *d_nnounce[MAX_GPUS];
+__constant__ uint32_t pTarget[8];
+__constant__ uint32_t key_init[16];
+__constant__ uint32_t input_init[16];
+__constant__ uint32_t c_data[80];
+
+
+#define SALSA_SMALL_UNROLL 1
+#define CHACHA_SMALL_UNROLL 1
+#define BLAKE2S_BLOCK_SIZE 64U
+#define BLAKE2S_OUT_SIZE 32U
+#define BLAKE2S_KEY_SIZE 32U
+#define BLOCK_SIZE 64U
+#define FASTKDF_BUFFER_SIZE 256U
+#define PASSWORD_LEN 80U
+/// constants ///
+
+static const __constant__ uint8 BLAKE2S_IV_Vec =
+ {
+ 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+ 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+ };
+
+
+static const uint8 BLAKE2S_IV_Vechost =
+{
+ 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+ 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint32_t BLAKE2S_SIGMA_host[10][16] =
+{
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
+__constant__ uint32_t BLAKE2S_SIGMA[10][16];
+
+// Blake2S
+
+#define BLAKE2S_BLOCK_SIZE 64U
+#define BLAKE2S_OUT_SIZE 32U
+#define BLAKE2S_KEY_SIZE 32U
+
+#if __CUDA_ARCH__ >= 500
+#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
+ idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
+ a += b; d = __byte_perm(d^a,0, 0x1032); \
+ c += d; b = rotateR(b^c, 12); \
+ idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
+ a += b; d = __byte_perm(d^a,0, 0x0321); \
+ c += d; b = rotateR(b^c, 7); \
+}
+#else
+#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
+ idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
+ a += b; d = rotate(d^a,16); \
+ c += d; b = rotateR(b^c, 12); \
+ idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
+ a += b; d = rotateR(d^a,8); \
+ c += d; b = rotateR(b^c, 7); \
+}
+#endif
+
+#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
+#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+
+#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \
+ idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \
+ a += b; d = ROTR32(d^a,16); \
+ c += d; b = ROTR32(b^c, 12); \
+ idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \
+ a += b; d = ROTR32(d^a,8); \
+ c += d; b = ROTR32(b^c, 7); \
+}
+
+
+static __forceinline__ __device__ void Blake2S(uint32_t * inout, const uint32_t * TheKey)
+{
+ uint16 V;
+ uint32_t idx;
+ uint8 tmpblock;
+
+ V.hi = BLAKE2S_IV_Vec;
+ V.lo = BLAKE2S_IV_Vec;
+ V.lo.s0 ^= 0x01012020;
+
+ // Copy input block for later
+ tmpblock = V.lo;
+
+ V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
+
+ for (int x = 0; x < 10; ++x)
+ {
+ BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+ BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+ BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+ BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+ BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+ BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+ BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+ BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+ }
+
+ V.lo ^= V.hi;
+ V.lo ^= tmpblock;
+
+ V.hi = BLAKE2S_IV_Vec;
+ tmpblock = V.lo;
+
+ V.hi.s4 ^= 128;
+ V.hi.s6 = ~V.hi.s6;
+
+ for (int x = 0; x < 10; ++x)
+ {
+ BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+ BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+ BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+ BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+ BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+ BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+ BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+ BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+ }
+
+ V.lo ^= V.hi ^ tmpblock;
+
+ ((uint8*)inout)[0]=V.lo;
+
+}
+
+static __forceinline__ __host__ void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
+{
+ uint16 V;
+ uint32_t idx;
+ uint8 tmpblock;
+
+ V.hi = BLAKE2S_IV_Vechost;
+ V.lo = BLAKE2S_IV_Vechost;
+ V.lo.s0 ^= 0x01012020;
+
+ // Copy input block for later
+ tmpblock = V.lo;
+
+ V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
+
+ for (int x = 0; x < 10; ++x)
+ {
+ BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey);
+ BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey);
+ BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey);
+ BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey);
+ BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey);
+ BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey);
+ BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey);
+ BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey);
+ }
+
+ V.lo ^= V.hi;
+ V.lo ^= tmpblock;
+
+ V.hi = BLAKE2S_IV_Vechost;
+ tmpblock = V.lo;
+
+ V.hi.s4 ^= 128;
+ V.hi.s6 = ~V.hi.s6;
+
+ for (int x = 0; x < 10; ++x)
+ {
+ BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+ BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+ BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+ BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+ BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+ BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+ BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+ BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+ }
+
+ V.lo ^= V.hi ^ tmpblock;
+
+ ((uint8*)inout)[0] = V.lo;
+}
+
+static __forceinline__ __device__ void fastkdf256(const uint32_t* password, uint8_t* output)
+{
+ uint8_t bufidx = 0;
+ uchar4 bufhelper;
+ uint8_t A[320],B[288];
+
+ ((uintx64*)A)[0] = ((uintx64*)password)[0];
+ ((uint816 *)A)[4] = ((uint816 *)password)[0];
+
+ ((uintx64*)B)[0] = ((uintx64*)password)[0];
+ ((uint48 *)B)[8] = ((uint48 *)password)[0];
+
+ uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = { 0 };
+
+ ((uint816*)input)[0] = ((uint816*)input_init)[0];
+ ((uint48*)key)[0] = ((uint48*)key_init)[0];
+
+ for (int i = 0; i < 32; ++i)
+ {
+ bufhelper = ((uchar4*)input)[0];
+ for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+ bufhelper += ((uchar4*)input)[x];
+ bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+
+ int qbuf = bufidx/4;
+ int rbuf = bufidx&3;
+ int bitbuf = rbuf << 3;
+ uint32_t shifted[9];
+
+ shift256R2(shifted, ((uint8*)input)[0], bitbuf);
+
+ for (int k = 0; k < 9; ++k) {
+ ((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
+ }
+
+ if (bufidx < BLAKE2S_KEY_SIZE) {((uint8*)B)[8] = ((uint8*)B)[0];}
+ else if (bufidx > FASTKDF_BUFFER_SIZE-BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];}
+
+ if (i<31) {
+ for (int k = 0; k FASTKDF_BUFFER_SIZE - BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];}
+// MyUnion Test;
+
+ for (uint8_t k = 0; k =500
+#define CHACHA_STEP(a,b,c,d) { \
+ a += b; d = __byte_perm(d^a,0,0x1032); \
+ c += d; b = rotate(b^c, 12); \
+ a += b; d = __byte_perm(d^a,0,0x2103); \
+ c += d; b = rotate(b^c, 7); \
+}
+#else
+#define CHACHA_STEP(a,b,c,d) { \
+ a += b; d = rotate(d^a,16); \
+ c += d; b = rotate(b^c, 12); \
+ a += b; d = rotate(d^a,8); \
+ c += d; b = rotate(b^c, 7); \
+}
+#endif
+
+#define CHACHA_CORE_PARALLEL(state) { \
+ CHACHA_STEP(state.lo.s0, state.lo.s4, state.hi.s0, state.hi.s4); \
+ CHACHA_STEP(state.lo.s1, state.lo.s5, state.hi.s1, state.hi.s5); \
+ CHACHA_STEP(state.lo.s2, state.lo.s6, state.hi.s2, state.hi.s6); \
+ CHACHA_STEP(state.lo.s3, state.lo.s7, state.hi.s3, state.hi.s7); \
+ CHACHA_STEP(state.lo.s0, state.lo.s5, state.hi.s2, state.hi.s7); \
+ CHACHA_STEP(state.lo.s1, state.lo.s6, state.hi.s3, state.hi.s4); \
+ CHACHA_STEP(state.lo.s2, state.lo.s7, state.hi.s0, state.hi.s5); \
+ CHACHA_STEP(state.lo.s3, state.lo.s4, state.hi.s1, state.hi.s6); \
+}
+
+
+static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X)
+{
+ uint16 state = X;
+ uint32_t t;
+
+ for (int i = 0; i < 10; ++i) { SALSA_CORE(state);}
+
+ return(X + state);
+}
+
+static __device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X)
+{
+ uint16 st = X;
+
+ for (int i = 0; i < 10; ++i) {CHACHA_CORE_PARALLEL(st);}
+ return(X + st);
+}
+
+static __device__ __forceinline__ void neoscrypt_chacha(uint16 *XV)
+{
+ XV[0] ^= XV[3];
+ uint16 temp;
+
+ XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];
+ temp = chacha_small_parallel_rnd(XV[1]); XV[2] ^= temp;
+ XV[1] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[1];
+ XV[3] = chacha_small_parallel_rnd(XV[3]);
+ XV[2] = temp;
+}
+
+static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV)
+{
+ XV[0] ^= XV[3];
+ uint16 temp;
+
+ XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];
+ temp = salsa_small_scalar_rnd(XV[1]); XV[2] ^= temp;
+ XV[1] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[1];
+ XV[3] = salsa_small_scalar_rnd(XV[3]);
+ XV[2] = temp;
+}
+
+
+#define SHIFT 130
+
+__global__ __launch_bounds__(128, 1)
+void neoscrypt_gpu_hash_k0(int stratum, uint32_t threads, uint32_t startNonce)
+{
+ uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ uint32_t shift = SHIFT * 16 * thread;
+// if (thread < threads)
+ {
+ uint32_t data[80];
+ uint16 X[4];
+ const uint32_t nonce = startNonce + thread;
+
+ for (int i = 0; i<20; i++) {
+ ((uint4*)data)[i] = ((uint4 *)c_data)[i];
+ } //ld.local.v4
+ data[19] = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!!
+ data[39] = data[19];
+ data[59] = data[19];
+
+ fastkdf256(data, (uint8_t*)X);
+
+ ((uintx64 *)(W + shift))[0] = ((uintx64 *)X)[0];
+// ((ulonglong16 *)(W + shift))[0] = ((ulonglong16 *)X)[0];
+ }
+}
+
+__global__ __launch_bounds__(128, 1)
+void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce)
+{
+ uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ uint32_t shift = SHIFT * 16 * thread;
+// if (thread < threads)
+ {
+ uint16 X[4];
+ ((uintx64 *)X)[0]= __ldg32(&(W + shift)[0]);
+
+ //#pragma unroll
+ for (int i = 0; i < 128; ++i)
+ {
+ neoscrypt_chacha(X);
+ ((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)X)[0];
+// ((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)X)[0];
+ }
+ }
+}
+
+__global__ __launch_bounds__(128, 1)
+void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce)
+{
+ uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ uint32_t shift = SHIFT * 16 * thread;
+// if (thread < threads)
+ {
+ uint16 X[4];
+ ((uintx64 *)X)[0] = __ldg32(&(W + shift)[2048]);
+
+ for (int t = 0; t < 128; t++)
+ {
+ int idx = X[3].lo.s0 & 0x7F;
+ ((uintx64 *)X)[0] ^= __ldg32(&(W + shift)[idx << 4]);
+ neoscrypt_chacha(X);
+
+ }
+ ((uintx64 *)(W + shift))[129] = ((uintx64*)X)[0]; // best checked
+
+ }
+}
+
+__global__ __launch_bounds__(128, 1)
+void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce)
+{
+ uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+// if (thread < threads)
+ {
+ uint32_t shift = SHIFT * 16 * thread;
+ uint16 Z[4];
+
+ ((uintx64*)Z)[0] = __ldg32(&(W + shift)[0]);
+
+ //#pragma unroll
+ for (int i = 0; i < 128; ++i) {
+ neoscrypt_salsa(Z);
+ ((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)Z)[0];
+// ((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)Z)[0];
+ }
+ }
+}
+
+__global__ __launch_bounds__(128, 1)
+void neoscrypt_gpu_hash_k4(int stratum, uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
+{
+ uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+// if (thread < threads)
+ {
+ const uint32_t nonce = startNonce + thread;
+
+ uint32_t shift = SHIFT * 16 * thread;
+ uint16 Z[4];
+ uint32_t outbuf[8];
+ uint32_t data[80];
+
+ for (int i=0; i<20; i++) {
+ ((uint4*)data)[i] = ((uint4 *)c_data)[i];
+ }
+
+ data[19] = (stratum) ? cuda_swab32(nonce) : nonce;
+ data[39] = data[19];
+ data[59] = data[19];
+ ((uintx64 *)Z)[0] = __ldg32(&(W + shift)[2048]);
+ for (int t = 0; t < 128; t++)
+ {
+ int idx = Z[3].lo.s0 & 0x7F;
+ ((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[idx << 4]);
+ neoscrypt_salsa(Z);
+ }
+ ((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[2064]);
+ fastkdf32(data, (uint32_t*)Z, outbuf);
+ if (outbuf[7] <= pTarget[7]) {
+ uint32_t tmp = atomicExch(&nonceVector[0], nonce);
+ }
+ }
+}
+
+void neoscrypt_cpu_init(int thr_id, uint32_t threads, uint32_t *hash)
+{
+ cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(W, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice);
+ cudaMalloc(&d_NNonce[thr_id], sizeof(uint32_t));
+}
+
+__host__
+uint32_t neoscrypt_cpu_hash_k4(int stratum, int thr_id, uint32_t threads, uint32_t startNounce, int order)
+{
+ uint32_t result[MAX_GPUS] = { 0xffffffff };
+ cudaMemset(d_NNonce[thr_id], 0xff, sizeof(uint32_t));
+
+ const uint32_t threadsperblock = 128;
+ dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+ dim3 block(threadsperblock);
+
+// neoscrypt_gpu_hash_orig << > >(threads, startNounce, d_NNonce[thr_id]);
+
+ neoscrypt_gpu_hash_k0 << > >(stratum,threads, startNounce);
+ neoscrypt_gpu_hash_k01 << > >(threads, startNounce);
+ neoscrypt_gpu_hash_k2 << > >(threads, startNounce);
+ neoscrypt_gpu_hash_k3 << > >(threads, startNounce);
+ neoscrypt_gpu_hash_k4 << > >(stratum,threads, startNounce, d_NNonce[thr_id]);
+
+ MyStreamSynchronize(NULL, order, thr_id);
+ cudaMemcpy(&result[thr_id], d_NNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+ return result[thr_id];
+}
+
+__host__
+void neoscrypt_setBlockTarget(uint32_t* pdata, const void *target)
+{
+ unsigned char PaddedMessage[80*4]; //bring balance to the force
+ uint32_t input[16], key[16] = { 0 };
+ memcpy(PaddedMessage, pdata, 80);
+ memcpy(PaddedMessage + 80, pdata, 80);
+ memcpy(PaddedMessage + 160, pdata, 80);
+ memcpy(PaddedMessage + 240, pdata, 80);
+
+ ((uint16*)input)[0] = ((uint16*)pdata)[0];
+ ((uint8*)key)[0] = ((uint8*)pdata)[0];
+// for (int i = 0; i<10; i++) { printf(" pdata/input %d %08x %08x \n",i,pdata[2*i],pdata[2*i+1]); }
+
+ Blake2Shost(input,key);
+
+ cudaMemcpyToSymbol(pTarget, target, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(input_init, input, 16 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(key_init, key, 16 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+
+ cudaMemcpyToSymbol(c_data, PaddedMessage, 40 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+}
+
diff --git a/neoscrypt/cuda_vectors.h b/neoscrypt/cuda_vectors.h
new file mode 100644
index 0000000..a654f7d
--- /dev/null
+++ b/neoscrypt/cuda_vectors.h
@@ -0,0 +1,1109 @@
+#ifndef CUDA_VECTOR_H
+#define CUDA_VECTOR_H
+
+
+///////////////////////////////////////////////////////////////////////////////////
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
+#define __LDG_PTR "l"
+#else
+#define __LDG_PTR "r"
+#endif
+
+#include "cuda_helper.h"
+
+//typedef __device_builtin__ struct ulong16 ulong16;
+
+
+typedef struct __align__(32) uint8
+{
+ unsigned int s0, s1, s2, s3, s4, s5, s6, s7;
+} uint8;
+
+typedef struct __align__(64) ulonglong2to8
+{
+ulonglong2 l0,l1,l2,l3;
+} ulonglong2to8;
+
+typedef struct __align__(128) ulonglong8to16
+{
+ ulonglong2to8 lo, hi;
+} ulonglong8to16;
+
+typedef struct __align__(256) ulonglong16to32
+{
+ ulonglong8to16 lo, hi;
+} ulonglong16to32;
+
+typedef struct __align__(512) ulonglong32to64
+{
+ ulonglong16to32 lo, hi;
+} ulonglong32to64;
+
+
+
+typedef struct __align__(1024) ulonglonglong
+{
+ ulonglong8to16 s0,s1,s2,s3,s4,s5,s6,s7;
+} ulonglonglong;
+
+
+
+
+typedef struct __align__(64) uint16
+{
+ union {
+ struct {unsigned int s0, s1, s2, s3, s4, s5, s6, s7;};
+ uint8 lo;
+ };
+ union {
+ struct {unsigned int s8, s9, sa, sb, sc, sd, se, sf;};
+ uint8 hi;
+ };
+} uint16;
+
+typedef struct __align__(128) uint32
+{
+
+ uint16 lo,hi;
+} uint32;
+
+
+
+struct __align__(128) ulong8
+{
+ ulonglong4 s0, s1, s2, s3;
+};
+typedef __device_builtin__ struct ulong8 ulong8;
+
+
+typedef struct __align__(256) ulonglong16
+{
+ ulonglong2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf;
+} ulonglong16;
+
+typedef struct __align__(32) uint48
+{
+ uint4 s0, s1;
+
+} uint48;
+
+typedef struct __align__(64) uint816
+{
+ uint48 s0, s1;
+
+} uint816;
+
+typedef struct __align__(128) uint1632
+{
+ uint816 s0, s1;
+
+} uint1632;
+
+typedef struct __align__(256) uintx64
+{
+ uint1632 s0, s1;
+
+} uintx64;
+
+typedef struct __align__(512) uintx128
+{
+ uintx64 s0, s1;
+
+} uintx128;
+
+typedef struct __align__(1024) uintx256
+{
+ uintx128 s0, s1;
+
+} uintx256;
+
+
+
+typedef struct __align__(256) uint4x16
+{
+ uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+} uint4x16;
+
+static __inline__ __device__ ulonglong2to8 make_ulonglong2to8(ulonglong2 s0, ulonglong2 s1, ulonglong2 s2, ulonglong2 s3)
+{
+ulonglong2to8 t; t.l0=s0; t.l1=s1; t.l2=s2; t.l3=s3;
+return t;
+}
+
+static __inline__ __device__ ulonglong8to16 make_ulonglong8to16(const ulonglong2to8 &s0, const ulonglong2to8 &s1)
+{
+ ulonglong8to16 t; t.lo = s0; t.hi = s1;
+ return t;
+}
+
+static __inline__ __device__ ulonglong16to32 make_ulonglong16to32(const ulonglong8to16 &s0, const ulonglong8to16 &s1)
+{
+ ulonglong16to32 t; t.lo = s0; t.hi = s1;
+ return t;
+}
+
+static __inline__ __device__ ulonglong32to64 make_ulonglong32to64(const ulonglong16to32 &s0, const ulonglong16to32 &s1)
+{
+ ulonglong32to64 t; t.lo = s0; t.hi = s1;
+ return t;
+}
+
+
+static __inline__ __host__ __device__ ulonglonglong make_ulonglonglong(
+ const ulonglong8to16 &s0, const ulonglong8to16 &s1, const ulonglong8to16 &s2, const ulonglong8to16 &s3,
+ const ulonglong8to16 &s4, const ulonglong8to16 &s5, const ulonglong8to16 &s6, const ulonglong8to16 &s7)
+{
+ ulonglonglong t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+ return t;
+}
+
+
+static __inline__ __device__ uint48 make_uint48(uint4 s0, uint4 s1)
+{
+ uint48 t; t.s0 = s0; t.s1 = s1;
+ return t;
+}
+
+static __inline__ __device__ uint816 make_uint816(const uint48 &s0, const uint48 &s1)
+{
+ uint816 t; t.s0 = s0; t.s1 = s1;
+ return t;
+}
+
+static __inline__ __device__ uint1632 make_uint1632(const uint816 &s0, const uint816 &s1)
+{
+ uint1632 t; t.s0 = s0; t.s1 = s1;
+ return t;
+}
+
+static __inline__ __device__ uintx64 make_uintx64(const uint1632 &s0, const uint1632 &s1)
+{
+ uintx64 t; t.s0 = s0; t.s1 = s1;
+ return t;
+}
+
+static __inline__ __device__ uintx128 make_uintx128(const uintx64 &s0, const uintx64 &s1)
+{
+ uintx128 t; t.s0 = s0; t.s1 = s1;
+ return t;
+}
+
+static __inline__ __device__ uintx256 make_uintx256(const uintx128 &s0, const uintx128 &s1)
+{
+ uintx256 t; t.s0 = s0; t.s1 = s1;
+ return t;
+}
+
+
+static __inline__ __device__ uintx256 make_uintx64(const uintx128 &s0, const uintx128 &s1)
+{
+ uintx256 t; t.s0 = s0; t.s1 = s1;
+ return t;
+}
+
+
+static __inline__ __host__ __device__ uint4x16 make_uint4x16(
+ uint4 s0, uint4 s1, uint4 s2, uint4 s3, uint4 s4, uint4 s5, uint4 s6, uint4 s7,
+ uint4 s8, uint4 s9, uint4 sa, uint4 sb, uint4 sc, uint4 sd, uint4 se, uint4 sf)
+{
+ uint4x16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+ t.s8 = s8; t.s9 = s9; t.s10 = sa; t.s11 = sb; t.s12 = sc; t.s13 = sd; t.s14 = se; t.s15 = sf;
+ return t;
+}
+
+
+
+
+static __inline__ __host__ __device__ uint16 make_uint16(
+ unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7,
+ unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf)
+{
+ uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+ t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+ return t;
+}
+
+static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b)
+{
+ uint16 t; t.lo=a; t.hi=b; return t;
+}
+
+static __inline__ __host__ __device__ uint32 make_uint32(const uint16 &a, const uint16 &b)
+{
+ uint32 t; t.lo = a; t.hi = b; return t;
+}
+
+
+static __inline__ __host__ __device__ uint8 make_uint8(
+ unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7)
+{
+ uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+ return t;
+}
+
+
+static __inline__ __host__ __device__ ulonglong16 make_ulonglong16(const ulonglong2 &s0, const ulonglong2 &s1,
+ const ulonglong2 &s2, const ulonglong2 &s3, const ulonglong2 &s4, const ulonglong2 &s5, const ulonglong2 &s6, const ulonglong2 &s7,
+ const ulonglong2 &s8, const ulonglong2 &s9,
+ const ulonglong2 &sa, const ulonglong2 &sb, const ulonglong2 &sc, const ulonglong2 &sd, const ulonglong2 &se, const ulonglong2 &sf
+) {
+ ulonglong16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+ t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+ return t;
+}
+
+
+
+static __inline__ __host__ __device__ ulong8 make_ulong8(
+ ulonglong4 s0, ulonglong4 s1, ulonglong4 s2, ulonglong4 s3)
+{
+ ulong8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3;// t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+ return t;
+}
+
+
+static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b) { return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b) { return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+
+
+
+
+
+static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ uint4 operator+ (uint4 a, uint4 b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+
+
+static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); }
+static __forceinline__ __device__ ulonglong2 operator+ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x + b.x, a.y + b.y); }
+
+static __forceinline__ __device__ ulong8 operator^ (const ulong8 &a, const ulong8 &b) {
+ return make_ulong8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3);
+} //, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
+
+static __forceinline__ __device__ ulong8 operator+ (const ulong8 &a, const ulong8 &b) {
+ return make_ulong8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3);
+} //, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
+
+
+static __forceinline__ __device__ __host__ uint8 operator^ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
+
+static __forceinline__ __device__ __host__ uint8 operator+ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
+
+////////////// mess++ //////
+
+static __forceinline__ __device__ uint48 operator^ (const uint48 &a, const uint48 &b) {
+ return make_uint48(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__ uint816 operator^ (const uint816 &a, const uint816 &b) {
+ return make_uint816(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__ uint1632 operator^ (const uint1632 &a, const uint1632 &b) {
+ return make_uint1632(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+
+static __forceinline__ __device__ uintx64 operator^ (const uintx64 &a, const uintx64 &b) {
+ return make_uintx64(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__ uintx128 operator^ (const uintx128 &a, const uintx128 &b) {
+ return make_uintx128(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__ uintx256 operator^ (const uintx256 &a, const uintx256 &b) {
+ return make_uintx256(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+/////////////////////////
+
+static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) {
+ return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+ a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf);
+}
+
+static __forceinline__ __device__ __host__ uint16 operator+ (const uint16 &a, const uint16 &b) {
+ return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+ a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf);
+}
+
+static __forceinline__ __device__ uint32 operator^ (const uint32 &a, const uint32 &b) {
+ return make_uint32(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__ uint32 operator+ (const uint32 &a, const uint32 &b) {
+ return make_uint32(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__ ulonglong16 operator^ (const ulonglong16 &a, const ulonglong16 &b) {
+ return make_ulonglong16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+ a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf
+);
+}
+
+static __forceinline__ __device__ ulonglong16 operator+ (const ulonglong16 &a, const ulonglong16 &b) {
+ return make_ulonglong16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+ a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf
+);
+}
+
+static __forceinline__ __device__ void operator^= (ulong8 &a, const ulong8 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (uintx64 &a, const uintx64 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator^= (uintx128 &a, const uintx128 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (uintx256 &a, const uintx256 &b) { a = a ^ b; }
+
+
+static __forceinline__ __device__ void operator^= (uint816 &a, const uint816 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator^= (uint48 &a, const uint48 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator^= (uint32 &a, const uint32 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (uint32 &a, const uint32 &b) { a = a + b; }
+
+
+static __forceinline__ __device__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) { a = a ^ b; }
+static __forceinline__ __device__ __host__ void operator^= (uint8 &a, const uint8 &b) { a = a ^ b; }
+static __forceinline__ __device__ __host__ void operator^= (uint16 &a, const uint16 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator^= (ulonglong16 &a, const ulonglong16 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator+= (ulonglong2 &a, const ulonglong2 &b) { a = a + b; }
+
+static __forceinline__ __device__
+ulonglong2to8 operator^ (const ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+ return make_ulonglong2to8(a.l0 ^ b.l0, a.l1 ^ b.l1, a.l2 ^ b.l2, a.l3 ^ b.l3);
+}
+static __forceinline__ __device__
+ulonglong2to8 operator+ (const ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+ return make_ulonglong2to8(a.l0 + b.l0, a.l1 + b.l1, a.l2 + b.l2, a.l3 + b.l3);
+}
+
+
+static __forceinline__ __device__
+ulonglong8to16 operator^ (const ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+ return make_ulonglong8to16(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong8to16 operator+ (const ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+ return make_ulonglong8to16(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong16to32 operator^ (const ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+ return make_ulonglong16to32(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong16to32 operator+ (const ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+ return make_ulonglong16to32(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong32to64 operator^ (const ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+ return make_ulonglong32to64(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong32to64 operator+ (const ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+ return make_ulonglong32to64(a.lo + b.lo, a.hi + b.hi);
+}
+
+
+static __forceinline__ __device__ ulonglonglong operator^ (const ulonglonglong &a, const ulonglonglong &b) {
+ return make_ulonglonglong(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7);
+}
+
+static __forceinline__ __device__ ulonglonglong operator+ (const ulonglonglong &a, const ulonglonglong &b) {
+ return make_ulonglonglong(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7);
+}
+
+
+static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulonglong2to8 &b) { a = a ^ b; }
+
+
+static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; }
+static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
+static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a ^ b; }
+
+
+static __forceinline__ __device__ void operator+= (ulonglonglong &a, const ulonglonglong &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglonglong &a, const ulonglonglong &b) { a = a ^ b; }
+
+#if __CUDA_ARCH__ < 320
+
+#define rotate ROTL32
+#define rotateR ROTR32
+
+#else
+
+static __forceinline__ __device__ uint4 rotate4(uint4 vec4, uint32_t shift)
+{
+ uint4 ret;
+ asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift));
+ asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift));
+ asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift));
+ asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift));
+ return ret;
+}
+
+static __forceinline__ __device__ uint4 rotate4R(uint4 vec4, uint32_t shift)
+{
+ uint4 ret;
+ asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift));
+ asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift));
+ asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift));
+ asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift));
+ return ret;
+}
+
+static __forceinline__ __device__ uint32_t rotate(uint32_t vec4, uint32_t shift)
+{
+ uint32_t ret;
+ asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+ return ret;
+}
+
+
+static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift)
+{
+ uint32_t ret;
+ asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+ return ret;
+}
+
+
+
+static __device__ __inline__ uint8 __ldg8(const uint8_t *ptr)
+{
+ uint8 test;
+ asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr));
+ asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr));
+ return (test);
+}
+
+
+static __device__ __inline__ uint32_t __ldgtoint(const uint8_t *ptr)
+{
+ uint32_t test;
+ asm volatile ("ld.global.nc.u32 {%0},[%1];" : "=r"(test) : __LDG_PTR(ptr));
+ return (test);
+}
+
+static __device__ __inline__ uint32_t __ldgtoint64(const uint8_t *ptr)
+{
+ uint64_t test;
+ asm volatile ("ld.global.nc.u64 {%0},[%1];" : "=l"(test) : __LDG_PTR(ptr));
+ return (test);
+}
+
+
+static __device__ __inline__ uint32_t __ldgtoint_unaligned(const uint8_t *ptr)
+{
+ uint32_t test;
+ asm volatile ("{\n\t"
+ ".reg .u8 a,b,c,d; \n\t"
+ "ld.global.nc.u8 a,[%1]; \n\t"
+ "ld.global.nc.u8 b,[%1+1]; \n\t"
+ "ld.global.nc.u8 c,[%1+2]; \n\t"
+ "ld.global.nc.u8 d,[%1+3]; \n\t"
+ "mov.b32 %0,{a,b,c,d}; }\n\t"
+ : "=r"(test) : __LDG_PTR(ptr));
+ return (test);
+}
+
+static __device__ __inline__ uint64_t __ldgtoint64_unaligned(const uint8_t *ptr)
+{
+ uint64_t test;
+ asm volatile ("{\n\t"
+ ".reg .u8 a,b,c,d,e,f,g,h; \n\t"
+ ".reg .u32 i,j; \n\t"
+ "ld.global.nc.u8 a,[%1]; \n\t"
+ "ld.global.nc.u8 b,[%1+1]; \n\t"
+ "ld.global.nc.u8 c,[%1+2]; \n\t"
+ "ld.global.nc.u8 d,[%1+3]; \n\t"
+ "ld.global.nc.u8 e,[%1+4]; \n\t"
+ "ld.global.nc.u8 f,[%1+5]; \n\t"
+ "ld.global.nc.u8 g,[%1+6]; \n\t"
+ "ld.global.nc.u8 h,[%1+7]; \n\t"
+ "mov.b32 i,{a,b,c,d}; \n\t"
+ "mov.b32 j,{e,f,g,h}; \n\t"
+ "mov.b64 %0,{i,j}; }\n\t"
+ : "=l"(test) : __LDG_PTR(ptr));
+ return (test);
+}
+
+
+static __device__ __inline__ uint64_t __ldgtoint64_trunc(const uint8_t *ptr)
+{
+ uint32_t zero = 0;
+ uint64_t test;
+ asm volatile ("{\n\t"
+ ".reg .u8 a,b,c,d; \n\t"
+ ".reg .u32 i; \n\t"
+ "ld.global.nc.u8 a,[%1]; \n\t"
+ "ld.global.nc.u8 b,[%1+1]; \n\t"
+ "ld.global.nc.u8 c,[%1+2]; \n\t"
+ "ld.global.nc.u8 d,[%1+3]; \n\t"
+ "mov.b32 i,{a,b,c,d}; \n\t"
+ "mov.b64 %0,{i,%1}; }\n\t"
+ : "=l"(test) : __LDG_PTR(ptr), "r"(zero));
+ return (test);
+}
+
+
+
+static __device__ __inline__ uint32_t __ldgtoint_unaligned2(const uint8_t *ptr)
+{
+ uint32_t test;
+ asm("{\n\t"
+ ".reg .u8 e,b,c,d; \n\t"
+ "ld.global.nc.u8 e,[%1]; \n\t"
+ "ld.global.nc.u8 b,[%1+1]; \n\t"
+ "ld.global.nc.u8 c,[%1+2]; \n\t"
+ "ld.global.nc.u8 d,[%1+3]; \n\t"
+ "mov.b32 %0,{e,b,c,d}; }\n\t"
+ : "=r"(test) : __LDG_PTR(ptr));
+ return (test);
+}
+
+#endif
+
+static __forceinline__ __device__ void shift256R2(uint32_t * ret, const uint8 &vec4, uint32_t shift)
+{
+ uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
+ asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+ ret[8] = cuda_swab32(truc);
+ truc3 = cuda_swab32(vec4.s6);
+ asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+ ret[7] = cuda_swab32(truc);
+ truc2 = cuda_swab32(vec4.s5);
+ asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+ ret[6] = cuda_swab32(truc);
+ truc3 = cuda_swab32(vec4.s4);
+ asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+ ret[5] = cuda_swab32(truc);
+ truc2 = cuda_swab32(vec4.s3);
+ asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+ ret[4] = cuda_swab32(truc);
+ truc3 = cuda_swab32(vec4.s2);
+ asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+ ret[3] = cuda_swab32(truc);
+ truc2 = cuda_swab32(vec4.s1);
+ asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+ ret[2] = cuda_swab32(truc);
+ truc3 = cuda_swab32(vec4.s0);
+ asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+ ret[1] = cuda_swab32(truc);
+ asm("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift));
+ ret[0] = cuda_swab32(truc);
+
+}
+
+#define shift256R3(ret,vec4, shift) \
+{ \
+ \
+uint32_t truc=0,truc2=cuda_swab32(vec4.s7),truc3=0; \
+ asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+ ret[8] = cuda_swab32(truc); \
+truc2=cuda_swab32(vec4.s6);truc3=cuda_swab32(vec4.s7); \
+ asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+ ret[7] = cuda_swab32(truc); \
+truc2=cuda_swab32(vec4.s5);truc3=cuda_swab32(vec4.s6); \
+ asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+ ret[6] = cuda_swab32(truc); \
+truc2 = cuda_swab32(vec4.s4); truc3 = cuda_swab32(vec4.s5); \
+ asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+ ret[5] = cuda_swab32(truc); \
+truc2 = cuda_swab32(vec4.s3); truc3 = cuda_swab32(vec4.s4); \
+ asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+ ret[4] = cuda_swab32(truc); \
+truc2 = cuda_swab32(vec4.s2); truc3 = cuda_swab32(vec4.s3); \
+ asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+ ret[3] = cuda_swab32(truc); \
+truc2 = cuda_swab32(vec4.s1); truc3 = cuda_swab32(vec4.s2); \
+ asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+ ret[2] = cuda_swab32(truc); \
+truc2 = cuda_swab32(vec4.s0); truc3 = cuda_swab32(vec4.s1); \
+ asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \
+ ret[1] = cuda_swab32(truc); \
+truc3 = cuda_swab32(vec4.s0); \
+ asm volatile ("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift)); \
+ ret[0] = cuda_swab32(truc); \
+ \
+ \
+}
+
+
+static __device__ __inline__ uint32 __ldg32b(const uint32 *ptr)
+{
+ uint32 ret;
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.lo.s0), "=r"(ret.lo.s1), "=r"(ret.lo.s2), "=r"(ret.lo.s3) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.lo.s4), "=r"(ret.lo.s5), "=r"(ret.lo.s6), "=r"(ret.lo.s7) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.lo.s8), "=r"(ret.lo.s9), "=r"(ret.lo.sa), "=r"(ret.lo.sb) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.lo.sc), "=r"(ret.lo.sd), "=r"(ret.lo.se), "=r"(ret.lo.sf) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.hi.s0), "=r"(ret.hi.s1), "=r"(ret.hi.s2), "=r"(ret.hi.s3) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.hi.s4), "=r"(ret.hi.s5), "=r"(ret.hi.s6), "=r"(ret.hi.s7) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.hi.s8), "=r"(ret.hi.s9), "=r"(ret.hi.sa), "=r"(ret.hi.sb) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.hi.sc), "=r"(ret.hi.sd), "=r"(ret.hi.se), "=r"(ret.hi.sf) : __LDG_PTR(ptr));
+ return ret;
+}
+
+static __device__ __inline__ uint16 __ldg16b(const uint16 *ptr)
+{
+ uint16 ret;
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0), "=r"(ret.s1), "=r"(ret.s2), "=r"(ret.s3) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s4), "=r"(ret.s5), "=r"(ret.s6), "=r"(ret.s7) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s8), "=r"(ret.s9), "=r"(ret.sa), "=r"(ret.sb) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.sc), "=r"(ret.sd), "=r"(ret.se), "=r"(ret.sf) : __LDG_PTR(ptr));
+ return ret;
+}
+
+
+static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr)
+{
+ uintx64 ret;
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+ return ret;
+}
+
+static __device__ __inline__ uintx64 __ldg32c(const uintx64 *ptr)
+{
+ uintx64 ret;
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+
+ return ret;
+}
+
+static __device__ __inline__ uintx128 __ldg128(const uintx128 *ptr)
+{
+ uintx128 ret;
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s0.s1.s0.s0.x), "=r"(ret.s0.s0.s1.s0.s0.y), "=r"(ret.s0.s0.s1.s0.s0.z), "=r"(ret.s0.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s0.s1.s0.s1.x), "=r"(ret.s0.s0.s1.s0.s1.y), "=r"(ret.s0.s0.s1.s0.s1.z), "=r"(ret.s0.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s0.s1.s1.s0.x), "=r"(ret.s0.s0.s1.s1.s0.y), "=r"(ret.s0.s0.s1.s1.s0.z), "=r"(ret.s0.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s0.s1.s1.s1.x), "=r"(ret.s0.s0.s1.s1.s1.y), "=r"(ret.s0.s0.s1.s1.s1.z), "=r"(ret.s0.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s0.s1.s0.s0.s0.x), "=r"(ret.s0.s1.s0.s0.s0.y), "=r"(ret.s0.s1.s0.s0.s0.z), "=r"(ret.s0.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s0.s1.s0.s0.s1.x), "=r"(ret.s0.s1.s0.s0.s1.y), "=r"(ret.s0.s1.s0.s0.s1.z), "=r"(ret.s0.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s0.s1.s0.s1.s0.x), "=r"(ret.s0.s1.s0.s1.s0.y), "=r"(ret.s0.s1.s0.s1.s0.z), "=r"(ret.s0.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s0.s1.s0.s1.s1.x), "=r"(ret.s0.s1.s0.s1.s1.y), "=r"(ret.s0.s1.s0.s1.s1.z), "=r"(ret.s0.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s0.s1.s1.s0.s0.x), "=r"(ret.s0.s1.s1.s0.s0.y), "=r"(ret.s0.s1.s1.s0.s0.z), "=r"(ret.s0.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s0.s1.s1.s0.s1.x), "=r"(ret.s0.s1.s1.s0.s1.y), "=r"(ret.s0.s1.s1.s0.s1.z), "=r"(ret.s0.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s0.s1.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s0.s1.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+256];" : "=r"(ret.s1.s0.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+272];" : "=r"(ret.s1.s0.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+288];" : "=r"(ret.s1.s0.s0.s1.s0.x), "=r"(ret.s1.s0.s0.s1.s0.y), "=r"(ret.s1.s0.s0.s1.s0.z), "=r"(ret.s1.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+304];" : "=r"(ret.s1.s0.s0.s1.s1.x), "=r"(ret.s1.s0.s0.s1.s1.y), "=r"(ret.s1.s0.s0.s1.s1.z), "=r"(ret.s1.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+320];" : "=r"(ret.s1.s0.s1.s0.s0.x), "=r"(ret.s1.s0.s1.s0.s0.y), "=r"(ret.s1.s0.s1.s0.s0.z), "=r"(ret.s1.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+336];" : "=r"(ret.s1.s0.s1.s0.s1.x), "=r"(ret.s1.s0.s1.s0.s1.y), "=r"(ret.s1.s0.s1.s0.s1.z), "=r"(ret.s1.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+352];" : "=r"(ret.s1.s0.s1.s1.s0.x), "=r"(ret.s1.s0.s1.s1.s0.y), "=r"(ret.s1.s0.s1.s1.s0.z), "=r"(ret.s1.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+368];" : "=r"(ret.s1.s0.s1.s1.s1.x), "=r"(ret.s1.s0.s1.s1.s1.y), "=r"(ret.s1.s0.s1.s1.s1.z), "=r"(ret.s1.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+384];" : "=r"(ret.s1.s1.s0.s0.s0.x), "=r"(ret.s1.s1.s0.s0.s0.y), "=r"(ret.s1.s1.s0.s0.s0.z), "=r"(ret.s1.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+400];" : "=r"(ret.s1.s1.s0.s0.s1.x), "=r"(ret.s1.s1.s0.s0.s1.y), "=r"(ret.s1.s1.s0.s0.s1.z), "=r"(ret.s1.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+416];" : "=r"(ret.s1.s1.s0.s1.s0.x), "=r"(ret.s1.s1.s0.s1.s0.y), "=r"(ret.s1.s1.s0.s1.s0.z), "=r"(ret.s1.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+432];" : "=r"(ret.s1.s1.s0.s1.s1.x), "=r"(ret.s1.s1.s0.s1.s1.y), "=r"(ret.s1.s1.s0.s1.s1.z), "=r"(ret.s1.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+448];" : "=r"(ret.s1.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+464];" : "=r"(ret.s1.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+480];" : "=r"(ret.s1.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+496];" : "=r"(ret.s1.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+
+ return ret;
+}
+
+static __device__ __inline__ uintx256 __ldg256(const uintx256 *ptr)
+{
+ uintx256 ret;
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s0.s0.s1.s0.s0.x), "=r"(ret.s0.s0.s0.s1.s0.s0.y), "=r"(ret.s0.s0.s0.s1.s0.s0.z), "=r"(ret.s0.s0.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s0.s0.s1.s0.s1.x), "=r"(ret.s0.s0.s0.s1.s0.s1.y), "=r"(ret.s0.s0.s0.s1.s0.s1.z), "=r"(ret.s0.s0.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s0.s0.s1.s1.s0.x), "=r"(ret.s0.s0.s0.s1.s1.s0.y), "=r"(ret.s0.s0.s0.s1.s1.s0.z), "=r"(ret.s0.s0.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s0.s0.s1.s1.s1.x), "=r"(ret.s0.s0.s0.s1.s1.s1.y), "=r"(ret.s0.s0.s0.s1.s1.s1.z), "=r"(ret.s0.s0.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s0.s0.s1.s0.s0.s0.x), "=r"(ret.s0.s0.s1.s0.s0.s0.y), "=r"(ret.s0.s0.s1.s0.s0.s0.z), "=r"(ret.s0.s0.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s0.s0.s1.s0.s0.s1.x), "=r"(ret.s0.s0.s1.s0.s0.s1.y), "=r"(ret.s0.s0.s1.s0.s0.s1.z), "=r"(ret.s0.s0.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s0.s0.s1.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s0.s0.s1.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s0.s0.s1.s1.s0.s0.x), "=r"(ret.s0.s0.s1.s1.s0.s0.y), "=r"(ret.s0.s0.s1.s1.s0.s0.z), "=r"(ret.s0.s0.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s0.s0.s1.s1.s0.s1.x), "=r"(ret.s0.s0.s1.s1.s0.s1.y), "=r"(ret.s0.s0.s1.s1.s0.s1.z), "=r"(ret.s0.s0.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s0.s0.s1.s1.s1.s0.x), "=r"(ret.s0.s0.s1.s1.s1.s0.y), "=r"(ret.s0.s0.s1.s1.s1.s0.z), "=r"(ret.s0.s0.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s0.s0.s1.s1.s1.s1.x), "=r"(ret.s0.s0.s1.s1.s1.s1.y), "=r"(ret.s0.s0.s1.s1.s1.s1.z), "=r"(ret.s0.s0.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+256];" : "=r"(ret.s0.s1.s0.s0.s0.s0.x), "=r"(ret.s0.s1.s0.s0.s0.s0.y), "=r"(ret.s0.s1.s0.s0.s0.s0.z), "=r"(ret.s0.s1.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+272];" : "=r"(ret.s0.s1.s0.s0.s0.s1.x), "=r"(ret.s0.s1.s0.s0.s0.s1.y), "=r"(ret.s0.s1.s0.s0.s0.s1.z), "=r"(ret.s0.s1.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+288];" : "=r"(ret.s0.s1.s0.s0.s1.s0.x), "=r"(ret.s0.s1.s0.s0.s1.s0.y), "=r"(ret.s0.s1.s0.s0.s1.s0.z), "=r"(ret.s0.s1.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+304];" : "=r"(ret.s0.s1.s0.s0.s1.s1.x), "=r"(ret.s0.s1.s0.s0.s1.s1.y), "=r"(ret.s0.s1.s0.s0.s1.s1.z), "=r"(ret.s0.s1.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+320];" : "=r"(ret.s0.s1.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+336];" : "=r"(ret.s0.s1.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+352];" : "=r"(ret.s0.s1.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+368];" : "=r"(ret.s0.s1.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+384];" : "=r"(ret.s0.s1.s1.s0.s0.s0.x), "=r"(ret.s0.s1.s1.s0.s0.s0.y), "=r"(ret.s0.s1.s1.s0.s0.s0.z), "=r"(ret.s0.s1.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+400];" : "=r"(ret.s0.s1.s1.s0.s0.s1.x), "=r"(ret.s0.s1.s1.s0.s0.s1.y), "=r"(ret.s0.s1.s1.s0.s0.s1.z), "=r"(ret.s0.s1.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+416];" : "=r"(ret.s0.s1.s1.s0.s1.s0.x), "=r"(ret.s0.s1.s1.s0.s1.s0.y), "=r"(ret.s0.s1.s1.s0.s1.s0.z), "=r"(ret.s0.s1.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+432];" : "=r"(ret.s0.s1.s1.s0.s1.s1.x), "=r"(ret.s0.s1.s1.s0.s1.s1.y), "=r"(ret.s0.s1.s1.s0.s1.s1.z), "=r"(ret.s0.s1.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+448];" : "=r"(ret.s0.s1.s1.s1.s0.s0.x), "=r"(ret.s0.s1.s1.s1.s0.s0.y), "=r"(ret.s0.s1.s1.s1.s0.s0.z), "=r"(ret.s0.s1.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+464];" : "=r"(ret.s0.s1.s1.s1.s0.s1.x), "=r"(ret.s0.s1.s1.s1.s0.s1.y), "=r"(ret.s0.s1.s1.s1.s0.s1.z), "=r"(ret.s0.s1.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+480];" : "=r"(ret.s0.s1.s1.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+496];" : "=r"(ret.s0.s1.s1.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+512];" : "=r"(ret.s1.s0.s0.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+528];" : "=r"(ret.s1.s0.s0.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+544];" : "=r"(ret.s1.s0.s0.s0.s1.s0.x), "=r"(ret.s1.s0.s0.s0.s1.s0.y), "=r"(ret.s1.s0.s0.s0.s1.s0.z), "=r"(ret.s1.s0.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+560];" : "=r"(ret.s1.s0.s0.s0.s1.s1.x), "=r"(ret.s1.s0.s0.s0.s1.s1.y), "=r"(ret.s1.s0.s0.s0.s1.s1.z), "=r"(ret.s1.s0.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+576];" : "=r"(ret.s1.s0.s0.s1.s0.s0.x), "=r"(ret.s1.s0.s0.s1.s0.s0.y), "=r"(ret.s1.s0.s0.s1.s0.s0.z), "=r"(ret.s1.s0.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+592];" : "=r"(ret.s1.s0.s0.s1.s0.s1.x), "=r"(ret.s1.s0.s0.s1.s0.s1.y), "=r"(ret.s1.s0.s0.s1.s0.s1.z), "=r"(ret.s1.s0.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+608];" : "=r"(ret.s1.s0.s0.s1.s1.s0.x), "=r"(ret.s1.s0.s0.s1.s1.s0.y), "=r"(ret.s1.s0.s0.s1.s1.s0.z), "=r"(ret.s1.s0.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+624];" : "=r"(ret.s1.s0.s0.s1.s1.s1.x), "=r"(ret.s1.s0.s0.s1.s1.s1.y), "=r"(ret.s1.s0.s0.s1.s1.s1.z), "=r"(ret.s1.s0.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+640];" : "=r"(ret.s1.s0.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+656];" : "=r"(ret.s1.s0.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+672];" : "=r"(ret.s1.s0.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+688];" : "=r"(ret.s1.s0.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+704];" : "=r"(ret.s1.s0.s1.s1.s0.s0.x), "=r"(ret.s1.s0.s1.s1.s0.s0.y), "=r"(ret.s1.s0.s1.s1.s0.s0.z), "=r"(ret.s1.s0.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+720];" : "=r"(ret.s1.s0.s1.s1.s0.s1.x), "=r"(ret.s1.s0.s1.s1.s0.s1.y), "=r"(ret.s1.s0.s1.s1.s0.s1.z), "=r"(ret.s1.s0.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+736];" : "=r"(ret.s1.s0.s1.s1.s1.s0.x), "=r"(ret.s1.s0.s1.s1.s1.s0.y), "=r"(ret.s1.s0.s1.s1.s1.s0.z), "=r"(ret.s1.s0.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+752];" : "=r"(ret.s1.s0.s1.s1.s1.s1.x), "=r"(ret.s1.s0.s1.s1.s1.s1.y), "=r"(ret.s1.s0.s1.s1.s1.s1.z), "=r"(ret.s1.s0.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+768];" : "=r"(ret.s1.s1.s0.s0.s0.s0.x), "=r"(ret.s1.s1.s0.s0.s0.s0.y), "=r"(ret.s1.s1.s0.s0.s0.s0.z), "=r"(ret.s1.s1.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+784];" : "=r"(ret.s1.s1.s0.s0.s0.s1.x), "=r"(ret.s1.s1.s0.s0.s0.s1.y), "=r"(ret.s1.s1.s0.s0.s0.s1.z), "=r"(ret.s1.s1.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+800];" : "=r"(ret.s1.s1.s0.s0.s1.s0.x), "=r"(ret.s1.s1.s0.s0.s1.s0.y), "=r"(ret.s1.s1.s0.s0.s1.s0.z), "=r"(ret.s1.s1.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+816];" : "=r"(ret.s1.s1.s0.s0.s1.s1.x), "=r"(ret.s1.s1.s0.s0.s1.s1.y), "=r"(ret.s1.s1.s0.s0.s1.s1.z), "=r"(ret.s1.s1.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+832];" : "=r"(ret.s1.s1.s0.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+848];" : "=r"(ret.s1.s1.s0.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+864];" : "=r"(ret.s1.s1.s0.s1.s1.s0.x), "=r"(ret.s1.s1.s0.s1.s1.s0.y), "=r"(ret.s1.s1.s0.s1.s1.s0.z), "=r"(ret.s1.s1.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+880];" : "=r"(ret.s1.s1.s0.s1.s1.s1.x), "=r"(ret.s1.s1.s0.s1.s1.s1.y), "=r"(ret.s1.s1.s0.s1.s1.s1.z), "=r"(ret.s1.s1.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+896];" : "=r"(ret.s1.s1.s1.s0.s0.s0.x), "=r"(ret.s1.s1.s1.s0.s0.s0.y), "=r"(ret.s1.s1.s1.s0.s0.s0.z), "=r"(ret.s1.s1.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+912];" : "=r"(ret.s1.s1.s1.s0.s0.s1.x), "=r"(ret.s1.s1.s1.s0.s0.s1.y), "=r"(ret.s1.s1.s1.s0.s0.s1.z), "=r"(ret.s1.s1.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+928];" : "=r"(ret.s1.s1.s1.s0.s1.s0.x), "=r"(ret.s1.s1.s1.s0.s1.s0.y), "=r"(ret.s1.s1.s1.s0.s1.s0.z), "=r"(ret.s1.s1.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+944];" : "=r"(ret.s1.s1.s1.s0.s1.s1.x), "=r"(ret.s1.s1.s1.s0.s1.s1.y), "=r"(ret.s1.s1.s1.s0.s1.s1.z), "=r"(ret.s1.s1.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+960];" : "=r"(ret.s1.s1.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+976];" : "=r"(ret.s1.s1.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+992];" : "=r"(ret.s1.s1.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+1008];" : "=r"(ret.s1.s1.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+
+ return ret;
+}
+
+static __device__ __inline__ ulonglong2 __ldg2(const ulonglong2 *ptr)
+{
+ ulonglong2 ret;
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr));
+return ret;
+}
+
+static __device__ __inline__ ulonglong4 __ldg4(const ulonglong4 *ptr)
+{
+ ulonglong4 ret;
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.z), "=l"(ret.w) : __LDG_PTR(ptr));
+ return ret;
+}
+
+
+static __device__ __inline__ ulonglong2to8 __ldg2to8(const ulonglong2to8 *ptr)
+{
+ ulonglong2to8 ret;
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.l0.x), "=l"(ret.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.l1.x), "=l"(ret.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.l2.x), "=l"(ret.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.l3.x), "=l"(ret.l3.y) : __LDG_PTR(ptr));
+ return ret;
+}
+static __device__ __inline__ ulonglong8to16 __ldg8to16(const ulonglong8to16 *ptr)
+{
+ ulonglong8to16 ret;
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.lo.l0.x), "=l"(ret.lo.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.lo.l1.x), "=l"(ret.lo.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.lo.l2.x), "=l"(ret.lo.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.lo.l3.x), "=l"(ret.lo.l3.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.hi.l0.x), "=l"(ret.hi.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.hi.l1.x), "=l"(ret.hi.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.hi.l2.x), "=l"(ret.hi.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.hi.l3.x), "=l"(ret.hi.l3.y) : __LDG_PTR(ptr));
+ return ret;
+}
+
+static __device__ __inline__ ulonglonglong __ldgxtralong(const ulonglonglong *ptr)
+{
+ ulonglonglong ret;
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.lo.l0.x), "=l"(ret.s0.lo.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s0.lo.l1.x), "=l"(ret.s0.lo.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s0.lo.l2.x), "=l"(ret.s0.lo.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s0.lo.l3.x), "=l"(ret.s0.lo.l3.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s0.hi.l0.x), "=l"(ret.s0.hi.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s0.hi.l1.x), "=l"(ret.s0.hi.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.s0.hi.l2.x), "=l"(ret.s0.hi.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.s0.hi.l3.x), "=l"(ret.s0.hi.l3.y) : __LDG_PTR(ptr));
+
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+128];" : "=l"(ret.s1.lo.l0.x), "=l"(ret.s1.lo.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+144];" : "=l"(ret.s1.lo.l1.x), "=l"(ret.s1.lo.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+160];" : "=l"(ret.s1.lo.l2.x), "=l"(ret.s1.lo.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+176];" : "=l"(ret.s1.lo.l3.x), "=l"(ret.s1.lo.l3.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+192];" : "=l"(ret.s1.hi.l0.x), "=l"(ret.s1.hi.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+208];" : "=l"(ret.s1.hi.l1.x), "=l"(ret.s1.hi.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+224];" : "=l"(ret.s1.hi.l2.x), "=l"(ret.s1.hi.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+240];" : "=l"(ret.s1.hi.l3.x), "=l"(ret.s1.hi.l3.y) : __LDG_PTR(ptr));
+
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+256];" : "=l"(ret.s2.lo.l0.x), "=l"(ret.s2.lo.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+272];" : "=l"(ret.s2.lo.l1.x), "=l"(ret.s2.lo.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+288];" : "=l"(ret.s2.lo.l2.x), "=l"(ret.s2.lo.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+304];" : "=l"(ret.s2.lo.l3.x), "=l"(ret.s2.lo.l3.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+320];" : "=l"(ret.s2.hi.l0.x), "=l"(ret.s2.hi.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+336];" : "=l"(ret.s2.hi.l1.x), "=l"(ret.s2.hi.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+352];" : "=l"(ret.s2.hi.l2.x), "=l"(ret.s2.hi.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+368];" : "=l"(ret.s2.hi.l3.x), "=l"(ret.s2.hi.l3.y) : __LDG_PTR(ptr));
+
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+384];" : "=l"(ret.s3.lo.l0.x), "=l"(ret.s3.lo.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+400];" : "=l"(ret.s3.lo.l1.x), "=l"(ret.s3.lo.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+416];" : "=l"(ret.s3.lo.l2.x), "=l"(ret.s3.lo.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+432];" : "=l"(ret.s3.lo.l3.x), "=l"(ret.s3.lo.l3.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+448];" : "=l"(ret.s3.hi.l0.x), "=l"(ret.s3.hi.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+464];" : "=l"(ret.s3.hi.l1.x), "=l"(ret.s3.hi.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+480];" : "=l"(ret.s3.hi.l2.x), "=l"(ret.s3.hi.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+496];" : "=l"(ret.s3.hi.l3.x), "=l"(ret.s3.hi.l3.y) : __LDG_PTR(ptr));
+
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+512];" : "=l"(ret.s4.lo.l0.x), "=l"(ret.s4.lo.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+528];" : "=l"(ret.s4.lo.l1.x), "=l"(ret.s4.lo.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+544];" : "=l"(ret.s4.lo.l2.x), "=l"(ret.s4.lo.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+560];" : "=l"(ret.s4.lo.l3.x), "=l"(ret.s4.lo.l3.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+576];" : "=l"(ret.s4.hi.l0.x), "=l"(ret.s4.hi.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+592];" : "=l"(ret.s4.hi.l1.x), "=l"(ret.s4.hi.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+608];" : "=l"(ret.s4.hi.l2.x), "=l"(ret.s4.hi.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+624];" : "=l"(ret.s4.hi.l3.x), "=l"(ret.s4.hi.l3.y) : __LDG_PTR(ptr));
+
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+640];" : "=l"(ret.s5.lo.l0.x), "=l"(ret.s5.lo.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+656];" : "=l"(ret.s5.lo.l1.x), "=l"(ret.s5.lo.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+672];" : "=l"(ret.s5.lo.l2.x), "=l"(ret.s5.lo.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+688];" : "=l"(ret.s5.lo.l3.x), "=l"(ret.s5.lo.l3.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+704];" : "=l"(ret.s5.hi.l0.x), "=l"(ret.s5.hi.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+720];" : "=l"(ret.s5.hi.l1.x), "=l"(ret.s5.hi.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+736];" : "=l"(ret.s5.hi.l2.x), "=l"(ret.s5.hi.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+752];" : "=l"(ret.s5.hi.l3.x), "=l"(ret.s5.hi.l3.y) : __LDG_PTR(ptr));
+
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+768];" : "=l"(ret.s6.lo.l0.x), "=l"(ret.s6.lo.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+784];" : "=l"(ret.s6.lo.l1.x), "=l"(ret.s6.lo.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+800];" : "=l"(ret.s6.lo.l2.x), "=l"(ret.s6.lo.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+816];" : "=l"(ret.s6.lo.l3.x), "=l"(ret.s6.lo.l3.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+832];" : "=l"(ret.s6.hi.l0.x), "=l"(ret.s6.hi.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+848];" : "=l"(ret.s6.hi.l1.x), "=l"(ret.s6.hi.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+864];" : "=l"(ret.s6.hi.l2.x), "=l"(ret.s6.hi.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+880];" : "=l"(ret.s6.hi.l3.x), "=l"(ret.s6.hi.l3.y) : __LDG_PTR(ptr));
+
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+896];" : "=l"(ret.s7.lo.l0.x), "=l"(ret.s7.lo.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+912];" : "=l"(ret.s7.lo.l1.x), "=l"(ret.s7.lo.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+928];" : "=l"(ret.s7.lo.l2.x), "=l"(ret.s7.lo.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+944];" : "=l"(ret.s7.lo.l3.x), "=l"(ret.s7.lo.l3.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+960];" : "=l"(ret.s7.hi.l0.x), "=l"(ret.s7.hi.l0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+976];" : "=l"(ret.s7.hi.l1.x), "=l"(ret.s7.hi.l1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+992];" : "=l"(ret.s7.hi.l2.x), "=l"(ret.s7.hi.l2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+1008];" : "=l"(ret.s7.hi.l3.x), "=l"(ret.s7.hi.l3.y) : __LDG_PTR(ptr));
+
+
+
+ return ret;
+}
+
+
+static __device__ __inline__ ulonglong16 __ldg64(const ulonglong2 *ptr)
+{
+ ulonglong16 ret;
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.s6.x), "=l"(ret.s6.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.s7.x), "=l"(ret.s7.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+128];" : "=l"(ret.s8.x), "=l"(ret.s8.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+144];" : "=l"(ret.s9.x), "=l"(ret.s9.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+160];" : "=l"(ret.sa.x), "=l"(ret.sa.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+176];" : "=l"(ret.sb.x), "=l"(ret.sb.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+192];" : "=l"(ret.sc.x), "=l"(ret.sc.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+208];" : "=l"(ret.sd.x), "=l"(ret.sd.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+224];" : "=l"(ret.se.x), "=l"(ret.se.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+240];" : "=l"(ret.sf.x), "=l"(ret.sf.y) : __LDG_PTR(ptr));
+ return ret;
+}
+
+
+static __device__ __inline__ ulonglong16 __ldg64b(const ulonglong16 *ptr)
+{
+ ulonglong16 ret;
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.s6.x), "=l"(ret.s6.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.s7.x), "=l"(ret.s7.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+128];" : "=l"(ret.s8.x), "=l"(ret.s8.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+144];" : "=l"(ret.s9.x), "=l"(ret.s9.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+160];" : "=l"(ret.sa.x), "=l"(ret.sa.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+176];" : "=l"(ret.sb.x), "=l"(ret.sb.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+192];" : "=l"(ret.sc.x), "=l"(ret.sc.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+208];" : "=l"(ret.sd.x), "=l"(ret.sd.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+224];" : "=l"(ret.se.x), "=l"(ret.se.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+240];" : "=l"(ret.sf.x), "=l"(ret.sf.y) : __LDG_PTR(ptr));
+ return ret;
+}
+
+
+
+static __device__ __inline__ ulonglong16 __ldg64b(const uint32 *ptr)
+{
+ ulonglong16 ret;
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.s6.x), "=l"(ret.s6.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.s7.x), "=l"(ret.s7.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+128];" : "=l"(ret.s8.x), "=l"(ret.s8.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+144];" : "=l"(ret.s9.x), "=l"(ret.s9.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+160];" : "=l"(ret.sa.x), "=l"(ret.sa.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+176];" : "=l"(ret.sb.x), "=l"(ret.sb.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+192];" : "=l"(ret.sc.x), "=l"(ret.sc.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+208];" : "=l"(ret.sd.x), "=l"(ret.sd.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+224];" : "=l"(ret.se.x), "=l"(ret.se.y) : __LDG_PTR(ptr));
+ asm("ld.global.nc.v2.u64 {%0,%1}, [%2+240];" : "=l"(ret.sf.x), "=l"(ret.sf.y) : __LDG_PTR(ptr));
+ return ret;
+}
+
+
+
+static __forceinline__ __device__ uint8 swapvec(const uint8 &buf)
+{
+ uint8 vec;
+ vec.s0 = cuda_swab32(buf.s0);
+ vec.s1 = cuda_swab32(buf.s1);
+ vec.s2 = cuda_swab32(buf.s2);
+ vec.s3 = cuda_swab32(buf.s3);
+ vec.s4 = cuda_swab32(buf.s4);
+ vec.s5 = cuda_swab32(buf.s5);
+ vec.s6 = cuda_swab32(buf.s6);
+ vec.s7 = cuda_swab32(buf.s7);
+ return vec;
+}
+
+
+static __forceinline__ __device__ uint8 swapvec(const uint8 *buf)
+{
+ uint8 vec;
+ vec.s0 = cuda_swab32(buf[0].s0);
+ vec.s1 = cuda_swab32(buf[0].s1);
+ vec.s2 = cuda_swab32(buf[0].s2);
+ vec.s3 = cuda_swab32(buf[0].s3);
+ vec.s4 = cuda_swab32(buf[0].s4);
+ vec.s5 = cuda_swab32(buf[0].s5);
+ vec.s6 = cuda_swab32(buf[0].s6);
+ vec.s7 = cuda_swab32(buf[0].s7);
+ return vec;
+}
+
+static __forceinline__ __device__ uint16 swapvec(const uint16 *buf)
+{
+ uint16 vec;
+ vec.s0 = cuda_swab32(buf[0].s0);
+ vec.s1 = cuda_swab32(buf[0].s1);
+ vec.s2 = cuda_swab32(buf[0].s2);
+ vec.s3 = cuda_swab32(buf[0].s3);
+ vec.s4 = cuda_swab32(buf[0].s4);
+ vec.s5 = cuda_swab32(buf[0].s5);
+ vec.s6 = cuda_swab32(buf[0].s6);
+ vec.s7 = cuda_swab32(buf[0].s7);
+ vec.s8 = cuda_swab32(buf[0].s8);
+ vec.s9 = cuda_swab32(buf[0].s9);
+ vec.sa = cuda_swab32(buf[0].sa);
+ vec.sb = cuda_swab32(buf[0].sb);
+ vec.sc = cuda_swab32(buf[0].sc);
+ vec.sd = cuda_swab32(buf[0].sd);
+ vec.se = cuda_swab32(buf[0].se);
+ vec.sf = cuda_swab32(buf[0].sf);
+ return vec;
+}
+
+static __forceinline__ __device__ uint16 swapvec(const uint16 &buf)
+{
+ uint16 vec;
+ vec.s0 = cuda_swab32(buf.s0);
+ vec.s1 = cuda_swab32(buf.s1);
+ vec.s2 = cuda_swab32(buf.s2);
+ vec.s3 = cuda_swab32(buf.s3);
+ vec.s4 = cuda_swab32(buf.s4);
+ vec.s5 = cuda_swab32(buf.s5);
+ vec.s6 = cuda_swab32(buf.s6);
+ vec.s7 = cuda_swab32(buf.s7);
+ vec.s8 = cuda_swab32(buf.s8);
+ vec.s9 = cuda_swab32(buf.s9);
+ vec.sa = cuda_swab32(buf.sa);
+ vec.sb = cuda_swab32(buf.sb);
+ vec.sc = cuda_swab32(buf.sc);
+ vec.sd = cuda_swab32(buf.sd);
+ vec.se = cuda_swab32(buf.se);
+ vec.sf = cuda_swab32(buf.sf);
+ return vec;
+}
+
+#endif // #ifndef CUDA_VECTOR_H
diff --git a/neoscrypt/neoscrypt.c b/neoscrypt/neoscrypt.c
new file mode 100644
index 0000000..59150ed
--- /dev/null
+++ b/neoscrypt/neoscrypt.c
@@ -0,0 +1,992 @@
+/*
+ * Copyright (c) 2009 Colin Percival, 2011 ArtForz
+ * Copyright (c) 2012 Andrew Moon (floodyberry)
+ * Copyright (c) 2012 Samuel Neves
+ * Copyright (c) 2014 John Doering
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#include
+#include
+#include
+
+#include "neoscrypt.h"
+
+
+#if (WINDOWS)
+/* sizeof(unsigned long) = 4 for MinGW64 */
+typedef unsigned long long ulong;
+#else
+typedef unsigned long ulong;
+#endif
+typedef unsigned int uint;
+typedef unsigned char uchar;
+typedef unsigned int bool;
+
+
+#define MIN(a, b) ((a) < (b) ? a : b)
+#define MAX(a, b) ((a) > (b) ? a : b)
+
+
+/* SHA-256 */
+
+static const uint32_t sha256_constants[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+#define Ch(x,y,z) (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) (((x | y) & z) | (x & y))
+#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
+#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
+#define G0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ (x >> 3))
+#define G1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10))
+#define W0(in,i) (U8TO32_BE(&in[i * 4]))
+#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16])
+#define STEP(i) \
+ t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \
+ t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \
+ r[7] = r[6]; \
+ r[6] = r[5]; \
+ r[5] = r[4]; \
+ r[4] = r[3] + t0; \
+ r[3] = r[2]; \
+ r[2] = r[1]; \
+ r[1] = r[0]; \
+ r[0] = t0 + t1;
+
+
+typedef struct sha256_hash_state_t {
+ uint32_t H[8];
+ uint64_t T;
+ uint32_t leftover;
+ uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} sha256_hash_state;
+
+
+static void sha256_blocks(sha256_hash_state *S, const uint8_t *in, size_t blocks)
+{
+ uint32_t r[8], w[64], t0, t1;
+ size_t i;
+
+ for (i = 0; i < 8; i++)
+ r[i] = S->H[i];
+
+ while (blocks--) {
+ for (i = 0U; i < 16; i++) {
+ w[i] = W0(in, i);
+ }
+ for (i = 16; i < 64; i++) {
+ w[i] = W1(i);
+ }
+ for (i = 0U; i < 64; i++) {
+ STEP(i);
+ }
+ for (i = 0U; i < 8U; i++) {
+ r[i] += S->H[i];
+ S->H[i] = r[i];
+ }
+ S->T += SCRYPT_HASH_BLOCK_SIZE * 8;
+ in += SCRYPT_HASH_BLOCK_SIZE;
+ }
+}
+
+static void neoscrypt_hash_init_sha256(sha256_hash_state *S)
+{
+ S->H[0] = 0x6a09e667;
+ S->H[1] = 0xbb67ae85;
+ S->H[2] = 0x3c6ef372;
+ S->H[3] = 0xa54ff53a;
+ S->H[4] = 0x510e527f;
+ S->H[5] = 0x9b05688c;
+ S->H[6] = 0x1f83d9ab;
+ S->H[7] = 0x5be0cd19;
+ S->T = 0;
+ S->leftover = 0;
+}
+
+static void neoscrypt_hash_update_sha256(sha256_hash_state *S, const uint8_t *in, size_t inlen)
+{
+ size_t blocks, want;
+
+ /* handle the previous data */
+ if (S->leftover) {
+ want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+ want = (want < inlen) ? want : inlen;
+ memcpy(S->buffer + S->leftover, in, want);
+ S->leftover += (uint32_t)want;
+ if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+ return;
+ in += want;
+ inlen -= want;
+ sha256_blocks(S, S->buffer, 1);
+ }
+
+ /* handle the current data */
+ blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+ S->leftover = (uint32_t)(inlen - blocks);
+ if (blocks) {
+ sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+ in += blocks;
+ }
+
+ /* handle leftover data */
+ if (S->leftover)
+ memcpy(S->buffer, in, S->leftover);
+}
+
+static void neoscrypt_hash_finish_sha256(sha256_hash_state *S, uint8_t *hash)
+{
+ uint64_t t = S->T + (S->leftover * 8);
+
+ S->buffer[S->leftover] = 0x80;
+ if (S->leftover <= 55) {
+ memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
+ } else {
+ memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
+ sha256_blocks(S, S->buffer, 1);
+ memset(S->buffer, 0, 56);
+ }
+
+ U64TO8_BE(S->buffer + 56, t);
+ sha256_blocks(S, S->buffer, 1);
+
+ U32TO8_BE(&hash[ 0], S->H[0]);
+ U32TO8_BE(&hash[ 4], S->H[1]);
+ U32TO8_BE(&hash[ 8], S->H[2]);
+ U32TO8_BE(&hash[12], S->H[3]);
+ U32TO8_BE(&hash[16], S->H[4]);
+ U32TO8_BE(&hash[20], S->H[5]);
+ U32TO8_BE(&hash[24], S->H[6]);
+ U32TO8_BE(&hash[28], S->H[7]);
+}
+
+static void neoscrypt_hash_sha256(hash_digest hash, const uint8_t *m, size_t mlen)
+{
+ sha256_hash_state st;
+ neoscrypt_hash_init_sha256(&st);
+ neoscrypt_hash_update_sha256(&st, m, mlen);
+ neoscrypt_hash_finish_sha256(&st, hash);
+}
+
+
+/* HMAC for SHA-256 */
+
+typedef struct sha256_hmac_state_t {
+ sha256_hash_state inner, outer;
+} sha256_hmac_state;
+
+static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen)
+{
+ uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
+ size_t i;
+
+ neoscrypt_hash_init_sha256(&st->inner);
+ neoscrypt_hash_init_sha256(&st->outer);
+
+ if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
+ /* use the key directly if it's <= blocksize bytes */
+ memcpy(pad, key, keylen);
+ } else {
+ /* if it's > blocksize bytes, hash it */
+ neoscrypt_hash_sha256(pad, key, keylen);
+ }
+
+ /* inner = (key ^ 0x36) */
+ /* h(inner || ...) */
+ for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+ pad[i] ^= 0x36;
+ neoscrypt_hash_update_sha256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+ /* outer = (key ^ 0x5c) */
+ /* h(outer || ...) */
+ for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+ pad[i] ^= (0x5c ^ 0x36);
+ neoscrypt_hash_update_sha256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
+}
+
+static void neoscrypt_hmac_update_sha256(sha256_hmac_state *st, const uint8_t *m, size_t mlen)
+{
+ /* h(inner || m...) */
+ neoscrypt_hash_update_sha256(&st->inner, m, mlen);
+}
+
+static void neoscrypt_hmac_finish_sha256(sha256_hmac_state *st, hash_digest mac)
+{
+ /* h(inner || m) */
+ hash_digest innerhash;
+ neoscrypt_hash_finish_sha256(&st->inner, innerhash);
+
+ /* h(outer || h(inner || m)) */
+ neoscrypt_hash_update_sha256(&st->outer, innerhash, sizeof(innerhash));
+ neoscrypt_hash_finish_sha256(&st->outer, mac);
+}
+
+
+/* PBKDF2 for SHA-256 */
+
+static void neoscrypt_pbkdf2_sha256(const uint8_t *password, size_t password_len,
+ const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *output, size_t output_len)
+{
+ sha256_hmac_state hmac_pw, hmac_pw_salt, work;
+ hash_digest ti, u;
+ uint8_t be[4];
+ uint32_t i, j, k, blocks;
+
+ /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
+
+ /* hmac(password, ...) */
+ neoscrypt_hmac_init_sha256(&hmac_pw, password, password_len);
+
+ /* hmac(password, salt...) */
+ hmac_pw_salt = hmac_pw;
+ neoscrypt_hmac_update_sha256(&hmac_pw_salt, salt, salt_len);
+
+ blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
+ for(i = 1; i <= blocks; i++) {
+ /* U1 = hmac(password, salt || be(i)) */
+ U32TO8_BE(be, i);
+ work = hmac_pw_salt;
+ neoscrypt_hmac_update_sha256(&work, be, 4);
+ neoscrypt_hmac_finish_sha256(&work, ti);
+ memcpy(u, ti, sizeof(u));
+
+ /* T[i] = U1 ^ U2 ^ U3... */
+ for(j = 0; j < N - 1; j++) {
+ /* UX = hmac(password, U{X-1}) */
+ work = hmac_pw;
+ neoscrypt_hmac_update_sha256(&work, u, SCRYPT_HASH_DIGEST_SIZE);
+ neoscrypt_hmac_finish_sha256(&work, u);
+
+ /* T[i] ^= UX */
+ for(k = 0; k < sizeof(u); k++)
+ ti[k] ^= u[k];
+ }
+
+ memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len);
+ output += SCRYPT_HASH_DIGEST_SIZE;
+ output_len -= SCRYPT_HASH_DIGEST_SIZE;
+ }
+}
+
+
+/* NeoScrypt */
+
+#if defined(ASM)
+
+extern void neoscrypt_salsa(uint *X, uint rounds);
+extern void neoscrypt_salsa_tangle(uint *X, uint count);
+extern void neoscrypt_chacha(uint *X, uint rounds);
+
+extern void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len);
+extern void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len);
+extern void neoscrypt_blkxor(void *dstp, const void *srcp, uint len);
+
+#else
+
+/* Salsa20, rounds must be a multiple of 2 */
+static void neoscrypt_salsa(uint *X, uint rounds)
+{
+ uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t;
+
+ x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3];
+ x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7];
+ x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11];
+ x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];
+
+#define quarter(a, b, c, d) \
+ t = a + d; t = ROTL32(t, 7); b ^= t; \
+ t = b + a; t = ROTL32(t, 9); c ^= t; \
+ t = c + b; t = ROTL32(t, 13); d ^= t; \
+ t = d + c; t = ROTL32(t, 18); a ^= t;
+
+ for(; rounds; rounds -= 2) {
+ quarter( x0, x4, x8, x12);
+ quarter( x5, x9, x13, x1);
+ quarter(x10, x14, x2, x6);
+ quarter(x15, x3, x7, x11);
+ quarter( x0, x1, x2, x3);
+ quarter( x5, x6, x7, x4);
+ quarter(x10, x11, x8, x9);
+ quarter(x15, x12, x13, x14);
+ }
+
+ X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3;
+ X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7;
+ X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11;
+ X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;
+
+#undef quarter
+}
+
+/* ChaCha20, rounds must be a multiple of 2 */
+static void neoscrypt_chacha(uint *X, uint rounds)
+{
+ uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t;
+
+ x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3];
+ x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7];
+ x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11];
+ x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];
+
+#define quarter(a,b,c,d) \
+ a += b; t = d ^ a; d = ROTL32(t, 16); \
+ c += d; t = b ^ c; b = ROTL32(t, 12); \
+ a += b; t = d ^ a; d = ROTL32(t, 8); \
+ c += d; t = b ^ c; b = ROTL32(t, 7);
+
+ for(; rounds; rounds -= 2) {
+ quarter( x0, x4, x8, x12);
+ quarter( x1, x5, x9, x13);
+ quarter( x2, x6, x10, x14);
+ quarter( x3, x7, x11, x15);
+ quarter( x0, x5, x10, x15);
+ quarter( x1, x6, x11, x12);
+ quarter( x2, x7, x8, x13);
+ quarter( x3, x4, x9, x14);
+ }
+
+ X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3;
+ X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7;
+ X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11;
+ X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;
+
+#undef quarter
+}
+
+
+/* Fast 32-bit / 64-bit memcpy();
+ * len must be a multiple of 32 bytes */
+static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len)
+{
+ ulong *dst = (ulong *) dstp;
+ ulong *src = (ulong *) srcp;
+ uint i;
+
+ for(i = 0; i < (len / sizeof(ulong)); i += 4) {
+ dst[i] = src[i];
+ dst[i + 1] = src[i + 1];
+ dst[i + 2] = src[i + 2];
+ dst[i + 3] = src[i + 3];
+ }
+}
+
+/* Fast 32-bit / 64-bit block swapper;
+ * len must be a multiple of 32 bytes */
+static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len)
+{
+ ulong *blkA = (ulong *) blkAp;
+ ulong *blkB = (ulong *) blkBp;
+ register ulong t0, t1, t2, t3;
+ uint i;
+
+ for(i = 0; i < (len / sizeof(ulong)); i += 4) {
+ t0 = blkA[i];
+ t1 = blkA[i + 1];
+ t2 = blkA[i + 2];
+ t3 = blkA[i + 3];
+ blkA[i] = blkB[i];
+ blkA[i + 1] = blkB[i + 1];
+ blkA[i + 2] = blkB[i + 2];
+ blkA[i + 3] = blkB[i + 3];
+ blkB[i] = t0;
+ blkB[i + 1] = t1;
+ blkB[i + 2] = t2;
+ blkB[i + 3] = t3;
+ }
+}
+
+/* Fast 32-bit / 64-bit block XOR engine;
+ * len must be a multiple of 32 bytes */
+static void neoscrypt_blkxor(void *dstp, const void *srcp, uint len)
+{
+ ulong *dst = (ulong *) dstp;
+ ulong *src = (ulong *) srcp;
+ uint i;
+
+ for (i = 0; i < (len / sizeof(ulong)); i += 4) {
+ dst[i] ^= src[i];
+ dst[i + 1] ^= src[i + 1];
+ dst[i + 2] ^= src[i + 2];
+ dst[i + 3] ^= src[i + 3];
+ }
+}
+
+#endif
+
+/* 32-bit / 64-bit optimised memcpy() */
+static void neoscrypt_copy(void *dstp, const void *srcp, uint len)
+{
+ ulong *dst = (ulong *) dstp;
+ ulong *src = (ulong *) srcp;
+ uint i, tail;
+
+ for(i = 0; i < (len / sizeof(ulong)); i++)
+ dst[i] = src[i];
+
+ tail = len & (sizeof(ulong) - 1);
+ if(tail) {
+ uchar *dstb = (uchar *) dstp;
+ uchar *srcb = (uchar *) srcp;
+
+ for(i = len - tail; i < len; i++)
+ dstb[i] = srcb[i];
+ }
+}
+
+/* 32-bit / 64-bit optimised memory erase aka memset() to zero */
+static void neoscrypt_erase(void *dstp, uint len)
+{
+ const ulong null = 0;
+ ulong *dst = (ulong *) dstp;
+ uint i, tail;
+
+ for (i = 0; i < (len / sizeof(ulong)); i++)
+ dst[i] = null;
+
+ tail = len & (sizeof(ulong) - 1);
+ if (tail) {
+ uchar *dstb = (uchar *) dstp;
+
+ for(i = len - tail; i < len; i++)
+ dstb[i] = (uchar)null;
+ }
+}
+
+/* 32-bit / 64-bit optimised XOR engine */
+static void neoscrypt_xor(void *dstp, const void *srcp, uint len)
+{
+ ulong *dst = (ulong *) dstp;
+ ulong *src = (ulong *) srcp;
+ uint i, tail;
+
+ for (i = 0; i < (len / sizeof(ulong)); i++)
+ dst[i] ^= src[i];
+
+ tail = len & (sizeof(ulong) - 1);
+ if (tail) {
+ uchar *dstb = (uchar *) dstp;
+ uchar *srcb = (uchar *) srcp;
+
+ for(i = len - tail; i < len; i++)
+ dstb[i] ^= srcb[i];
+ }
+}
+
+
+/* BLAKE2s */
+
+#define BLAKE2S_BLOCK_SIZE 64U
+#define BLAKE2S_OUT_SIZE 32U
+#define BLAKE2S_KEY_SIZE 32U
+
+/* Parameter block of 32 bytes */
+typedef struct blake2s_param_t {
+ uchar digest_length;
+ uchar key_length;
+ uchar fanout;
+ uchar depth;
+ uint leaf_length;
+ uchar node_offset[6];
+ uchar node_depth;
+ uchar inner_length;
+ uchar salt[8];
+ uchar personal[8];
+} blake2s_param;
+
+/* State block of 180 bytes */
+typedef struct blake2s_state_t {
+ uint h[8];
+ uint t[2];
+ uint f[2];
+ uchar buf[2 * BLAKE2S_BLOCK_SIZE];
+ uint buflen;
+} blake2s_state;
+
+static const uint blake2s_IV[8] = {
+ 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+ 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint8_t blake2s_sigma[10][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
+};
+
+static void blake2s_compress(blake2s_state *S, const uint *buf)
+{
+ uint i;
+ uint m[16];
+ uint v[16];
+
+ neoscrypt_copy(m, buf, 64);
+ neoscrypt_copy(v, S, 32);
+
+ v[ 8] = blake2s_IV[0];
+ v[ 9] = blake2s_IV[1];
+ v[10] = blake2s_IV[2];
+ v[11] = blake2s_IV[3];
+ v[12] = S->t[0] ^ blake2s_IV[4];
+ v[13] = S->t[1] ^ blake2s_IV[5];
+ v[14] = S->f[0] ^ blake2s_IV[6];
+ v[15] = S->f[1] ^ blake2s_IV[7];
+
+#define G(r,i,a,b,c,d) do { \
+ a = a + b + m[blake2s_sigma[r][2*i+0]]; \
+ d = ROTR32(d ^ a, 16); \
+ c = c + d; \
+ b = ROTR32(b ^ c, 12); \
+ a = a + b + m[blake2s_sigma[r][2*i+1]]; \
+ d = ROTR32(d ^ a, 8); \
+ c = c + d; \
+ b = ROTR32(b ^ c, 7); \
+} while(0)
+
+#define ROUND(r) do { \
+ G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \
+ G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \
+ G(r, 2, v[ 2], v[ 6], v[10], v[14]); \
+ G(r, 3, v[ 3], v[ 7], v[11], v[15]); \
+ G(r, 4, v[ 0], v[ 5], v[10], v[15]); \
+ G(r, 5, v[ 1], v[ 6], v[11], v[12]); \
+ G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \
+ G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \
+} while(0)
+
+ ROUND(0);
+ ROUND(1);
+ ROUND(2);
+ ROUND(3);
+ ROUND(4);
+ ROUND(5);
+ ROUND(6);
+ ROUND(7);
+ ROUND(8);
+ ROUND(9);
+
+ for (i = 0; i < 8; i++)
+ S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+
+#undef G
+#undef ROUND
+}
+
+static void blake2s_update(blake2s_state *S, const uchar *input, uint input_size)
+{
+ uint left, fill;
+
+ while(input_size > 0) {
+ left = S->buflen;
+ fill = 2 * BLAKE2S_BLOCK_SIZE - left;
+ if(input_size > fill) {
+ /* Buffer fill */
+ neoscrypt_copy(S->buf + left, input, fill);
+ S->buflen += fill;
+ /* Counter increment */
+ S->t[0] += BLAKE2S_BLOCK_SIZE;
+ /* Compress */
+ blake2s_compress(S, (uint *) S->buf);
+ /* Shift buffer left */
+ neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, BLAKE2S_BLOCK_SIZE);
+ S->buflen -= BLAKE2S_BLOCK_SIZE;
+ input += fill;
+ input_size -= fill;
+ } else {
+ neoscrypt_copy(S->buf + left, input, input_size);
+ S->buflen += input_size;
+ /* Do not compress */
+ input += input_size;
+ input_size = 0;
+ }
+ }
+}
+
+static void neoscrypt_blake2s(const void *input, const uint input_size, const void *key, const uchar key_size,
+ void *output, const uchar output_size)
+{
+ uchar block[BLAKE2S_BLOCK_SIZE];
+ blake2s_param P[1];
+ blake2s_state S[1];
+
+ /* Initialise */
+ neoscrypt_erase(P, 32);
+ P->digest_length = output_size;
+ P->key_length = key_size;
+ P->fanout = 1;
+ P->depth = 1;
+
+ neoscrypt_erase(S, 180);
+ neoscrypt_copy(S, blake2s_IV, 32);
+ neoscrypt_xor(S, P, 32);
+
+ neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE);
+ neoscrypt_copy(block, key, key_size);
+ blake2s_update(S, (uchar *) block, BLAKE2S_BLOCK_SIZE);
+
+ /* Update */
+ blake2s_update(S, (uchar *) input, input_size);
+
+ /* Finish */
+ if(S->buflen > BLAKE2S_BLOCK_SIZE) {
+ S->t[0] += BLAKE2S_BLOCK_SIZE;
+ blake2s_compress(S, (uint *) S->buf);
+ S->buflen -= BLAKE2S_BLOCK_SIZE;
+ neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, S->buflen);
+ }
+ S->t[0] += S->buflen;
+ S->f[0] = ~0U;
+ neoscrypt_erase(S->buf + S->buflen, 2 * BLAKE2S_BLOCK_SIZE - S->buflen);
+ blake2s_compress(S, (uint *) S->buf);
+
+ /* Write back */
+ neoscrypt_copy(output, S, output_size);
+
+ //for (int k = 0; k<4; k++) { printf("cpu blake %d %08x %08x\n", k, ((unsigned int*)output)[2 * k], ((unsigned int*)output)[2 * k + 1]); }
+}
+
+
+#define FASTKDF_BUFFER_SIZE 256U
+
+/* FastKDF, a fast buffered key derivation function:
+ * FASTKDF_BUFFER_SIZE must be a power of 2;
+ * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
+ * prf_output_size must be <= prf_key_size; */
+static void neoscrypt_fastkdf(const uchar *password, uint password_len, const uchar *salt, uint salt_len,
+ uint N, uchar *output, uint output_len)
+{
+ //for (int i = 0; i<10; i++) { printf("cpu password %d %08x %08x\n", i, ((unsigned int*)password)[2 * i], ((unsigned int*)password)[2 * i+1]); }
+ const uint stack_align = 0x40;
+ const uint kdf_buf_size = 256U; //FASTKDF_BUFFER_SIZE
+ const uint prf_input_size = 64U; //BLAKE2S_BLOCK_SIZE
+ const uint prf_key_size = 32U; //BLAKE2S_KEY_SIZE
+ const uint prf_output_size = 32U; //BLAKE2S_OUT_SIZE
+ uint bufptr, a, b, i, j;
+ uchar *A, *B, *prf_input, *prf_key, *prf_output;
+ uchar *stack;
+ stack = (uchar*)malloc(sizeof(uchar) * 2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align);
+ /* Align and set up the buffers in stack */
+ //uchar stack[2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align];
+
+ A = &stack[stack_align & ~(stack_align - 1)];
+ B = &A[kdf_buf_size + prf_input_size];
+ prf_output = &A[2 * kdf_buf_size + prf_input_size + prf_key_size];
+
+ /* Initialise the password buffer */
+ if(password_len > kdf_buf_size)
+ password_len = kdf_buf_size;
+
+ a = kdf_buf_size / password_len;
+ for(i = 0; i < a; i++)
+ neoscrypt_copy(&A[i * password_len], &password[0], password_len);
+ b = kdf_buf_size - a * password_len;
+ if(b)
+ neoscrypt_copy(&A[a * password_len], &password[0], b);
+ neoscrypt_copy(&A[kdf_buf_size], &password[0], prf_input_size);
+
+ /* Initialise the salt buffer */
+ if(salt_len > kdf_buf_size)
+ salt_len = kdf_buf_size;
+
+ a = kdf_buf_size / salt_len;
+ for(i = 0; i < a; i++)
+ neoscrypt_copy(&B[i * salt_len], &salt[0], salt_len);
+ b = kdf_buf_size - a * salt_len;
+ if(b)
+ neoscrypt_copy(&B[a * salt_len], &salt[0], b);
+ neoscrypt_copy(&B[kdf_buf_size], &salt[0], prf_key_size);
+
+ /* The primary iteration */
+ for(i = 0, bufptr = 0; i < N; i++) {
+
+ /* Map the PRF input buffer */
+ prf_input = &A[bufptr];
+
+ /* Map the PRF key buffer */
+ prf_key = &B[bufptr];
+
+ /* PRF */
+ // for (int k = 0; k<(prf_input_size/4); k++) { printf("cpu bufptr %08x before blake %d %d %08x \n",bufptr, i, k, ((unsigned int*)prf_input)[k]); }
+ neoscrypt_blake2s(prf_input, prf_input_size, prf_key, prf_key_size, prf_output, prf_output_size);
+ // for (int k = 0; k<(prf_output_size/4); k++) { printf("cpu after blake %d %d %08x \n", i, k, ((unsigned int*)prf_output)[k]); }
+
+ /* Calculate the next buffer pointer */
+ for(j = 0, bufptr = 0; j < prf_output_size; j++)
+ bufptr += prf_output[j];
+ bufptr &= (kdf_buf_size - 1);
+
+ /* Modify the salt buffer */
+ neoscrypt_xor(&B[bufptr], &prf_output[0], prf_output_size);
+
+ /* Head modified, tail updated */
+ if(bufptr < prf_key_size)
+ neoscrypt_copy(&B[kdf_buf_size + bufptr], &B[bufptr], MIN(prf_output_size, prf_key_size - bufptr));
+
+ /* Tail modified, head updated */
+ if((kdf_buf_size - bufptr) < prf_output_size)
+ neoscrypt_copy(&B[0], &B[kdf_buf_size], prf_output_size - (kdf_buf_size - bufptr));
+ }
+
+ /* Modify and copy into the output buffer */
+ if(output_len > kdf_buf_size)
+ output_len = kdf_buf_size;
+
+ a = kdf_buf_size - bufptr;
+ if(a >= output_len) {
+ neoscrypt_xor(&B[bufptr], &A[0], output_len);
+ neoscrypt_copy(&output[0], &B[bufptr], output_len);
+ } else {
+ neoscrypt_xor(&B[bufptr], &A[0], a);
+ neoscrypt_xor(&B[0], &A[a], output_len - a);
+ neoscrypt_copy(&output[0], &B[bufptr], a);
+ neoscrypt_copy(&output[a], &B[0], output_len - a);
+ }
+ // for (int i = 0; i<10; i++) { printf("cpu fastkdf %d %08x %08x\n", i, ((unsigned int*)output)[2 * i], ((unsigned int*)output)[2 * i + 1]); }
+}
+
+
+/* Configurable optimised block mixer */
+static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode)
+{
+ uint i, mixer, rounds;
+
+ mixer = mixmode >> 8;
+ rounds = mixmode & 0xFF;
+
+ /* NeoScrypt flow: Scrypt flow:
+ Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa";
+ Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb";
+ Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya;
+ Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb;
+ Xa" = Ya; Xb" = Yc;
+ Xc" = Yb; Xd" = Yd; */
+
+ if (r == 1) {
+ neoscrypt_blkxor(&X[0], &X[16], SCRYPT_BLOCK_SIZE);
+ if(mixer)
+ neoscrypt_chacha(&X[0], rounds);
+ else
+ neoscrypt_salsa(&X[0], rounds);
+ neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE);
+ if(mixer)
+ neoscrypt_chacha(&X[16], rounds);
+ else
+ neoscrypt_salsa(&X[16], rounds);
+ return;
+ }
+
+ if (r == 2) {
+ neoscrypt_blkxor(&X[0], &X[48], SCRYPT_BLOCK_SIZE);
+ if(mixer)
+ neoscrypt_chacha(&X[0], rounds);
+ else
+ neoscrypt_salsa(&X[0], rounds);
+ neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE);
+ if(mixer)
+ neoscrypt_chacha(&X[16], rounds);
+ else
+ neoscrypt_salsa(&X[16], rounds);
+ neoscrypt_blkxor(&X[32], &X[16], SCRYPT_BLOCK_SIZE);
+ if(mixer)
+ neoscrypt_chacha(&X[32], rounds);
+ else
+ neoscrypt_salsa(&X[32], rounds);
+ neoscrypt_blkxor(&X[48], &X[32], SCRYPT_BLOCK_SIZE);
+ if(mixer)
+ neoscrypt_chacha(&X[48], rounds);
+ else
+ neoscrypt_salsa(&X[48], rounds);
+ neoscrypt_blkswp(&X[16], &X[32], SCRYPT_BLOCK_SIZE);
+ return;
+ }
+
+ /* Reference code for any reasonable r */
+ for (i = 0; i < 2 * r; i++) {
+ if(i) neoscrypt_blkxor(&X[16 * i], &X[16 * (i - 1)], SCRYPT_BLOCK_SIZE);
+ else neoscrypt_blkxor(&X[0], &X[16 * (2 * r - 1)], SCRYPT_BLOCK_SIZE);
+ if(mixer)
+ neoscrypt_chacha(&X[16 * i], rounds);
+ else
+ neoscrypt_salsa(&X[16 * i], rounds);
+ neoscrypt_blkcpy(&Y[16 * i], &X[16 * i], SCRYPT_BLOCK_SIZE);
+ }
+ for (i = 0; i < r; i++)
+ neoscrypt_blkcpy(&X[16 * i], &Y[16 * 2 * i], SCRYPT_BLOCK_SIZE);
+ for (i = 0; i < r; i++)
+ neoscrypt_blkcpy(&X[16 * (i + r)], &Y[16 * (2 * i + 1)], SCRYPT_BLOCK_SIZE);
+}
+
+/* NeoScrypt core engine:
+ * p = 1, salt = password;
+ * Basic customisation (required):
+ * profile bit 0:
+ * 0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20;
+ * 1 = Scrypt(1024, 1, 1) with Salsa20/8;
+ * profile bits 4 to 1:
+ * 0000 = FastKDF-BLAKE2s;
+ * 0001 = PBKDF2-HMAC-SHA256;
+ * Extended customisation (optional):
+ * profile bit 31:
+ * 0 = extended customisation absent;
+ * 1 = extended customisation present;
+ * profile bits 7 to 5 (rfactor):
+ * 000 = r of 1;
+ * 001 = r of 2;
+ * 010 = r of 4;
+ * ...
+ * 111 = r of 128;
+ * profile bits 12 to 8 (Nfactor):
+ * 00000 = N of 2;
+ * 00001 = N of 4;
+ * 00010 = N of 8;
+ * .....
+ * 00110 = N of 128;
+ * .....
+ * 01001 = N of 1024;
+ * .....
+ * 11110 = N of 2147483648;
+ * profile bits 30 to 13 are reserved */
+void neoscrypt(const uchar *password, uchar *output, uint profile)
+{
+ uint N = 128, r = 2, dblmix = 1, mixmode = 0x14, stack_align = 0x40;
+ uint kdf, i, j;
+ uint *X, *Y, *Z, *V;
+
+ if(profile & 0x1) {
+ N = 1024; /* N = (1 << (Nfactor + 1)); */
+ r = 1; /* r = (1 << rfactor); */
+ dblmix = 0; /* Salsa only */
+ mixmode = 0x08; /* 8 rounds */
+ }
+
+ if(profile >> 31) {
+ N = (1 << (((profile >> 8) & 0x1F) + 1));
+ r = (1 << ((profile >> 5) & 0x7));
+ }
+ uchar *stack;
+ stack = (uchar*)malloc(((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align)*sizeof(uchar));
+ /* X = r * 2 * SCRYPT_BLOCK_SIZE */
+ X = (uint *) &stack[stack_align & ~(stack_align - 1)];
+ /* Z is a copy of X for ChaCha */
+ Z = &X[32 * r];
+ /* Y is an X sized temporal space */
+ Y = &X[64 * r];
+ /* V = N * r * 2 * SCRYPT_BLOCK_SIZE */
+ V = &X[96 * r];
+
+ /* X = KDF(password, salt) */
+ kdf = (profile >> 1) & 0xF;
+
+ switch(kdf) {
+
+ default:
+ case(0x0):
+ neoscrypt_fastkdf(password, 80, password, 80, 32, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE);
+ break;
+
+ case(0x1):
+ neoscrypt_pbkdf2_sha256(password, 80, password, 80, 1, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE);
+ break;
+ }
+
+ /* Process ChaCha 1st, Salsa 2nd and XOR them into FastKDF; otherwise Salsa only */
+
+ if(dblmix) {
+ /* blkcpy(Z, X) */
+ neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * SCRYPT_BLOCK_SIZE);
+
+ /* Z = SMix(Z) */
+ for(i = 0; i < N; i++) {
+ /* blkcpy(V, Z) */
+ neoscrypt_blkcpy(&V[i * (32 * r)], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE);
+ /* blkmix(Z, Y) */
+ neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100));
+ }
+ for(i = 0; i < N; i++) {
+ /* integerify(Z) mod N */
+ j = (32 * r) * (Z[16 * (2 * r - 1)] & (N - 1));
+ /* blkxor(Z, V) */
+ neoscrypt_blkxor(&Z[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE);
+ /* blkmix(Z, Y) */
+ neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100));
+ }
+ }
+
+#if (ASM)
+ /* Must be called before and after SSE2 Salsa */
+ neoscrypt_salsa_tangle(&X[0], r * 2);
+#endif
+
+ /* X = SMix(X) */
+ for(i = 0; i < N; i++) {
+ /* blkcpy(V, X) */
+ neoscrypt_blkcpy(&V[i * (32 * r)], &X[0], r * 2 * SCRYPT_BLOCK_SIZE);
+ /* blkmix(X, Y) */
+ neoscrypt_blkmix(&X[0], &Y[0], r, mixmode);
+ }
+ for(i = 0; i < N; i++) {
+ /* integerify(X) mod N */
+ j = (32 * r) * (X[16 * (2 * r - 1)] & (N - 1));
+ /* blkxor(X, V) */
+ neoscrypt_blkxor(&X[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE);
+ /* blkmix(X, Y) */
+ neoscrypt_blkmix(&X[0], &Y[0], r, mixmode);
+ }
+
+#if (ASM)
+ neoscrypt_salsa_tangle(&X[0], r * 2);
+#endif
+
+ if(dblmix)
+ /* blkxor(X, Z) */
+ neoscrypt_blkxor(&X[0], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE);
+
+ /* output = KDF(password, X) */
+ switch(kdf) {
+
+ default:
+ case(0x0):
+ neoscrypt_fastkdf(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 32, output, 32);
+ break;
+
+ case(0x1):
+ neoscrypt_pbkdf2_sha256(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32);
+ break;
+ }
+}
+
diff --git a/neoscrypt/neoscrypt.h b/neoscrypt/neoscrypt.h
new file mode 100644
index 0000000..5c4d4e4
--- /dev/null
+++ b/neoscrypt/neoscrypt.h
@@ -0,0 +1,33 @@
+#if (__cplusplus)
+extern "C" {
+#endif
+
+void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile);
+
+#if (__cplusplus)
+}
+#else
+
+#define SCRYPT_BLOCK_SIZE 64
+#define SCRYPT_HASH_BLOCK_SIZE 64
+#define SCRYPT_HASH_DIGEST_SIZE 32
+
+typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+
+#define U8TO32_BE(p) \
+ (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
+ ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3])))
+
+#define U32TO8_BE(p, v) \
+ (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+ (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) );
+
+#define U64TO8_BE(p, v) \
+ U32TO8_BE((p), (uint32_t)((v) >> 32)); \
+ U32TO8_BE((p) + 4, (uint32_t)((v) ));
+
+#endif
+
diff --git a/util.cpp b/util.cpp
index 577e87a..6c8e298 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1777,6 +1777,9 @@ void print_hash_tests(void)
myriadhash(&hash[0], &buf[0]);
printpfx("myriad", hash);
+ neoscrypt(&buf[0], &hash[0], 80000620);
+ printpfx("neoscrypt", hash);
+
nist5hash(&hash[0], &buf[0]);
printpfx("nist5", hash);