From d4e191610ee4a6aabed87f4089d12d22bfdafcb3 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 18 Aug 2015 09:27:11 +0200 Subject: [PATCH] Import and adapt lyra2v2 not tested on windows and with SM <= 5 --- Algo256/cuda_bmw256.cu | 320 ++++++++++ Algo256/cuda_cubehash256.cu | 260 ++++++++ Makefile.am | 2 + ccminer.cpp | 15 +- ccminer.vcxproj | 4 + ccminer.vcxproj.filters | 12 + cuda_helper.h | 31 + lyra2/Lyra2.c | 95 +-- lyra2/Lyra2.h | 10 +- lyra2/Sponge.c | 867 ++++++++------------------ lyra2/Sponge.h | 90 ++- lyra2/cuda_lyra2.cu | 674 +++++++++++++++------ lyra2/cuda_lyra2_vectors.h | 1138 +++++++++++++++++++++++++++++++++++ lyra2/cuda_lyra2v2.cu | 520 ++++++++++++++++ lyra2/lyra2RE.cu | 14 +- lyra2/lyra2REv2.cu | 164 +++++ miner.h | 9 +- util.cpp | 5 +- 18 files changed, 3328 insertions(+), 902 deletions(-) create mode 100644 Algo256/cuda_bmw256.cu create mode 100644 Algo256/cuda_cubehash256.cu create mode 100644 lyra2/cuda_lyra2_vectors.h create mode 100644 lyra2/cuda_lyra2v2.cu create mode 100644 lyra2/lyra2REv2.cu diff --git a/Algo256/cuda_bmw256.cu b/Algo256/cuda_bmw256.cu new file mode 100644 index 0000000..5086007 --- /dev/null +++ b/Algo256/cuda_bmw256.cu @@ -0,0 +1,320 @@ +#include +#include + +#include "cuda_helper.h" + + + +// die Message it Padding zur Berechnung auf der GPU +__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) +__constant__ uint32_t ZDH[16]; +static uint32_t *d_gnounce[MAX_GPUS]; +static uint32_t *d_GNonce[MAX_GPUS]; + +__constant__ uint32_t pTarget[8]; +#define shl(x, n) ((x) << (n)) +#define shr(x, n) ((x) >> (n)) +//#define SHR(x, n) SHR2(x, n) +//#define SHL(x, n) SHL2(x, n) + +#define ROTL32host(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) +// #define SPH_ROTL32 SPH_ROTL32 +#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x), 4) ^ SPH_ROTL32((x), 19)) +#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x), 8) ^ SPH_ROTL32((x), 23)) +#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25)) +#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29)) +#define ss4(x) (shr((x), 1) ^ (x)) +#define ss5(x) (shr((x), 2) ^ (x)) +#define rs1(x) SPH_ROTL32((x), 3) +#define rs2(x) SPH_ROTL32((x), 7) +#define rs3(x) SPH_ROTL32((x), 13) +#define rs4(x) SPH_ROTL32((x), 16) +#define rs5(x) SPH_ROTL32((x), 19) +#define rs6(x) SPH_ROTL32((x), 23) +#define rs7(x) SPH_ROTL32((x), 27) + +/* Message expansion function 1 */ +__forceinline__ __device__ uint32_t expand32_1(int i, uint32_t *M32, uint32_t *H, uint32_t *Q) +{ +#undef SPH_ROTL32 +#define SPH_ROTL32 ROTL32 + return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13]) + + ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9]) + + ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5]) + + ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1]) + + ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); +#undef SPH_ROTL32 +} + +/* Message expansion function 2 */ +__forceinline__ __device__ uint32_t expand32_2(int i, uint32_t *M32, uint32_t *H, uint32_t *Q) +{ +#undef SPH_ROTL32 +#define SPH_ROTL32 ROTL32 + return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13]) + + Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9]) + + Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5]) + + Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1]) + + ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); +#undef SPH_ROTL32 +} + +__forceinline__ __device__ void Compression256(uint32_t * M32, uint32_t * H) +{ +#undef SPH_ROTL32 +#define SPH_ROTL32 ROTL32 + int i; + uint32_t XL32, XH32, Q[32]; + + + Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]); + Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]); + Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); + Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]); + Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]); + Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); + Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]); + Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]); + Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]); + Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]); + Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]); + Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]); + Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]); + Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]); + Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]); + Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]); + + /* Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/ + Q[0] = ss0(Q[0]) + H[1]; + Q[1] = ss1(Q[1]) + H[2]; + Q[2] = ss2(Q[2]) + H[3]; + Q[3] = ss3(Q[3]) + H[4]; + Q[4] = ss4(Q[4]) + H[5]; + Q[5] = ss0(Q[5]) + H[6]; + Q[6] = ss1(Q[6]) + H[7]; + Q[7] = ss2(Q[7]) + H[8]; + Q[8] = ss3(Q[8]) + H[9]; + Q[9] = ss4(Q[9]) + H[10]; + Q[10] = ss0(Q[10]) + H[11]; + Q[11] = ss1(Q[11]) + H[12]; + Q[12] = ss2(Q[12]) + H[13]; + Q[13] = ss3(Q[13]) + H[14]; + Q[14] = ss4(Q[14]) + H[15]; + Q[15] = ss0(Q[15]) + H[0]; + + /* This is the Message expansion or f_1 in the documentation. */ + /* It has 16 rounds. */ + /* Blue Midnight Wish has two tunable security parameters. */ + /* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS. */ + /* The following relation for these parameters should is satisfied: */ + /* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */ + + for (i = 0; i<2; i++) + Q[i + 16] = expand32_1(i + 16, M32, H, Q); + + for (i = 2; i<16; i++) + Q[i + 16] = expand32_2(i + 16, M32, H, Q); + + /* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */ + /* 16 new variables that are prooduced in the Message Expansion part. */ + XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23]; + XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31]; + + + /* This part is the function f_2 - in the documentation */ + + /* Compute the double chaining pipe for the next message block. */ + H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32 ^ Q[24] ^ Q[0]); + H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32 ^ Q[25] ^ Q[1]); + H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]); + H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]); + H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32 ^ Q[28] ^ Q[4]); + H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32 ^ Q[29] ^ Q[5]); + H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32 ^ Q[30] ^ Q[6]); + H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32 ^ Q[31] ^ Q[7]); + + H[8] = SPH_ROTL32(H[4], 9) + (XH32 ^ Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]); + H[9] = SPH_ROTL32(H[5], 10) + (XH32 ^ Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]); + H[10] = SPH_ROTL32(H[6], 11) + (XH32 ^ Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]); + H[11] = SPH_ROTL32(H[7], 12) + (XH32 ^ Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]); + H[12] = SPH_ROTL32(H[0], 13) + (XH32 ^ Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]); + H[13] = SPH_ROTL32(H[1], 14) + (XH32 ^ Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]); + H[14] = SPH_ROTL32(H[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]); + H[15] = SPH_ROTL32(H[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]); + +#undef SPH_ROTL32 + +} + +__forceinline__ __device__ void Compression256_2(uint32_t * M32, uint32_t * H) +{ +#undef SPH_ROTL32 +#define SPH_ROTL32 ROTL32 + int i; + uint32_t XL32, XH32, Q[32]; + + /* This part is the function f0 - in the documentation */ + + /* First we mix the message block *M32 (M in the documatation) */ + /* with the previous double pipe *H. */ + /* For a fixed previous double pipe, or fixed message block, this */ + /* part is bijection. */ + /* This transformation diffuses every one bit difference in 5 words. */ + + Q[0] = (H[5]) - (H[7]) + (H[10]) + (H[13]) + (0x280 ^ H[14]); + Q[1] = (H[6]) - (H[8]) + (H[11]) + (0x280 ^ H[14]) - (H[15]); + Q[2] = (M32[0] ^ H[0]) + (H[7]) + (H[9]) - (H[12]) + (H[15]); + Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (H[8]) - (H[10]) + (H[13]); + Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (H[9]) - (H[11]) - (0x280 ^ H[14]); + Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (H[10]) - (H[12]) + (H[15]); + Q[6] = (0x80 ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (H[11]) + (H[13]); + Q[7] = (M32[1] ^ H[1]) - (0x80 ^ H[4]) - (H[5]) - (H[12]) - (0x280 ^ H[14]); + Q[8] = (M32[2] ^ H[2]) - (H[5]) - (H[6]) + (H[13]) - (H[15]); + Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (H[6]) - (H[7]) + (0x280 ^ H[14]); + Q[10] = (H[8]) - (M32[1] ^ H[1]) - (0x80 ^ H[4]) - (H[7]) + (H[15]); + Q[11] = (H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (H[5]) + (H[9]); + Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (H[6]) - (H[9]) + (H[10]); + Q[13] = (M32[2] ^ H[2]) + (0x80 ^ H[4]) + (H[7]) + (H[10]) + (H[11]); + Q[14] = (M32[3] ^ H[3]) - (H[5]) + (H[8]) - (H[11]) - (H[12]); + Q[15] = (H[12]) - (0x80 ^ H[4]) - (H[6]) - (H[9]) + (H[13]); + + /* Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/ + + Q[0] = ss0(Q[0]) + H[1]; + Q[1] = ss1(Q[1]) + H[2]; + Q[2] = ss2(Q[2]) + H[3]; + Q[3] = ss3(Q[3]) + H[4]; + Q[4] = ss4(Q[4]) + H[5]; + Q[5] = ss0(Q[5]) + H[6]; + Q[6] = ss1(Q[6]) + H[7]; + Q[7] = ss2(Q[7]) + H[8]; + Q[8] = ss3(Q[8]) + H[9]; + Q[9] = ss4(Q[9]) + H[10]; + Q[10] = ss0(Q[10]) + H[11]; + Q[11] = ss1(Q[11]) + H[12]; + Q[12] = ss2(Q[12]) + H[13]; + Q[13] = ss3(Q[13]) + H[14]; + Q[14] = ss4(Q[14]) + H[15]; + Q[15] = ss0(Q[15]) + H[0]; + + /* This is the Message expansion or f_1 in the documentation. */ + /* It has 16 rounds. */ + /* Blue Midnight Wish has two tunable security parameters. */ + /* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS. */ + /* The following relation for these parameters should is satisfied: */ + /* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */ + + for (i = 0; i<2; i++) + Q[i + 16] = expand32_1(i + 16, M32, H, Q); + + for (i = 2; i<16; i++) + Q[i + 16] = expand32_2(i + 16, M32, H, Q); + + /* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */ + /* 16 new variables that are prooduced in the Message Expansion part. */ + XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23]; + XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31]; + + + /* This part is the function f_2 - in the documentation */ + + /* Compute the double chaining pipe for the next message block. */ + H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32 ^ Q[24] ^ Q[0]); + H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32 ^ Q[25] ^ Q[1]); + H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]); + H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]); + H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32 ^ Q[28] ^ Q[4]); + H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32 ^ Q[29] ^ Q[5]); + H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32 ^ Q[30] ^ Q[6]); + H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32 ^ Q[31] ^ Q[7]); + + H[8] = SPH_ROTL32(H[4], 9) + (XH32 ^ Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]); + H[9] = SPH_ROTL32(H[5], 10) + (XH32 ^ Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]); + H[10] = SPH_ROTL32(H[6], 11) + (XH32 ^ Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]); + H[11] = SPH_ROTL32(H[7], 12) + (XH32 ^ Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]); + H[12] = SPH_ROTL32(H[0], 13) + (XH32 ^ Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]); + H[13] = SPH_ROTL32(H[1], 14) + (XH32 ^ Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]); + H[14] = SPH_ROTL32(H[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]); + H[15] = SPH_ROTL32(H[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]); + +#undef SPH_ROTL32 + +} + +#define TPB 512 +__global__ __launch_bounds__(TPB, 2) +void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *const __restrict__ nonceVector) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t dh[16] = { + (0x40414243), (0x44454647), + (0x48494A4B), (0x4C4D4E4F), + (0x50515253), (0x54555657), + (0x58595A5B), (0x5C5D5E5F), + (0x60616263), (0x64656667), + (0x68696A6B), (0x6C6D6E6F), + (0x70717273), (0x74757677), + (0x78797A7B), (0x7C7D7E7F) + }; + + + uint32_t final_s[16] = { + (0xaaaaaaa0), (0xaaaaaaa1), (0xaaaaaaa2), + (0xaaaaaaa3), (0xaaaaaaa4), (0xaaaaaaa5), + (0xaaaaaaa6), (0xaaaaaaa7), (0xaaaaaaa8), + (0xaaaaaaa9), (0xaaaaaaaa), (0xaaaaaaab), + (0xaaaaaaac), (0xaaaaaaad), (0xaaaaaaae), + (0xaaaaaaaf) + }; + uint32_t message[16]={0}; + LOHI(message[0], message[1], __ldg(&g_hash[thread])); + LOHI(message[2], message[3], __ldg(&g_hash[thread + 1 * threads])); + LOHI(message[4], message[5], __ldg(&g_hash[thread + 2 * threads])); + LOHI(message[6], message[7], __ldg(&g_hash[thread + 3 * threads])); + + message[8]=0x80; + message[14]=0x100; + Compression256(message, dh); + Compression256(dh, final_s); + + if (((uint64_t*)final_s)[7] <= ((uint64_t*)pTarget)[3]) + { + uint32_t tmp = atomicExch(&nonceVector[0], startNounce + thread); + if (tmp != 0) + nonceVector[1] = tmp; + } + } +} + + +__host__ +void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash,uint32_t *resultnonces) +{ + cudaMemset(d_GNonce[thr_id], 0x0, 2 * sizeof(uint32_t)); + const uint32_t threadsperblock = TPB; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + bmw256_gpu_hash_32 << > >(threads, startNounce, g_hash, d_GNonce[thr_id]); + cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); + resultnonces[0] = *(d_gnounce[thr_id]); + resultnonces[1] = *(d_gnounce[thr_id] + 1); +} + + +__host__ +void bmw256_cpu_init(int thr_id, uint32_t threads) +{ + cudaMalloc(&d_GNonce[thr_id], 2 * sizeof(uint32_t)); + cudaMallocHost(&d_gnounce[thr_id], 2 * sizeof(uint32_t)); +} + +__host__ +void bmw256_setTarget(const void *pTargetIn) +{ + cudaMemcpyToSymbol(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); +} diff --git a/Algo256/cuda_cubehash256.cu b/Algo256/cuda_cubehash256.cu new file mode 100644 index 0000000..5a132b6 --- /dev/null +++ b/Algo256/cuda_cubehash256.cu @@ -0,0 +1,260 @@ +#include "cuda_helper.h" + +#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ +#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ + +#if __CUDA_ARCH__ < 350 +#define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) +#else +#define LROT(x, bits) __funnelshift_l(x, x, bits) +#endif + +#define ROTATEUPWARDS7(a) LROT(a,7) +#define ROTATEUPWARDS11(a) LROT(a,11) + +//#define SWAP(a,b) { uint32_t u = a; a = b; b = u; } +#define SWAP(a,b) { a ^= b; b ^=a; a ^=b;} +__device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2]) +{ + int r; + int j; + int k; + int l; + int m; + + #pragma unroll 2 + for (r = 0; r < CUBEHASH_ROUNDS; ++r) { + + /* "add x_0jklm into x_1jklmn modulo 2^32" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 7 bits" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); + + /* "swap x_00klm with x_01klm" */ +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][0][k][l][m], x[0][1][k][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jk0m with x_1jk1m" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[1][j][k][0][m], x[1][j][k][1][m]) + + /* "add x_0jklm into x_1jklm modulo 2^32" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 11 bits" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); + + /* "swap x_0j0lm with x_0j1lm" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][j][0][l][m], x[0][j][1][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jkl0 with x_1jkl1" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) + SWAP(x[1][j][k][l][0], x[1][j][k][l][1]) + + } +} + +__device__ __forceinline__ void block_tox(const uint32_t *in, uint32_t x[2][2][2][2][2]) +{ + x[0][0][0][0][0] ^= in[0]; + x[0][0][0][0][1] ^= in[1]; + x[0][0][0][1][0] ^= in[2]; + x[0][0][0][1][1] ^= in[3]; + x[0][0][1][0][0] ^= in[4]; + x[0][0][1][0][1] ^= in[5]; + x[0][0][1][1][0] ^= in[6]; + x[0][0][1][1][1] ^= in[7]; +} + +__device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2]) +{ + out[0] = x[0][0][0][0][0]; + out[1] = x[0][0][0][0][1]; + out[2] = x[0][0][0][1][0]; + out[3] = x[0][0][0][1][1]; + out[4] = x[0][0][1][0][0]; + out[5] = x[0][0][1][0][1]; + out[6] = x[0][0][1][1][0]; + out[7] = x[0][0][1][1][1]; + +} + +void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const uint32_t *data) +{ + /* "xor the block into the first b bytes of the state" */ + /* "and then transform the state invertibly through r identical rounds" */ + block_tox(data, x); + rrounds(x); +} + +void __device__ __forceinline__ Update32_const(uint32_t x[2][2][2][2][2]) +{ + x[0][0][0][0][0] ^= 0x80; + rrounds(x); +} + + + +void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], uint32_t *hashval) +{ + int i; + + /* "the integer 1 is xored into the last state word x_11111" */ + x[1][1][1][1][1] ^= 1; + + /* "the state is then transformed invertibly through 10r identical rounds" */ + #pragma unroll 2 + for (i = 0; i < 10; ++i) rrounds(x); + + /* "output the first h/8 bytes of the state" */ + hash_fromx(hashval, x); +} + + +// Die Hash-Funktion +#if __CUDA_ARCH__ <500 +__global__ __launch_bounds__(576,1) +#else +__global__ __launch_bounds__(576,1) +#endif +void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + + uint32_t Hash[8]; // = &g_hash[16 * hashPosition]; + LOHI(Hash[0], Hash[1], __ldg(&g_hash[thread])); + LOHI(Hash[2], Hash[3], __ldg(&g_hash[thread + 1 * threads])); + LOHI(Hash[4], Hash[5], __ldg(&g_hash[thread + 2 * threads])); + LOHI(Hash[6], Hash[7], __ldg(&g_hash[thread + 3 * threads])); + + uint32_t x[2][2][2][2][2] = + { + 0xEA2BD4B4, 0xCCD6F29F, 0x63117E71, + 0x35481EAE, 0x22512D5B, 0xE5D94E63, + 0x7E624131, 0xF4CC12BE, 0xC2D0B696, + 0x42AF2070, 0xD0720C35, 0x3361DA8C, + 0x28CCECA4, 0x8EF8AD83, 0x4680AC00, + 0x40E5FBAB, 0xD89041C3, 0x6107FBD5, + 0x6C859D41, 0xF0B26679, 0x09392549, + 0x5FA25603, 0x65C892FD, 0x93CB6285, + 0x2AF2B5AE, 0x9E4B4E60, 0x774ABFDD, + 0x85254725, 0x15815AEB, 0x4AB6AAD6, + 0x9CDAF8AF, 0xD6032C0A + + }; + x[0][0][0][0][0] ^= Hash[0]; + x[0][0][0][0][1] ^= Hash[1]; + x[0][0][0][1][0] ^= Hash[2]; + x[0][0][0][1][1] ^= Hash[3]; + x[0][0][1][0][0] ^= Hash[4]; + x[0][0][1][0][1] ^= Hash[5]; + x[0][0][1][1][0] ^= Hash[6]; + x[0][0][1][1][1] ^= Hash[7]; + + rrounds(x); + x[0][0][0][0][0] ^= 0x80; + rrounds(x); + + Final(x, Hash); + + g_hash[thread] = ((uint64_t*)Hash)[0]; + g_hash[1 * threads + thread] = ((uint64_t*)Hash)[1]; + g_hash[2 * threads + thread] = ((uint64_t*)Hash)[2]; + g_hash[3 * threads + thread] = ((uint64_t*)Hash)[3]; + } +} + + +__host__ +void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash) +{ + + uint32_t tpb; + if (device_sm[device_map[thr_id]]<500) + tpb = 576; + else + tpb = 576; + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + tpb-1)/tpb); + dim3 block(tpb); + + cubehash256_gpu_hash_32<<>>(threads, startNounce, d_hash); +} diff --git a/Makefile.am b/Makefile.am index 7ac01a8..d36f478 100644 --- a/Makefile.am +++ b/Makefile.am @@ -33,6 +33,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \ myriadgroestl.cpp cuda_myriadgroestl.cu \ lyra2/Lyra2.c lyra2/Sponge.c \ lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \ + lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \ + Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \ Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \ Algo256/blake256.cu Algo256/keccak256.cu \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \ diff --git a/ccminer.cpp b/ccminer.cpp index de438a1..6ea6988 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -98,6 +98,7 @@ enum sha_algos { ALGO_JACKPOT, ALGO_LUFFA, ALGO_LYRA2, + ALGO_LYRA2v2, ALGO_MJOLLNIR, /* Hefty hash */ ALGO_MYR_GR, ALGO_NEOSCRYPT, @@ -135,6 +136,7 @@ static const char *algo_names[] = { "jackpot", "luffa", "lyra2", + "lyra2v2", "mjollnir", "myr-gr", "neoscrypt", @@ -291,7 +293,8 @@ Options:\n\ jackpot Jackpot\n\ keccak Keccak-256 (Maxcoin)\n\ luffa Joincoin\n\ - lyra2 VertCoin\n\ + lyra2 LyraBar\n\ + lyra2v2 VertCoin\n\ mjollnir Mjollnircoin\n\ myr-gr Myriad-Groestl\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\ @@ -1458,6 +1461,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) break; case ALGO_KECCAK: case ALGO_LYRA2: + case ALGO_LYRA2v2: diff_to_target(work->target, sctx->job.diff / (128.0 * opt_difficulty)); break; default: @@ -1767,6 +1771,7 @@ static void *miner_thread(void *userdata) minmax = 0x2000000; break; case ALGO_C11: + case ALGO_LYRA2v2: case ALGO_S3: case ALGO_X11: case ALGO_X13: @@ -1774,6 +1779,9 @@ static void *miner_thread(void *userdata) break; case ALGO_LYRA2: case ALGO_NEOSCRYPT: + case ALGO_X15: + minmax = 0x300000; + break; case ALGO_SCRYPT: case ALGO_SCRYPT_JANE: minmax = 0x100000; @@ -1902,6 +1910,11 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; + case ALGO_LYRA2v2: + rc = scanhash_lyra2v2(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + case ALGO_NEOSCRYPT: rc = scanhash_neoscrypt(thr_id, work.data, work.target, max_nonce, &hashes_done); diff --git a/ccminer.vcxproj b/ccminer.vcxproj index b52fe19..a50983d 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -392,6 +392,8 @@ + + @@ -440,6 +442,8 @@ + + 64 diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index f023047..fd3cf83 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -583,6 +583,12 @@ Source Files\CUDA\Algo256 + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + Source Files\CUDA\Algo256 @@ -601,6 +607,12 @@ Source Files\CUDA + + Source Files\CUDA + + + Source Files\CUDA + Source Files\CUDA diff --git a/cuda_helper.h b/cuda_helper.h index f89014e..9c6b290 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -606,6 +606,37 @@ uint2 SWAPUINT2(uint2 value) return make_uint2(value.y, value.x); } +/* Byte aligned Rotations (lyra2) */ +#ifdef __CUDA_ARCH__ +__device__ __inline__ uint2 ROL8(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x6543); + result.y = __byte_perm(a.y, a.x, 0x2107); + return result; +} + +__device__ __inline__ uint2 ROR16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x1076); + result.y = __byte_perm(a.y, a.x, 0x5432); + return result; +} + +__device__ __inline__ uint2 ROR24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x2107); + result.y = __byte_perm(a.y, a.x, 0x6543); + return result; +} +#else +#define ROL8(u) ((u) << 8) +#define ROR16(u) ((u) >> 16) +#define ROR24(u) ((u) >> 24) +#endif + /* uint2 for bmw512 - to double check later */ __device__ __forceinline__ diff --git a/lyra2/Lyra2.c b/lyra2/Lyra2.c index 697f435..46ff09b 100644 --- a/lyra2/Lyra2.c +++ b/lyra2/Lyra2.c @@ -44,7 +44,7 @@ * * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) */ -int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) +int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols) { //============================= Basic variables ============================// int64_t row = 2; //index of row to be processed @@ -55,25 +55,32 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 int64_t i; //auxiliary iteration counter + int64_t v64; // 64bit var for memcpy //==========================================================================/ //========== Initializing the Memory Matrix and pointers to it =============// //Tries to allocate enough space for the whole memory matrix - i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); - uint64_t *wholeMatrix = (uint64_t*) malloc((size_t) i); + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + // for Lyra2REv2, nCols = 4, v1 was using 8 + const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES; + + i = (int64_t)ROW_LEN_BYTES * nRows; + uint64_t *wholeMatrix = malloc(i); if (wholeMatrix == NULL) { return -1; } - memset(wholeMatrix, 0, (size_t) i); + memset(wholeMatrix, 0, i); //Allocates pointers to each row of the matrix - uint64_t **memMatrix = malloc((size_t) nRows * sizeof(uint64_t*)); + uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows); if (memMatrix == NULL) { return -1; } //Places the pointers in the correct positions uint64_t *ptrWord = wholeMatrix; - for (i = 0; i < (int64_t) nRows; i++) { + for (i = 0; i < nRows; i++) { memMatrix[i] = ptrWord; ptrWord += ROW_LEN_INT64; } @@ -84,32 +91,38 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //but this ensures that the password copied locally will be overwritten as soon as possible //First, we clean enough blocks for the password, salt, basil and padding - uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; byte *ptrByte = (byte*) wholeMatrix; - memset(ptrByte, 0, (size_t) nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES); //Prepends the password - memcpy(ptrByte, pwd, (size_t) pwdlen); + memcpy(ptrByte, pwd, pwdlen); ptrByte += pwdlen; //Concatenates the salt - memcpy(ptrByte, salt, (size_t) saltlen); + memcpy(ptrByte, salt, saltlen); ptrByte += saltlen; + memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen)); + //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface - memcpy(ptrByte, &kLen, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &pwdlen, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &saltlen, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &timeCost, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &nRows, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &nCols, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &kLen, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = pwdlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = saltlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = timeCost; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nRows; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nCols; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); //Now comes the padding *ptrByte = 0x80; //first byte of padding: right after the password @@ -120,30 +133,27 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //======================= Initializing the Sponge State ====================// //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) - uint64_t *state = malloc(16 * sizeof (uint64_t)); - if (state == NULL) { - return -1; - } + uint64_t state[16]; initState(state); //==========================================================================/ //================================ Setup Phase =============================// //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits ptrWord = wholeMatrix; - for (i = 0; i < (int64_t) nBlocksInput; i++) { + for (i = 0; i < nBlocksInput; i++) { absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) - ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) } //Initializes M[0] and M[1] - reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here + reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here - reducedDuplexRow1(state, memMatrix[0], memMatrix[1]); + reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols); do { //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) - reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); //updates the value of row* (deterministically picked during Setup)) rowa = (rowa + step) & (window - 1); @@ -159,53 +169,46 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * gap = -gap; //inverts the modifier to the step } - } while (row < (int64_t) nRows); + } while (row < nRows); //==========================================================================/ //============================ Wandering Phase =============================// row = 0; //Resets the visitation to the first row of the memory matrix - for (tau = 1; tau <= (int64_t) timeCost; tau++) { + for (tau = 1; tau <= timeCost; tau++) { //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; do { //Selects a pseudorandom index row* //------------------------------------------------------------------------------------------ - //rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) //------------------------------------------------------------------------------------------ //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] - reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); //update prev: it now points to the last row ever computed prev = row; //updates row: goes to the next row to be computed //------------------------------------------------------------------------------------------ - //row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) //------------------------------------------------------------------------------------------ } while (row != 0); } - //==========================================================================/ //============================ Wrap-up Phase ===============================// //Absorbs the last block of the memory matrix absorbBlock(state, memMatrix[rowa]); //Squeezes the key - squeeze(state, K, (size_t) kLen); - //==========================================================================/ + squeeze(state, K, (unsigned int) kLen); //========================= Freeing the memory =============================// free(memMatrix); free(wholeMatrix); - //Wiping out the sponge's internal state before freeing it - memset(state, 0, 16 * sizeof (uint64_t)); - free(state); - //==========================================================================/ - return 0; } diff --git a/lyra2/Lyra2.h b/lyra2/Lyra2.h index 229b2c9..edf9179 100644 --- a/lyra2/Lyra2.h +++ b/lyra2/Lyra2.h @@ -37,14 +37,6 @@ typedef unsigned char byte; #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes #endif -#ifndef N_COLS - #define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default -#endif - -#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks -#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row - - -int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols); +int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols); #endif /* LYRA2_H_ */ diff --git a/lyra2/Sponge.c b/lyra2/Sponge.c index e0a001e..a698229 100644 --- a/lyra2/Sponge.c +++ b/lyra2/Sponge.c @@ -25,7 +25,6 @@ #include "Lyra2.h" - /** * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder * receive Blake2b's IV as per Blake2b's specification. Note: Even though sponges @@ -37,20 +36,18 @@ * * @param state The 1024-bit array to be initialized */ - void initState(uint64_t state[/*16*/]) { - //First 512 bis are zeros - memset(state, 0, 64); - //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV - - state[8] = blake2b_IV[0]; - state[9] = blake2b_IV[1]; - state[10] = blake2b_IV[2]; - state[11] = blake2b_IV[3]; - state[12] = blake2b_IV[4]; - state[13] = blake2b_IV[5]; - state[14] = blake2b_IV[6]; - state[15] = blake2b_IV[7]; - +void initState(uint64_t state[/*16*/]) { + //First 512 bis are zeros + memset(state, 0, 64); + //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV + state[8] = blake2b_IV[0]; + state[9] = blake2b_IV[1]; + state[10] = blake2b_IV[2]; + state[11] = blake2b_IV[3]; + state[12] = blake2b_IV[4]; + state[13] = blake2b_IV[5]; + state[14] = blake2b_IV[6]; + state[15] = blake2b_IV[7]; } /** @@ -59,18 +56,18 @@ * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function */ __inline static void blake2bLyra(uint64_t *v) { - ROUND_LYRA(0); - ROUND_LYRA(1); - ROUND_LYRA(2); - ROUND_LYRA(3); - ROUND_LYRA(4); - ROUND_LYRA(5); - ROUND_LYRA(6); - ROUND_LYRA(7); - ROUND_LYRA(8); - ROUND_LYRA(9); - ROUND_LYRA(10); - ROUND_LYRA(11); + ROUND_LYRA(0); + ROUND_LYRA(1); + ROUND_LYRA(2); + ROUND_LYRA(3); + ROUND_LYRA(4); + ROUND_LYRA(5); + ROUND_LYRA(6); + ROUND_LYRA(7); + ROUND_LYRA(8); + ROUND_LYRA(9); + ROUND_LYRA(10); + ROUND_LYRA(11); } /** @@ -78,7 +75,7 @@ __inline static void blake2bLyra(uint64_t *v) { * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function */ __inline static void reducedBlake2bLyra(uint64_t *v) { - ROUND_LYRA(0); + ROUND_LYRA(0); } /** @@ -89,19 +86,20 @@ __inline static void reducedBlake2bLyra(uint64_t *v) { * @param out Array that will receive the data squeezed * @param len The number of bytes to be squeezed into the "out" array */ - void squeeze(uint64_t *state, byte *out, unsigned int len) { - int fullBlocks = len / BLOCK_LEN_BYTES; - byte *ptr = out; - int i; - //Squeezes full blocks - for (i = 0; i < fullBlocks; i++) { - memcpy(ptr, state, BLOCK_LEN_BYTES); - blake2bLyra(state); - ptr += BLOCK_LEN_BYTES; - } - - //Squeezes remaining bytes - memcpy(ptr, state, (len % BLOCK_LEN_BYTES)); +void squeeze(uint64_t *state, byte *out, unsigned int len) +{ + int fullBlocks = len / BLOCK_LEN_BYTES; + byte *ptr = out; + int i; + //Squeezes full blocks + for (i = 0; i < fullBlocks; i++) { + memcpy(ptr, state, BLOCK_LEN_BYTES); + blake2bLyra(state); + ptr += BLOCK_LEN_BYTES; + } + + //Squeezes remaining bytes + memcpy(ptr, state, (len % BLOCK_LEN_BYTES)); } /** @@ -111,23 +109,24 @@ __inline static void reducedBlake2bLyra(uint64_t *v) { * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_INT64 words) */ -void absorbBlock(uint64_t *state, const uint64_t *in) { - //XORs the first BLOCK_LEN_INT64 words of "in" with the current state - state[0] ^= in[0]; - state[1] ^= in[1]; - state[2] ^= in[2]; - state[3] ^= in[3]; - state[4] ^= in[4]; - state[5] ^= in[5]; - state[6] ^= in[6]; - state[7] ^= in[7]; - state[8] ^= in[8]; - state[9] ^= in[9]; - state[10] ^= in[10]; - state[11] ^= in[11]; - - //Applies the transformation f to the sponge's state - blake2bLyra(state); +void absorbBlock(uint64_t *state, const uint64_t *in) +{ + //XORs the first BLOCK_LEN_INT64 words of "in" with the current state + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + state[8] ^= in[8]; + state[9] ^= in[9]; + state[10] ^= in[10]; + state[11] ^= in[11]; + + //Applies the transformation f to the sponge's state + blake2bLyra(state); } /** @@ -137,25 +136,21 @@ void absorbBlock(uint64_t *state, const uint64_t *in) { * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) */ -void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { - //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state - state[0] ^= in[0]; - state[1] ^= in[1]; - state[2] ^= in[2]; - state[3] ^= in[3]; - state[4] ^= in[4]; - state[5] ^= in[5]; - state[6] ^= in[6]; - state[7] ^= in[7]; - - //Applies the transformation f to the sponge's state - blake2bLyra(state); -/* - for(int i = 0; i<16; i++) { - printf(" final state %d %08x %08x in %08x %08x\n", i, (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32), - (uint32_t)(in[i] & 0xFFFFFFFFULL), (uint32_t)(in[i] >> 32)); - } -*/ +void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) +{ + //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state + + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + + //Applies the transformation f to the sponge's state + blake2bLyra(state); } /** @@ -166,36 +161,31 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { * @param state The current state of the sponge * @param rowOut Row to receive the data squeezed */ -void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) { - uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] - int i; - //M[row][C-1-col] = H.reduced_squeeze() - for (i = 0; i < N_COLS; i++) { - - ptrWord[0] = state[0]; - ptrWord[1] = state[1]; - ptrWord[2] = state[2]; - ptrWord[3] = state[3]; - ptrWord[4] = state[4]; - ptrWord[5] = state[5]; - ptrWord[6] = state[6]; - ptrWord[7] = state[7]; - ptrWord[8] = state[8]; - ptrWord[9] = state[9]; - ptrWord[10] = state[10]; - ptrWord[11] = state[11]; - /* -for (int i = 0; i<12; i++) { - printf(" after reducedSqueezeRow0 %d %08x %08x in %08x %08x\n", i, (uint32_t)(ptrWord[i] & 0xFFFFFFFFULL), (uint32_t)(ptrWord[i] >> 32), - (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32)); +void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, const uint32_t nCols) +{ + uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] + unsigned int i; + //M[row][C-1-col] = H.reduced_squeeze() + for (i = 0; i < nCols; i++) { + ptrWord[0] = state[0]; + ptrWord[1] = state[1]; + ptrWord[2] = state[2]; + ptrWord[3] = state[3]; + ptrWord[4] = state[4]; + ptrWord[5] = state[5]; + ptrWord[6] = state[6]; + ptrWord[7] = state[7]; + ptrWord[8] = state[8]; + ptrWord[9] = state[9]; + ptrWord[10] = state[10]; + ptrWord[11] = state[11]; + + //Goes to next block (column) that will receive the squeezed data + ptrWord -= BLOCK_LEN_INT64; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); } -*/ - //Goes to next block (column) that will receive the squeezed data - ptrWord -= BLOCK_LEN_INT64; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - } } /** @@ -207,50 +197,50 @@ for (int i = 0; i<12; i++) { * @param rowIn Row to feed the sponge * @param rowOut Row to receive the sponge's output */ - void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) { - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row - int i; - - for (i = 0; i < N_COLS; i++) { - - //Absorbing "M[prev][col]" - state[0] ^= (ptrWordIn[0]); - state[1] ^= (ptrWordIn[1]); - state[2] ^= (ptrWordIn[2]); - state[3] ^= (ptrWordIn[3]); - state[4] ^= (ptrWordIn[4]); - state[5] ^= (ptrWordIn[5]); - state[6] ^= (ptrWordIn[6]); - state[7] ^= (ptrWordIn[7]); - state[8] ^= (ptrWordIn[8]); - state[9] ^= (ptrWordIn[9]); - state[10] ^= (ptrWordIn[10]); - state[11] ^= (ptrWordIn[11]); - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - //M[row][C-1-col] = M[prev][col] XOR rand - ptrWordOut[0] = ptrWordIn[0] ^ state[0]; - ptrWordOut[1] = ptrWordIn[1] ^ state[1]; - ptrWordOut[2] = ptrWordIn[2] ^ state[2]; - ptrWordOut[3] = ptrWordIn[3] ^ state[3]; - ptrWordOut[4] = ptrWordIn[4] ^ state[4]; - ptrWordOut[5] = ptrWordIn[5] ^ state[5]; - ptrWordOut[6] = ptrWordIn[6] ^ state[6]; - ptrWordOut[7] = ptrWordIn[7] ^ state[7]; - ptrWordOut[8] = ptrWordIn[8] ^ state[8]; - ptrWordOut[9] = ptrWordIn[9] ^ state[9]; - ptrWordOut[10] = ptrWordIn[10] ^ state[10]; - ptrWordOut[11] = ptrWordIn[11] ^ state[11]; - - - //Input: next column (i.e., next block in sequence) - ptrWordIn += BLOCK_LEN_INT64; - //Output: goes to previous column - ptrWordOut -= BLOCK_LEN_INT64; - } +void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols) +{ + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + unsigned int i; + + for (i = 0; i < nCols; i++) { + + //Absorbing "M[prev][col]" + state[0] ^= (ptrWordIn[0]); + state[1] ^= (ptrWordIn[1]); + state[2] ^= (ptrWordIn[2]); + state[3] ^= (ptrWordIn[3]); + state[4] ^= (ptrWordIn[4]); + state[5] ^= (ptrWordIn[5]); + state[6] ^= (ptrWordIn[6]); + state[7] ^= (ptrWordIn[7]); + state[8] ^= (ptrWordIn[8]); + state[9] ^= (ptrWordIn[9]); + state[10] ^= (ptrWordIn[10]); + state[11] ^= (ptrWordIn[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][C-1-col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + + //Input: next column (i.e., next block in sequence) + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } } /** @@ -267,63 +257,66 @@ for (int i = 0; i<12; i++) { * @param rowOut Row receiving the output * */ - void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row - int i; - for (i = 0; i < N_COLS; i++) { - //Absorbing "M[prev] [+] M[row*]" - state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); - state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); - state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); - state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); - state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); - state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); - state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); - state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); - state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); - state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); - state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); - state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - //M[row][col] = M[prev][col] XOR rand - ptrWordOut[0] = ptrWordIn[0] ^ state[0]; - ptrWordOut[1] = ptrWordIn[1] ^ state[1]; - ptrWordOut[2] = ptrWordIn[2] ^ state[2]; - ptrWordOut[3] = ptrWordIn[3] ^ state[3]; - ptrWordOut[4] = ptrWordIn[4] ^ state[4]; - ptrWordOut[5] = ptrWordIn[5] ^ state[5]; - ptrWordOut[6] = ptrWordIn[6] ^ state[6]; - ptrWordOut[7] = ptrWordIn[7] ^ state[7]; - ptrWordOut[8] = ptrWordIn[8] ^ state[8]; - ptrWordOut[9] = ptrWordIn[9] ^ state[9]; - ptrWordOut[10] = ptrWordIn[10] ^ state[10]; - ptrWordOut[11] = ptrWordIn[11] ^ state[11]; - - //M[row*][col] = M[row*][col] XOR rotW(rand) - ptrWordInOut[0] ^= state[11]; - ptrWordInOut[1] ^= state[0]; - ptrWordInOut[2] ^= state[1]; - ptrWordInOut[3] ^= state[2]; - ptrWordInOut[4] ^= state[3]; - ptrWordInOut[5] ^= state[4]; - ptrWordInOut[6] ^= state[5]; - ptrWordInOut[7] ^= state[6]; - ptrWordInOut[8] ^= state[7]; - ptrWordInOut[9] ^= state[8]; - ptrWordInOut[10] ^= state[9]; - ptrWordInOut[11] ^= state[10]; - - //Inputs: next column (i.e., next block in sequence) - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - //Output: goes to previous column - ptrWordOut -= BLOCK_LEN_INT64; - } +void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols) +{ + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + unsigned int i; + + for (i = 0; i < nCols; i++) { + + //Absorbing "M[prev] [+] M[row*]" + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Inputs: next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } } /** @@ -340,410 +333,72 @@ for (int i = 0; i<12; i++) { * @param rowOut Row receiving the output * */ -void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row - int i; - - for (i = 0; i < N_COLS; i++) { - - //Absorbing "M[prev] [+] M[row*]" - state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); - state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); - state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); - state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); - state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); - state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); - state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); - state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); - state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); - state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); - state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); - state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - //M[rowOut][col] = M[rowOut][col] XOR rand - ptrWordOut[0] ^= state[0]; - ptrWordOut[1] ^= state[1]; - ptrWordOut[2] ^= state[2]; - ptrWordOut[3] ^= state[3]; - ptrWordOut[4] ^= state[4]; - ptrWordOut[5] ^= state[5]; - ptrWordOut[6] ^= state[6]; - ptrWordOut[7] ^= state[7]; - ptrWordOut[8] ^= state[8]; - ptrWordOut[9] ^= state[9]; - ptrWordOut[10] ^= state[10]; - ptrWordOut[11] ^= state[11]; - - //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) - ptrWordInOut[0] ^= state[11]; - ptrWordInOut[1] ^= state[0]; - ptrWordInOut[2] ^= state[1]; - ptrWordInOut[3] ^= state[2]; - ptrWordInOut[4] ^= state[3]; - ptrWordInOut[5] ^= state[4]; - ptrWordInOut[6] ^= state[5]; - ptrWordInOut[7] ^= state[6]; - ptrWordInOut[8] ^= state[7]; - ptrWordInOut[9] ^= state[8]; - ptrWordInOut[10] ^= state[9]; - ptrWordInOut[11] ^= state[10]; - - //Goes to next block - ptrWordOut += BLOCK_LEN_INT64; - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - } -} - - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -/** - * Performs a duplex operation over "M[rowInOut] [+] M[rowIn]", writing the output "rand" - * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit - * rotation to the left. - * - * @param state The current state of the sponge - * @param rowIn Row used only as input - * @param rowInOut Row used as input and to receive output after rotation - * @param rowOut Row receiving the output - * - */ -/* -inline void reducedDuplexRowSetupOLD(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row - int i; - for (i = 0; i < N_COLS; i++) { - - //Absorbing "M[rowInOut] XOR M[rowIn]" - state[0] ^= ptrWordInOut[0] ^ ptrWordIn[0]; - state[1] ^= ptrWordInOut[1] ^ ptrWordIn[1]; - state[2] ^= ptrWordInOut[2] ^ ptrWordIn[2]; - state[3] ^= ptrWordInOut[3] ^ ptrWordIn[3]; - state[4] ^= ptrWordInOut[4] ^ ptrWordIn[4]; - state[5] ^= ptrWordInOut[5] ^ ptrWordIn[5]; - state[6] ^= ptrWordInOut[6] ^ ptrWordIn[6]; - state[7] ^= ptrWordInOut[7] ^ ptrWordIn[7]; - state[8] ^= ptrWordInOut[8] ^ ptrWordIn[8]; - state[9] ^= ptrWordInOut[9] ^ ptrWordIn[9]; - state[10] ^= ptrWordInOut[10] ^ ptrWordIn[10]; - state[11] ^= ptrWordInOut[11] ^ ptrWordIn[11]; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - //M[row][col] = rand - ptrWordOut[0] = state[0]; - ptrWordOut[1] = state[1]; - ptrWordOut[2] = state[2]; - ptrWordOut[3] = state[3]; - ptrWordOut[4] = state[4]; - ptrWordOut[5] = state[5]; - ptrWordOut[6] = state[6]; - ptrWordOut[7] = state[7]; - ptrWordOut[8] = state[8]; - ptrWordOut[9] = state[9]; - ptrWordOut[10] = state[10]; - ptrWordOut[11] = state[11]; - - - //M[row*][col] = M[row*][col] XOR rotW(rand) - ptrWordInOut[0] ^= state[10]; - ptrWordInOut[1] ^= state[11]; - ptrWordInOut[2] ^= state[0]; - ptrWordInOut[3] ^= state[1]; - ptrWordInOut[4] ^= state[2]; - ptrWordInOut[5] ^= state[3]; - ptrWordInOut[6] ^= state[4]; - ptrWordInOut[7] ^= state[5]; - ptrWordInOut[8] ^= state[6]; - ptrWordInOut[9] ^= state[7]; - ptrWordInOut[10] ^= state[8]; - ptrWordInOut[11] ^= state[9]; - - //Goes to next column (i.e., next block in sequence) - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - ptrWordOut += BLOCK_LEN_INT64; - } -} -*/ - -/** - * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand" - * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit - * rotation to the left. - * - * @param state The current state of the sponge - * @param rowIn Row used only as input - * @param rowInOut Row used as input and to receive output after rotation - * @param rowOut Row receiving the output - * - */ -/* -inline void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row - int i; - for (i = 0; i < N_COLS; i++) { - - //Absorbing "M[rowInOut] XOR M[rowIn]" - state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; - state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; - state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; - state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; - state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; - state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; - state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; - state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; - state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; - state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; - state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; - state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - - //M[row*][col] = M[row*][col] XOR rotW(rand) - ptrWordInOut[0] ^= state[10]; - ptrWordInOut[1] ^= state[11]; - ptrWordInOut[2] ^= state[0]; - ptrWordInOut[3] ^= state[1]; - ptrWordInOut[4] ^= state[2]; - ptrWordInOut[5] ^= state[3]; - ptrWordInOut[6] ^= state[4]; - ptrWordInOut[7] ^= state[5]; - ptrWordInOut[8] ^= state[6]; - ptrWordInOut[9] ^= state[7]; - ptrWordInOut[10] ^= state[8]; - ptrWordInOut[11] ^= state[9]; - - - //M[row][col] = rand - ptrWordOut[0] = state[0] ^ ptrWordIn[0]; - ptrWordOut[1] = state[1] ^ ptrWordIn[1]; - ptrWordOut[2] = state[2] ^ ptrWordIn[2]; - ptrWordOut[3] = state[3] ^ ptrWordIn[3]; - ptrWordOut[4] = state[4] ^ ptrWordIn[4]; - ptrWordOut[5] = state[5] ^ ptrWordIn[5]; - ptrWordOut[6] = state[6] ^ ptrWordIn[6]; - ptrWordOut[7] = state[7] ^ ptrWordIn[7]; - ptrWordOut[8] = state[8] ^ ptrWordIn[8]; - ptrWordOut[9] = state[9] ^ ptrWordIn[9]; - ptrWordOut[10] = state[10] ^ ptrWordIn[10]; - ptrWordOut[11] = state[11] ^ ptrWordIn[11]; - - //Goes to next column (i.e., next block in sequence) - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - ptrWordOut += BLOCK_LEN_INT64; - } -} -*/ - -/** - * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand" - * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit - * rotation to the left. - * - * @param state The current state of the sponge - * @param rowIn Row used only as input - * @param rowInOut Row used as input and to receive output after rotation - * @param rowOut Row receiving the output - * - */ -/* -inline void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordOut = rowOut; - int i; - - for (i = 0; i < N_COLS / 2; i++) { - //Absorbing "M[rowInOut] XOR M[rowIn]" - state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; - state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; - state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; - state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; - state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; - state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; - state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; - state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; - state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; - state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; - state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; - state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - - //M[row*][col] = M[row*][col] XOR rotW(rand) - ptrWordInOut[0] ^= state[10]; - ptrWordInOut[1] ^= state[11]; - ptrWordInOut[2] ^= state[0]; - ptrWordInOut[3] ^= state[1]; - ptrWordInOut[4] ^= state[2]; - ptrWordInOut[5] ^= state[3]; - ptrWordInOut[6] ^= state[4]; - ptrWordInOut[7] ^= state[5]; - ptrWordInOut[8] ^= state[6]; - ptrWordInOut[9] ^= state[7]; - ptrWordInOut[10] ^= state[8]; - ptrWordInOut[11] ^= state[9]; - - - //M[row][col] = rand - ptrWordOut[0] = state[0] ^ ptrWordIn[0]; - ptrWordOut[1] = state[1] ^ ptrWordIn[1]; - ptrWordOut[2] = state[2] ^ ptrWordIn[2]; - ptrWordOut[3] = state[3] ^ ptrWordIn[3]; - ptrWordOut[4] = state[4] ^ ptrWordIn[4]; - ptrWordOut[5] = state[5] ^ ptrWordIn[5]; - ptrWordOut[6] = state[6] ^ ptrWordIn[6]; - ptrWordOut[7] = state[7] ^ ptrWordIn[7]; - ptrWordOut[8] = state[8] ^ ptrWordIn[8]; - ptrWordOut[9] = state[9] ^ ptrWordIn[9]; - ptrWordOut[10] = state[10] ^ ptrWordIn[10]; - ptrWordOut[11] = state[11] ^ ptrWordIn[11]; - - //Goes to next column (i.e., next block in sequence) - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - ptrWordOut += 2 * BLOCK_LEN_INT64; - } - - ptrWordOut = rowOut + BLOCK_LEN_INT64; - for (i = 0; i < N_COLS / 2; i++) { - //Absorbing "M[rowInOut] XOR M[rowIn]" - state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; - state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; - state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; - state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; - state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; - state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; - state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; - state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; - state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; - state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; - state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; - state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - - //M[row*][col] = M[row*][col] XOR rotW(rand) - ptrWordInOut[0] ^= state[10]; - ptrWordInOut[1] ^= state[11]; - ptrWordInOut[2] ^= state[0]; - ptrWordInOut[3] ^= state[1]; - ptrWordInOut[4] ^= state[2]; - ptrWordInOut[5] ^= state[3]; - ptrWordInOut[6] ^= state[4]; - ptrWordInOut[7] ^= state[5]; - ptrWordInOut[8] ^= state[6]; - ptrWordInOut[9] ^= state[7]; - ptrWordInOut[10] ^= state[8]; - ptrWordInOut[11] ^= state[9]; - - - //M[row][col] = rand - ptrWordOut[0] = state[0] ^ ptrWordIn[0]; - ptrWordOut[1] = state[1] ^ ptrWordIn[1]; - ptrWordOut[2] = state[2] ^ ptrWordIn[2]; - ptrWordOut[3] = state[3] ^ ptrWordIn[3]; - ptrWordOut[4] = state[4] ^ ptrWordIn[4]; - ptrWordOut[5] = state[5] ^ ptrWordIn[5]; - ptrWordOut[6] = state[6] ^ ptrWordIn[6]; - ptrWordOut[7] = state[7] ^ ptrWordIn[7]; - ptrWordOut[8] = state[8] ^ ptrWordIn[8]; - ptrWordOut[9] = state[9] ^ ptrWordIn[9]; - ptrWordOut[10] = state[10] ^ ptrWordIn[10]; - ptrWordOut[11] = state[11] ^ ptrWordIn[11]; - - //Goes to next column (i.e., next block in sequence) - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - ptrWordOut += 2 * BLOCK_LEN_INT64; - } -} -*/ +void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols) +{ + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + unsigned int i; -/** - * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", using the output "rand" - * to make "M[rowOut][col] = M[rowOut][col] XOR rand" and "M[rowInOut] = M[rowInOut] XOR rotW(rand)", - * where rotW is a 64-bit rotation to the left. - * - * @param state The current state of the sponge - * @param rowIn Row used only as input - * @param rowInOut Row used as input and to receive output after rotation - * @param rowOut Row receiving the output - * - */ -/* -inline void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { - uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row - int i; - for (i = 0; i < N_COLS; i++) { - - //Absorbing "M[rowInOut] XOR M[rowIn]" - state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; - state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; - state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; - state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; - state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; - state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; - state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; - state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; - state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; - state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; - state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; - state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; - - //Applies the reduced-round transformation f to the sponge's state - reducedBlake2bLyra(state); - - //M[rowOut][col] = M[rowOut][col] XOR rand - ptrWordOut[0] ^= state[0]; - ptrWordOut[1] ^= state[1]; - ptrWordOut[2] ^= state[2]; - ptrWordOut[3] ^= state[3]; - ptrWordOut[4] ^= state[4]; - ptrWordOut[5] ^= state[5]; - ptrWordOut[6] ^= state[6]; - ptrWordOut[7] ^= state[7]; - ptrWordOut[8] ^= state[8]; - ptrWordOut[9] ^= state[9]; - ptrWordOut[10] ^= state[10]; - ptrWordOut[11] ^= state[11]; - - //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) - - - //Goes to next block - ptrWordOut += BLOCK_LEN_INT64; - ptrWordInOut += BLOCK_LEN_INT64; - ptrWordIn += BLOCK_LEN_INT64; - } + for (i = 0; i < nCols; i++) { + + //Absorbing "M[prev] [+] M[row*]" + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[rowOut][col] = M[rowOut][col] XOR rand + ptrWordOut[0] ^= state[0]; + ptrWordOut[1] ^= state[1]; + ptrWordOut[2] ^= state[2]; + ptrWordOut[3] ^= state[3]; + ptrWordOut[4] ^= state[4]; + ptrWordOut[5] ^= state[5]; + ptrWordOut[6] ^= state[6]; + ptrWordOut[7] ^= state[7]; + ptrWordOut[8] ^= state[8]; + ptrWordOut[9] ^= state[9]; + ptrWordOut[10] ^= state[10]; + ptrWordOut[11] ^= state[11]; + + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + } } -*/ /** - Prints an array of unsigned chars + * Prints an array of unsigned chars */ -void printArray(unsigned char *array, unsigned int size, char *name) { +void printArray(unsigned char *array, unsigned int size, char *name) +{ unsigned int i; printf("%s: ", name); for (i = 0; i < size; i++) { diff --git a/lyra2/Sponge.h b/lyra2/Sponge.h index 9bd8ed6..7fcd093 100644 --- a/lyra2/Sponge.h +++ b/lyra2/Sponge.h @@ -24,85 +24,65 @@ #include -#if defined(__GNUC__) -#define ALIGN __attribute__ ((aligned(32))) -#elif defined(_MSC_VER) -#define ALIGN __declspec(align(32)) -#else -#define ALIGN -#endif - - -/*Blake2b IV Array*/ +/* Blake2b IV Array */ static const uint64_t blake2b_IV[8] = { - 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL }; -/*Blake2b's rotation*/ -static __inline uint64_t rotr64( const uint64_t w, const unsigned c ){ - return ( w >> c ) | ( w << ( 64 - c ) ); +/* Blake2b's rotation */ +static __inline uint64_t rotr64(const uint64_t w, const unsigned c) { +#ifdef _MSC_VER + return _rotr64(w, c); +#else + return ( w >> c ) | ( w << ( 64 - c ) ); +#endif } -/*Blake2b's G function*/ -#define G(r,i,a,b,c,d) \ - do { \ - a = a + b; \ - d = rotr64(d ^ a, 32); \ - c = c + d; \ - b = rotr64(b ^ c, 24); \ - a = a + b; \ - d = rotr64(d ^ a, 16); \ - c = c + d; \ - b = rotr64(b ^ c, 63); \ +/* Blake2b's G function */ +#define G(r,i,a,b,c,d) do { \ + a = a + b; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ } while(0) /*One Round of the Blake2b's compression function*/ -#define ROUND_LYRA(r) \ - G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ - G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ - G(r,2,v[ 2],v[ 6],v[10],v[14]); \ - G(r,3,v[ 3],v[ 7],v[11],v[15]); \ - G(r,4,v[ 0],v[ 5],v[10],v[15]); \ - G(r,5,v[ 1],v[ 6],v[11],v[12]); \ - G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ - G(r,7,v[ 3],v[ 4],v[ 9],v[14]); - +#define ROUND_LYRA(r) \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); //---- Housekeeping void initState(uint64_t state[/*16*/]); //---- Squeezes void squeeze(uint64_t *state, unsigned char *out, unsigned int len); -void reducedSqueezeRow0(uint64_t* state, uint64_t* row); +void reducedSqueezeRow0(uint64_t* state, uint64_t* row, const uint32_t nCols); //---- Absorbs void absorbBlock(uint64_t *state, const uint64_t *in); void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); //---- Duplexes -void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut); -void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols); +void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols); +void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols); //---- Misc void printArray(unsigned char *array, unsigned int size, char *name); -//////////////////////////////////////////////////////////////////////////////////////////////// - - -////TESTS//// -//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2); -//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -///////////// - - #endif /* SPONGE_H_ */ diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu index 8e96f36..05bd19f 100644 --- a/lyra2/cuda_lyra2.cu +++ b/lyra2/cuda_lyra2.cu @@ -1,223 +1,543 @@ +#include #include +#include "cuda_lyra2_vectors.h" +#define TPB 8 +// + +#if __CUDA_ARCH__ < 500 +#define vectype ulonglong4 +#define u64type uint64_t +#define memshift 4 +#elif __CUDA_ARCH__ == 500 +#define u64type uint2 +#define vectype uint28 +#define memshift 3 +#else +#define u64type uint2 +#define vectype uint28 +#define memshift 4 +#endif + +__device__ vectype *DMatrix; -#include "cuda_helper.h" - -#define TPB 160 - -static __constant__ uint2 blake2b_IV[8] = { - { 0xf3bcc908, 0x6a09e667 }, - { 0x84caa73b, 0xbb67ae85 }, - { 0xfe94f82b, 0x3c6ef372 }, - { 0x5f1d36f1, 0xa54ff53a }, - { 0xade682d1, 0x510e527f }, - { 0x2b3e6c1f, 0x9b05688c }, - { 0xfb41bd6b, 0x1f83d9ab }, - { 0x137e2179, 0x5be0cd19 } -}; - -#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \ - for (int i = 0; i < 8; i++) { \ - for (int j = 0; j < 12; j++) \ - state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ - round_lyra(state); \ - for (int j = 0; j < 12; j++) \ - Matrix[j + 12 * i][rowOut] ^= state[j]; \ - Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ - Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ - Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ - Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ - Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ - Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ - Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ - Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ - Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ - Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ - Matrix[10+ 12 * i][rowInOut] ^= state[9]; \ - Matrix[11+ 12 * i][rowInOut] ^= state[10]; \ - } \ - } - -#define absorbblock(in) { \ - state[0] ^= Matrix[0][in]; \ - state[1] ^= Matrix[1][in]; \ - state[2] ^= Matrix[2][in]; \ - state[3] ^= Matrix[3][in]; \ - state[4] ^= Matrix[4][in]; \ - state[5] ^= Matrix[5][in]; \ - state[6] ^= Matrix[6][in]; \ - state[7] ^= Matrix[7][in]; \ - state[8] ^= Matrix[8][in]; \ - state[9] ^= Matrix[9][in]; \ - state[10] ^= Matrix[10][in]; \ - state[11] ^= Matrix[11][in]; \ - round_lyra(state); \ - round_lyra(state); \ - round_lyra(state); \ - round_lyra(state); \ - round_lyra(state); \ - round_lyra(state); \ - round_lyra(state); \ - round_lyra(state); \ - round_lyra(state); \ - round_lyra(state); \ - round_lyra(state); \ - round_lyra(state); \ - } +#ifdef __CUDA_ARCH__ static __device__ __forceinline__ -void Gfunc(uint2 & a, uint2 &b, uint2 &c, uint2 &d) +void Gfunc_v35(uint2 &a, uint2 &b, uint2 &c, uint2 &d) { a += b; d ^= a; d = SWAPUINT2(d); - c += d; b ^= c; b = ROR2(b, 24); - a += b; d ^= a; d = ROR2(d, 16); + c += d; b ^= c; b = ROR24(b); + a += b; d ^= a; d = ROR16(d); c += d; b ^= c; b = ROR2(b, 63); } -__device__ __forceinline__ -static void round_lyra(uint2 *s) +#if __CUDA_ARCH__ < 500 +static __device__ __forceinline__ +void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d) { - Gfunc(s[0], s[4], s[8], s[12]); - Gfunc(s[1], s[5], s[9], s[13]); - Gfunc(s[2], s[6], s[10], s[14]); - Gfunc(s[3], s[7], s[11], s[15]); - Gfunc(s[0], s[5], s[10], s[15]); - Gfunc(s[1], s[6], s[11], s[12]); - Gfunc(s[2], s[7], s[8], s[13]); - Gfunc(s[3], s[4], s[9], s[14]); + a += b; d ^= a; d = ROTR64(d, 32); + c += d; b ^= c; b = ROTR64(b, 24); + a += b; d ^= a; d = ROTR64(d, 16); + c += d; b ^= c; b = ROTR64(b, 63); } +#endif -__device__ __forceinline__ -void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[16], uint2 Matrix[96][8]) +static __device__ __forceinline__ +void round_lyra_v35(vectype* s) { -#if __CUDA_ARCH__ > 500 - #pragma unroll + Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w); + + Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z); +} +#else +#define round_lyra_v35(s) {} #endif + +static __device__ __forceinline__ +void reduceDuplex(vectype state[4], uint32_t thread) +{ + vectype state1[3]; + uint32_t ps1 = (256 * thread); + uint32_t ps2 = (memshift * 7 + memshift * 8 + 256 * thread); + + #pragma unroll 4 for (int i = 0; i < 8; i++) { - #pragma unroll - for (int j = 0; j < 12; j++) - state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; - - round_lyra(state); - - #pragma unroll - for (int j = 0; j < 12; j++) - Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; - - Matrix[0 + 12 * i][rowInOut] ^= state[11]; - Matrix[1 + 12 * i][rowInOut] ^= state[0]; - Matrix[2 + 12 * i][rowInOut] ^= state[1]; - Matrix[3 + 12 * i][rowInOut] ^= state[2]; - Matrix[4 + 12 * i][rowInOut] ^= state[3]; - Matrix[5 + 12 * i][rowInOut] ^= state[4]; - Matrix[6 + 12 * i][rowInOut] ^= state[5]; - Matrix[7 + 12 * i][rowInOut] ^= state[6]; - Matrix[8 + 12 * i][rowInOut] ^= state[7]; - Matrix[9 + 12 * i][rowInOut] ^= state[8]; - Matrix[10 + 12 * i][rowInOut] ^= state[9]; - Matrix[11 + 12 * i][rowInOut] ^= state[10]; + uint32_t s1 = ps1 + i*memshift; + uint32_t s2 = ps2 - i*memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix+s1)[j]); + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra_v35(state); + + for (int j = 0; j < 3; j++) + state1[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state1[j]; } } -__global__ __launch_bounds__(TPB, 1) -void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash) +static __device__ __forceinline__ +void reduceDuplexV3(vectype state[4], uint32_t thread) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + vectype state1[3]; + uint32_t ps1 = (256 * thread); + // colomn row + uint32_t ps2 = (memshift * 7 * 8 + memshift * 1 + 64 * memshift * thread); + + #pragma unroll 4 + for (int i = 0; i < 8; i++) + { + uint32_t s1 = ps1 + 8 * i *memshift; + uint32_t s2 = ps2 - 8 * i *memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra_v35(state); + + for (int j = 0; j < 3; j++) + state1[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state1[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) +{ + vectype state2[3],state1[3]; + + uint32_t ps1 = ( memshift * 8 * rowIn + 256 * thread); + uint32_t ps2 = ( memshift * 8 * rowInOut + 256 * thread); + uint32_t ps3 = (memshift*7 + memshift * 8 * rowOut + 256 * thread); + +#pragma unroll 1 + for (int i = 0; i < 8; i++) + { + uint32_t s1 = ps1 + i*memshift; + uint32_t s2 = ps2 + i*memshift; + uint32_t s3 = ps3 - i*memshift; + + for (int j = 0; j < 3; j++) + state1[j]= __ldg4(&(DMatrix + s1)[j]); + for (int j = 0; j < 3; j++) + state2[j]= __ldg4(&(DMatrix + s2)[j]); + for (int j = 0; j < 3; j++) { + vectype tmp = state1[j] + state2[j]; + state[j] ^= tmp; + } + + round_lyra_v35(state); + + for (int j = 0; j < 3; j++) { + state1[j] ^= state[j]; + (DMatrix + s3)[j] = state1[j]; + } + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j+1] ^= ((uint2*)state)[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) +{ + vectype state2[3], state1[3]; + + uint32_t ps1 = ( memshift * rowIn + 64 * memshift * thread); + uint32_t ps2 = (memshift * rowInOut + 64 * memshift* thread); + uint32_t ps3 = (8 * memshift * 7 + memshift * rowOut + 64 * memshift * thread); + /* + uint32_t ps1 = (256 * thread); + uint32_t ps2 = (256 * thread); + uint32_t ps3 = (256 * thread); + */ +#pragma nounroll + for (int i = 0; i < 8; i++) { - uint2 state[16]; + uint32_t s1 = ps1 + 8*i*memshift; + uint32_t s2 = ps2 + 8*i*memshift; + uint32_t s3 = ps3 - 8*i*memshift; - #pragma unroll - for (int i = 0; i<4; i++) { - LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); - } //password + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1 )[j]); + for (int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2 )[j]); + for (int j = 0; j < 3; j++) { + vectype tmp = state1[j] + state2[j]; + state[j] ^= tmp; + } - #pragma unroll - for (int i = 0; i<4; i++) { - state[i + 4] = state[i]; - } //salt + round_lyra_v35(state); - #pragma unroll - for (int i = 0; i<8; i++) { - state[i + 8] = blake2b_IV[i]; + for (int j = 0; j < 3; j++) { + state1[j] ^= state[j]; + (DMatrix + s3)[j] = state1[j]; } - // blake2blyra x2 - //#pragma unroll 24 - for (int i = 0; i<24; i++) { - round_lyra(state); - } //because 12 is not enough + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } +} + + +static __device__ __forceinline__ +void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) +{ + vectype state1[3], state2[3]; + + uint32_t ps1 = (memshift * 8 * rowIn + 256 * thread); + uint32_t ps2 = (memshift * 8 * rowInOut + 256 * thread); + uint32_t ps3 = (memshift * 8 * rowOut + 256 * thread); + +#pragma unroll 1 + for (int i = 0; i < 8; i++) + { + uint32_t s1 = ps1 + i*memshift; + uint32_t s2 = ps2 + i*memshift; + uint32_t s3 = ps3 + i*memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + for (int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2)[j]); + + for (int j = 0; j < 3; j++) + state1[j] += state2[j]; + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra_v35(state); + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + if (rowInOut != rowOut) { + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s3)[j] ^= state[j]; + + } else { + + for (int j = 0; j < 3; j++) + state2[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j]=state2[j]; + } + + } +} + +static __device__ __forceinline__ +void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) +{ - uint2 Matrix[96][8]; // not cool + vectype state1[3], state2[3]; + uint32_t ps1 = (memshift * rowIn + 64 * memshift * thread); + uint32_t ps2 = (memshift * rowInOut + 64 * memshift * thread); + uint32_t ps3 = (memshift * rowOut + 64 *memshift * thread); + +#pragma nounroll + for (int i = 0; i < 8; i++) + { + uint32_t s1 = ps1 + 8 * i*memshift; + uint32_t s2 = ps2 + 8 * i*memshift; + uint32_t s3 = ps3 + 8 * i*memshift; + + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + + for (int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2)[j]); + + + for (int j = 0; j < 3; j++) + state1[j] += state2[j]; + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + + round_lyra_v35(state); + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + if (rowInOut != rowOut) { + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s3)[j] ^= state[j]; + + } + else { + + for (int j = 0; j < 3; j++) + state2[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } + } +} + + + +#if __CUDA_ARCH__ < 500 +__global__ __launch_bounds__(48, 1) +#elif __CUDA_ARCH__ == 500 +__global__ __launch_bounds__(16, 1) +#else +__global__ __launch_bounds__(TPB, 1) +#endif +void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + vectype state[4]; + +#if __CUDA_ARCH__ > 350 + const uint28 blake2b_IV[2] = { + {{ 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a }}, + {{ 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 }} + }; +#else + const ulonglong4 blake2b_IV[2] = { + { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 }, + { 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 } + }; +#endif + +#if __CUDA_ARCH__ == 350 + if (thread < threads) +#endif + { + ((uint2*)state)[0] = __ldg(&outputHash[thread]); + ((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); + ((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); + ((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); +// state[0] = __ldg4(&((vectype*)outputHash)[thread]); + state[1] = state[0]; + state[2] = ((vectype*)blake2b_IV)[0]; + state[3] = ((vectype*)blake2b_IV)[1]; + + + for (int i = 0; i<24; i++) { //because 12 is not enough + round_lyra_v35(state); + } + + uint32_t ps1 = (memshift * 7 + 256 * thread); - // reducedSqueezeRow0 - #pragma unroll 8 for (int i = 0; i < 8; i++) { - #pragma unroll 12 - for (int j = 0; j<12; j++) { - Matrix[j + 84 - 12 * i][0] = state[j]; - } - round_lyra(state); + uint32_t s1 = ps1 - memshift * i; + for (int j = 0; j < 3; j++) + (DMatrix + s1)[j] = (state)[j]; + + round_lyra_v35(state); } - // reducedSqueezeRow1 - #pragma unroll 8 + + reduceDuplex(state, thread); + + reduceDuplexRowSetupV2(1, 0, 2, state, thread); + reduceDuplexRowSetupV2(2, 1, 3, state, thread); + reduceDuplexRowSetupV2(3, 0, 4, state, thread); + reduceDuplexRowSetupV2(4, 3, 5, state, thread); + reduceDuplexRowSetupV2(5, 2, 6, state, thread); + reduceDuplexRowSetupV2(6, 1, 7, state, thread); + uint32_t rowa = ((uint2*)state)[0].x & 7; + + reduceDuplexRowtV2(7, rowa, 0, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV2(0, rowa, 3, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV2(3, rowa, 6, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV2(6, rowa, 1, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV2(1, rowa, 4, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV2(4, rowa, 7, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV2(7, rowa, 2, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV2(2, rowa, 5, state, thread); + + uint32_t shift = (memshift * 8 * rowa + 256 * thread); + + for (int j = 0; j < 3; j++) + state[j] ^= __ldg4(&(DMatrix + shift)[j]); + + for (int i = 0; i < 12; i++) + round_lyra_v35(state); + + + outputHash[thread]= ((uint2*)state)[0]; + outputHash[thread + threads] = ((uint2*)state)[1]; + outputHash[thread + 2 * threads] = ((uint2*)state)[2]; + outputHash[thread + 3 * threads] = ((uint2*)state)[3]; +// ((vectype*)outputHash)[thread] = state[0]; + + } //thread +} + +#if __CUDA_ARCH__ < 500 +__global__ __launch_bounds__(48, 1) +#elif __CUDA_ARCH__ == 500 +__global__ __launch_bounds__(16, 1) +#else +__global__ __launch_bounds__(TPB, 1) +#endif +void lyra2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + vectype state[4]; + +#if __CUDA_ARCH__ > 350 + const uint28 blake2b_IV[2] = { + { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } }, + { { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } } + }; +#else + const ulonglong4 blake2b_IV[2] = { + { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 }, + { 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 } + }; +#endif + + +#if __CUDA_ARCH__ == 350 + if (thread < threads) +#endif + { + ((uint2*)state)[0] = __ldg(&outputHash[thread]); + ((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); + ((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); + ((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); + + state[1] = state[0]; + + state[2] = ((vectype*)blake2b_IV)[0]; + state[3] = ((vectype*)blake2b_IV)[1]; + + for (int i = 0; i<24; i++) + round_lyra_v35(state); //because 12 is not enough + + uint32_t ps1 = (8 * memshift * 7 + 64 * memshift * thread); + for (int i = 0; i < 8; i++) { - #pragma unroll 12 - for (int j = 0; j<12; j++) { - state[j] ^= Matrix[j + 12 * i][0]; - } - round_lyra(state); - #pragma unroll 12 - for (int j = 0; j<12; j++) { - Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; - } + uint32_t s1 = ps1 - 8 * memshift * i; + for (int j = 0; j < 3; j++) + (DMatrix + s1)[j] = (state)[j]; + + round_lyra_v35(state); } - reduceDuplexRowSetup(1, 0, 2,state, Matrix); - reduceDuplexRowSetup(2, 1, 3, state, Matrix); - reduceDuplexRowSetup(3, 0, 4, state, Matrix); - reduceDuplexRowSetup(4, 3, 5, state, Matrix); - reduceDuplexRowSetup(5, 2, 6, state, Matrix); - reduceDuplexRowSetup(6, 1, 7, state, Matrix); - - uint32_t rowa; - rowa = state[0].x & 7; - reduceDuplexRow(7, rowa, 0); - rowa = state[0].x & 7; - reduceDuplexRow(0, rowa, 3); - rowa = state[0].x & 7; - reduceDuplexRow(3, rowa, 6); - rowa = state[0].x & 7; - reduceDuplexRow(6, rowa, 1); - rowa = state[0].x & 7; - reduceDuplexRow(1, rowa, 4); - rowa = state[0].x & 7; - reduceDuplexRow(4, rowa, 7); - rowa = state[0].x & 7; - reduceDuplexRow(7, rowa, 2); - rowa = state[0].x & 7; - reduceDuplexRow(2, rowa, 5); - - absorbblock(rowa); - - #pragma unroll - for (int i = 0; i<4; i++) { - outputHash[threads*i + thread] = devectorize(state[i]); - } //password + + reduceDuplexV3(state, thread); + + reduceDuplexRowSetupV3(1, 0, 2, state, thread); + reduceDuplexRowSetupV3(2, 1, 3, state, thread); + reduceDuplexRowSetupV3(3, 0, 4, state, thread); + reduceDuplexRowSetupV3(4, 3, 5, state, thread); + reduceDuplexRowSetupV3(5, 2, 6, state, thread); + reduceDuplexRowSetupV3(6, 1, 7, state, thread); + uint32_t rowa = ((uint2*)state)[0].x & 7; + + reduceDuplexRowtV3(7, rowa, 0, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV3(0, rowa, 3, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV3(3, rowa, 6, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV3(6, rowa, 1, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV3(1, rowa, 4, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV3(4, rowa, 7, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV3(7, rowa, 2, state, thread); + rowa = ((uint2*)state)[0].x & 7; + reduceDuplexRowtV3(2, rowa, 5, state, thread); + + uint32_t shift = (memshift * rowa + 64 * memshift * thread); + + for (int j = 0; j < 3; j++) + state[j] ^= __ldg4(&(DMatrix + shift)[j]); + + for (int i = 0; i < 12; i++) + round_lyra_v35(state); + + + outputHash[thread] = ((uint2*)state)[0]; + outputHash[thread + threads] = ((uint2*)state)[1]; + outputHash[thread + 2 * threads] = ((uint2*)state)[2]; + outputHash[thread + 3 * threads] = ((uint2*)state)[3]; } //thread } __host__ +void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *hash) +{ + cudaMemcpyToSymbol(DMatrix, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice); +} + void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) { - const uint32_t threadsperblock = TPB; + uint32_t tpb; + if (device_sm[device_map[thr_id]]<500) + tpb = 48; + else if (device_sm[device_map[thr_id]]==500) + tpb = 16; + else + tpb = TPB; + + dim3 grid((threads + tpb - 1) / tpb); + dim3 block(tpb); - dim3 grid((threads + threadsperblock - 1) / threadsperblock); - dim3 block(threadsperblock); + if (device_sm[device_map[thr_id]] == 500) + lyra2_gpu_hash_32 <<< grid, block >>> (threads, startNounce, (uint2*)d_outputHash); + else + lyra2_gpu_hash_32_v3 <<< grid, block >>> (threads, startNounce, (uint2*)d_outputHash); - lyra2_gpu_hash_32 <<>> (threads, startNounce, d_outputHash); + MyStreamSynchronize(NULL, order, thr_id); } diff --git a/lyra2/cuda_lyra2_vectors.h b/lyra2/cuda_lyra2_vectors.h new file mode 100644 index 0000000..045e526 --- /dev/null +++ b/lyra2/cuda_lyra2_vectors.h @@ -0,0 +1,1138 @@ +/* DJM CRAP */ + +#ifndef CUDA_LYRA_VECTOR_H +#define CUDA_LYRA_VECTOR_H + + +/////////////////////////////////////////////////////////////////////////////////// +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif + +#include "cuda_helper.h" + +//typedef __device_builtin__ struct ulong16 ulong16; + + +typedef struct __align__(32) uint8 +{ + unsigned int s0, s1, s2, s3, s4, s5, s6, s7; +} uint8; + +typedef struct __align__(64) uint2_8 +{ + uint2 s0, s1, s2, s3, s4, s5, s6, s7; +} uint2_8; + + +typedef struct __align__(64) ulonglong2to8 +{ +ulonglong2 l0,l1,l2,l3; +} ulonglong2to8; + +typedef struct __align__(128) ulonglong8to16 +{ + ulonglong2to8 lo, hi; +} ulonglong8to16; + +typedef struct __align__(256) ulonglong16to32 +{ + ulonglong8to16 lo, hi; +} ulonglong16to32; + +typedef struct __align__(512) ulonglong32to64 +{ + ulonglong16to32 lo, hi; +} ulonglong32to64; + + + +typedef struct __align__(128) ulonglonglong +{ + ulonglong2 s0,s1,s2,s3,s4,s5,s6,s7; +} ulonglonglong; + + + + +typedef struct __align__(64) uint16 +{ + union { + struct {unsigned int s0, s1, s2, s3, s4, s5, s6, s7;}; + uint8 lo; + }; + union { + struct {unsigned int s8, s9, sa, sb, sc, sd, se, sf;}; + uint8 hi; + }; +} uint16; + +typedef struct __align__(128) uint2_16 +{ + union { + struct { uint2 s0, s1, s2, s3, s4, s5, s6, s7; }; + uint2_8 lo; + }; + union { + struct { uint2 s8, s9, sa, sb, sc, sd, se, sf; }; + uint2_8 hi; + }; +} uint2_16; + + + + +typedef struct __align__(128) uint32 +{ + + uint16 lo,hi; +} uint32; + + + +struct __align__(128) ulong8 +{ + ulonglong4 s0, s1, s2, s3; +}; +typedef __device_builtin__ struct ulong8 ulong8; + +/* +typedef struct __align__(256) ulonglong16 +{ + ulonglong2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf; +} ulonglong16; +*/ +typedef struct __align__(256) ulonglong16 +{ + ulonglong4 s0, s1, s2, s3, s4, s5, s6, s7; +} ulonglong16; + + + +//typedef struct __align__(32) uint48 +//{ +// uint4 s0, s1; +// +//} uint48; + +typedef struct __align__(16) uint28 +{ + uint2 x, y, z, w; + +} uint28; + +/* +typedef struct __builtin_align__(32) uint48 +{ + union { + uint4 s0; + struct { uint2 x, y;}; + }; + union { + uint4 s1; + struct { uint2 z, w; }; + + }; +} uint48; +*/ + +typedef struct __builtin_align__(32) uint48 +{ + uint4 s0,s1; +} uint48; + +typedef struct __align__(64) uint816 +{ + uint48 s0, s1; + +} uint816; + +typedef struct __align__(128) uint1632 +{ + uint816 s0, s1; + +} uint1632; + +typedef struct __align__(256) uintx64 +{ + uint1632 s0, s1; + +} uintx64; + +typedef struct __builtin_align__(256) uintx64bis +{ + uint28 s0, s1, s2, s3, s4, s5, s6, s7; + +} uintx64bis; + +typedef struct __align__(256) uint4x16 +{ + uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; +} uint4x16; + +static __inline__ __device__ ulonglong2to8 make_ulonglong2to8(ulonglong2 s0, ulonglong2 s1, ulonglong2 s2, ulonglong2 s3) +{ +ulonglong2to8 t; t.l0=s0; t.l1=s1; t.l2=s2; t.l3=s3; +return t; +} + +static __inline__ __device__ ulonglong8to16 make_ulonglong8to16(const ulonglong2to8 &s0, const ulonglong2to8 &s1) +{ + ulonglong8to16 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __device__ ulonglong16to32 make_ulonglong16to32(const ulonglong8to16 &s0, const ulonglong8to16 &s1) +{ + ulonglong16to32 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __device__ ulonglong32to64 make_ulonglong32to64(const ulonglong16to32 &s0, const ulonglong16to32 &s1) +{ + ulonglong32to64 t; t.lo = s0; t.hi = s1; + return t; +} + + +static __inline__ __host__ __device__ ulonglonglong make_ulonglonglong( + const ulonglong2 &s0, const ulonglong2 &s1, const ulonglong2 &s2, const ulonglong2 &s3, + const ulonglong2 &s4, const ulonglong2 &s5) +{ + ulonglonglong t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; + return t; +} + + +static __inline__ __device__ uint48 make_uint48(uint4 s0, uint4 s1) +{ + uint48 t; t.s0 = s0; t.s1 = s1; + return t; +} +/* +static __inline__ __device__ uint48 make_uint48(uint2 s0, uint2 s1, uint2 s2, uint2 s3) +{ + uint48 t; t.x = s0; t.y = s1; t.z = s2; t.w = s3; + return t; +} + +static __inline__ __device__ uint48 make_uint48(const uint28 &s0) +{ + uint48 t; t.x = s0.x; t.y = s0.y; t.z = s0.z; t.w = s0.w; + return t; +} +*/ +static __inline__ __device__ uint28 make_uint28(uint2 s0, uint2 s1, uint2 s2, uint2 s3) +{ + uint28 t; t.x = s0; t.y = s1; t.z = s2; t.w = s3; + return t; +} + + +static __inline__ __device__ uint816 make_uint816(const uint48 &s0, const uint48 &s1) +{ + uint816 t; t.s0 = s0; t.s1 = s1; + return t; +} + + + + + +static __inline__ __device__ uint1632 make_uint1632(const uint816 &s0, const uint816 &s1) +{ + uint1632 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uintx64 make_uintx64(const uint1632 &s0, const uint1632 &s1) +{ + uintx64 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uintx64bis make_uintx64bis( + const uint28 &s0, const uint28 &s1, const uint28 &s2, const uint28 &s3, + const uint28 &s4, const uint28 &s5, const uint28 &s6, const uint28 &s7 +) +{ + uintx64bis t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + +static __inline__ __host__ __device__ uint4x16 make_uint4x16( + uint4 s0, uint4 s1, uint4 s2, uint4 s3, uint4 s4, uint4 s5, uint4 s6, uint4 s7, + uint4 s8, uint4 s9, uint4 sa, uint4 sb, uint4 sc, uint4 sd, uint4 se, uint4 sf) +{ + uint4x16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.s10 = sa; t.s11 = sb; t.s12 = sc; t.s13 = sd; t.s14 = se; t.s15 = sf; + return t; +} + + +static __inline__ __device__ uint2_16 make_uint2_16( + uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7, + uint2 s8, uint2 s9, uint2 sa, uint2 sb, uint2 sc, uint2 sd, uint2 se, uint2 sf) +{ + uint2_16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf; + return t; +} + + +static __inline__ __host__ __device__ uint16 make_uint16( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7, + unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf) +{ + uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf; + return t; +} + +static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b) +{ + uint16 t; t.lo=a; t.hi=b; return t; +} + +static __inline__ __host__ __device__ uint32 make_uint32(const uint16 &a, const uint16 &b) +{ + uint32 t; t.lo = a; t.hi = b; return t; +} + + +static __inline__ __host__ __device__ uint8 make_uint8( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7) +{ + uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + +static __inline__ __host__ __device__ uint2_8 make_uint2_8( + uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7) +{ + uint2_8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + + +static __inline__ __host__ __device__ ulonglong16 make_ulonglong16(const ulonglong4 &s0, const ulonglong4 &s1, + const ulonglong4 &s2, const ulonglong4 &s3, const ulonglong4 &s4, const ulonglong4 &s5, const ulonglong4 &s6, const ulonglong4 &s7) +{ + ulonglong16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + + + + +static __inline__ __host__ __device__ ulong8 make_ulong8( + ulonglong4 s0, ulonglong4 s1, ulonglong4 s2, ulonglong4 s3) +{ + ulong8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3;// t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + + +static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b) { return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b) { return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } + + + + + +//static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __forceinline__ __device__ uint4 operator+ (uint4 a, uint4 b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } + + +static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } +static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); } +static __forceinline__ __device__ ulonglong2 operator+ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x + b.x, a.y + b.y); } + +static __forceinline__ __device__ ulong8 operator^ (const ulong8 &a, const ulong8 &b) { + return make_ulong8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3); +} //, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); } + +static __forceinline__ __device__ ulong8 operator+ (const ulong8 &a, const ulong8 &b) { + return make_ulong8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3); +} //, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); } + + +static __forceinline__ __device__ __host__ uint8 operator^ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); } + +static __forceinline__ __device__ __host__ uint8 operator+ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); } + +static __forceinline__ __device__ uint2_8 operator^ (const uint2_8 &a, const uint2_8 &b) { return make_uint2_8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); } + +static __forceinline__ __device__ uint2_8 operator+ (const uint2_8 &a, const uint2_8 &b) { return make_uint2_8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); } + + +////////////// mess++ ////// + +//static __forceinline__ __device__ uint48 operator^ (const uint48 &a, const uint48 &b) { +// return make_uint48(a.s0 ^ b.s0, a.s1 ^ b.s1); +//} + +static __forceinline__ __device__ uint28 operator^ (const uint28 &a, const uint28 &b) { + return make_uint28(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} + +static __forceinline__ __device__ uint28 operator+ (const uint28 &a, const uint28 &b) { + return make_uint28(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} + +static __forceinline__ __device__ uint48 operator+ (const uint48 &a, const uint48 &b) { + return make_uint48(a.s0 + b.s0, a.s1 + b.s1); +} +/* +static __forceinline__ __device__ uint48 operator+ (const uint48 &a, const uint48 &b) { + return make_uint48(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +*/ + +//static __forceinline__ __device__ uint816 operator^ (const uint816 &a, const uint816 &b) { +// return make_uint816(a.s0 ^ b.s0, a.s1 ^ b.s1); +//} + +static __forceinline__ __device__ uint816 operator+ (const uint816 &a, const uint816 &b) { + return make_uint816(a.s0 + b.s0, a.s1 + b.s1); +} + + +//static __forceinline__ __device__ uint1632 operator^ (const uint1632 &a, const uint1632 &b) { +// return make_uint1632(a.s0 ^ b.s0, a.s1 ^ b.s1); +//} + + +//static __forceinline__ __device__ uintx64 operator^ (const uintx64 &a, const uintx64 &b) { +// return make_uintx64(a.s0 ^ b.s0, a.s1 ^ b.s1); +//} + +///////////////////////// + +static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) { + return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7, + a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf); +} + +static __forceinline__ __device__ __host__ uint16 operator+ (const uint16 &a, const uint16 &b) { + return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7, + a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf); +} + +static __forceinline__ __device__ uint2_16 operator^ (const uint2_16 &a, const uint2_16 &b) { + return make_uint2_16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7, + a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf); +} + +static __forceinline__ __device__ uint2_16 operator+ (const uint2_16 &a, const uint2_16 &b) { + return make_uint2_16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7, + a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf); +} + +static __forceinline__ __device__ uintx64bis operator^ (const uintx64bis &a, const uintx64bis &b) { + return make_uintx64bis(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); +} + +static __forceinline__ __device__ uintx64bis operator+ (const uintx64bis &a, const uintx64bis &b) { + return make_uintx64bis(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); +} + + +static __forceinline__ __device__ uint32 operator^ (const uint32 &a, const uint32 &b) { + return make_uint32(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ uint32 operator+ (const uint32 &a, const uint32 &b) { + return make_uint32(a.lo + b.lo, a.hi + b.hi); +} + + +static __forceinline__ __device__ ulonglong16 operator^ (const ulonglong16 &a, const ulonglong16 &b) { + return make_ulonglong16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); +} + +static __forceinline__ __device__ ulonglong16 operator+ (const ulonglong16 &a, const ulonglong16 &b) { + return make_ulonglong16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); +} + +static __forceinline__ __device__ void operator^= (ulong8 &a, const ulong8 &b) { a = a ^ b; } +//static __forceinline__ __device__ void operator^= (uintx64 &a, const uintx64 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (uintx64bis &a, const uintx64bis &b) { a = a ^ b; } + +//static __forceinline__ __device__ void operator^= (uint816 &a, const uint816 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator+= (uint816 &a, const uint816 &b) { a = a + b; } + + +//static __forceinline__ __device__ void operator^= (uint48 &a, const uint48 &b) { a = a ^ b; } + +//static __forceinline__ __device__ void operator+= (uint48 &a, const uint48 &b) { a = a + b; } + + +static __forceinline__ __device__ void operator^= (uint28 &a, const uint28 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (uint28 &a, const uint28 &b) { a = a + b; } + +static __forceinline__ __device__ void operator^= (uint2_8 &a, const uint2_8 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (uint2_8 &a, const uint2_8 &b) { a = a + b; } + + + +static __forceinline__ __device__ void operator^= (uint32 &a, const uint32 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (uint32 &a, const uint32 &b) { a = a + b; } + + +//static __forceinline__ __device__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) { a = a ^ b; } +static __forceinline__ __device__ __host__ void operator^= (uint8 &a, const uint8 &b) { a = a ^ b; } +static __forceinline__ __device__ __host__ void operator^= (uint16 &a, const uint16 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator^= (ulonglong16 &a, const ulonglong16 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator+= (ulonglong4 &a, const ulonglong4 &b) { a = a + b; } + +static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator+= (ulonglong2 &a, const ulonglong2 &b) { a = a + b; } + +static __forceinline__ __device__ +ulonglong2to8 operator^ (const ulonglong2to8 &a, const ulonglong2to8 &b) +{ + return make_ulonglong2to8(a.l0 ^ b.l0, a.l1 ^ b.l1, a.l2 ^ b.l2, a.l3 ^ b.l3); +} +static __forceinline__ __device__ +ulonglong2to8 operator+ (const ulonglong2to8 &a, const ulonglong2to8 &b) +{ + return make_ulonglong2to8(a.l0 + b.l0, a.l1 + b.l1, a.l2 + b.l2, a.l3 + b.l3); +} + + +static __forceinline__ __device__ +ulonglong8to16 operator^ (const ulonglong8to16 &a, const ulonglong8to16 &b) +{ + return make_ulonglong8to16(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong8to16 operator+ (const ulonglong8to16 &a, const ulonglong8to16 &b) +{ + return make_ulonglong8to16(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ +ulonglong16to32 operator^ (const ulonglong16to32 &a, const ulonglong16to32 &b) +{ + return make_ulonglong16to32(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong16to32 operator+ (const ulonglong16to32 &a, const ulonglong16to32 &b) +{ + return make_ulonglong16to32(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ +ulonglong32to64 operator^ (const ulonglong32to64 &a, const ulonglong32to64 &b) +{ + return make_ulonglong32to64(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong32to64 operator+ (const ulonglong32to64 &a, const ulonglong32to64 &b) +{ + return make_ulonglong32to64(a.lo + b.lo, a.hi + b.hi); +} + + +static __forceinline__ __device__ ulonglonglong operator^ (const ulonglonglong &a, const ulonglonglong &b) { + return make_ulonglonglong(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5); +} + +static __forceinline__ __device__ ulonglonglong operator+ (const ulonglonglong &a, const ulonglonglong &b) { + return make_ulonglonglong(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5); +} + + +static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulonglong2to8 &b) { a = a ^ b; } + + +static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; } +static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; } +static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; } +static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; } +static __forceinline__ __device__ void operator+= (uint2_16 &a, const uint2_16 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (uint2_16 &a, const uint2_16 &b) { a = a + b; } + +static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) { a = a + b; } +static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) { a = a + b; } +static __forceinline__ __device__ void operator+= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a ^ b; } + + +static __forceinline__ __device__ void operator+= (ulonglonglong &a, const ulonglonglong &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglonglong &a, const ulonglonglong &b) { a = a ^ b; } + +#if __CUDA_ARCH__ < 320 + +#define rotate ROTL32 +#define rotateR ROTR32 + +#else + +static __forceinline__ __device__ uint4 rotate4(uint4 vec4, uint32_t shift) +{ + uint4 ret; + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift)); + return ret; +} + +static __forceinline__ __device__ uint4 rotate4R(uint4 vec4, uint32_t shift) +{ + uint4 ret; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift)); + return ret; +} + +static __forceinline__ __device__ uint32_t rotate(uint32_t vec4, uint32_t shift) +{ + uint32_t ret; + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift)); + return ret; +} + + +static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift) +{ + uint32_t ret; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift)); + return ret; +} + + + +static __device__ __inline__ uint8 __ldg8(const uint8_t *ptr) +{ + uint8 test; + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr)); + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ uint32_t __ldgtoint(const uint8_t *ptr) +{ + uint32_t test; + asm volatile ("ld.global.nc.u32 {%0},[%1];" : "=r"(test) : __LDG_PTR(ptr)); + return (test); +} + +static __device__ __inline__ uint32_t __ldgtoint64(const uint8_t *ptr) +{ + uint64_t test; + asm volatile ("ld.global.nc.u64 {%0},[%1];" : "=l"(test) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ uint32_t __ldgtoint_unaligned(const uint8_t *ptr) +{ + uint32_t test; + asm volatile ("{\n\t" + ".reg .u8 a,b,c,d; \n\t" + "ld.global.nc.u8 a,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "mov.b32 %0,{a,b,c,d}; }\n\t" + : "=r"(test) : __LDG_PTR(ptr)); + return (test); +} + +static __device__ __inline__ uint64_t __ldgtoint64_unaligned(const uint8_t *ptr) +{ + uint64_t test; + asm volatile ("{\n\t" + ".reg .u8 a,b,c,d,e,f,g,h; \n\t" + ".reg .u32 i,j; \n\t" + "ld.global.nc.u8 a,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "ld.global.nc.u8 e,[%1+4]; \n\t" + "ld.global.nc.u8 f,[%1+5]; \n\t" + "ld.global.nc.u8 g,[%1+6]; \n\t" + "ld.global.nc.u8 h,[%1+7]; \n\t" + "mov.b32 i,{a,b,c,d}; \n\t" + "mov.b32 j,{e,f,g,h}; \n\t" + "mov.b64 %0,{i,j}; }\n\t" + : "=l"(test) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ uint64_t __ldgtoint64_trunc(const uint8_t *ptr) +{ + uint32_t zero = 0; + uint64_t test; + asm volatile ("{\n\t" + ".reg .u8 a,b,c,d; \n\t" + ".reg .u32 i; \n\t" + "ld.global.nc.u8 a,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "mov.b32 i,{a,b,c,d}; \n\t" + "mov.b64 %0,{i,%1}; }\n\t" + : "=l"(test) : __LDG_PTR(ptr), "r"(zero)); + return (test); +} + + + +static __device__ __inline__ uint32_t __ldgtoint_unaligned2(const uint8_t *ptr) +{ + uint32_t test; + asm("{\n\t" + ".reg .u8 e,b,c,d; \n\t" + "ld.global.nc.u8 e,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "mov.b32 %0,{e,b,c,d}; }\n\t" + : "=r"(test) : __LDG_PTR(ptr)); + return (test); +} + +#endif /* __CUDA_ARCH__ < 320 */ + +static __forceinline__ __device__ void shift256R2(uint32_t * ret, const uint8 &vec4, const uint32_t shift) +{ + uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[8] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s6); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[7] = cuda_swab32(truc); + truc2 = cuda_swab32(vec4.s5); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[6] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s4); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[5] = cuda_swab32(truc); + truc2 = cuda_swab32(vec4.s3); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[4] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s2); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[3] = cuda_swab32(truc); + truc2 = cuda_swab32(vec4.s1); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[2] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s0); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[1] = cuda_swab32(truc); + asm("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift)); + ret[0] = cuda_swab32(truc); + +} + +#define shift256R3(ret,vec4, shift) \ +{ \ + \ +uint32_t truc=0,truc2=cuda_swab32(vec4.s7),truc3=0; \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[8] = cuda_swab32(truc); \ +truc2=cuda_swab32(vec4.s6);truc3=cuda_swab32(vec4.s7); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[7] = cuda_swab32(truc); \ +truc2=cuda_swab32(vec4.s5);truc3=cuda_swab32(vec4.s6); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[6] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s4); truc3 = cuda_swab32(vec4.s5); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[5] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s3); truc3 = cuda_swab32(vec4.s4); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[4] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s2); truc3 = cuda_swab32(vec4.s3); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[3] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s1); truc3 = cuda_swab32(vec4.s2); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[2] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s0); truc3 = cuda_swab32(vec4.s1); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[1] = cuda_swab32(truc); \ +truc3 = cuda_swab32(vec4.s0); \ + asm volatile ("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift)); \ + ret[0] = cuda_swab32(truc); \ + \ + \ +} + +static __device__ __inline__ uint32 __ldg32b(const uint32 *ptr) +{ + uint32 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.lo.s0), "=r"(ret.lo.s1), "=r"(ret.lo.s2), "=r"(ret.lo.s3) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.lo.s4), "=r"(ret.lo.s5), "=r"(ret.lo.s6), "=r"(ret.lo.s7) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.lo.s8), "=r"(ret.lo.s9), "=r"(ret.lo.sa), "=r"(ret.lo.sb) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.lo.sc), "=r"(ret.lo.sd), "=r"(ret.lo.se), "=r"(ret.lo.sf) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.hi.s0), "=r"(ret.hi.s1), "=r"(ret.hi.s2), "=r"(ret.hi.s3) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.hi.s4), "=r"(ret.hi.s5), "=r"(ret.hi.s6), "=r"(ret.hi.s7) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.hi.s8), "=r"(ret.hi.s9), "=r"(ret.hi.sa), "=r"(ret.hi.sb) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.hi.sc), "=r"(ret.hi.sd), "=r"(ret.hi.se), "=r"(ret.hi.sf) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ uint16 __ldg16b(const uint16 *ptr) +{ + uint16 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0), "=r"(ret.s1), "=r"(ret.s2), "=r"(ret.s3) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s4), "=r"(ret.s5), "=r"(ret.s6), "=r"(ret.s7) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s8), "=r"(ret.s9), "=r"(ret.sa), "=r"(ret.sb) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.sc), "=r"(ret.sd), "=r"(ret.se), "=r"(ret.sf) : __LDG_PTR(ptr)); + return ret; +} + + +static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr) +{ + uintx64 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ uintx64 __ldg32c(const uintx64 *ptr) +{ + uintx64 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + + return ret; +} + +static __device__ __inline__ ulonglong2 __ldg2(const ulonglong2 *ptr) +{ + ulonglong2 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ ulonglong4 __ldg4(const ulonglong4 *ptr) +{ + ulonglong4 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.z), "=l"(ret.w) : __LDG_PTR(ptr)); + return ret; +} +static __device__ __inline__ void ldg4(const ulonglong4 *ptr,ulonglong4 *ret) +{ + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret[0].x), "=l"(ret[0].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret[0].z), "=l"(ret[0].w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret[1].x), "=l"(ret[1].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret[1].z), "=l"(ret[1].w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret[2].x), "=l"(ret[2].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret[2].z), "=l"(ret[2].w) : __LDG_PTR(ptr)); +} +static __device__ __inline__ void ldg4xor(const ulonglong4 *ptr, ulonglong4 *ret, ulonglong4 *state) +{ + + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret[0].x), "=l"(ret[0].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret[0].z), "=l"(ret[0].w) : __LDG_PTR(ptr)); + state[0] ^= ret[0]; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret[1].x), "=l"(ret[1].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret[1].z), "=l"(ret[1].w) : __LDG_PTR(ptr)); + state[1] ^= ret[1]; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret[2].x), "=l"(ret[2].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret[2].z), "=l"(ret[2].w) : __LDG_PTR(ptr)); + state[2] ^= ret[2]; +} + + +static __device__ __inline__ uint28 __ldg4(const uint28 *ptr) +{ + uint28 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ uint48 __ldg4(const uint48 *ptr) +{ + uint48 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.x), "=r"(ret.s0.y), "=r"(ret.s0.z), "=r"(ret.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s1.x), "=r"(ret.s1.y), "=r"(ret.s1.z), "=r"(ret.s1.w) : __LDG_PTR(ptr)); + return ret; +} + + +static __device__ __inline__ void ldg4(const uint28 *ptr, uint28 *ret) +{ + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr)); +} +static __device__ __inline__ void ldg4xor(const uint28 *ptr, uint28 *ret,uint28* state) +{ + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr)); + state[0].x ^= ret[0].x; state[0].y ^= ret[0].y; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr)); + state[0].z ^= ret[0].z; state[0].w ^= ret[0].w; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr)); + state[1].x ^= ret[1].x; state[1].y ^= ret[1].y; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr)); + state[1].z ^= ret[1].z; state[1].w ^= ret[1].w; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr)); + state[2].x ^= ret[2].x; state[2].y ^= ret[2].y; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr)); + state[2].z ^= ret[2].z; state[2].w ^= ret[2].w; +} + + +static __device__ __inline__ ulonglong2to8 __ldg2to8(const ulonglong2to8 *ptr) +{ + ulonglong2to8 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.l0.x), "=l"(ret.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.l1.x), "=l"(ret.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.l2.x), "=l"(ret.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.l3.x), "=l"(ret.l3.y) : __LDG_PTR(ptr)); + return ret; +} +static __device__ __inline__ ulonglong8to16 __ldg8to16(const ulonglong8to16 *ptr) +{ + ulonglong8to16 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.lo.l0.x), "=l"(ret.lo.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.lo.l1.x), "=l"(ret.lo.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.lo.l2.x), "=l"(ret.lo.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.lo.l3.x), "=l"(ret.lo.l3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.hi.l0.x), "=l"(ret.hi.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.hi.l1.x), "=l"(ret.hi.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.hi.l2.x), "=l"(ret.hi.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.hi.l3.x), "=l"(ret.hi.l3.y) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ ulonglonglong __ldgxtralong(const ulonglonglong *ptr) +{ + ulonglonglong ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr)); + return ret; +} +static __device__ __inline__ uint8 ldg8bis(const uint8 *ptr) +{ + uint8 test; + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr)); + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ ulonglong16 __ldg32(const ulonglong4 *ptr) +{ + ulonglong16 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s0.z), "=l"(ret.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s1.z), "=l"(ret.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s2.z), "=l"(ret.s2.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.s3.z), "=l"(ret.s3.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+128];" : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+144];" : "=l"(ret.s4.z), "=l"(ret.s4.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+160];" : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+176];" : "=l"(ret.s5.z), "=l"(ret.s5.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+192];" : "=l"(ret.s6.x), "=l"(ret.s6.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+208];" : "=l"(ret.s6.z), "=l"(ret.s6.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+224];" : "=l"(ret.s7.x), "=l"(ret.s7.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+240];" : "=l"(ret.s7.z), "=l"(ret.s7.w) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ uintx64bis __ldg32(const uint28 *ptr) +{ + uintx64bis ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.x.x), "=r"(ret.s0.x.y), "=r"(ret.s0.y.x), "=r"(ret.s0.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.z.x), "=r"(ret.s0.z.y), "=r"(ret.s0.w.x), "=r"(ret.s0.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s1.x.x), "=r"(ret.s1.x.y), "=r"(ret.s1.y.x), "=r"(ret.s1.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s1.z.x), "=r"(ret.s1.z.y), "=r"(ret.s1.w.x), "=r"(ret.s1.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s2.x.x), "=r"(ret.s2.x.y), "=r"(ret.s2.y.x), "=r"(ret.s2.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s2.z.x), "=r"(ret.s2.z.y), "=r"(ret.s2.w.x), "=r"(ret.s2.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s3.x.x), "=r"(ret.s3.x.y), "=r"(ret.s3.y.x), "=r"(ret.s3.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s3.z.x), "=r"(ret.s3.z.y), "=r"(ret.s3.w.x), "=r"(ret.s3.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s4.x.x), "=r"(ret.s4.x.y), "=r"(ret.s4.y.x), "=r"(ret.s4.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s4.z.x), "=r"(ret.s4.z.y), "=r"(ret.s4.w.x), "=r"(ret.s4.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s5.x.x), "=r"(ret.s5.x.y), "=r"(ret.s5.y.x), "=r"(ret.s5.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s5.z.x), "=r"(ret.s5.z.y), "=r"(ret.s5.w.x), "=r"(ret.s5.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s6.x.x), "=r"(ret.s6.x.y), "=r"(ret.s6.y.x), "=r"(ret.s6.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s6.z.x), "=r"(ret.s6.z.y), "=r"(ret.s6.w.x), "=r"(ret.s6.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s7.x.x), "=r"(ret.s7.x.y), "=r"(ret.s7.y.x), "=r"(ret.s7.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s7.z.x), "=r"(ret.s7.z.y), "=r"(ret.s7.w.x), "=r"(ret.s7.w.y) : __LDG_PTR(ptr)); + return ret; +} + + +static __forceinline__ __device__ uint8 swapvec(const uint8 &buf) +{ + uint8 vec; + vec.s0 = cuda_swab32(buf.s0); + vec.s1 = cuda_swab32(buf.s1); + vec.s2 = cuda_swab32(buf.s2); + vec.s3 = cuda_swab32(buf.s3); + vec.s4 = cuda_swab32(buf.s4); + vec.s5 = cuda_swab32(buf.s5); + vec.s6 = cuda_swab32(buf.s6); + vec.s7 = cuda_swab32(buf.s7); + return vec; +} + + +static __forceinline__ __device__ uint8 swapvec(const uint8 *buf) +{ + uint8 vec; + vec.s0 = cuda_swab32(buf[0].s0); + vec.s1 = cuda_swab32(buf[0].s1); + vec.s2 = cuda_swab32(buf[0].s2); + vec.s3 = cuda_swab32(buf[0].s3); + vec.s4 = cuda_swab32(buf[0].s4); + vec.s5 = cuda_swab32(buf[0].s5); + vec.s6 = cuda_swab32(buf[0].s6); + vec.s7 = cuda_swab32(buf[0].s7); + return vec; +} + +static __forceinline__ __device__ uint16 swapvec(const uint16 *buf) +{ + uint16 vec; + vec.s0 = cuda_swab32(buf[0].s0); + vec.s1 = cuda_swab32(buf[0].s1); + vec.s2 = cuda_swab32(buf[0].s2); + vec.s3 = cuda_swab32(buf[0].s3); + vec.s4 = cuda_swab32(buf[0].s4); + vec.s5 = cuda_swab32(buf[0].s5); + vec.s6 = cuda_swab32(buf[0].s6); + vec.s7 = cuda_swab32(buf[0].s7); + vec.s8 = cuda_swab32(buf[0].s8); + vec.s9 = cuda_swab32(buf[0].s9); + vec.sa = cuda_swab32(buf[0].sa); + vec.sb = cuda_swab32(buf[0].sb); + vec.sc = cuda_swab32(buf[0].sc); + vec.sd = cuda_swab32(buf[0].sd); + vec.se = cuda_swab32(buf[0].se); + vec.sf = cuda_swab32(buf[0].sf); + return vec; +} + +static __forceinline__ __device__ uint16 swapvec(const uint16 &buf) +{ + uint16 vec; + vec.s0 = cuda_swab32(buf.s0); + vec.s1 = cuda_swab32(buf.s1); + vec.s2 = cuda_swab32(buf.s2); + vec.s3 = cuda_swab32(buf.s3); + vec.s4 = cuda_swab32(buf.s4); + vec.s5 = cuda_swab32(buf.s5); + vec.s6 = cuda_swab32(buf.s6); + vec.s7 = cuda_swab32(buf.s7); + vec.s8 = cuda_swab32(buf.s8); + vec.s9 = cuda_swab32(buf.s9); + vec.sa = cuda_swab32(buf.sa); + vec.sb = cuda_swab32(buf.sb); + vec.sc = cuda_swab32(buf.sc); + vec.sd = cuda_swab32(buf.sd); + vec.se = cuda_swab32(buf.se); + vec.sf = cuda_swab32(buf.sf); + return vec; +} + +static __device__ __forceinline__ uint28 shuffle4(const uint28 &var, int lane) +{ + uint28 res; + res.x.x = __shfl(var.x.x, lane); + res.x.y = __shfl(var.x.y, lane); + res.y.x = __shfl(var.y.x, lane); + res.y.y = __shfl(var.y.y, lane); + res.z.x = __shfl(var.z.x, lane); + res.z.y = __shfl(var.z.y, lane); + res.w.x = __shfl(var.w.x, lane); + res.w.y = __shfl(var.w.y, lane); + return res; +} + + +static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane) +{ + ulonglong4 res; + uint2 temp; + temp = vectorize(var.x); + temp.x = __shfl(temp.x, lane); + temp.y = __shfl(temp.y, lane); + res.x = devectorize(temp); + temp = vectorize(var.y); + temp.x = __shfl(temp.x, lane); + temp.y = __shfl(temp.y, lane); + res.y = devectorize(temp); + temp = vectorize(var.z); + temp.x = __shfl(temp.x, lane); + temp.y = __shfl(temp.y, lane); + res.z = devectorize(temp); + temp = vectorize(var.w); + temp.x = __shfl(temp.x, lane); + temp.y = __shfl(temp.y, lane); + res.w = devectorize(temp); + return res; +} + +#endif // #ifndef CUDA_VECTOR_H diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu new file mode 100644 index 0000000..70869d6 --- /dev/null +++ b/lyra2/cuda_lyra2v2.cu @@ -0,0 +1,520 @@ +#include +#include + +#include "cuda_lyra2_vectors.h" + +#define TPB 16 + +#define Nrow 4 +#define Ncol 4 +#if __CUDA_ARCH__ < 500 +#define vectype ulonglong4 +#define u64type uint64_t +#define memshift 4 +#elif __CUDA_ARCH__ == 500 +#define u64type uint2 +#define vectype uint28 +#define memshift 3 +#else +#define u64type uint2 +#define vectype uint28 +#define memshift 3 +#endif + +__device__ vectype *DMatrix; + +#ifdef __CUDA_ARCH__ +static __device__ __forceinline__ +void Gfunc_v35(uint2 &a, uint2 &b, uint2 &c, uint2 &d) +{ + a += b; d ^= a; d = SWAPUINT2(d); + c += d; b ^= c; b = ROR24(b); + a += b; d ^= a; d = ROR16(d); + c += d; b ^= c; b = ROR2(b, 63); +} + +#if __CUDA_ARCH__ < 500 +static __device__ __forceinline__ +void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d) +{ + a += b; d ^= a; d = ROTR64(d, 32); + c += d; b ^= c; b = ROTR64(b, 24); + a += b; d ^= a; d = ROTR64(d, 16); + c += d; b ^= c; b = ROTR64(b, 63); +} +#endif + +static __device__ __forceinline__ +void round_lyra_v35(vectype* s) +{ + Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w); + + Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z); +} +#else +#define round_lyra_v35(s) {} +#endif + + +static __device__ __forceinline__ +void reduceDuplex(vectype state[4], uint32_t thread) +{ + vectype state1[3]; + uint32_t ps1 = (Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * (Ncol-1) + memshift * Ncol + Nrow * Ncol * memshift * thread); + + #pragma unroll 4 + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + i*memshift; + uint32_t s2 = ps2 - i*memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix+s1)[j]); + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + round_lyra_v35(state); + for (int j = 0; j < 3; j++) + state1[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state1[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexV3(vectype state[4], uint32_t thread) +{ + vectype state1[3]; + uint32_t ps1 = (Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread); + + #pragma unroll 4 + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow * i *memshift; + uint32_t s2 = ps2 - Nrow * i *memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + round_lyra_v35(state); + + for (int j = 0; j < 3; j++) + state1[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state1[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) +{ + vectype state2[3],state1[3]; + + uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); + uint32_t ps3 = (memshift * (Ncol-1) + memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); + + //#pragma unroll 1 + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + i*memshift; + uint32_t s2 = ps2 + i*memshift; + uint32_t s3 = ps3 - i*memshift; + + for (int j = 0; j < 3; j++) + state1[j]= __ldg4(&(DMatrix + s1)[j]); + for (int j = 0; j < 3; j++) + state2[j]= __ldg4(&(DMatrix + s2)[j]); + for (int j = 0; j < 3; j++) { + vectype tmp = state1[j] + state2[j]; + state[j] ^= tmp; + } + + round_lyra_v35(state); + + for (int j = 0; j < 3; j++) { + state1[j] ^= state[j]; + (DMatrix + s3)[j] = state1[j]; + } + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j+1] ^= ((uint2*)state)[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) +{ + vectype state2[3], state1[3]; + + uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); + uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift * rowOut + Nrow * Ncol * memshift * thread); + + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow*i*memshift; + uint32_t s2 = ps2 + Nrow*i*memshift; + uint32_t s3 = ps3 - Nrow*i*memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1 )[j]); + for (int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2 )[j]); + for (int j = 0; j < 3; j++) { + vectype tmp = state1[j] + state2[j]; + state[j] ^= tmp; + } + + round_lyra_v35(state); + + for (int j = 0; j < 3; j++) { + state1[j] ^= state[j]; + (DMatrix + s3)[j] = state1[j]; + } + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } +} + + +static __device__ __forceinline__ +void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) +{ + vectype state1[3],state2[3]; + uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); + uint32_t ps3 = (memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); + + //#pragma unroll 1 + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + i*memshift; + uint32_t s2 = ps2 + i*memshift; + uint32_t s3 = ps3 + i*memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + for (int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2)[j]); + + for (int j = 0; j < 3; j++) + state1[j] += state2[j]; + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra_v35(state); + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + if (rowInOut != rowOut) { + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s3)[j] ^= state[j]; + + } else { + + for (int j = 0; j < 3; j++) + state2[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j]=state2[j]; + } + + } +} + +static __device__ __forceinline__ +void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) +{ + vectype state1[3], state2[3]; + uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); + uint32_t ps3 = (memshift * rowOut + Nrow * Ncol * memshift * thread); + + #pragma nounroll + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow * i*memshift; + uint32_t s2 = ps2 + Nrow * i*memshift; + uint32_t s3 = ps3 + Nrow * i*memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + for (int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2)[j]); + + for (int j = 0; j < 3; j++) + state1[j] += state2[j]; + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra_v35(state); + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + if (rowInOut != rowOut) { + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s3)[j] ^= state[j]; + + } else { + + for (int j = 0; j < 3; j++) + state2[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } + } +} + + +#if __CUDA_ARCH__ < 500 +__global__ __launch_bounds__(128, 1) +#elif __CUDA_ARCH__ == 500 +__global__ __launch_bounds__(16, 1) +#else +__global__ __launch_bounds__(TPB, 1) +#endif +void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + vectype state[4]; + uint28 blake2b_IV[2]; + uint28 padding[2]; + + if (threadIdx.x == 0) { + + ((uint16*)blake2b_IV)[0] = make_uint16( + 0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85 , + 0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a , + 0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c , + 0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19 + ); + ((uint16*)padding)[0] = make_uint16( + 0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0 , + 0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000 + ); + } + +#if __CUDA_ARCH__ == 350 + if (thread < threads) +#endif + { + ((uint2*)state)[0] = __ldg(&outputHash[thread]); + ((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); + ((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); + ((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); + state[1] = state[0]; + state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0); + state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0); + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + state[0] ^= shuffle4(((vectype*)padding)[0], 0); + state[1] ^= shuffle4(((vectype*)padding)[1], 0); + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); + + //#pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint32_t s1 = ps1 - 4 * memshift * i; + for (int j = 0; j < 3; j++) + (DMatrix + s1)[j] = (state)[j]; + + round_lyra_v35(state); + } + + reduceDuplexV3(state, thread); + reduceDuplexRowSetupV3(1, 0, 2, state, thread); + reduceDuplexRowSetupV3(2, 1, 3, state, thread); + + uint32_t rowa; + int prev = 3; + for (int i = 0; i < 4; i++) + { + rowa = ((uint2*)state)[0].x & 3; reduceDuplexRowtV3(prev, rowa, i, state, thread); + prev = i; + } + + uint32_t shift = (memshift * rowa + 16 * memshift * thread); + + for (int j = 0; j < 3; j++) + state[j] ^= __ldg4(&(DMatrix + shift)[j]); + + for (int i = 0; i < 12; i++) + round_lyra_v35(state); + + outputHash[thread] = ((uint2*)state)[0]; + outputHash[thread + threads] = ((uint2*)state)[1]; + outputHash[thread + 2 * threads] = ((uint2*)state)[2]; + outputHash[thread + 3 * threads] = ((uint2*)state)[3]; + //((vectype*)outputHash)[thread] = state[0]; + + } //thread +} + +#if __CUDA_ARCH__ < 500 +__global__ __launch_bounds__(64, 1) +#elif __CUDA_ARCH__ == 500 +__global__ __launch_bounds__(32, 1) +#else +__global__ __launch_bounds__(TPB, 1) +#endif +void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + vectype state[4]; + uint28 blake2b_IV[2]; + uint28 padding[2]; + + if (threadIdx.x == 0) { + + ((uint16*)blake2b_IV)[0] = make_uint16( + 0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85 , + 0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a , + 0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c , + 0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19 + ); + ((uint16*)padding)[0] = make_uint16( + 0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0 , + 0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000 + ); + } + +#if __CUDA_ARCH__ == 350 + if (thread < threads) +#endif + { + ((uint2*)state)[0] = __ldg(&outputHash[thread]); + ((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); + ((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); + ((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); + + state[1] = state[0]; + + state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0); + state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0); + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + state[0] ^= shuffle4(((vectype*)padding)[0], 0); + state[1] ^= shuffle4(((vectype*)padding)[1], 0); + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + uint32_t ps1 = (memshift * (Ncol - 1) + Nrow * Ncol * memshift * thread); + + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 - memshift * i; + for (int j = 0; j < 3; j++) + (DMatrix + s1)[j] = (state)[j]; + + round_lyra_v35(state); + } + + reduceDuplex(state, thread); + + reduceDuplexRowSetupV2(1, 0, 2, state, thread); + reduceDuplexRowSetupV2(2, 1, 3, state, thread); + + uint32_t rowa; + int prev=3; + + for (int i = 0; i < 4; i++) { + rowa = ((uint2*)state)[0].x & 3; + reduceDuplexRowtV2(prev, rowa, i, state, thread); + prev=i; + } + + uint32_t shift = (memshift * Ncol * rowa + Nrow * Ncol * memshift * thread); + + for (int j = 0; j < 3; j++) + state[j] ^= __ldg4(&(DMatrix + shift)[j]); + + for (int i = 0; i < 12; i++) + round_lyra_v35(state); + + outputHash[thread]= ((uint2*)state)[0]; + outputHash[thread + threads] = ((uint2*)state)[1]; + outputHash[thread + 2 * threads] = ((uint2*)state)[2]; + outputHash[thread + 3 * threads] = ((uint2*)state)[3]; +// ((vectype*)outputHash)[thread] = state[0]; + + } //thread +} + +__host__ +void lyra2v2_cpu_init(int thr_id, uint32_t threads,uint64_t *hash) +{ + cudaMemcpyToSymbol(DMatrix, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice); +} + +__host__ +void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) +{ + uint32_t tpb; + if (device_sm[device_map[thr_id]] < 500) + tpb = 64; + else if (device_sm[device_map[thr_id]] == 500) + tpb = 32; + else + tpb = TPB; + + dim3 grid((threads + tpb - 1) / tpb); + dim3 block(tpb); + + if (device_sm[device_map[thr_id]] >= 500) + lyra2v2_gpu_hash_32 << > > (threads, startNounce, (uint2*)d_outputHash); + else + lyra2v2_gpu_hash_32_v3 <<>> (threads, startNounce,(uint2*) d_outputHash); + + MyStreamSynchronize(NULL, order, thr_id); +} + diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu index e5fe1fe..e1a8d08 100644 --- a/lyra2/lyra2RE.cu +++ b/lyra2/lyra2RE.cu @@ -10,6 +10,7 @@ extern "C" { #include "cuda_helper.h" static uint64_t* d_hash[MAX_GPUS]; +static uint64_t* d_hash2[MAX_GPUS]; extern void blake256_cpu_init(int thr_id, uint32_t threads); extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); @@ -19,6 +20,7 @@ extern void keccak256_cpu_init(int thr_id, uint32_t threads); extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void skein256_cpu_init(int thr_id, uint32_t threads); +extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *hash); extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void groestl256_cpu_init(int thr_id, uint32_t threads); @@ -26,7 +28,7 @@ extern void groestl256_setTarget(const void *ptarget); extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order); extern uint32_t groestl256_getSecNonce(int thr_id, int num); -extern "C" void lyra2_hash(void *state, const void *input) +extern "C" void lyra2re_hash(void *state, const void *input) { sph_blake256_context ctx_blake; sph_keccak256_context ctx_keccak; @@ -79,7 +81,11 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata, skein256_cpu_init(thr_id, throughput); groestl256_cpu_init(thr_id, throughput); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64)); + // DMatrix + cudaMalloc(&d_hash2[thr_id], (size_t)16 * 8 * 8 * sizeof(uint64_t) * throughput); + lyra2_cpu_init(thr_id, throughput, d_hash2[thr_id]); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); init[thr_id] = true; } @@ -108,7 +114,7 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata, uint32_t _ALIGN(64) vhash64[8]; be32enc(&endiandata[19], foundNonce); - lyra2_hash(vhash64, endiandata); + lyra2re_hash(vhash64, endiandata); if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { int res = 1; @@ -116,7 +122,7 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata, if (secNonce != UINT32_MAX) { be32enc(&endiandata[19], secNonce); - lyra2_hash(vhash64, endiandata); + lyra2re_hash(vhash64, endiandata); if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { if (opt_debug) applog(LOG_BLUE, "GPU #%d: found second nonce %08x", device_map[thr_id], secNonce); diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu new file mode 100644 index 0000000..04024dc --- /dev/null +++ b/lyra2/lyra2REv2.cu @@ -0,0 +1,164 @@ +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_skein.h" +#include "sph/sph_keccak.h" +#include "sph/sph_cubehash.h" +#include "lyra2/Lyra2.h" +} + +#include "miner.h" +#include "cuda_helper.h" + + +static _ALIGN(64) uint64_t *d_hash[MAX_GPUS]; +static uint64_t *d_hash2[MAX_GPUS]; + +extern void blake256_cpu_init(int thr_id, uint32_t threads); +extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); +extern void blake256_cpu_setBlock_80(uint32_t *pdata); +extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +extern void keccak256_cpu_init(int thr_id, uint32_t threads); +extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +extern void skein256_cpu_init(int thr_id, uint32_t threads); + +extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +extern void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t* matrix); + +extern void bmw256_setTarget(const void *ptarget); +extern void bmw256_cpu_init(int thr_id, uint32_t threads); +extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces); + +extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash); + +void lyra2v2_hash(void *state, const void *input) +{ + uint32_t hashA[8], hashB[8]; + + sph_blake256_context ctx_blake; + sph_keccak256_context ctx_keccak; + sph_skein256_context ctx_skein; + sph_bmw256_context ctx_bmw; + sph_cubehash256_context ctx_cube; + + sph_blake256_init(&ctx_blake); + sph_blake256(&ctx_blake, input, 80); + sph_blake256_close(&ctx_blake, hashA); + + sph_keccak256_init(&ctx_keccak); + sph_keccak256(&ctx_keccak, hashA, 32); + sph_keccak256_close(&ctx_keccak, hashB); + + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashB, 32); + sph_cubehash256_close(&ctx_cube, hashA); + + LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); + + sph_skein256_init(&ctx_skein); + sph_skein256(&ctx_skein, hashB, 32); + sph_skein256_close(&ctx_skein, hashA); + + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashA, 32); + sph_cubehash256_close(&ctx_cube, hashB); + + sph_bmw256_init(&ctx_bmw); + sph_bmw256(&ctx_bmw, hashB, 32); + sph_bmw256_close(&ctx_bmw, hashA); + + memcpy(state, hashA, 32); +} + +static bool init[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_lyra2v2(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + int intensity = (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 18 : 17; + unsigned int defthr = 1U << intensity; + uint32_t throughput = device_intensity(device_map[thr_id], __func__, defthr); + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x00ff; + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + //if (opt_n_gputhreads == 1) + // cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + blake256_cpu_init(thr_id, throughput); + keccak256_cpu_init(thr_id,throughput); + skein256_cpu_init(thr_id, throughput); + bmw256_cpu_init(thr_id, throughput); + + // DMatrix + CUDA_SAFE_CALL(cudaMalloc(&d_hash2[thr_id], 16 * 4 * 4 * sizeof(uint64_t) * throughput)); + lyra2v2_cpu_init(thr_id, throughput, d_hash2[thr_id]); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)throughput * 32)); + + init[thr_id] = true; + } + + uint32_t endiandata[20]; + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + + blake256_cpu_setBlock_80(pdata); + bmw256_setTarget(ptarget); + + do { + int order = 0; + uint32_t foundNonces[2] = { 0, 0 }; + + blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]); + lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id]); + + bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces); + + if (foundNonces[0] != 0) + { +// CUDA_SAFE_CALL(cudaGetLastError()); + const uint32_t Htarg = ptarget[7]; + uint32_t vhash64[8]; + be32enc(&endiandata[19], foundNonces[0]); + lyra2v2_hash(vhash64, endiandata); + if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + int res = 1; + // check if there was some other ones... + *hashes_done = pdata[19] - first_nonce + throughput; + if (foundNonces[1] != 0) + { + pdata[21] = foundNonces[1]; + res++; + if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, foundNonces[1], vhash64[7], Htarg); + } + pdata[19] = foundNonces[0]; + if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found nounce % 08x", thr_id, foundNonces[0], vhash64[7], Htarg); + MyStreamSynchronize(NULL, 0, device_map[thr_id]); + return res; + } + else + { + if (vhash64[7] > Htarg) // don't show message if it is equal but fails fulltest + applog(LOG_WARNING, "GPU #%d: result does not validate on CPU!", thr_id); + } + } + + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); + + *hashes_done = pdata[19] - first_nonce + 1; + MyStreamSynchronize(NULL, 0, device_map[thr_id]); + return 0; +} diff --git a/miner.h b/miner.h index 0d2b390..835eb39 100644 --- a/miner.h +++ b/miner.h @@ -316,8 +316,10 @@ extern int scanhash_fresh(int thr_id, uint32_t *pdata, unsigned long *hashes_done); extern int scanhash_lyra2(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); + +extern int scanhash_lyra2v2(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_neoscrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -781,7 +783,8 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); void keccak256_hash(void *state, const void *input); unsigned int jackpothash(void *state, const void *input); void groestlhash(void *state, const void *input); -void lyra2_hash(void *state, const void *input); +void lyra2re_hash(void *state, const void *input); +void lyra2v2_hash(void *state, const void *input); void myriadhash(void *state, const void *input); void neoscrypt(uchar *output, const uchar *input, uint32_t profile); void nist5hash(void *state, const void *input); diff --git a/util.cpp b/util.cpp index 1babd9e..8e7bc7b 100644 --- a/util.cpp +++ b/util.cpp @@ -1841,9 +1841,12 @@ void print_hash_tests(void) luffa_hash(&hash[0], &buf[0]); printpfx("luffa", hash); - lyra2_hash(&hash[0], &buf[0]); + lyra2re_hash(&hash[0], &buf[0]); printpfx("lyra2", hash); + lyra2v2_hash(&hash[0], &buf[0]); + printpfx("lyra2v2", hash); + myriadhash(&hash[0], &buf[0]); printpfx("myriad", hash);