Tanguy Pruvot
9 years ago
18 changed files with 3328 additions and 902 deletions
@ -0,0 +1,320 @@ |
|||||||
|
#include <stdio.h> |
||||||
|
#include <memory.h> |
||||||
|
|
||||||
|
#include "cuda_helper.h" |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// die Message it Padding zur Berechnung auf der GPU |
||||||
|
__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) |
||||||
|
__constant__ uint32_t ZDH[16]; |
||||||
|
static uint32_t *d_gnounce[MAX_GPUS]; |
||||||
|
static uint32_t *d_GNonce[MAX_GPUS]; |
||||||
|
|
||||||
|
__constant__ uint32_t pTarget[8]; |
||||||
|
#define shl(x, n) ((x) << (n)) |
||||||
|
#define shr(x, n) ((x) >> (n)) |
||||||
|
//#define SHR(x, n) SHR2(x, n) |
||||||
|
//#define SHL(x, n) SHL2(x, n) |
||||||
|
|
||||||
|
#define ROTL32host(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) |
||||||
|
// #define SPH_ROTL32 SPH_ROTL32 |
||||||
|
#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x), 4) ^ SPH_ROTL32((x), 19)) |
||||||
|
#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x), 8) ^ SPH_ROTL32((x), 23)) |
||||||
|
#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25)) |
||||||
|
#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29)) |
||||||
|
#define ss4(x) (shr((x), 1) ^ (x)) |
||||||
|
#define ss5(x) (shr((x), 2) ^ (x)) |
||||||
|
#define rs1(x) SPH_ROTL32((x), 3) |
||||||
|
#define rs2(x) SPH_ROTL32((x), 7) |
||||||
|
#define rs3(x) SPH_ROTL32((x), 13) |
||||||
|
#define rs4(x) SPH_ROTL32((x), 16) |
||||||
|
#define rs5(x) SPH_ROTL32((x), 19) |
||||||
|
#define rs6(x) SPH_ROTL32((x), 23) |
||||||
|
#define rs7(x) SPH_ROTL32((x), 27) |
||||||
|
|
||||||
|
/* Message expansion function 1 */ |
||||||
|
__forceinline__ __device__ uint32_t expand32_1(int i, uint32_t *M32, uint32_t *H, uint32_t *Q) |
||||||
|
{ |
||||||
|
#undef SPH_ROTL32 |
||||||
|
#define SPH_ROTL32 ROTL32 |
||||||
|
return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13]) |
||||||
|
+ ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9]) |
||||||
|
+ ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5]) |
||||||
|
+ ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1]) |
||||||
|
+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); |
||||||
|
#undef SPH_ROTL32 |
||||||
|
} |
||||||
|
|
||||||
|
/* Message expansion function 2 */ |
||||||
|
__forceinline__ __device__ uint32_t expand32_2(int i, uint32_t *M32, uint32_t *H, uint32_t *Q) |
||||||
|
{ |
||||||
|
#undef SPH_ROTL32 |
||||||
|
#define SPH_ROTL32 ROTL32 |
||||||
|
return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13]) |
||||||
|
+ Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9]) |
||||||
|
+ Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5]) |
||||||
|
+ Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1]) |
||||||
|
+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); |
||||||
|
#undef SPH_ROTL32 |
||||||
|
} |
||||||
|
|
||||||
|
__forceinline__ __device__ void Compression256(uint32_t * M32, uint32_t * H) |
||||||
|
{ |
||||||
|
#undef SPH_ROTL32 |
||||||
|
#define SPH_ROTL32 ROTL32 |
||||||
|
int i; |
||||||
|
uint32_t XL32, XH32, Q[32]; |
||||||
|
|
||||||
|
|
||||||
|
Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]); |
||||||
|
Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]); |
||||||
|
Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); |
||||||
|
Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]); |
||||||
|
Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]); |
||||||
|
Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); |
||||||
|
Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]); |
||||||
|
Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]); |
||||||
|
Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]); |
||||||
|
Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]); |
||||||
|
Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]); |
||||||
|
Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]); |
||||||
|
Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]); |
||||||
|
Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]); |
||||||
|
Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]); |
||||||
|
Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]); |
||||||
|
|
||||||
|
/* Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/ |
||||||
|
Q[0] = ss0(Q[0]) + H[1]; |
||||||
|
Q[1] = ss1(Q[1]) + H[2]; |
||||||
|
Q[2] = ss2(Q[2]) + H[3]; |
||||||
|
Q[3] = ss3(Q[3]) + H[4]; |
||||||
|
Q[4] = ss4(Q[4]) + H[5]; |
||||||
|
Q[5] = ss0(Q[5]) + H[6]; |
||||||
|
Q[6] = ss1(Q[6]) + H[7]; |
||||||
|
Q[7] = ss2(Q[7]) + H[8]; |
||||||
|
Q[8] = ss3(Q[8]) + H[9]; |
||||||
|
Q[9] = ss4(Q[9]) + H[10]; |
||||||
|
Q[10] = ss0(Q[10]) + H[11]; |
||||||
|
Q[11] = ss1(Q[11]) + H[12]; |
||||||
|
Q[12] = ss2(Q[12]) + H[13]; |
||||||
|
Q[13] = ss3(Q[13]) + H[14]; |
||||||
|
Q[14] = ss4(Q[14]) + H[15]; |
||||||
|
Q[15] = ss0(Q[15]) + H[0]; |
||||||
|
|
||||||
|
/* This is the Message expansion or f_1 in the documentation. */ |
||||||
|
/* It has 16 rounds. */ |
||||||
|
/* Blue Midnight Wish has two tunable security parameters. */ |
||||||
|
/* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS. */ |
||||||
|
/* The following relation for these parameters should is satisfied: */ |
||||||
|
/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */ |
||||||
|
|
||||||
|
for (i = 0; i<2; i++) |
||||||
|
Q[i + 16] = expand32_1(i + 16, M32, H, Q); |
||||||
|
|
||||||
|
for (i = 2; i<16; i++) |
||||||
|
Q[i + 16] = expand32_2(i + 16, M32, H, Q); |
||||||
|
|
||||||
|
/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */ |
||||||
|
/* 16 new variables that are prooduced in the Message Expansion part. */ |
||||||
|
XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23]; |
||||||
|
XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31]; |
||||||
|
|
||||||
|
|
||||||
|
/* This part is the function f_2 - in the documentation */ |
||||||
|
|
||||||
|
/* Compute the double chaining pipe for the next message block. */ |
||||||
|
H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32 ^ Q[24] ^ Q[0]); |
||||||
|
H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32 ^ Q[25] ^ Q[1]); |
||||||
|
H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]); |
||||||
|
H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]); |
||||||
|
H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32 ^ Q[28] ^ Q[4]); |
||||||
|
H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32 ^ Q[29] ^ Q[5]); |
||||||
|
H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32 ^ Q[30] ^ Q[6]); |
||||||
|
H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32 ^ Q[31] ^ Q[7]); |
||||||
|
|
||||||
|
H[8] = SPH_ROTL32(H[4], 9) + (XH32 ^ Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]); |
||||||
|
H[9] = SPH_ROTL32(H[5], 10) + (XH32 ^ Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]); |
||||||
|
H[10] = SPH_ROTL32(H[6], 11) + (XH32 ^ Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]); |
||||||
|
H[11] = SPH_ROTL32(H[7], 12) + (XH32 ^ Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]); |
||||||
|
H[12] = SPH_ROTL32(H[0], 13) + (XH32 ^ Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]); |
||||||
|
H[13] = SPH_ROTL32(H[1], 14) + (XH32 ^ Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]); |
||||||
|
H[14] = SPH_ROTL32(H[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]); |
||||||
|
H[15] = SPH_ROTL32(H[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]); |
||||||
|
|
||||||
|
#undef SPH_ROTL32 |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
__forceinline__ __device__ void Compression256_2(uint32_t * M32, uint32_t * H) |
||||||
|
{ |
||||||
|
#undef SPH_ROTL32 |
||||||
|
#define SPH_ROTL32 ROTL32 |
||||||
|
int i; |
||||||
|
uint32_t XL32, XH32, Q[32]; |
||||||
|
|
||||||
|
/* This part is the function f0 - in the documentation */ |
||||||
|
|
||||||
|
/* First we mix the message block *M32 (M in the documatation) */ |
||||||
|
/* with the previous double pipe *H. */ |
||||||
|
/* For a fixed previous double pipe, or fixed message block, this */ |
||||||
|
/* part is bijection. */ |
||||||
|
/* This transformation diffuses every one bit difference in 5 words. */ |
||||||
|
|
||||||
|
Q[0] = (H[5]) - (H[7]) + (H[10]) + (H[13]) + (0x280 ^ H[14]); |
||||||
|
Q[1] = (H[6]) - (H[8]) + (H[11]) + (0x280 ^ H[14]) - (H[15]); |
||||||
|
Q[2] = (M32[0] ^ H[0]) + (H[7]) + (H[9]) - (H[12]) + (H[15]); |
||||||
|
Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (H[8]) - (H[10]) + (H[13]); |
||||||
|
Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (H[9]) - (H[11]) - (0x280 ^ H[14]); |
||||||
|
Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (H[10]) - (H[12]) + (H[15]); |
||||||
|
Q[6] = (0x80 ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (H[11]) + (H[13]); |
||||||
|
Q[7] = (M32[1] ^ H[1]) - (0x80 ^ H[4]) - (H[5]) - (H[12]) - (0x280 ^ H[14]); |
||||||
|
Q[8] = (M32[2] ^ H[2]) - (H[5]) - (H[6]) + (H[13]) - (H[15]); |
||||||
|
Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (H[6]) - (H[7]) + (0x280 ^ H[14]); |
||||||
|
Q[10] = (H[8]) - (M32[1] ^ H[1]) - (0x80 ^ H[4]) - (H[7]) + (H[15]); |
||||||
|
Q[11] = (H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (H[5]) + (H[9]); |
||||||
|
Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (H[6]) - (H[9]) + (H[10]); |
||||||
|
Q[13] = (M32[2] ^ H[2]) + (0x80 ^ H[4]) + (H[7]) + (H[10]) + (H[11]); |
||||||
|
Q[14] = (M32[3] ^ H[3]) - (H[5]) + (H[8]) - (H[11]) - (H[12]); |
||||||
|
Q[15] = (H[12]) - (0x80 ^ H[4]) - (H[6]) - (H[9]) + (H[13]); |
||||||
|
|
||||||
|
/* Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/ |
||||||
|
|
||||||
|
Q[0] = ss0(Q[0]) + H[1]; |
||||||
|
Q[1] = ss1(Q[1]) + H[2]; |
||||||
|
Q[2] = ss2(Q[2]) + H[3]; |
||||||
|
Q[3] = ss3(Q[3]) + H[4]; |
||||||
|
Q[4] = ss4(Q[4]) + H[5]; |
||||||
|
Q[5] = ss0(Q[5]) + H[6]; |
||||||
|
Q[6] = ss1(Q[6]) + H[7]; |
||||||
|
Q[7] = ss2(Q[7]) + H[8]; |
||||||
|
Q[8] = ss3(Q[8]) + H[9]; |
||||||
|
Q[9] = ss4(Q[9]) + H[10]; |
||||||
|
Q[10] = ss0(Q[10]) + H[11]; |
||||||
|
Q[11] = ss1(Q[11]) + H[12]; |
||||||
|
Q[12] = ss2(Q[12]) + H[13]; |
||||||
|
Q[13] = ss3(Q[13]) + H[14]; |
||||||
|
Q[14] = ss4(Q[14]) + H[15]; |
||||||
|
Q[15] = ss0(Q[15]) + H[0]; |
||||||
|
|
||||||
|
/* This is the Message expansion or f_1 in the documentation. */ |
||||||
|
/* It has 16 rounds. */ |
||||||
|
/* Blue Midnight Wish has two tunable security parameters. */ |
||||||
|
/* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS. */ |
||||||
|
/* The following relation for these parameters should is satisfied: */ |
||||||
|
/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */ |
||||||
|
|
||||||
|
for (i = 0; i<2; i++) |
||||||
|
Q[i + 16] = expand32_1(i + 16, M32, H, Q); |
||||||
|
|
||||||
|
for (i = 2; i<16; i++) |
||||||
|
Q[i + 16] = expand32_2(i + 16, M32, H, Q); |
||||||
|
|
||||||
|
/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */ |
||||||
|
/* 16 new variables that are prooduced in the Message Expansion part. */ |
||||||
|
XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23]; |
||||||
|
XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31]; |
||||||
|
|
||||||
|
|
||||||
|
/* This part is the function f_2 - in the documentation */ |
||||||
|
|
||||||
|
/* Compute the double chaining pipe for the next message block. */ |
||||||
|
H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32 ^ Q[24] ^ Q[0]); |
||||||
|
H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32 ^ Q[25] ^ Q[1]); |
||||||
|
H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]); |
||||||
|
H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]); |
||||||
|
H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32 ^ Q[28] ^ Q[4]); |
||||||
|
H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32 ^ Q[29] ^ Q[5]); |
||||||
|
H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32 ^ Q[30] ^ Q[6]); |
||||||
|
H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32 ^ Q[31] ^ Q[7]); |
||||||
|
|
||||||
|
H[8] = SPH_ROTL32(H[4], 9) + (XH32 ^ Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]); |
||||||
|
H[9] = SPH_ROTL32(H[5], 10) + (XH32 ^ Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]); |
||||||
|
H[10] = SPH_ROTL32(H[6], 11) + (XH32 ^ Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]); |
||||||
|
H[11] = SPH_ROTL32(H[7], 12) + (XH32 ^ Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]); |
||||||
|
H[12] = SPH_ROTL32(H[0], 13) + (XH32 ^ Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]); |
||||||
|
H[13] = SPH_ROTL32(H[1], 14) + (XH32 ^ Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]); |
||||||
|
H[14] = SPH_ROTL32(H[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]); |
||||||
|
H[15] = SPH_ROTL32(H[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]); |
||||||
|
|
||||||
|
#undef SPH_ROTL32 |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
#define TPB 512 |
||||||
|
__global__ __launch_bounds__(TPB, 2) |
||||||
|
void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *const __restrict__ nonceVector) |
||||||
|
{ |
||||||
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||||
|
if (thread < threads) |
||||||
|
{ |
||||||
|
uint32_t dh[16] = { |
||||||
|
(0x40414243), (0x44454647), |
||||||
|
(0x48494A4B), (0x4C4D4E4F), |
||||||
|
(0x50515253), (0x54555657), |
||||||
|
(0x58595A5B), (0x5C5D5E5F), |
||||||
|
(0x60616263), (0x64656667), |
||||||
|
(0x68696A6B), (0x6C6D6E6F), |
||||||
|
(0x70717273), (0x74757677), |
||||||
|
(0x78797A7B), (0x7C7D7E7F) |
||||||
|
}; |
||||||
|
|
||||||
|
|
||||||
|
uint32_t final_s[16] = { |
||||||
|
(0xaaaaaaa0), (0xaaaaaaa1), (0xaaaaaaa2), |
||||||
|
(0xaaaaaaa3), (0xaaaaaaa4), (0xaaaaaaa5), |
||||||
|
(0xaaaaaaa6), (0xaaaaaaa7), (0xaaaaaaa8), |
||||||
|
(0xaaaaaaa9), (0xaaaaaaaa), (0xaaaaaaab), |
||||||
|
(0xaaaaaaac), (0xaaaaaaad), (0xaaaaaaae), |
||||||
|
(0xaaaaaaaf) |
||||||
|
}; |
||||||
|
uint32_t message[16]={0}; |
||||||
|
LOHI(message[0], message[1], __ldg(&g_hash[thread])); |
||||||
|
LOHI(message[2], message[3], __ldg(&g_hash[thread + 1 * threads])); |
||||||
|
LOHI(message[4], message[5], __ldg(&g_hash[thread + 2 * threads])); |
||||||
|
LOHI(message[6], message[7], __ldg(&g_hash[thread + 3 * threads])); |
||||||
|
|
||||||
|
message[8]=0x80; |
||||||
|
message[14]=0x100; |
||||||
|
Compression256(message, dh); |
||||||
|
Compression256(dh, final_s); |
||||||
|
|
||||||
|
if (((uint64_t*)final_s)[7] <= ((uint64_t*)pTarget)[3]) |
||||||
|
{ |
||||||
|
uint32_t tmp = atomicExch(&nonceVector[0], startNounce + thread); |
||||||
|
if (tmp != 0) |
||||||
|
nonceVector[1] = tmp; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
__host__ |
||||||
|
void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash,uint32_t *resultnonces) |
||||||
|
{ |
||||||
|
cudaMemset(d_GNonce[thr_id], 0x0, 2 * sizeof(uint32_t)); |
||||||
|
const uint32_t threadsperblock = TPB; |
||||||
|
|
||||||
|
// berechne wie viele Thread Blocks wir brauchen |
||||||
|
dim3 grid((threads + threadsperblock - 1) / threadsperblock); |
||||||
|
dim3 block(threadsperblock); |
||||||
|
|
||||||
|
bmw256_gpu_hash_32 << <grid, block >> >(threads, startNounce, g_hash, d_GNonce[thr_id]); |
||||||
|
cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); |
||||||
|
resultnonces[0] = *(d_gnounce[thr_id]); |
||||||
|
resultnonces[1] = *(d_gnounce[thr_id] + 1); |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
__host__ |
||||||
|
void bmw256_cpu_init(int thr_id, uint32_t threads) |
||||||
|
{ |
||||||
|
cudaMalloc(&d_GNonce[thr_id], 2 * sizeof(uint32_t)); |
||||||
|
cudaMallocHost(&d_gnounce[thr_id], 2 * sizeof(uint32_t)); |
||||||
|
} |
||||||
|
|
||||||
|
__host__ |
||||||
|
void bmw256_setTarget(const void *pTargetIn) |
||||||
|
{ |
||||||
|
cudaMemcpyToSymbol(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); |
||||||
|
} |
@ -0,0 +1,260 @@ |
|||||||
|
#include "cuda_helper.h" |
||||||
|
|
||||||
|
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ |
||||||
|
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ |
||||||
|
|
||||||
|
#if __CUDA_ARCH__ < 350 |
||||||
|
#define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) |
||||||
|
#else |
||||||
|
#define LROT(x, bits) __funnelshift_l(x, x, bits) |
||||||
|
#endif |
||||||
|
|
||||||
|
#define ROTATEUPWARDS7(a) LROT(a,7) |
||||||
|
#define ROTATEUPWARDS11(a) LROT(a,11) |
||||||
|
|
||||||
|
//#define SWAP(a,b) { uint32_t u = a; a = b; b = u; } |
||||||
|
#define SWAP(a,b) { a ^= b; b ^=a; a ^=b;} |
||||||
|
__device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2]) |
||||||
|
{ |
||||||
|
int r; |
||||||
|
int j; |
||||||
|
int k; |
||||||
|
int l; |
||||||
|
int m; |
||||||
|
|
||||||
|
#pragma unroll 2 |
||||||
|
for (r = 0; r < CUBEHASH_ROUNDS; ++r) { |
||||||
|
|
||||||
|
/* "add x_0jklm into x_1jklmn modulo 2^32" */ |
||||||
|
#pragma unroll 2 |
||||||
|
for (j = 0; j < 2; ++j) |
||||||
|
#pragma unroll 2 |
||||||
|
for (k = 0; k < 2; ++k) |
||||||
|
#pragma unroll 2 |
||||||
|
for (l = 0; l < 2; ++l) |
||||||
|
#pragma unroll 2 |
||||||
|
for (m = 0; m < 2; ++m) |
||||||
|
x[1][j][k][l][m] += x[0][j][k][l][m]; |
||||||
|
|
||||||
|
/* "rotate x_0jklm upwards by 7 bits" */ |
||||||
|
#pragma unroll 2 |
||||||
|
for (j = 0; j < 2; ++j) |
||||||
|
#pragma unroll 2 |
||||||
|
for (k = 0; k < 2; ++k) |
||||||
|
#pragma unroll 2 |
||||||
|
for (l = 0; l < 2; ++l) |
||||||
|
#pragma unroll 2 |
||||||
|
for (m = 0; m < 2; ++m) |
||||||
|
x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); |
||||||
|
|
||||||
|
/* "swap x_00klm with x_01klm" */ |
||||||
|
#pragma unroll 2 |
||||||
|
for (k = 0; k < 2; ++k) |
||||||
|
#pragma unroll 2 |
||||||
|
for (l = 0; l < 2; ++l) |
||||||
|
#pragma unroll 2 |
||||||
|
for (m = 0; m < 2; ++m) |
||||||
|
SWAP(x[0][0][k][l][m], x[0][1][k][l][m]) |
||||||
|
|
||||||
|
/* "xor x_1jklm into x_0jklm" */ |
||||||
|
#pragma unroll 2 |
||||||
|
for (j = 0; j < 2; ++j) |
||||||
|
#pragma unroll 2 |
||||||
|
for (k = 0; k < 2; ++k) |
||||||
|
#pragma unroll 2 |
||||||
|
for (l = 0; l < 2; ++l) |
||||||
|
#pragma unroll 2 |
||||||
|
for (m = 0; m < 2; ++m) |
||||||
|
x[0][j][k][l][m] ^= x[1][j][k][l][m]; |
||||||
|
|
||||||
|
/* "swap x_1jk0m with x_1jk1m" */ |
||||||
|
#pragma unroll 2 |
||||||
|
for (j = 0; j < 2; ++j) |
||||||
|
#pragma unroll 2 |
||||||
|
for (k = 0; k < 2; ++k) |
||||||
|
#pragma unroll 2 |
||||||
|
for (m = 0; m < 2; ++m) |
||||||
|
SWAP(x[1][j][k][0][m], x[1][j][k][1][m]) |
||||||
|
|
||||||
|
/* "add x_0jklm into x_1jklm modulo 2^32" */ |
||||||
|
#pragma unroll 2 |
||||||
|
for (j = 0; j < 2; ++j) |
||||||
|
#pragma unroll 2 |
||||||
|
for (k = 0; k < 2; ++k) |
||||||
|
#pragma unroll 2 |
||||||
|
for (l = 0; l < 2; ++l) |
||||||
|
#pragma unroll 2 |
||||||
|
for (m = 0; m < 2; ++m) |
||||||
|
x[1][j][k][l][m] += x[0][j][k][l][m]; |
||||||
|
|
||||||
|
/* "rotate x_0jklm upwards by 11 bits" */ |
||||||
|
#pragma unroll 2 |
||||||
|
for (j = 0; j < 2; ++j) |
||||||
|
#pragma unroll 2 |
||||||
|
for (k = 0; k < 2; ++k) |
||||||
|
#pragma unroll 2 |
||||||
|
for (l = 0; l < 2; ++l) |
||||||
|
#pragma unroll 2 |
||||||
|
for (m = 0; m < 2; ++m) |
||||||
|
x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); |
||||||
|
|
||||||
|
/* "swap x_0j0lm with x_0j1lm" */ |
||||||
|
#pragma unroll 2 |
||||||
|
for (j = 0; j < 2; ++j) |
||||||
|
#pragma unroll 2 |
||||||
|
for (l = 0; l < 2; ++l) |
||||||
|
#pragma unroll 2 |
||||||
|
for (m = 0; m < 2; ++m) |
||||||
|
SWAP(x[0][j][0][l][m], x[0][j][1][l][m]) |
||||||
|
|
||||||
|
/* "xor x_1jklm into x_0jklm" */ |
||||||
|
#pragma unroll 2 |
||||||
|
for (j = 0; j < 2; ++j) |
||||||
|
#pragma unroll 2 |
||||||
|
for (k = 0; k < 2; ++k) |
||||||
|
#pragma unroll 2 |
||||||
|
for (l = 0; l < 2; ++l) |
||||||
|
#pragma unroll 2 |
||||||
|
for (m = 0; m < 2; ++m) |
||||||
|
x[0][j][k][l][m] ^= x[1][j][k][l][m]; |
||||||
|
|
||||||
|
/* "swap x_1jkl0 with x_1jkl1" */ |
||||||
|
#pragma unroll 2 |
||||||
|
for (j = 0; j < 2; ++j) |
||||||
|
#pragma unroll 2 |
||||||
|
for (k = 0; k < 2; ++k) |
||||||
|
#pragma unroll 2 |
||||||
|
for (l = 0; l < 2; ++l) |
||||||
|
SWAP(x[1][j][k][l][0], x[1][j][k][l][1]) |
||||||
|
|
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
__device__ __forceinline__ void block_tox(const uint32_t *in, uint32_t x[2][2][2][2][2]) |
||||||
|
{ |
||||||
|
x[0][0][0][0][0] ^= in[0]; |
||||||
|
x[0][0][0][0][1] ^= in[1]; |
||||||
|
x[0][0][0][1][0] ^= in[2]; |
||||||
|
x[0][0][0][1][1] ^= in[3]; |
||||||
|
x[0][0][1][0][0] ^= in[4]; |
||||||
|
x[0][0][1][0][1] ^= in[5]; |
||||||
|
x[0][0][1][1][0] ^= in[6]; |
||||||
|
x[0][0][1][1][1] ^= in[7]; |
||||||
|
} |
||||||
|
|
||||||
|
__device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2]) |
||||||
|
{ |
||||||
|
out[0] = x[0][0][0][0][0]; |
||||||
|
out[1] = x[0][0][0][0][1]; |
||||||
|
out[2] = x[0][0][0][1][0]; |
||||||
|
out[3] = x[0][0][0][1][1]; |
||||||
|
out[4] = x[0][0][1][0][0]; |
||||||
|
out[5] = x[0][0][1][0][1]; |
||||||
|
out[6] = x[0][0][1][1][0]; |
||||||
|
out[7] = x[0][0][1][1][1]; |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const uint32_t *data) |
||||||
|
{ |
||||||
|
/* "xor the block into the first b bytes of the state" */ |
||||||
|
/* "and then transform the state invertibly through r identical rounds" */ |
||||||
|
block_tox(data, x); |
||||||
|
rrounds(x); |
||||||
|
} |
||||||
|
|
||||||
|
void __device__ __forceinline__ Update32_const(uint32_t x[2][2][2][2][2]) |
||||||
|
{ |
||||||
|
x[0][0][0][0][0] ^= 0x80; |
||||||
|
rrounds(x); |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], uint32_t *hashval) |
||||||
|
{ |
||||||
|
int i; |
||||||
|
|
||||||
|
/* "the integer 1 is xored into the last state word x_11111" */ |
||||||
|
x[1][1][1][1][1] ^= 1; |
||||||
|
|
||||||
|
/* "the state is then transformed invertibly through 10r identical rounds" */ |
||||||
|
#pragma unroll 2 |
||||||
|
for (i = 0; i < 10; ++i) rrounds(x); |
||||||
|
|
||||||
|
/* "output the first h/8 bytes of the state" */ |
||||||
|
hash_fromx(hashval, x); |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
// Die Hash-Funktion |
||||||
|
#if __CUDA_ARCH__ <500 |
||||||
|
__global__ __launch_bounds__(576,1) |
||||||
|
#else |
||||||
|
__global__ __launch_bounds__(576,1) |
||||||
|
#endif |
||||||
|
void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) |
||||||
|
{ |
||||||
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||||
|
if (thread < threads) |
||||||
|
{ |
||||||
|
|
||||||
|
uint32_t Hash[8]; // = &g_hash[16 * hashPosition]; |
||||||
|
LOHI(Hash[0], Hash[1], __ldg(&g_hash[thread])); |
||||||
|
LOHI(Hash[2], Hash[3], __ldg(&g_hash[thread + 1 * threads])); |
||||||
|
LOHI(Hash[4], Hash[5], __ldg(&g_hash[thread + 2 * threads])); |
||||||
|
LOHI(Hash[6], Hash[7], __ldg(&g_hash[thread + 3 * threads])); |
||||||
|
|
||||||
|
uint32_t x[2][2][2][2][2] = |
||||||
|
{ |
||||||
|
0xEA2BD4B4, 0xCCD6F29F, 0x63117E71, |
||||||
|
0x35481EAE, 0x22512D5B, 0xE5D94E63, |
||||||
|
0x7E624131, 0xF4CC12BE, 0xC2D0B696, |
||||||
|
0x42AF2070, 0xD0720C35, 0x3361DA8C, |
||||||
|
0x28CCECA4, 0x8EF8AD83, 0x4680AC00, |
||||||
|
0x40E5FBAB, 0xD89041C3, 0x6107FBD5, |
||||||
|
0x6C859D41, 0xF0B26679, 0x09392549, |
||||||
|
0x5FA25603, 0x65C892FD, 0x93CB6285, |
||||||
|
0x2AF2B5AE, 0x9E4B4E60, 0x774ABFDD, |
||||||
|
0x85254725, 0x15815AEB, 0x4AB6AAD6, |
||||||
|
0x9CDAF8AF, 0xD6032C0A |
||||||
|
|
||||||
|
}; |
||||||
|
x[0][0][0][0][0] ^= Hash[0]; |
||||||
|
x[0][0][0][0][1] ^= Hash[1]; |
||||||
|
x[0][0][0][1][0] ^= Hash[2]; |
||||||
|
x[0][0][0][1][1] ^= Hash[3]; |
||||||
|
x[0][0][1][0][0] ^= Hash[4]; |
||||||
|
x[0][0][1][0][1] ^= Hash[5]; |
||||||
|
x[0][0][1][1][0] ^= Hash[6]; |
||||||
|
x[0][0][1][1][1] ^= Hash[7]; |
||||||
|
|
||||||
|
rrounds(x); |
||||||
|
x[0][0][0][0][0] ^= 0x80; |
||||||
|
rrounds(x); |
||||||
|
|
||||||
|
Final(x, Hash); |
||||||
|
|
||||||
|
g_hash[thread] = ((uint64_t*)Hash)[0]; |
||||||
|
g_hash[1 * threads + thread] = ((uint64_t*)Hash)[1]; |
||||||
|
g_hash[2 * threads + thread] = ((uint64_t*)Hash)[2]; |
||||||
|
g_hash[3 * threads + thread] = ((uint64_t*)Hash)[3]; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
__host__ |
||||||
|
void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash) |
||||||
|
{ |
||||||
|
|
||||||
|
uint32_t tpb; |
||||||
|
if (device_sm[device_map[thr_id]]<500) |
||||||
|
tpb = 576; |
||||||
|
else |
||||||
|
tpb = 576; |
||||||
|
// berechne wie viele Thread Blocks wir brauchen |
||||||
|
dim3 grid((threads + tpb-1)/tpb); |
||||||
|
dim3 block(tpb); |
||||||
|
|
||||||
|
cubehash256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_hash); |
||||||
|
} |
@ -1,223 +1,543 @@ |
|||||||
|
#include <stdio.h> |
||||||
#include <memory.h> |
#include <memory.h> |
||||||
|
#include "cuda_lyra2_vectors.h" |
||||||
|
#define TPB 8 |
||||||
|
// |
||||||
|
|
||||||
#include "cuda_helper.h" |
#if __CUDA_ARCH__ < 500 |
||||||
|
#define vectype ulonglong4 |
||||||
#define TPB 160 |
#define u64type uint64_t |
||||||
|
#define memshift 4 |
||||||
static __constant__ uint2 blake2b_IV[8] = { |
#elif __CUDA_ARCH__ == 500 |
||||||
{ 0xf3bcc908, 0x6a09e667 }, |
#define u64type uint2 |
||||||
{ 0x84caa73b, 0xbb67ae85 }, |
#define vectype uint28 |
||||||
{ 0xfe94f82b, 0x3c6ef372 }, |
#define memshift 3 |
||||||
{ 0x5f1d36f1, 0xa54ff53a }, |
#else |
||||||
{ 0xade682d1, 0x510e527f }, |
#define u64type uint2 |
||||||
{ 0x2b3e6c1f, 0x9b05688c }, |
#define vectype uint28 |
||||||
{ 0xfb41bd6b, 0x1f83d9ab }, |
#define memshift 4 |
||||||
{ 0x137e2179, 0x5be0cd19 } |
#endif |
||||||
}; |
|
||||||
|
__device__ vectype *DMatrix; |
||||||
#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \ |
|
||||||
for (int i = 0; i < 8; i++) { \ |
|
||||||
for (int j = 0; j < 12; j++) \ |
|
||||||
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ |
|
||||||
round_lyra(state); \ |
|
||||||
for (int j = 0; j < 12; j++) \ |
|
||||||
Matrix[j + 12 * i][rowOut] ^= state[j]; \ |
|
||||||
Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ |
|
||||||
Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ |
|
||||||
Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ |
|
||||||
Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ |
|
||||||
Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ |
|
||||||
Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ |
|
||||||
Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ |
|
||||||
Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ |
|
||||||
Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ |
|
||||||
Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ |
|
||||||
Matrix[10+ 12 * i][rowInOut] ^= state[9]; \ |
|
||||||
Matrix[11+ 12 * i][rowInOut] ^= state[10]; \ |
|
||||||
} \ |
|
||||||
} |
|
||||||
|
|
||||||
#define absorbblock(in) { \ |
|
||||||
state[0] ^= Matrix[0][in]; \ |
|
||||||
state[1] ^= Matrix[1][in]; \ |
|
||||||
state[2] ^= Matrix[2][in]; \ |
|
||||||
state[3] ^= Matrix[3][in]; \ |
|
||||||
state[4] ^= Matrix[4][in]; \ |
|
||||||
state[5] ^= Matrix[5][in]; \ |
|
||||||
state[6] ^= Matrix[6][in]; \ |
|
||||||
state[7] ^= Matrix[7][in]; \ |
|
||||||
state[8] ^= Matrix[8][in]; \ |
|
||||||
state[9] ^= Matrix[9][in]; \ |
|
||||||
state[10] ^= Matrix[10][in]; \ |
|
||||||
state[11] ^= Matrix[11][in]; \ |
|
||||||
round_lyra(state); \ |
|
||||||
round_lyra(state); \ |
|
||||||
round_lyra(state); \ |
|
||||||
round_lyra(state); \ |
|
||||||
round_lyra(state); \ |
|
||||||
round_lyra(state); \ |
|
||||||
round_lyra(state); \ |
|
||||||
round_lyra(state); \ |
|
||||||
round_lyra(state); \ |
|
||||||
round_lyra(state); \ |
|
||||||
round_lyra(state); \ |
|
||||||
round_lyra(state); \ |
|
||||||
} |
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __CUDA_ARCH__ |
||||||
static __device__ __forceinline__ |
static __device__ __forceinline__ |
||||||
void Gfunc(uint2 & a, uint2 &b, uint2 &c, uint2 &d) |
void Gfunc_v35(uint2 &a, uint2 &b, uint2 &c, uint2 &d) |
||||||
{ |
{ |
||||||
a += b; d ^= a; d = SWAPUINT2(d); |
a += b; d ^= a; d = SWAPUINT2(d); |
||||||
c += d; b ^= c; b = ROR2(b, 24); |
c += d; b ^= c; b = ROR24(b); |
||||||
a += b; d ^= a; d = ROR2(d, 16); |
a += b; d ^= a; d = ROR16(d); |
||||||
c += d; b ^= c; b = ROR2(b, 63); |
c += d; b ^= c; b = ROR2(b, 63); |
||||||
} |
} |
||||||
|
|
||||||
__device__ __forceinline__ |
#if __CUDA_ARCH__ < 500 |
||||||
static void round_lyra(uint2 *s) |
static __device__ __forceinline__ |
||||||
|
void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d) |
||||||
{ |
{ |
||||||
Gfunc(s[0], s[4], s[8], s[12]); |
a += b; d ^= a; d = ROTR64(d, 32); |
||||||
Gfunc(s[1], s[5], s[9], s[13]); |
c += d; b ^= c; b = ROTR64(b, 24); |
||||||
Gfunc(s[2], s[6], s[10], s[14]); |
a += b; d ^= a; d = ROTR64(d, 16); |
||||||
Gfunc(s[3], s[7], s[11], s[15]); |
c += d; b ^= c; b = ROTR64(b, 63); |
||||||
Gfunc(s[0], s[5], s[10], s[15]); |
|
||||||
Gfunc(s[1], s[6], s[11], s[12]); |
|
||||||
Gfunc(s[2], s[7], s[8], s[13]); |
|
||||||
Gfunc(s[3], s[4], s[9], s[14]); |
|
||||||
} |
} |
||||||
|
#endif |
||||||
|
|
||||||
__device__ __forceinline__ |
static __device__ __forceinline__ |
||||||
void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[16], uint2 Matrix[96][8]) |
void round_lyra_v35(vectype* s) |
||||||
{ |
{ |
||||||
#if __CUDA_ARCH__ > 500 |
Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x); |
||||||
#pragma unroll |
Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y); |
||||||
|
Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z); |
||||||
|
Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w); |
||||||
|
|
||||||
|
Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w); |
||||||
|
Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x); |
||||||
|
Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y); |
||||||
|
Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z); |
||||||
|
} |
||||||
|
#else |
||||||
|
#define round_lyra_v35(s) {} |
||||||
#endif |
#endif |
||||||
|
|
||||||
|
static __device__ __forceinline__ |
||||||
|
void reduceDuplex(vectype state[4], uint32_t thread) |
||||||
|
{ |
||||||
|
vectype state1[3]; |
||||||
|
uint32_t ps1 = (256 * thread); |
||||||
|
uint32_t ps2 = (memshift * 7 + memshift * 8 + 256 * thread); |
||||||
|
|
||||||
|
#pragma unroll 4 |
||||||
for (int i = 0; i < 8; i++) |
for (int i = 0; i < 8; i++) |
||||||
{ |
{ |
||||||
#pragma unroll |
uint32_t s1 = ps1 + i*memshift; |
||||||
for (int j = 0; j < 12; j++) |
uint32_t s2 = ps2 - i*memshift; |
||||||
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; |
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
round_lyra(state); |
state1[j] = __ldg4(&(DMatrix+s1)[j]); |
||||||
|
|
||||||
#pragma unroll |
for (int j = 0; j < 3; j++) |
||||||
for (int j = 0; j < 12; j++) |
state[j] ^= state1[j]; |
||||||
Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; |
|
||||||
|
round_lyra_v35(state); |
||||||
Matrix[0 + 12 * i][rowInOut] ^= state[11]; |
|
||||||
Matrix[1 + 12 * i][rowInOut] ^= state[0]; |
for (int j = 0; j < 3; j++) |
||||||
Matrix[2 + 12 * i][rowInOut] ^= state[1]; |
state1[j] ^= state[j]; |
||||||
Matrix[3 + 12 * i][rowInOut] ^= state[2]; |
|
||||||
Matrix[4 + 12 * i][rowInOut] ^= state[3]; |
for (int j = 0; j < 3; j++) |
||||||
Matrix[5 + 12 * i][rowInOut] ^= state[4]; |
(DMatrix + s2)[j] = state1[j]; |
||||||
Matrix[6 + 12 * i][rowInOut] ^= state[5]; |
|
||||||
Matrix[7 + 12 * i][rowInOut] ^= state[6]; |
|
||||||
Matrix[8 + 12 * i][rowInOut] ^= state[7]; |
|
||||||
Matrix[9 + 12 * i][rowInOut] ^= state[8]; |
|
||||||
Matrix[10 + 12 * i][rowInOut] ^= state[9]; |
|
||||||
Matrix[11 + 12 * i][rowInOut] ^= state[10]; |
|
||||||
} |
} |
||||||
} |
} |
||||||
|
|
||||||
__global__ __launch_bounds__(TPB, 1) |
static __device__ __forceinline__ |
||||||
void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash) |
void reduceDuplexV3(vectype state[4], uint32_t thread) |
||||||
{ |
{ |
||||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
vectype state1[3]; |
||||||
if (thread < threads) |
uint32_t ps1 = (256 * thread); |
||||||
|
// colomn row |
||||||
|
uint32_t ps2 = (memshift * 7 * 8 + memshift * 1 + 64 * memshift * thread); |
||||||
|
|
||||||
|
#pragma unroll 4 |
||||||
|
for (int i = 0; i < 8; i++) |
||||||
|
{ |
||||||
|
uint32_t s1 = ps1 + 8 * i *memshift; |
||||||
|
uint32_t s2 = ps2 - 8 * i *memshift; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] = __ldg4(&(DMatrix + s1)[j]); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state[j] ^= state1[j]; |
||||||
|
|
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] ^= state[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j] = state1[j]; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
static __device__ __forceinline__ |
||||||
|
void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) |
||||||
|
{ |
||||||
|
vectype state2[3],state1[3]; |
||||||
|
|
||||||
|
uint32_t ps1 = ( memshift * 8 * rowIn + 256 * thread); |
||||||
|
uint32_t ps2 = ( memshift * 8 * rowInOut + 256 * thread); |
||||||
|
uint32_t ps3 = (memshift*7 + memshift * 8 * rowOut + 256 * thread); |
||||||
|
|
||||||
|
#pragma unroll 1 |
||||||
|
for (int i = 0; i < 8; i++) |
||||||
|
{ |
||||||
|
uint32_t s1 = ps1 + i*memshift; |
||||||
|
uint32_t s2 = ps2 + i*memshift; |
||||||
|
uint32_t s3 = ps3 - i*memshift; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j]= __ldg4(&(DMatrix + s1)[j]); |
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state2[j]= __ldg4(&(DMatrix + s2)[j]); |
||||||
|
for (int j = 0; j < 3; j++) { |
||||||
|
vectype tmp = state1[j] + state2[j]; |
||||||
|
state[j] ^= tmp; |
||||||
|
} |
||||||
|
|
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) { |
||||||
|
state1[j] ^= state[j]; |
||||||
|
(DMatrix + s3)[j] = state1[j]; |
||||||
|
} |
||||||
|
|
||||||
|
((uint2*)state2)[0] ^= ((uint2*)state)[11]; |
||||||
|
for (int j = 0; j < 11; j++) |
||||||
|
((uint2*)state2)[j+1] ^= ((uint2*)state)[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j] = state2[j]; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
static __device__ __forceinline__ |
||||||
|
void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) |
||||||
|
{ |
||||||
|
vectype state2[3], state1[3]; |
||||||
|
|
||||||
|
uint32_t ps1 = ( memshift * rowIn + 64 * memshift * thread); |
||||||
|
uint32_t ps2 = (memshift * rowInOut + 64 * memshift* thread); |
||||||
|
uint32_t ps3 = (8 * memshift * 7 + memshift * rowOut + 64 * memshift * thread); |
||||||
|
/* |
||||||
|
uint32_t ps1 = (256 * thread); |
||||||
|
uint32_t ps2 = (256 * thread); |
||||||
|
uint32_t ps3 = (256 * thread); |
||||||
|
*/ |
||||||
|
#pragma nounroll |
||||||
|
for (int i = 0; i < 8; i++) |
||||||
|
{ |
||||||
|
uint32_t s1 = ps1 + 8*i*memshift; |
||||||
|
uint32_t s2 = ps2 + 8*i*memshift; |
||||||
|
uint32_t s3 = ps3 - 8*i*memshift; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] = __ldg4(&(DMatrix + s1 )[j]); |
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state2[j] = __ldg4(&(DMatrix + s2 )[j]); |
||||||
|
for (int j = 0; j < 3; j++) { |
||||||
|
vectype tmp = state1[j] + state2[j]; |
||||||
|
state[j] ^= tmp; |
||||||
|
} |
||||||
|
|
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) { |
||||||
|
state1[j] ^= state[j]; |
||||||
|
(DMatrix + s3)[j] = state1[j]; |
||||||
|
} |
||||||
|
|
||||||
|
((uint2*)state2)[0] ^= ((uint2*)state)[11]; |
||||||
|
for (int j = 0; j < 11; j++) |
||||||
|
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j] = state2[j]; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
static __device__ __forceinline__ |
||||||
|
void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) |
||||||
|
{ |
||||||
|
vectype state1[3], state2[3]; |
||||||
|
|
||||||
|
uint32_t ps1 = (memshift * 8 * rowIn + 256 * thread); |
||||||
|
uint32_t ps2 = (memshift * 8 * rowInOut + 256 * thread); |
||||||
|
uint32_t ps3 = (memshift * 8 * rowOut + 256 * thread); |
||||||
|
|
||||||
|
#pragma unroll 1 |
||||||
|
for (int i = 0; i < 8; i++) |
||||||
{ |
{ |
||||||
uint2 state[16]; |
uint32_t s1 = ps1 + i*memshift; |
||||||
|
uint32_t s2 = ps2 + i*memshift; |
||||||
|
uint32_t s3 = ps3 + i*memshift; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] = __ldg4(&(DMatrix + s1)[j]); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state2[j] = __ldg4(&(DMatrix + s2)[j]); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] += state2[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state[j] ^= state1[j]; |
||||||
|
|
||||||
#pragma unroll |
round_lyra_v35(state); |
||||||
for (int i = 0; i<4; i++) { |
|
||||||
LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); |
|
||||||
} //password |
|
||||||
|
|
||||||
#pragma unroll |
((uint2*)state2)[0] ^= ((uint2*)state)[11]; |
||||||
for (int i = 0; i<4; i++) { |
for (int j = 0; j < 11; j++) |
||||||
state[i + 4] = state[i]; |
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; |
||||||
} //salt |
|
||||||
|
|
||||||
#pragma unroll |
if (rowInOut != rowOut) { |
||||||
for (int i = 0; i<8; i++) { |
|
||||||
state[i + 8] = blake2b_IV[i]; |
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j] = state2[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s3)[j] ^= state[j]; |
||||||
|
|
||||||
|
} else { |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state2[j] ^= state[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j]=state2[j]; |
||||||
} |
} |
||||||
|
|
||||||
// blake2blyra x2 |
} |
||||||
//#pragma unroll 24 |
} |
||||||
for (int i = 0; i<24; i++) { |
|
||||||
round_lyra(state); |
static __device__ __forceinline__ |
||||||
} //because 12 is not enough |
void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) |
||||||
|
{ |
||||||
|
|
||||||
uint2 Matrix[96][8]; // not cool |
vectype state1[3], state2[3]; |
||||||
|
uint32_t ps1 = (memshift * rowIn + 64 * memshift * thread); |
||||||
|
uint32_t ps2 = (memshift * rowInOut + 64 * memshift * thread); |
||||||
|
uint32_t ps3 = (memshift * rowOut + 64 *memshift * thread); |
||||||
|
|
||||||
// reducedSqueezeRow0 |
#pragma nounroll |
||||||
#pragma unroll 8 |
|
||||||
for (int i = 0; i < 8; i++) |
for (int i = 0; i < 8; i++) |
||||||
{ |
{ |
||||||
#pragma unroll 12 |
uint32_t s1 = ps1 + 8 * i*memshift; |
||||||
for (int j = 0; j<12; j++) { |
uint32_t s2 = ps2 + 8 * i*memshift; |
||||||
Matrix[j + 84 - 12 * i][0] = state[j]; |
uint32_t s3 = ps3 + 8 * i*memshift; |
||||||
|
|
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] = __ldg4(&(DMatrix + s1)[j]); |
||||||
|
|
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state2[j] = __ldg4(&(DMatrix + s2)[j]); |
||||||
|
|
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] += state2[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state[j] ^= state1[j]; |
||||||
|
|
||||||
|
|
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
((uint2*)state2)[0] ^= ((uint2*)state)[11]; |
||||||
|
for (int j = 0; j < 11; j++) |
||||||
|
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; |
||||||
|
|
||||||
|
if (rowInOut != rowOut) { |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j] = state2[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s3)[j] ^= state[j]; |
||||||
|
|
||||||
|
} |
||||||
|
else { |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state2[j] ^= state[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j] = state2[j]; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ < 500 |
||||||
|
__global__ __launch_bounds__(48, 1) |
||||||
|
#elif __CUDA_ARCH__ == 500 |
||||||
|
__global__ __launch_bounds__(16, 1) |
||||||
|
#else |
||||||
|
__global__ __launch_bounds__(TPB, 1) |
||||||
|
#endif |
||||||
|
void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash) |
||||||
|
{ |
||||||
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||||
|
vectype state[4]; |
||||||
|
|
||||||
|
#if __CUDA_ARCH__ > 350 |
||||||
|
const uint28 blake2b_IV[2] = { |
||||||
|
{{ 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a }}, |
||||||
|
{{ 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 }} |
||||||
|
}; |
||||||
|
#else |
||||||
|
const ulonglong4 blake2b_IV[2] = { |
||||||
|
{ 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 }, |
||||||
|
{ 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 } |
||||||
|
}; |
||||||
|
#endif |
||||||
|
|
||||||
|
#if __CUDA_ARCH__ == 350 |
||||||
|
if (thread < threads) |
||||||
|
#endif |
||||||
|
{ |
||||||
|
((uint2*)state)[0] = __ldg(&outputHash[thread]); |
||||||
|
((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); |
||||||
|
((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); |
||||||
|
((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); |
||||||
|
// state[0] = __ldg4(&((vectype*)outputHash)[thread]); |
||||||
|
state[1] = state[0]; |
||||||
|
state[2] = ((vectype*)blake2b_IV)[0]; |
||||||
|
state[3] = ((vectype*)blake2b_IV)[1]; |
||||||
|
|
||||||
|
|
||||||
|
for (int i = 0; i<24; i++) { //because 12 is not enough |
||||||
|
round_lyra_v35(state); |
||||||
} |
} |
||||||
round_lyra(state); |
|
||||||
|
uint32_t ps1 = (memshift * 7 + 256 * thread); |
||||||
|
|
||||||
|
for (int i = 0; i < 8; i++) |
||||||
|
{ |
||||||
|
uint32_t s1 = ps1 - memshift * i; |
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s1)[j] = (state)[j]; |
||||||
|
|
||||||
|
round_lyra_v35(state); |
||||||
} |
} |
||||||
|
|
||||||
// reducedSqueezeRow1 |
|
||||||
#pragma unroll 8 |
reduceDuplex(state, thread); |
||||||
|
|
||||||
|
reduceDuplexRowSetupV2(1, 0, 2, state, thread); |
||||||
|
reduceDuplexRowSetupV2(2, 1, 3, state, thread); |
||||||
|
reduceDuplexRowSetupV2(3, 0, 4, state, thread); |
||||||
|
reduceDuplexRowSetupV2(4, 3, 5, state, thread); |
||||||
|
reduceDuplexRowSetupV2(5, 2, 6, state, thread); |
||||||
|
reduceDuplexRowSetupV2(6, 1, 7, state, thread); |
||||||
|
uint32_t rowa = ((uint2*)state)[0].x & 7; |
||||||
|
|
||||||
|
reduceDuplexRowtV2(7, rowa, 0, state, thread); |
||||||
|
rowa = ((uint2*)state)[0].x & 7; |
||||||
|
reduceDuplexRowtV2(0, rowa, 3, state, thread); |
||||||
|
rowa = ((uint2*)state)[0].x & 7; |
||||||
|
reduceDuplexRowtV2(3, rowa, 6, state, thread); |
||||||
|
rowa = ((uint2*)state)[0].x & 7; |
||||||
|
reduceDuplexRowtV2(6, rowa, 1, state, thread); |
||||||
|
rowa = ((uint2*)state)[0].x & 7; |
||||||
|
reduceDuplexRowtV2(1, rowa, 4, state, thread); |
||||||
|
rowa = ((uint2*)state)[0].x & 7; |
||||||
|
reduceDuplexRowtV2(4, rowa, 7, state, thread); |
||||||
|
rowa = ((uint2*)state)[0].x & 7; |
||||||
|
reduceDuplexRowtV2(7, rowa, 2, state, thread); |
||||||
|
rowa = ((uint2*)state)[0].x & 7; |
||||||
|
reduceDuplexRowtV2(2, rowa, 5, state, thread); |
||||||
|
|
||||||
|
uint32_t shift = (memshift * 8 * rowa + 256 * thread); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state[j] ^= __ldg4(&(DMatrix + shift)[j]); |
||||||
|
|
||||||
|
for (int i = 0; i < 12; i++) |
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
|
||||||
|
outputHash[thread]= ((uint2*)state)[0]; |
||||||
|
outputHash[thread + threads] = ((uint2*)state)[1]; |
||||||
|
outputHash[thread + 2 * threads] = ((uint2*)state)[2]; |
||||||
|
outputHash[thread + 3 * threads] = ((uint2*)state)[3]; |
||||||
|
// ((vectype*)outputHash)[thread] = state[0]; |
||||||
|
|
||||||
|
} //thread |
||||||
|
} |
||||||
|
|
||||||
|
#if __CUDA_ARCH__ < 500 |
||||||
|
__global__ __launch_bounds__(48, 1) |
||||||
|
#elif __CUDA_ARCH__ == 500 |
||||||
|
__global__ __launch_bounds__(16, 1) |
||||||
|
#else |
||||||
|
__global__ __launch_bounds__(TPB, 1) |
||||||
|
#endif |
||||||
|
void lyra2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) |
||||||
|
{ |
||||||
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||||
|
|
||||||
|
vectype state[4]; |
||||||
|
|
||||||
|
#if __CUDA_ARCH__ > 350 |
||||||
|
const uint28 blake2b_IV[2] = { |
||||||
|
{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } }, |
||||||
|
{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } } |
||||||
|
}; |
||||||
|
#else |
||||||
|
const ulonglong4 blake2b_IV[2] = { |
||||||
|
{ 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 }, |
||||||
|
{ 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 } |
||||||
|
}; |
||||||
|
#endif |
||||||
|
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ == 350 |
||||||
|
if (thread < threads) |
||||||
|
#endif |
||||||
|
{ |
||||||
|
((uint2*)state)[0] = __ldg(&outputHash[thread]); |
||||||
|
((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); |
||||||
|
((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); |
||||||
|
((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); |
||||||
|
|
||||||
|
state[1] = state[0]; |
||||||
|
|
||||||
|
state[2] = ((vectype*)blake2b_IV)[0]; |
||||||
|
state[3] = ((vectype*)blake2b_IV)[1]; |
||||||
|
|
||||||
|
for (int i = 0; i<24; i++) |
||||||
|
round_lyra_v35(state); //because 12 is not enough |
||||||
|
|
||||||
|
uint32_t ps1 = (8 * memshift * 7 + 64 * memshift * thread); |
||||||
|
|
||||||
for (int i = 0; i < 8; i++) |
for (int i = 0; i < 8; i++) |
||||||
{ |
{ |
||||||
#pragma unroll 12 |
uint32_t s1 = ps1 - 8 * memshift * i; |
||||||
for (int j = 0; j<12; j++) { |
for (int j = 0; j < 3; j++) |
||||||
state[j] ^= Matrix[j + 12 * i][0]; |
(DMatrix + s1)[j] = (state)[j]; |
||||||
} |
|
||||||
round_lyra(state); |
round_lyra_v35(state); |
||||||
#pragma unroll 12 |
} |
||||||
for (int j = 0; j<12; j++) { |
|
||||||
Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; |
|
||||||
} |
reduceDuplexV3(state, thread); |
||||||
} |
|
||||||
|
reduceDuplexRowSetupV3(1, 0, 2, state, thread); |
||||||
reduceDuplexRowSetup(1, 0, 2,state, Matrix); |
reduceDuplexRowSetupV3(2, 1, 3, state, thread); |
||||||
reduceDuplexRowSetup(2, 1, 3, state, Matrix); |
reduceDuplexRowSetupV3(3, 0, 4, state, thread); |
||||||
reduceDuplexRowSetup(3, 0, 4, state, Matrix); |
reduceDuplexRowSetupV3(4, 3, 5, state, thread); |
||||||
reduceDuplexRowSetup(4, 3, 5, state, Matrix); |
reduceDuplexRowSetupV3(5, 2, 6, state, thread); |
||||||
reduceDuplexRowSetup(5, 2, 6, state, Matrix); |
reduceDuplexRowSetupV3(6, 1, 7, state, thread); |
||||||
reduceDuplexRowSetup(6, 1, 7, state, Matrix); |
uint32_t rowa = ((uint2*)state)[0].x & 7; |
||||||
|
|
||||||
uint32_t rowa; |
reduceDuplexRowtV3(7, rowa, 0, state, thread); |
||||||
rowa = state[0].x & 7; |
rowa = ((uint2*)state)[0].x & 7; |
||||||
reduceDuplexRow(7, rowa, 0); |
reduceDuplexRowtV3(0, rowa, 3, state, thread); |
||||||
rowa = state[0].x & 7; |
rowa = ((uint2*)state)[0].x & 7; |
||||||
reduceDuplexRow(0, rowa, 3); |
reduceDuplexRowtV3(3, rowa, 6, state, thread); |
||||||
rowa = state[0].x & 7; |
rowa = ((uint2*)state)[0].x & 7; |
||||||
reduceDuplexRow(3, rowa, 6); |
reduceDuplexRowtV3(6, rowa, 1, state, thread); |
||||||
rowa = state[0].x & 7; |
rowa = ((uint2*)state)[0].x & 7; |
||||||
reduceDuplexRow(6, rowa, 1); |
reduceDuplexRowtV3(1, rowa, 4, state, thread); |
||||||
rowa = state[0].x & 7; |
rowa = ((uint2*)state)[0].x & 7; |
||||||
reduceDuplexRow(1, rowa, 4); |
reduceDuplexRowtV3(4, rowa, 7, state, thread); |
||||||
rowa = state[0].x & 7; |
rowa = ((uint2*)state)[0].x & 7; |
||||||
reduceDuplexRow(4, rowa, 7); |
reduceDuplexRowtV3(7, rowa, 2, state, thread); |
||||||
rowa = state[0].x & 7; |
rowa = ((uint2*)state)[0].x & 7; |
||||||
reduceDuplexRow(7, rowa, 2); |
reduceDuplexRowtV3(2, rowa, 5, state, thread); |
||||||
rowa = state[0].x & 7; |
|
||||||
reduceDuplexRow(2, rowa, 5); |
uint32_t shift = (memshift * rowa + 64 * memshift * thread); |
||||||
|
|
||||||
absorbblock(rowa); |
for (int j = 0; j < 3; j++) |
||||||
|
state[j] ^= __ldg4(&(DMatrix + shift)[j]); |
||||||
#pragma unroll |
|
||||||
for (int i = 0; i<4; i++) { |
for (int i = 0; i < 12; i++) |
||||||
outputHash[threads*i + thread] = devectorize(state[i]); |
round_lyra_v35(state); |
||||||
} //password |
|
||||||
|
|
||||||
|
outputHash[thread] = ((uint2*)state)[0]; |
||||||
|
outputHash[thread + threads] = ((uint2*)state)[1]; |
||||||
|
outputHash[thread + 2 * threads] = ((uint2*)state)[2]; |
||||||
|
outputHash[thread + 3 * threads] = ((uint2*)state)[3]; |
||||||
|
|
||||||
} //thread |
} //thread |
||||||
} |
} |
||||||
|
|
||||||
__host__ |
__host__ |
||||||
|
void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *hash) |
||||||
|
{ |
||||||
|
cudaMemcpyToSymbol(DMatrix, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice); |
||||||
|
} |
||||||
|
|
||||||
void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) |
void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) |
||||||
{ |
{ |
||||||
const uint32_t threadsperblock = TPB; |
uint32_t tpb; |
||||||
|
if (device_sm[device_map[thr_id]]<500) |
||||||
|
tpb = 48; |
||||||
|
else if (device_sm[device_map[thr_id]]==500) |
||||||
|
tpb = 16; |
||||||
|
else |
||||||
|
tpb = TPB; |
||||||
|
|
||||||
|
dim3 grid((threads + tpb - 1) / tpb); |
||||||
|
dim3 block(tpb); |
||||||
|
|
||||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock); |
if (device_sm[device_map[thr_id]] == 500) |
||||||
dim3 block(threadsperblock); |
lyra2_gpu_hash_32 <<< grid, block >>> (threads, startNounce, (uint2*)d_outputHash); |
||||||
|
else |
||||||
|
lyra2_gpu_hash_32_v3 <<< grid, block >>> (threads, startNounce, (uint2*)d_outputHash); |
||||||
|
|
||||||
lyra2_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash); |
MyStreamSynchronize(NULL, order, thr_id); |
||||||
} |
} |
||||||
|
@ -0,0 +1,520 @@ |
|||||||
|
#include <stdio.h> |
||||||
|
#include <memory.h> |
||||||
|
|
||||||
|
#include "cuda_lyra2_vectors.h" |
||||||
|
|
||||||
|
#define TPB 16 |
||||||
|
|
||||||
|
#define Nrow 4 |
||||||
|
#define Ncol 4 |
||||||
|
#if __CUDA_ARCH__ < 500 |
||||||
|
#define vectype ulonglong4 |
||||||
|
#define u64type uint64_t |
||||||
|
#define memshift 4 |
||||||
|
#elif __CUDA_ARCH__ == 500 |
||||||
|
#define u64type uint2 |
||||||
|
#define vectype uint28 |
||||||
|
#define memshift 3 |
||||||
|
#else |
||||||
|
#define u64type uint2 |
||||||
|
#define vectype uint28 |
||||||
|
#define memshift 3 |
||||||
|
#endif |
||||||
|
|
||||||
|
__device__ vectype *DMatrix; |
||||||
|
|
||||||
|
#ifdef __CUDA_ARCH__ |
||||||
|
static __device__ __forceinline__ |
||||||
|
void Gfunc_v35(uint2 &a, uint2 &b, uint2 &c, uint2 &d) |
||||||
|
{ |
||||||
|
a += b; d ^= a; d = SWAPUINT2(d); |
||||||
|
c += d; b ^= c; b = ROR24(b); |
||||||
|
a += b; d ^= a; d = ROR16(d); |
||||||
|
c += d; b ^= c; b = ROR2(b, 63); |
||||||
|
} |
||||||
|
|
||||||
|
#if __CUDA_ARCH__ < 500 |
||||||
|
static __device__ __forceinline__ |
||||||
|
void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d) |
||||||
|
{ |
||||||
|
a += b; d ^= a; d = ROTR64(d, 32); |
||||||
|
c += d; b ^= c; b = ROTR64(b, 24); |
||||||
|
a += b; d ^= a; d = ROTR64(d, 16); |
||||||
|
c += d; b ^= c; b = ROTR64(b, 63); |
||||||
|
} |
||||||
|
#endif |
||||||
|
|
||||||
|
static __device__ __forceinline__ |
||||||
|
void round_lyra_v35(vectype* s) |
||||||
|
{ |
||||||
|
Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x); |
||||||
|
Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y); |
||||||
|
Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z); |
||||||
|
Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w); |
||||||
|
|
||||||
|
Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w); |
||||||
|
Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x); |
||||||
|
Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y); |
||||||
|
Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z); |
||||||
|
} |
||||||
|
#else |
||||||
|
#define round_lyra_v35(s) {} |
||||||
|
#endif |
||||||
|
|
||||||
|
|
||||||
|
static __device__ __forceinline__ |
||||||
|
void reduceDuplex(vectype state[4], uint32_t thread) |
||||||
|
{ |
||||||
|
vectype state1[3]; |
||||||
|
uint32_t ps1 = (Nrow * Ncol * memshift * thread); |
||||||
|
uint32_t ps2 = (memshift * (Ncol-1) + memshift * Ncol + Nrow * Ncol * memshift * thread); |
||||||
|
|
||||||
|
#pragma unroll 4 |
||||||
|
for (int i = 0; i < Ncol; i++) |
||||||
|
{ |
||||||
|
uint32_t s1 = ps1 + i*memshift; |
||||||
|
uint32_t s2 = ps2 - i*memshift; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] = __ldg4(&(DMatrix+s1)[j]); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state[j] ^= state1[j]; |
||||||
|
round_lyra_v35(state); |
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] ^= state[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j] = state1[j]; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
static __device__ __forceinline__ |
||||||
|
void reduceDuplexV3(vectype state[4], uint32_t thread) |
||||||
|
{ |
||||||
|
vectype state1[3]; |
||||||
|
uint32_t ps1 = (Nrow * Ncol * memshift * thread); |
||||||
|
uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread); |
||||||
|
|
||||||
|
#pragma unroll 4 |
||||||
|
for (int i = 0; i < Ncol; i++) |
||||||
|
{ |
||||||
|
uint32_t s1 = ps1 + Nrow * i *memshift; |
||||||
|
uint32_t s2 = ps2 - Nrow * i *memshift; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] = __ldg4(&(DMatrix + s1)[j]); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state[j] ^= state1[j]; |
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] ^= state[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j] = state1[j]; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
static __device__ __forceinline__ |
||||||
|
void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) |
||||||
|
{ |
||||||
|
vectype state2[3],state1[3]; |
||||||
|
|
||||||
|
uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); |
||||||
|
uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); |
||||||
|
uint32_t ps3 = (memshift * (Ncol-1) + memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); |
||||||
|
|
||||||
|
//#pragma unroll 1 |
||||||
|
for (int i = 0; i < Ncol; i++) |
||||||
|
{ |
||||||
|
uint32_t s1 = ps1 + i*memshift; |
||||||
|
uint32_t s2 = ps2 + i*memshift; |
||||||
|
uint32_t s3 = ps3 - i*memshift; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j]= __ldg4(&(DMatrix + s1)[j]); |
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state2[j]= __ldg4(&(DMatrix + s2)[j]); |
||||||
|
for (int j = 0; j < 3; j++) { |
||||||
|
vectype tmp = state1[j] + state2[j]; |
||||||
|
state[j] ^= tmp; |
||||||
|
} |
||||||
|
|
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) { |
||||||
|
state1[j] ^= state[j]; |
||||||
|
(DMatrix + s3)[j] = state1[j]; |
||||||
|
} |
||||||
|
|
||||||
|
((uint2*)state2)[0] ^= ((uint2*)state)[11]; |
||||||
|
|
||||||
|
for (int j = 0; j < 11; j++) |
||||||
|
((uint2*)state2)[j+1] ^= ((uint2*)state)[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j] = state2[j]; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
static __device__ __forceinline__ |
||||||
|
void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) |
||||||
|
{ |
||||||
|
vectype state2[3], state1[3]; |
||||||
|
|
||||||
|
uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); |
||||||
|
uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); |
||||||
|
uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift * rowOut + Nrow * Ncol * memshift * thread); |
||||||
|
|
||||||
|
for (int i = 0; i < Ncol; i++) |
||||||
|
{ |
||||||
|
uint32_t s1 = ps1 + Nrow*i*memshift; |
||||||
|
uint32_t s2 = ps2 + Nrow*i*memshift; |
||||||
|
uint32_t s3 = ps3 - Nrow*i*memshift; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] = __ldg4(&(DMatrix + s1 )[j]); |
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state2[j] = __ldg4(&(DMatrix + s2 )[j]); |
||||||
|
for (int j = 0; j < 3; j++) { |
||||||
|
vectype tmp = state1[j] + state2[j]; |
||||||
|
state[j] ^= tmp; |
||||||
|
} |
||||||
|
|
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) { |
||||||
|
state1[j] ^= state[j]; |
||||||
|
(DMatrix + s3)[j] = state1[j]; |
||||||
|
} |
||||||
|
|
||||||
|
((uint2*)state2)[0] ^= ((uint2*)state)[11]; |
||||||
|
for (int j = 0; j < 11; j++) |
||||||
|
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j] = state2[j]; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
static __device__ __forceinline__ |
||||||
|
void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) |
||||||
|
{ |
||||||
|
vectype state1[3],state2[3]; |
||||||
|
uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); |
||||||
|
uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); |
||||||
|
uint32_t ps3 = (memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); |
||||||
|
|
||||||
|
//#pragma unroll 1 |
||||||
|
for (int i = 0; i < Ncol; i++) |
||||||
|
{ |
||||||
|
uint32_t s1 = ps1 + i*memshift; |
||||||
|
uint32_t s2 = ps2 + i*memshift; |
||||||
|
uint32_t s3 = ps3 + i*memshift; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] = __ldg4(&(DMatrix + s1)[j]); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state2[j] = __ldg4(&(DMatrix + s2)[j]); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] += state2[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state[j] ^= state1[j]; |
||||||
|
|
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
((uint2*)state2)[0] ^= ((uint2*)state)[11]; |
||||||
|
for (int j = 0; j < 11; j++) |
||||||
|
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; |
||||||
|
|
||||||
|
if (rowInOut != rowOut) { |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j] = state2[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s3)[j] ^= state[j]; |
||||||
|
|
||||||
|
} else { |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state2[j] ^= state[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j]=state2[j]; |
||||||
|
} |
||||||
|
|
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
static __device__ __forceinline__ |
||||||
|
void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) |
||||||
|
{ |
||||||
|
vectype state1[3], state2[3]; |
||||||
|
uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); |
||||||
|
uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); |
||||||
|
uint32_t ps3 = (memshift * rowOut + Nrow * Ncol * memshift * thread); |
||||||
|
|
||||||
|
#pragma nounroll |
||||||
|
for (int i = 0; i < Ncol; i++) |
||||||
|
{ |
||||||
|
uint32_t s1 = ps1 + Nrow * i*memshift; |
||||||
|
uint32_t s2 = ps2 + Nrow * i*memshift; |
||||||
|
uint32_t s3 = ps3 + Nrow * i*memshift; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] = __ldg4(&(DMatrix + s1)[j]); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state2[j] = __ldg4(&(DMatrix + s2)[j]); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state1[j] += state2[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state[j] ^= state1[j]; |
||||||
|
|
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
((uint2*)state2)[0] ^= ((uint2*)state)[11]; |
||||||
|
|
||||||
|
for (int j = 0; j < 11; j++) |
||||||
|
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; |
||||||
|
|
||||||
|
if (rowInOut != rowOut) { |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j] = state2[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s3)[j] ^= state[j]; |
||||||
|
|
||||||
|
} else { |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state2[j] ^= state[j]; |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s2)[j] = state2[j]; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ < 500 |
||||||
|
__global__ __launch_bounds__(128, 1) |
||||||
|
#elif __CUDA_ARCH__ == 500 |
||||||
|
__global__ __launch_bounds__(16, 1) |
||||||
|
#else |
||||||
|
__global__ __launch_bounds__(TPB, 1) |
||||||
|
#endif |
||||||
|
void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) |
||||||
|
{ |
||||||
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||||
|
|
||||||
|
vectype state[4]; |
||||||
|
uint28 blake2b_IV[2]; |
||||||
|
uint28 padding[2]; |
||||||
|
|
||||||
|
if (threadIdx.x == 0) { |
||||||
|
|
||||||
|
((uint16*)blake2b_IV)[0] = make_uint16( |
||||||
|
0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85 , |
||||||
|
0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a , |
||||||
|
0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c , |
||||||
|
0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19 |
||||||
|
); |
||||||
|
((uint16*)padding)[0] = make_uint16( |
||||||
|
0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0 , |
||||||
|
0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000 |
||||||
|
); |
||||||
|
} |
||||||
|
|
||||||
|
#if __CUDA_ARCH__ == 350 |
||||||
|
if (thread < threads) |
||||||
|
#endif |
||||||
|
{ |
||||||
|
((uint2*)state)[0] = __ldg(&outputHash[thread]); |
||||||
|
((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); |
||||||
|
((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); |
||||||
|
((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); |
||||||
|
state[1] = state[0]; |
||||||
|
state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0); |
||||||
|
state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0); |
||||||
|
|
||||||
|
for (int i = 0; i<12; i++) |
||||||
|
round_lyra_v35(state); |
||||||
|
state[0] ^= shuffle4(((vectype*)padding)[0], 0); |
||||||
|
state[1] ^= shuffle4(((vectype*)padding)[1], 0); |
||||||
|
|
||||||
|
for (int i = 0; i<12; i++) |
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); |
||||||
|
|
||||||
|
//#pragma unroll 4 |
||||||
|
for (int i = 0; i < 4; i++) |
||||||
|
{ |
||||||
|
uint32_t s1 = ps1 - 4 * memshift * i; |
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s1)[j] = (state)[j]; |
||||||
|
|
||||||
|
round_lyra_v35(state); |
||||||
|
} |
||||||
|
|
||||||
|
reduceDuplexV3(state, thread); |
||||||
|
reduceDuplexRowSetupV3(1, 0, 2, state, thread); |
||||||
|
reduceDuplexRowSetupV3(2, 1, 3, state, thread); |
||||||
|
|
||||||
|
uint32_t rowa; |
||||||
|
int prev = 3; |
||||||
|
for (int i = 0; i < 4; i++) |
||||||
|
{ |
||||||
|
rowa = ((uint2*)state)[0].x & 3; reduceDuplexRowtV3(prev, rowa, i, state, thread); |
||||||
|
prev = i; |
||||||
|
} |
||||||
|
|
||||||
|
uint32_t shift = (memshift * rowa + 16 * memshift * thread); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state[j] ^= __ldg4(&(DMatrix + shift)[j]); |
||||||
|
|
||||||
|
for (int i = 0; i < 12; i++) |
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
outputHash[thread] = ((uint2*)state)[0]; |
||||||
|
outputHash[thread + threads] = ((uint2*)state)[1]; |
||||||
|
outputHash[thread + 2 * threads] = ((uint2*)state)[2]; |
||||||
|
outputHash[thread + 3 * threads] = ((uint2*)state)[3]; |
||||||
|
//((vectype*)outputHash)[thread] = state[0]; |
||||||
|
|
||||||
|
} //thread |
||||||
|
} |
||||||
|
|
||||||
|
#if __CUDA_ARCH__ < 500 |
||||||
|
__global__ __launch_bounds__(64, 1) |
||||||
|
#elif __CUDA_ARCH__ == 500 |
||||||
|
__global__ __launch_bounds__(32, 1) |
||||||
|
#else |
||||||
|
__global__ __launch_bounds__(TPB, 1) |
||||||
|
#endif |
||||||
|
void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash) |
||||||
|
{ |
||||||
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||||
|
vectype state[4]; |
||||||
|
uint28 blake2b_IV[2]; |
||||||
|
uint28 padding[2]; |
||||||
|
|
||||||
|
if (threadIdx.x == 0) { |
||||||
|
|
||||||
|
((uint16*)blake2b_IV)[0] = make_uint16( |
||||||
|
0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85 , |
||||||
|
0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a , |
||||||
|
0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c , |
||||||
|
0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19 |
||||||
|
); |
||||||
|
((uint16*)padding)[0] = make_uint16( |
||||||
|
0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0 , |
||||||
|
0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000 |
||||||
|
); |
||||||
|
} |
||||||
|
|
||||||
|
#if __CUDA_ARCH__ == 350 |
||||||
|
if (thread < threads) |
||||||
|
#endif |
||||||
|
{ |
||||||
|
((uint2*)state)[0] = __ldg(&outputHash[thread]); |
||||||
|
((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); |
||||||
|
((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); |
||||||
|
((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); |
||||||
|
|
||||||
|
state[1] = state[0]; |
||||||
|
|
||||||
|
state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0); |
||||||
|
state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0); |
||||||
|
|
||||||
|
for (int i = 0; i<12; i++) |
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
state[0] ^= shuffle4(((vectype*)padding)[0], 0); |
||||||
|
state[1] ^= shuffle4(((vectype*)padding)[1], 0); |
||||||
|
|
||||||
|
for (int i = 0; i<12; i++) |
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
uint32_t ps1 = (memshift * (Ncol - 1) + Nrow * Ncol * memshift * thread); |
||||||
|
|
||||||
|
for (int i = 0; i < Ncol; i++) |
||||||
|
{ |
||||||
|
uint32_t s1 = ps1 - memshift * i; |
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
(DMatrix + s1)[j] = (state)[j]; |
||||||
|
|
||||||
|
round_lyra_v35(state); |
||||||
|
} |
||||||
|
|
||||||
|
reduceDuplex(state, thread); |
||||||
|
|
||||||
|
reduceDuplexRowSetupV2(1, 0, 2, state, thread); |
||||||
|
reduceDuplexRowSetupV2(2, 1, 3, state, thread); |
||||||
|
|
||||||
|
uint32_t rowa; |
||||||
|
int prev=3; |
||||||
|
|
||||||
|
for (int i = 0; i < 4; i++) { |
||||||
|
rowa = ((uint2*)state)[0].x & 3; |
||||||
|
reduceDuplexRowtV2(prev, rowa, i, state, thread); |
||||||
|
prev=i; |
||||||
|
} |
||||||
|
|
||||||
|
uint32_t shift = (memshift * Ncol * rowa + Nrow * Ncol * memshift * thread); |
||||||
|
|
||||||
|
for (int j = 0; j < 3; j++) |
||||||
|
state[j] ^= __ldg4(&(DMatrix + shift)[j]); |
||||||
|
|
||||||
|
for (int i = 0; i < 12; i++) |
||||||
|
round_lyra_v35(state); |
||||||
|
|
||||||
|
outputHash[thread]= ((uint2*)state)[0]; |
||||||
|
outputHash[thread + threads] = ((uint2*)state)[1]; |
||||||
|
outputHash[thread + 2 * threads] = ((uint2*)state)[2]; |
||||||
|
outputHash[thread + 3 * threads] = ((uint2*)state)[3]; |
||||||
|
// ((vectype*)outputHash)[thread] = state[0]; |
||||||
|
|
||||||
|
} //thread |
||||||
|
} |
||||||
|
|
||||||
|
__host__ |
||||||
|
void lyra2v2_cpu_init(int thr_id, uint32_t threads,uint64_t *hash) |
||||||
|
{ |
||||||
|
cudaMemcpyToSymbol(DMatrix, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice); |
||||||
|
} |
||||||
|
|
||||||
|
__host__ |
||||||
|
void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) |
||||||
|
{ |
||||||
|
uint32_t tpb; |
||||||
|
if (device_sm[device_map[thr_id]] < 500) |
||||||
|
tpb = 64; |
||||||
|
else if (device_sm[device_map[thr_id]] == 500) |
||||||
|
tpb = 32; |
||||||
|
else |
||||||
|
tpb = TPB; |
||||||
|
|
||||||
|
dim3 grid((threads + tpb - 1) / tpb); |
||||||
|
dim3 block(tpb); |
||||||
|
|
||||||
|
if (device_sm[device_map[thr_id]] >= 500) |
||||||
|
lyra2v2_gpu_hash_32 << <grid, block >> > (threads, startNounce, (uint2*)d_outputHash); |
||||||
|
else |
||||||
|
lyra2v2_gpu_hash_32_v3 <<<grid, block>>> (threads, startNounce,(uint2*) d_outputHash); |
||||||
|
|
||||||
|
MyStreamSynchronize(NULL, order, thr_id); |
||||||
|
} |
||||||
|
|
@ -0,0 +1,164 @@ |
|||||||
|
extern "C" { |
||||||
|
#include "sph/sph_blake.h" |
||||||
|
#include "sph/sph_bmw.h" |
||||||
|
#include "sph/sph_skein.h" |
||||||
|
#include "sph/sph_keccak.h" |
||||||
|
#include "sph/sph_cubehash.h" |
||||||
|
#include "lyra2/Lyra2.h" |
||||||
|
} |
||||||
|
|
||||||
|
#include "miner.h" |
||||||
|
#include "cuda_helper.h" |
||||||
|
|
||||||
|
|
||||||
|
static _ALIGN(64) uint64_t *d_hash[MAX_GPUS]; |
||||||
|
static uint64_t *d_hash2[MAX_GPUS]; |
||||||
|
|
||||||
|
extern void blake256_cpu_init(int thr_id, uint32_t threads); |
||||||
|
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); |
||||||
|
extern void blake256_cpu_setBlock_80(uint32_t *pdata); |
||||||
|
extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); |
||||||
|
extern void keccak256_cpu_init(int thr_id, uint32_t threads); |
||||||
|
extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); |
||||||
|
extern void skein256_cpu_init(int thr_id, uint32_t threads); |
||||||
|
|
||||||
|
extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); |
||||||
|
extern void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t* matrix); |
||||||
|
|
||||||
|
extern void bmw256_setTarget(const void *ptarget); |
||||||
|
extern void bmw256_cpu_init(int thr_id, uint32_t threads); |
||||||
|
extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces); |
||||||
|
|
||||||
|
extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash); |
||||||
|
|
||||||
|
void lyra2v2_hash(void *state, const void *input) |
||||||
|
{ |
||||||
|
uint32_t hashA[8], hashB[8]; |
||||||
|
|
||||||
|
sph_blake256_context ctx_blake; |
||||||
|
sph_keccak256_context ctx_keccak; |
||||||
|
sph_skein256_context ctx_skein; |
||||||
|
sph_bmw256_context ctx_bmw; |
||||||
|
sph_cubehash256_context ctx_cube; |
||||||
|
|
||||||
|
sph_blake256_init(&ctx_blake); |
||||||
|
sph_blake256(&ctx_blake, input, 80); |
||||||
|
sph_blake256_close(&ctx_blake, hashA); |
||||||
|
|
||||||
|
sph_keccak256_init(&ctx_keccak); |
||||||
|
sph_keccak256(&ctx_keccak, hashA, 32); |
||||||
|
sph_keccak256_close(&ctx_keccak, hashB); |
||||||
|
|
||||||
|
sph_cubehash256_init(&ctx_cube); |
||||||
|
sph_cubehash256(&ctx_cube, hashB, 32); |
||||||
|
sph_cubehash256_close(&ctx_cube, hashA); |
||||||
|
|
||||||
|
LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); |
||||||
|
|
||||||
|
sph_skein256_init(&ctx_skein); |
||||||
|
sph_skein256(&ctx_skein, hashB, 32); |
||||||
|
sph_skein256_close(&ctx_skein, hashA); |
||||||
|
|
||||||
|
sph_cubehash256_init(&ctx_cube); |
||||||
|
sph_cubehash256(&ctx_cube, hashA, 32); |
||||||
|
sph_cubehash256_close(&ctx_cube, hashB); |
||||||
|
|
||||||
|
sph_bmw256_init(&ctx_bmw); |
||||||
|
sph_bmw256(&ctx_bmw, hashB, 32); |
||||||
|
sph_bmw256_close(&ctx_bmw, hashA); |
||||||
|
|
||||||
|
memcpy(state, hashA, 32); |
||||||
|
} |
||||||
|
|
||||||
|
static bool init[MAX_GPUS] = { 0 }; |
||||||
|
|
||||||
|
extern "C" int scanhash_lyra2v2(int thr_id, uint32_t *pdata, |
||||||
|
const uint32_t *ptarget, uint32_t max_nonce, |
||||||
|
unsigned long *hashes_done) |
||||||
|
{ |
||||||
|
const uint32_t first_nonce = pdata[19]; |
||||||
|
int intensity = (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 18 : 17; |
||||||
|
unsigned int defthr = 1U << intensity; |
||||||
|
uint32_t throughput = device_intensity(device_map[thr_id], __func__, defthr); |
||||||
|
|
||||||
|
if (opt_benchmark) |
||||||
|
((uint32_t*)ptarget)[7] = 0x00ff; |
||||||
|
|
||||||
|
if (!init[thr_id]) |
||||||
|
{ |
||||||
|
cudaSetDevice(device_map[thr_id]); |
||||||
|
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); |
||||||
|
//if (opt_n_gputhreads == 1) |
||||||
|
// cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); |
||||||
|
blake256_cpu_init(thr_id, throughput); |
||||||
|
keccak256_cpu_init(thr_id,throughput); |
||||||
|
skein256_cpu_init(thr_id, throughput); |
||||||
|
bmw256_cpu_init(thr_id, throughput); |
||||||
|
|
||||||
|
// DMatrix |
||||||
|
CUDA_SAFE_CALL(cudaMalloc(&d_hash2[thr_id], 16 * 4 * 4 * sizeof(uint64_t) * throughput)); |
||||||
|
lyra2v2_cpu_init(thr_id, throughput, d_hash2[thr_id]); |
||||||
|
|
||||||
|
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)throughput * 32)); |
||||||
|
|
||||||
|
init[thr_id] = true; |
||||||
|
} |
||||||
|
|
||||||
|
uint32_t endiandata[20]; |
||||||
|
for (int k=0; k < 20; k++) |
||||||
|
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); |
||||||
|
|
||||||
|
blake256_cpu_setBlock_80(pdata); |
||||||
|
bmw256_setTarget(ptarget); |
||||||
|
|
||||||
|
do { |
||||||
|
int order = 0; |
||||||
|
uint32_t foundNonces[2] = { 0, 0 }; |
||||||
|
|
||||||
|
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||||
|
keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||||
|
cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]); |
||||||
|
lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||||
|
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||||
|
cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id]); |
||||||
|
|
||||||
|
bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces); |
||||||
|
|
||||||
|
if (foundNonces[0] != 0) |
||||||
|
{ |
||||||
|
// CUDA_SAFE_CALL(cudaGetLastError()); |
||||||
|
const uint32_t Htarg = ptarget[7]; |
||||||
|
uint32_t vhash64[8]; |
||||||
|
be32enc(&endiandata[19], foundNonces[0]); |
||||||
|
lyra2v2_hash(vhash64, endiandata); |
||||||
|
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) |
||||||
|
{ |
||||||
|
int res = 1; |
||||||
|
// check if there was some other ones... |
||||||
|
*hashes_done = pdata[19] - first_nonce + throughput; |
||||||
|
if (foundNonces[1] != 0) |
||||||
|
{ |
||||||
|
pdata[21] = foundNonces[1]; |
||||||
|
res++; |
||||||
|
if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, foundNonces[1], vhash64[7], Htarg); |
||||||
|
} |
||||||
|
pdata[19] = foundNonces[0]; |
||||||
|
if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found nounce % 08x", thr_id, foundNonces[0], vhash64[7], Htarg); |
||||||
|
MyStreamSynchronize(NULL, 0, device_map[thr_id]); |
||||||
|
return res; |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
if (vhash64[7] > Htarg) // don't show message if it is equal but fails fulltest |
||||||
|
applog(LOG_WARNING, "GPU #%d: result does not validate on CPU!", thr_id); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
pdata[19] += throughput; |
||||||
|
|
||||||
|
} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); |
||||||
|
|
||||||
|
*hashes_done = pdata[19] - first_nonce + 1; |
||||||
|
MyStreamSynchronize(NULL, 0, device_map[thr_id]); |
||||||
|
return 0; |
||||||
|
} |
Loading…
Reference in new issue