lyra2v2, bmw256 and cubehash256 cleanup + diff fix

This commit is contained in:
Tanguy Pruvot 2015-08-18 11:02:01 +02:00
parent 4b04ca8027
commit 53cd591956
4 changed files with 170 additions and 226 deletions

View File

@ -3,28 +3,24 @@
#include "cuda_helper.h"
#undef SPH_ROTL32
#define SPH_ROTL32 ROTL32
// die Message it Padding zur Berechnung auf der GPU
__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
__constant__ uint32_t ZDH[16];
static uint32_t *d_gnounce[MAX_GPUS];
static uint32_t *d_GNonce[MAX_GPUS];
__constant__ uint32_t pTarget[8];
#define shl(x, n) ((x) << (n))
#define shr(x, n) ((x) >> (n))
//#define SHR(x, n) SHR2(x, n)
//#define SHL(x, n) SHL2(x, n)
__constant__ uint64_t pTarget[8];
#define shl(x, n) ((x) << (n))
#define shr(x, n) ((x) >> (n))
#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x), 4) ^ SPH_ROTL32((x), 19))
#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x), 8) ^ SPH_ROTL32((x), 23))
#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
#define ss4(x) (shr((x), 1) ^ (x))
#define ss5(x) (shr((x), 2) ^ (x))
#define ROTL32host(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
// #define SPH_ROTL32 SPH_ROTL32
#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x), 4) ^ SPH_ROTL32((x), 19))
#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x), 8) ^ SPH_ROTL32((x), 23))
#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
#define ss4(x) (shr((x), 1) ^ (x))
#define ss5(x) (shr((x), 2) ^ (x))
#define rs1(x) SPH_ROTL32((x), 3)
#define rs2(x) SPH_ROTL32((x), 7)
#define rs3(x) SPH_ROTL32((x), 13)
@ -34,38 +30,132 @@ __constant__ uint32_t pTarget[8];
#define rs7(x) SPH_ROTL32((x), 27)
/* Message expansion function 1 */
__forceinline__ __device__ uint32_t expand32_1(int i, uint32_t *M32, uint32_t *H, uint32_t *Q)
__forceinline__ __device__
uint32_t expand32_1(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
{
#undef SPH_ROTL32
#define SPH_ROTL32 ROTL32
return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13])
+ ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9])
+ ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5])
+ ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1])
+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
#undef SPH_ROTL32
+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1)
+ SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1)
- SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
}
/* Message expansion function 2 */
__forceinline__ __device__ uint32_t expand32_2(int i, uint32_t *M32, uint32_t *H, uint32_t *Q)
__forceinline__ __device__
uint32_t expand32_2(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
{
#undef SPH_ROTL32
#define SPH_ROTL32 ROTL32
return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13])
+ Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9])
+ Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5])
+ Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1])
+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
#undef SPH_ROTL32
+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1)
+ SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1)
- SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
}
__forceinline__ __device__ void Compression256(uint32_t * M32, uint32_t * H)
__forceinline__ __device__
void Compression256(uint32_t * M32)
{
uint32_t Q[32], XL32, XH32;
const uint32_t H[16] = {
0x40414243, 0x44454647, 0x48494A4B, 0x4C4D4E4F,
0x50515253, 0x54555657, 0x58595A5B, 0x5C5D5E5F,
0x60616263, 0x64656667, 0x68696A6B, 0x6C6D6E6F,
0x70717273, 0x74757677, 0x78797A7B, 0x7C7D7E7F
};
Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]);
Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]);
Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]);
Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]);
Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]);
Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]);
Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]);
Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]);
Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]);
Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]);
Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]);
Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]);
/* Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe. */
Q[0] = ss0(Q[0]) + H[1];
Q[1] = ss1(Q[1]) + H[2];
Q[2] = ss2(Q[2]) + H[3];
Q[3] = ss3(Q[3]) + H[4];
Q[4] = ss4(Q[4]) + H[5];
Q[5] = ss0(Q[5]) + H[6];
Q[6] = ss1(Q[6]) + H[7];
Q[7] = ss2(Q[7]) + H[8];
Q[8] = ss3(Q[8]) + H[9];
Q[9] = ss4(Q[9]) + H[10];
Q[10] = ss0(Q[10]) + H[11];
Q[11] = ss1(Q[11]) + H[12];
Q[12] = ss2(Q[12]) + H[13];
Q[13] = ss3(Q[13]) + H[14];
Q[14] = ss4(Q[14]) + H[15];
Q[15] = ss0(Q[15]) + H[0];
/* This is the Message expansion or f_1 in the documentation. */
/* It has 16 rounds. */
/* Blue Midnight Wish has two tunable security parameters. */
/* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS. */
/* The following relation for these parameters should is satisfied: */
/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */
#pragma unroll
for (int i=16; i<18; i++)
Q[i] = expand32_1(i, M32, H, Q);
#pragma nounroll
for (int i=18; i<32; i++)
Q[i] = expand32_2(i, M32, H, Q);
/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
/* 16 new variables that are prooduced in the Message Expansion part. */
XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
/* This part is the function f_2 - in the documentation */
/* Compute the double chaining pipe for the next message block. */
M32[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32 ^ Q[24] ^ Q[0]);
M32[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32 ^ Q[25] ^ Q[1]);
M32[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]);
M32[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]);
M32[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32 ^ Q[28] ^ Q[4]);
M32[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32 ^ Q[29] ^ Q[5]);
M32[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32 ^ Q[30] ^ Q[6]);
M32[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32 ^ Q[31] ^ Q[7]);
M32[8] = SPH_ROTL32(M32[4], 9) + (XH32 ^ Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]);
M32[9] = SPH_ROTL32(M32[5], 10) + (XH32 ^ Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]);
M32[10] = SPH_ROTL32(M32[6], 11) + (XH32 ^ Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]);
M32[11] = SPH_ROTL32(M32[7], 12) + (XH32 ^ Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]);
M32[12] = SPH_ROTL32(M32[0], 13) + (XH32 ^ Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]);
M32[13] = SPH_ROTL32(M32[1], 14) + (XH32 ^ Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]);
M32[14] = SPH_ROTL32(M32[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
M32[15] = SPH_ROTL32(M32[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
}
__forceinline__ __device__
void Compression256_2(uint32_t * M32)
{
#undef SPH_ROTL32
#define SPH_ROTL32 ROTL32
int i;
uint32_t XL32, XH32, Q[32];
const uint32_t H[16] = {
0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2, 0xaaaaaaa3,
0xaaaaaaa4, 0xaaaaaaa5, 0xaaaaaaa6, 0xaaaaaaa7,
0xaaaaaaa8, 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab,
0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae, 0xaaaaaaaf
};
Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
@ -109,136 +199,23 @@ __forceinline__ __device__ void Compression256(uint32_t * M32, uint32_t * H)
/* The following relation for these parameters should is satisfied: */
/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */
for (i = 0; i<2; i++)
Q[i + 16] = expand32_1(i + 16, M32, H, Q);
#pragma unroll
for (int i = 16; i<18; i++)
Q[i] = expand32_1(i, M32, H, Q);
for (i = 2; i<16; i++)
Q[i + 16] = expand32_2(i + 16, M32, H, Q);
#pragma nounroll
for (int i = 18; i<32; i++)
Q[i] = expand32_2(i, M32, H, Q);
/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
/* 16 new variables that are prooduced in the Message Expansion part. */
XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
/* This part is the function f_2 - in the documentation */
/* Compute the double chaining pipe for the next message block. */
H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32 ^ Q[24] ^ Q[0]);
H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32 ^ Q[25] ^ Q[1]);
H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]);
H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]);
H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32 ^ Q[28] ^ Q[4]);
H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32 ^ Q[29] ^ Q[5]);
H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32 ^ Q[30] ^ Q[6]);
H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32 ^ Q[31] ^ Q[7]);
H[8] = SPH_ROTL32(H[4], 9) + (XH32 ^ Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]);
H[9] = SPH_ROTL32(H[5], 10) + (XH32 ^ Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]);
H[10] = SPH_ROTL32(H[6], 11) + (XH32 ^ Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]);
H[11] = SPH_ROTL32(H[7], 12) + (XH32 ^ Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]);
H[12] = SPH_ROTL32(H[0], 13) + (XH32 ^ Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]);
H[13] = SPH_ROTL32(H[1], 14) + (XH32 ^ Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]);
H[14] = SPH_ROTL32(H[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
H[15] = SPH_ROTL32(H[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
#undef SPH_ROTL32
}
__forceinline__ __device__ void Compression256_2(uint32_t * M32, uint32_t * H)
{
#undef SPH_ROTL32
#define SPH_ROTL32 ROTL32
int i;
uint32_t XL32, XH32, Q[32];
/* This part is the function f0 - in the documentation */
/* First we mix the message block *M32 (M in the documatation) */
/* with the previous double pipe *H. */
/* For a fixed previous double pipe, or fixed message block, this */
/* part is bijection. */
/* This transformation diffuses every one bit difference in 5 words. */
Q[0] = (H[5]) - (H[7]) + (H[10]) + (H[13]) + (0x280 ^ H[14]);
Q[1] = (H[6]) - (H[8]) + (H[11]) + (0x280 ^ H[14]) - (H[15]);
Q[2] = (M32[0] ^ H[0]) + (H[7]) + (H[9]) - (H[12]) + (H[15]);
Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (H[8]) - (H[10]) + (H[13]);
Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (H[9]) - (H[11]) - (0x280 ^ H[14]);
Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (H[10]) - (H[12]) + (H[15]);
Q[6] = (0x80 ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (H[11]) + (H[13]);
Q[7] = (M32[1] ^ H[1]) - (0x80 ^ H[4]) - (H[5]) - (H[12]) - (0x280 ^ H[14]);
Q[8] = (M32[2] ^ H[2]) - (H[5]) - (H[6]) + (H[13]) - (H[15]);
Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (H[6]) - (H[7]) + (0x280 ^ H[14]);
Q[10] = (H[8]) - (M32[1] ^ H[1]) - (0x80 ^ H[4]) - (H[7]) + (H[15]);
Q[11] = (H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (H[5]) + (H[9]);
Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (H[6]) - (H[9]) + (H[10]);
Q[13] = (M32[2] ^ H[2]) + (0x80 ^ H[4]) + (H[7]) + (H[10]) + (H[11]);
Q[14] = (M32[3] ^ H[3]) - (H[5]) + (H[8]) - (H[11]) - (H[12]);
Q[15] = (H[12]) - (0x80 ^ H[4]) - (H[6]) - (H[9]) + (H[13]);
/* Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/
Q[0] = ss0(Q[0]) + H[1];
Q[1] = ss1(Q[1]) + H[2];
Q[2] = ss2(Q[2]) + H[3];
Q[3] = ss3(Q[3]) + H[4];
Q[4] = ss4(Q[4]) + H[5];
Q[5] = ss0(Q[5]) + H[6];
Q[6] = ss1(Q[6]) + H[7];
Q[7] = ss2(Q[7]) + H[8];
Q[8] = ss3(Q[8]) + H[9];
Q[9] = ss4(Q[9]) + H[10];
Q[10] = ss0(Q[10]) + H[11];
Q[11] = ss1(Q[11]) + H[12];
Q[12] = ss2(Q[12]) + H[13];
Q[13] = ss3(Q[13]) + H[14];
Q[14] = ss4(Q[14]) + H[15];
Q[15] = ss0(Q[15]) + H[0];
/* This is the Message expansion or f_1 in the documentation. */
/* It has 16 rounds. */
/* Blue Midnight Wish has two tunable security parameters. */
/* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS. */
/* The following relation for these parameters should is satisfied: */
/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */
for (i = 0; i<2; i++)
Q[i + 16] = expand32_1(i + 16, M32, H, Q);
for (i = 2; i<16; i++)
Q[i + 16] = expand32_2(i + 16, M32, H, Q);
/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
/* 16 new variables that are prooduced in the Message Expansion part. */
XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
/* This part is the function f_2 - in the documentation */
/* Compute the double chaining pipe for the next message block. */
H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32 ^ Q[24] ^ Q[0]);
H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32 ^ Q[25] ^ Q[1]);
H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]);
H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]);
H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32 ^ Q[28] ^ Q[4]);
H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32 ^ Q[29] ^ Q[5]);
H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32 ^ Q[30] ^ Q[6]);
H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32 ^ Q[31] ^ Q[7]);
H[8] = SPH_ROTL32(H[4], 9) + (XH32 ^ Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]);
H[9] = SPH_ROTL32(H[5], 10) + (XH32 ^ Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]);
H[10] = SPH_ROTL32(H[6], 11) + (XH32 ^ Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]);
H[11] = SPH_ROTL32(H[7], 12) + (XH32 ^ Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]);
H[12] = SPH_ROTL32(H[0], 13) + (XH32 ^ Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]);
H[13] = SPH_ROTL32(H[1], 14) + (XH32 ^ Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]);
H[14] = SPH_ROTL32(H[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
H[15] = SPH_ROTL32(H[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
#undef SPH_ROTL32
XH32 = XL32 ^ Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
M32[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]);
M32[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]);
M32[14] = SPH_ROTL32(M32[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
M32[15] = SPH_ROTL32(M32[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
}
#define TPB 512
@ -248,27 +225,8 @@ void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t dh[16] = {
(0x40414243), (0x44454647),
(0x48494A4B), (0x4C4D4E4F),
(0x50515253), (0x54555657),
(0x58595A5B), (0x5C5D5E5F),
(0x60616263), (0x64656667),
(0x68696A6B), (0x6C6D6E6F),
(0x70717273), (0x74757677),
(0x78797A7B), (0x7C7D7E7F)
};
uint32_t message[16] = { 0 };
uint32_t final_s[16] = {
(0xaaaaaaa0), (0xaaaaaaa1), (0xaaaaaaa2),
(0xaaaaaaa3), (0xaaaaaaa4), (0xaaaaaaa5),
(0xaaaaaaa6), (0xaaaaaaa7), (0xaaaaaaa8),
(0xaaaaaaa9), (0xaaaaaaaa), (0xaaaaaaab),
(0xaaaaaaac), (0xaaaaaaad), (0xaaaaaaae),
(0xaaaaaaaf)
};
uint32_t message[16]={0};
LOHI(message[0], message[1], __ldg(&g_hash[thread]));
LOHI(message[2], message[3], __ldg(&g_hash[thread + 1 * threads]));
LOHI(message[4], message[5], __ldg(&g_hash[thread + 2 * threads]));
@ -276,10 +234,10 @@ void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash
message[8]=0x80;
message[14]=0x100;
Compression256(message, dh);
Compression256(dh, final_s);
Compression256(message);
Compression256_2(message);
if (((uint64_t*)final_s)[7] <= ((uint64_t*)pTarget)[3])
if (((uint64_t*)message)[7] <= pTarget[3])
{
uint32_t tmp = atomicExch(&nonceVector[0], startNounce + thread);
if (tmp != 0)
@ -288,17 +246,15 @@ void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash
}
}
__host__
void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash,uint32_t *resultnonces)
void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces)
{
cudaMemset(d_GNonce[thr_id], 0x0, 2 * sizeof(uint32_t));
const uint32_t threadsperblock = TPB;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
cudaMemset(d_GNonce[thr_id], 0, 2 * sizeof(uint32_t));
bmw256_gpu_hash_32 << <grid, block >> >(threads, startNounce, g_hash, d_GNonce[thr_id]);
cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
resultnonces[0] = *(d_gnounce[thr_id]);
@ -316,5 +272,5 @@ void bmw256_cpu_init(int thr_id, uint32_t threads)
__host__
void bmw256_setTarget(const void *pTargetIn)
{
cudaMemcpyToSymbol(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice);
}

View File

@ -13,7 +13,8 @@
#define ROTATEUPWARDS11(a) LROT(a,11)
//#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
#define SWAP(a,b) { a ^= b; b ^=a; a ^=b;}
#define SWAP(a,b) { a ^= b; b ^= a; a ^= b; }
__device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2])
{
int r;
@ -155,7 +156,8 @@ __device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2]
}
void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const uint32_t *data)
__device__ __forceinline__
void Update32(uint32_t x[2][2][2][2][2], const uint32_t *data)
{
/* "xor the block into the first b bytes of the state" */
/* "and then transform the state invertibly through r identical rounds" */
@ -163,24 +165,22 @@ void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const uint32
rrounds(x);
}
void __device__ __forceinline__ Update32_const(uint32_t x[2][2][2][2][2])
__device__ __forceinline__
void Update32_const(uint32_t x[2][2][2][2][2])
{
x[0][0][0][0][0] ^= 0x80;
rrounds(x);
}
void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
__device__ __forceinline__
void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
{
int i;
/* "the integer 1 is xored into the last state word x_11111" */
x[1][1][1][1][1] ^= 1;
x[1][1][1][1][1] ^= 1U;
/* "the state is then transformed invertibly through 10r identical rounds" */
#pragma unroll 2
for (i = 0; i < 10; ++i) rrounds(x);
for (int i = 0; i < 10; ++i) rrounds(x);
/* "output the first h/8 bytes of the state" */
hash_fromx(hashval, x);
@ -198,8 +198,8 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t Hash[8]; // = &g_hash[16 * hashPosition];
LOHI(Hash[0], Hash[1], __ldg(&g_hash[thread]));
LOHI(Hash[2], Hash[3], __ldg(&g_hash[thread + 1 * threads]));
LOHI(Hash[4], Hash[5], __ldg(&g_hash[thread + 2 * threads]));
@ -207,19 +207,16 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g
uint32_t x[2][2][2][2][2] =
{
0xEA2BD4B4, 0xCCD6F29F, 0x63117E71,
0x35481EAE, 0x22512D5B, 0xE5D94E63,
0x7E624131, 0xF4CC12BE, 0xC2D0B696,
0x42AF2070, 0xD0720C35, 0x3361DA8C,
0x28CCECA4, 0x8EF8AD83, 0x4680AC00,
0x40E5FBAB, 0xD89041C3, 0x6107FBD5,
0x6C859D41, 0xF0B26679, 0x09392549,
0x5FA25603, 0x65C892FD, 0x93CB6285,
0x2AF2B5AE, 0x9E4B4E60, 0x774ABFDD,
0x85254725, 0x15815AEB, 0x4AB6AAD6,
0x9CDAF8AF, 0xD6032C0A
0xEA2BD4B4, 0xCCD6F29F, 0x63117E71, 0x35481EAE,
0x22512D5B, 0xE5D94E63, 0x7E624131, 0xF4CC12BE,
0xC2D0B696, 0x42AF2070, 0xD0720C35, 0x3361DA8C,
0x28CCECA4, 0x8EF8AD83, 0x4680AC00, 0x40E5FBAB,
0xD89041C3, 0x6107FBD5, 0x6C859D41, 0xF0B26679,
0x09392549, 0x5FA25603, 0x65C892FD, 0x93CB6285,
0x2AF2B5AE, 0x9E4B4E60, 0x774ABFDD, 0x85254725,
0x15815AEB, 0x4AB6AAD6, 0x9CDAF8AF, 0xD6032C0A
};
x[0][0][0][0][0] ^= Hash[0];
x[0][0][0][0][1] ^= Hash[1];
x[0][0][0][1][0] ^= Hash[2];
@ -230,7 +227,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g
x[0][0][1][1][1] ^= Hash[7];
rrounds(x);
x[0][0][0][0][0] ^= 0x80;
x[0][0][0][0][0] ^= 0x80U;
rrounds(x);
Final(x, Hash);
@ -244,17 +241,12 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g
__host__
void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash)
void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order)
{
uint32_t tpb = 576;
uint32_t tpb;
if (device_sm[device_map[thr_id]]<500)
tpb = 576;
else
tpb = 576;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + tpb-1)/tpb);
dim3 block(tpb);
cubehash256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_hash);
cubehash256_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_hash);
}

View File

@ -1457,11 +1457,11 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
case ALGO_FRESH:
case ALGO_FUGUE256:
case ALGO_GROESTL:
case ALGO_LYRA2v2:
diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
break;
case ALGO_KECCAK:
case ALGO_LYRA2:
case ALGO_LYRA2v2:
diff_to_target(work->target, sctx->job.diff / (128.0 * opt_difficulty));
break;
default:

View File

@ -21,6 +21,7 @@ extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNo
extern void keccak256_cpu_init(int thr_id, uint32_t threads);
extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void skein256_cpu_init(int thr_id, uint32_t threads);
extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t* matrix);
@ -29,8 +30,6 @@ extern void bmw256_setTarget(const void *ptarget);
extern void bmw256_cpu_init(int thr_id, uint32_t threads);
extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash);
void lyra2v2_hash(void *state, const void *input)
{
uint32_t hashA[8], hashB[8];
@ -117,16 +116,15 @@ extern "C" int scanhash_lyra2v2(int thr_id, uint32_t *pdata,
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id]);
cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id], order++);
bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces);
if (foundNonces[0] != 0)
{
// CUDA_SAFE_CALL(cudaGetLastError());
const uint32_t Htarg = ptarget[7];
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonces[0]);
@ -134,16 +132,14 @@ extern "C" int scanhash_lyra2v2(int thr_id, uint32_t *pdata,
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
{
int res = 1;
// check if there was some other ones...
// check if there was another one...
*hashes_done = pdata[19] - first_nonce + throughput;
if (foundNonces[1] != 0)
{
pdata[21] = foundNonces[1];
res++;
if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, foundNonces[1], vhash64[7], Htarg);
}
pdata[19] = foundNonces[0];
if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found nounce % 08x", thr_id, foundNonces[0], vhash64[7], Htarg);
MyStreamSynchronize(NULL, 0, device_map[thr_id]);
return res;
}
@ -156,7 +152,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, uint32_t *pdata,
pdata[19] += throughput;
} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
} while (!work_restart[thr_id].restart && (max_nonce > ((uint64_t)(pdata[19]) + throughput)));
*hashes_done = pdata[19] - first_nonce + 1;
MyStreamSynchronize(NULL, 0, device_map[thr_id]);