OpenCL GPU miner
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

185 lines
4.9 KiB

/*
* Lyra2 kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
* Copyright (c) 2014 djm34
*
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author djm34
*/
#define ROTL64(x,n) rotate(x,(ulong)n)
#define ROTR64(x,n) rotate(x,(ulong)(64-n))
#define SWAP32(x) as_ulong(as_uint2(x).s10)
#define SWAP24(x) as_ulong(as_uchar8(x).s34567012)
#define SWAP16(x) as_ulong(as_uchar8(x).s23456701)
#define G(a,b,c,d) \
do { \
a += b; d ^= a; d = SWAP32(d); \
c += d; b ^= c; b = ROTR64(b,24); \
a += b; d ^= a; d = ROTR64(d,16); \
c += d; b ^= c; b = ROTR64(b, 63); \
\
} while (0)
#define G_old(a,b,c,d) \
do { \
a += b; d ^= a; d = ROTR64(d, 32); \
c += d; b ^= c; b = ROTR64(b, 24); \
a += b; d ^= a; d = ROTR64(d, 16); \
c += d; b ^= c; b = ROTR64(b, 63); \
\
} while (0)
/*One Round of the Blake2b's compression function*/
#define round_lyra(s) \
do { \
G(s[0].x, s[1].x, s[2].x, s[3].x); \
G(s[0].y, s[1].y, s[2].y, s[3].y); \
G(s[0].z, s[1].z, s[2].z, s[3].z); \
G(s[0].w, s[1].w, s[2].w, s[3].w); \
G(s[0].x, s[1].y, s[2].z, s[3].w); \
G(s[0].y, s[1].z, s[2].w, s[3].x); \
G(s[0].z, s[1].w, s[2].x, s[3].y); \
G(s[0].w, s[1].x, s[2].y, s[3].z); \
} while(0)
void reduceDuplexf(ulong4* state ,__global ulong4* DMatrix)
{
ulong4 state1[3];
uint ps1 = 0;
uint ps2 = (memshift * 3 + memshift * 4);
//#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint s1 = ps1 + i*memshift;
uint s2 = ps2 - i*memshift;
for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1];
for (int j = 0; j < 3; j++) state[j] ^= state1[j];
round_lyra(state);
for (int j = 0; j < 3; j++) state1[j] ^= state[j];
for (int j = 0; j < 3; j++) (DMatrix)[j + s2] = state1[j];
}
}
void reduceDuplexRowf(uint rowIn,uint rowInOut,uint rowOut,ulong4 * state, __global ulong4 * DMatrix)
{
ulong4 state1[3], state2[3];
uint ps1 = (memshift * 4 * rowIn);
uint ps2 = (memshift * 4 * rowInOut);
uint ps3 = (memshift * 4 * rowOut);
for (int i = 0; i < 4; i++)
{
uint s1 = ps1 + i*memshift;
uint s2 = ps2 + i*memshift;
uint s3 = ps3 + i*memshift;
for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1];
for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2];
for (int j = 0; j < 3; j++) state1[j] += state2[j];
for (int j = 0; j < 3; j++) state[j] ^= state1[j];
round_lyra(state);
((ulong*)state2)[0] ^= ((ulong*)state)[11];
for (int j = 0; j < 11; j++)
((ulong*)state2)[j + 1] ^= ((ulong*)state)[j];
if (rowInOut != rowOut) {
for (int j = 0; j < 3; j++)
(DMatrix)[j + s2] = state2[j];
for (int j = 0; j < 3; j++)
(DMatrix)[j + s3] ^= state[j];
}
else {
for (int j = 0; j < 3; j++)
state2[j] ^= state[j];
for (int j = 0; j < 3; j++)
(DMatrix)[j + s2] = state2[j];
}
}
}
void reduceDuplexRowSetupf(uint rowIn, uint rowInOut, uint rowOut, ulong4 *state, __global ulong4* DMatrix) {
ulong4 state2[3], state1[3];
uint ps1 = (memshift * 4 * rowIn);
uint ps2 = (memshift * 4 * rowInOut);
uint ps3 = (memshift * 3 + memshift * 4 * rowOut);
for (int i = 0; i < 4; i++)
{
uint s1 = ps1 + i*memshift;
uint s2 = ps2 + i*memshift;
uint s3 = ps3 - i*memshift;
for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1];
for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2];
for (int j = 0; j < 3; j++) {
ulong4 tmp = state1[j] + state2[j];
state[j] ^= tmp;
}
round_lyra(state);
for (int j = 0; j < 3; j++) {
state1[j] ^= state[j];
(DMatrix)[j + s3] = state1[j];
}
((ulong*)state2)[0] ^= ((ulong*)state)[11];
for (int j = 0; j < 11; j++)
((ulong*)state2)[j + 1] ^= ((ulong*)state)[j];
for (int j = 0; j < 3; j++)
(DMatrix)[j + s2] = state2[j];
}
}