/* * Lyra2 kernel implementation. * * ==========================(LICENSE BEGIN)============================ * Copyright (c) 2014 djm34 * * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * ===========================(LICENSE END)============================= * * @author djm34 */ #define ROTL64(x,n) rotate(x,(ulong)n) #define ROTR64(x,n) rotate(x,(ulong)(64-n)) #define SWAP32(x) as_ulong(as_uint2(x).s10) #define SWAP24(x) as_ulong(as_uchar8(x).s34567012) #define SWAP16(x) as_ulong(as_uchar8(x).s23456701) #define G(a,b,c,d) \ do { \ a += b; d ^= a; d = SWAP32(d); \ c += d; b ^= c; b = ROTR64(b,24); \ a += b; d ^= a; d = ROTR64(d,16); \ c += d; b ^= c; b = ROTR64(b, 63); \ \ } while (0) #define G_old(a,b,c,d) \ do { \ a += b; d ^= a; d = ROTR64(d, 32); \ c += d; b ^= c; b = ROTR64(b, 24); \ a += b; d ^= a; d = ROTR64(d, 16); \ c += d; b ^= c; b = ROTR64(b, 63); \ \ } while (0) /*One Round of the Blake2b's compression function*/ #define round_lyra(s) \ do { \ G(s[0].x, s[1].x, s[2].x, s[3].x); \ G(s[0].y, s[1].y, s[2].y, s[3].y); \ G(s[0].z, s[1].z, s[2].z, s[3].z); \ G(s[0].w, s[1].w, s[2].w, s[3].w); \ G(s[0].x, s[1].y, s[2].z, s[3].w); \ G(s[0].y, s[1].z, s[2].w, s[3].x); \ G(s[0].z, s[1].w, s[2].x, s[3].y); \ G(s[0].w, s[1].x, s[2].y, s[3].z); \ } while(0) void reduceDuplexf(ulong4* state ,__global ulong4* DMatrix) { ulong4 state1[3]; uint ps1 = 0; uint ps2 = (memshift * 3 + memshift * 4); //#pragma unroll 4 for (int i = 0; i < 4; i++) { uint s1 = ps1 + i*memshift; uint s2 = ps2 - i*memshift; for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1]; for (int j = 0; j < 3; j++) state[j] ^= state1[j]; round_lyra(state); for (int j = 0; j < 3; j++) state1[j] ^= state[j]; for (int j = 0; j < 3; j++) (DMatrix)[j + s2] = state1[j]; } } void reduceDuplexRowf(uint rowIn,uint rowInOut,uint rowOut,ulong4 * state, __global ulong4 * DMatrix) { ulong4 state1[3], state2[3]; uint ps1 = (memshift * 4 * rowIn); uint ps2 = (memshift * 4 * rowInOut); uint ps3 = (memshift * 4 * rowOut); for (int i = 0; i < 4; i++) { uint s1 = ps1 + i*memshift; uint s2 = ps2 + i*memshift; uint s3 = ps3 + i*memshift; for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1]; for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2]; for (int j = 0; j < 3; j++) state1[j] += state2[j]; for (int j = 0; j < 3; j++) state[j] ^= state1[j]; round_lyra(state); ((ulong*)state2)[0] ^= ((ulong*)state)[11]; for (int j = 0; j < 11; j++) ((ulong*)state2)[j + 1] ^= ((ulong*)state)[j]; if (rowInOut != rowOut) { for (int j = 0; j < 3; j++) (DMatrix)[j + s2] = state2[j]; for (int j = 0; j < 3; j++) (DMatrix)[j + s3] ^= state[j]; } else { for (int j = 0; j < 3; j++) state2[j] ^= state[j]; for (int j = 0; j < 3; j++) (DMatrix)[j + s2] = state2[j]; } } } void reduceDuplexRowSetupf(uint rowIn, uint rowInOut, uint rowOut, ulong4 *state, __global ulong4* DMatrix) { ulong4 state2[3], state1[3]; uint ps1 = (memshift * 4 * rowIn); uint ps2 = (memshift * 4 * rowInOut); uint ps3 = (memshift * 3 + memshift * 4 * rowOut); for (int i = 0; i < 4; i++) { uint s1 = ps1 + i*memshift; uint s2 = ps2 + i*memshift; uint s3 = ps3 - i*memshift; for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1]; for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2]; for (int j = 0; j < 3; j++) { ulong4 tmp = state1[j] + state2[j]; state[j] ^= tmp; } round_lyra(state); for (int j = 0; j < 3; j++) { state1[j] ^= state[j]; (DMatrix)[j + s3] = state1[j]; } ((ulong*)state2)[0] ^= ((ulong*)state)[11]; for (int j = 0; j < 11; j++) ((ulong*)state2)[j + 1] ^= ((ulong*)state)[j]; for (int j = 0; j < 3; j++) (DMatrix)[j + s2] = state2[j]; } }