mirror of https://github.com/GOSTSec/sgminer
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
497 lines
12 KiB
497 lines
12 KiB
/* |
|
* Lyra2RE kernel implementation. |
|
* |
|
* ==========================(LICENSE BEGIN)============================ |
|
* Copyright (c) 2014 djm34 |
|
* Copyright (c) 2014 James Lovejoy |
|
* |
|
* Permission is hereby granted, free of charge, to any person obtaining |
|
* a copy of this software and associated documentation files (the |
|
* "Software"), to deal in the Software without restriction, including |
|
* without limitation the rights to use, copy, modify, merge, publish, |
|
* distribute, sublicense, and/or sell copies of the Software, and to |
|
* permit persons to whom the Software is furnished to do so, subject to |
|
* the following conditions: |
|
* |
|
* The above copyright notice and this permission notice shall be |
|
* included in all copies or substantial portions of the Software. |
|
* |
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
|
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
|
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
|
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
|
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
|
* |
|
* ===========================(LICENSE END)============================= |
|
* |
|
* @author djm34 |
|
*/ |
|
|
|
#pragma OPENCL EXTENSION cl_amd_printf : enable |
|
|
|
#ifndef LYRA2RE_CL |
|
#define LYRA2RE_CL |
|
|
|
#if __ENDIAN_LITTLE__ |
|
#define SPH_LITTLE_ENDIAN 1 |
|
#else |
|
#define SPH_BIG_ENDIAN 1 |
|
#endif |
|
|
|
#define SPH_UPTR sph_u64 |
|
|
|
typedef unsigned int sph_u32; |
|
typedef int sph_s32; |
|
#ifndef __OPENCL_VERSION__ |
|
typedef unsigned long long sph_u64; |
|
typedef long long sph_s64; |
|
#else |
|
typedef unsigned long sph_u64; |
|
typedef long sph_s64; |
|
#endif |
|
|
|
|
|
#define SPH_64 1 |
|
#define SPH_64_TRUE 1 |
|
|
|
#define SPH_C32(x) ((sph_u32)(x ## U)) |
|
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) |
|
|
|
#define SPH_C64(x) ((sph_u64)(x ## UL)) |
|
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) |
|
|
|
//#define SPH_ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) |
|
//#define SPH_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) |
|
//#define SPH_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) |
|
//#define SPH_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) |
|
|
|
#define SPH_ROTL32(x,n) rotate(x,(uint)n) //faster with driver 14.6 |
|
#define SPH_ROTR32(x,n) rotate(x,(uint)(32-n)) |
|
#define SPH_ROTL64(x,n) rotate(x,(ulong)n) |
|
//#define SPH_ROTR64(x,n) rotate(x,(ulong)(64-n)) |
|
|
|
/* |
|
inline ulong rol64 (ulong l,ulong n) |
|
{ |
|
if (n<=32) { |
|
uint2 t = rotate(as_uint2(l), (n)); |
|
return as_ulong((uint2)(bitselect(t.s0, t.s1, (uint)(1 << (n)) - 1), bitselect(t.s0, t.s1, (uint)(~((1 << (n)) - 1))))); } |
|
else { |
|
uint2 t = rotate(as_uint2(l), (n - 32)); |
|
return as_ulong((uint2)(bitselect(t.s1, t.s0, (uint)(1 << (n - 32)) - 1), bitselect(t.s1, t.s0, (uint)(~((1 << (n - 32)) - 1))))); |
|
}} |
|
*/ |
|
|
|
|
|
/* |
|
static inline ulong rol64(const ulong vw, unsigned n) { |
|
uint2 result; |
|
uint2 v=as_uint2(vw); |
|
if (n == 32) { return as_ulong((uint2)(v.y,v.x)); } |
|
if (n < 32) { |
|
result.y = ( (v.y << (n)) | (v.x >> (32 - n)) ); |
|
result.x = ( (v.x << (n)) | (v.y >> (32 - n)) ); |
|
} |
|
else { |
|
result.y = ( (v.x << (n - 32)) | (v.y >> (64 - n)) ); |
|
result.x = ( (v.y << (n - 32)) | (v.x >> (64 - n)) ); |
|
} |
|
return as_ulong(result); |
|
} |
|
*/ |
|
|
|
static inline sph_u64 ror64(sph_u64 vw, unsigned a) { |
|
uint2 result; |
|
uint2 v = as_uint2(vw); |
|
unsigned n = (unsigned)(64 - a); |
|
if (n == 32) { return as_ulong((uint2)(v.y,v.x)); } |
|
if (n < 32) { |
|
result.y = ((v.y << (n)) | (v.x >> (32 - n))); |
|
result.x = ((v.x << (n)) | (v.y >> (32 - n))); |
|
} |
|
else { |
|
result.y = ((v.x << (n - 32)) | (v.y >> (64 - n))); |
|
result.x = ((v.y << (n - 32)) | (v.x >> (64 - n))); |
|
} |
|
return as_ulong(result); |
|
} |
|
|
|
#define SPH_ROTR64(l,n) ror64(l,n) |
|
|
|
|
|
|
|
#include "blake256.cl" |
|
#include "groestl256.cl" |
|
#include "Lyra2.cl" |
|
#include "keccak1600.cl" |
|
#include "skein256.cl" |
|
|
|
#define SWAP4(x) as_uint(as_uchar4(x).wzyx) |
|
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210) |
|
|
|
#if SPH_BIG_ENDIAN |
|
#define DEC64E(x) (x) |
|
#define DEC64BE(x) (*(const __global sph_u64 *) (x)); |
|
#define DEC64LE(x) SWAP8(*(const __global sph_u64 *) (x)); |
|
#define DEC32LE(x) (*(const __global sph_u32 *) (x)); |
|
#else |
|
#define DEC64E(x) SWAP8(x) |
|
#define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x)); |
|
#define DEC64LE(x) (*(const __global sph_u64 *) (x)); |
|
#define DEC32LE(x) SWAP4(*(const __global sph_u32 *) (x)); |
|
#endif |
|
|
|
typedef union { |
|
unsigned char h1[64]; |
|
uint h4[16]; |
|
ulong h8[8]; |
|
} hash_t; |
|
|
|
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) |
|
__kernel void search( |
|
__global hash_t* hashes, |
|
// precalc hash from fisrt part of message |
|
const uint h0, |
|
const uint h1, |
|
const uint h2, |
|
const uint h3, |
|
const uint h4, |
|
const uint h5, |
|
const uint h6, |
|
const uint h7, |
|
// last 12 bytes of original message |
|
const uint in16, |
|
const uint in17, |
|
const uint in18 |
|
) |
|
{ |
|
uint gid = get_global_id(0); |
|
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); |
|
|
|
sph_u32 h[8]; |
|
sph_u32 m[16]; |
|
sph_u32 v[16]; |
|
|
|
|
|
h[0]=h0; |
|
h[1]=h1; |
|
h[2]=h2; |
|
h[3]=h3; |
|
h[4]=h4; |
|
h[5]=h5; |
|
h[6]=h6; |
|
h[7]=h7; |
|
// compress 2nd round |
|
m[0] = in16; |
|
m[1] = in17; |
|
m[2] = in18; |
|
m[3] = SWAP4(gid); |
|
|
|
for (int i = 4; i < 16; i++) {m[i] = c_Padding[i];} |
|
|
|
for (int i = 0; i < 8; i++) {v[i] = h[i];} |
|
|
|
v[8] = c_u256[0]; |
|
v[9] = c_u256[1]; |
|
v[10] = c_u256[2]; |
|
v[11] = c_u256[3]; |
|
v[12] = c_u256[4] ^ 640; |
|
v[13] = c_u256[5] ^ 640; |
|
v[14] = c_u256[6]; |
|
v[15] = c_u256[7]; |
|
|
|
for (int r = 0; r < 14; r++) { |
|
GS(0, 4, 0x8, 0xC, 0x0); |
|
GS(1, 5, 0x9, 0xD, 0x2); |
|
GS(2, 6, 0xA, 0xE, 0x4); |
|
GS(3, 7, 0xB, 0xF, 0x6); |
|
GS(0, 5, 0xA, 0xF, 0x8); |
|
GS(1, 6, 0xB, 0xC, 0xA); |
|
GS(2, 7, 0x8, 0xD, 0xC); |
|
GS(3, 4, 0x9, 0xE, 0xE); |
|
} |
|
|
|
for (int i = 0; i < 16; i++) { |
|
int j = i & 7; |
|
h[j] ^= v[i];} |
|
|
|
for (int i=0;i<8;i++) {hash->h4[i]=SWAP4(h[i]);} |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
} |
|
|
|
// keccak256 |
|
|
|
|
|
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) |
|
__kernel void search1(__global hash_t* hashes) |
|
{ |
|
uint gid = get_global_id(0); |
|
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); |
|
|
|
sph_u64 keccak_gpu_state[25]; |
|
|
|
for (int i = 0; i<25; i++) { |
|
if (i<4) { keccak_gpu_state[i] = hash->h8[i]; } |
|
else { keccak_gpu_state[i] = 0; } |
|
} |
|
keccak_gpu_state[4] = 0x0000000000000001; |
|
keccak_gpu_state[16] = 0x8000000000000000; |
|
|
|
keccak_block(keccak_gpu_state); |
|
for (int i = 0; i<4; i++) { hash->h8[i] = keccak_gpu_state[i]; } |
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
} |
|
|
|
/// lyra2 algo |
|
|
|
|
|
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) |
|
__kernel void search2(__global hash_t* hashes) |
|
{ |
|
uint gid = get_global_id(0); |
|
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); |
|
|
|
|
|
|
|
uint2 state[16]; |
|
|
|
for (int i = 0; i<4; i++) { state[i] = as_uint2(hash->h8[i]);} //password |
|
for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt |
|
|
|
for (int i = 0; i<8; i++) { state[i + 8] = as_uint2(blake2b_IV[i]); } |
|
|
|
// blake2blyra x2 |
|
|
|
for (int i = 0; i<24; i++) { round_lyra(state); } //because 12 is not enough |
|
|
|
__private uint2 Matrix[96][8]; // very uncool |
|
/// reducedSqueezeRow0 |
|
|
|
for (int i = 0; i < 8; i++) |
|
{ |
|
for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][0] = state[j]; } |
|
round_lyra(state); |
|
} |
|
|
|
/// reducedSqueezeRow1 |
|
|
|
for (int i = 0; i < 8; i++) |
|
{ |
|
for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + 12 * i][0]; } |
|
round_lyra(state); |
|
for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; } |
|
} |
|
|
|
|
|
reduceDuplexRowSetup(1, 0, 2); |
|
reduceDuplexRowSetup(2, 1, 3); |
|
reduceDuplexRowSetup(3, 0, 4); |
|
reduceDuplexRowSetup(4, 3, 5); |
|
reduceDuplexRowSetup(5, 2, 6); |
|
reduceDuplexRowSetup(6, 1, 7); |
|
|
|
sph_u32 rowa; |
|
rowa = state[0].x & 7; |
|
|
|
reduceDuplexRow(7, rowa, 0); |
|
rowa = state[0].x & 7; |
|
reduceDuplexRow(0, rowa, 3); |
|
rowa = state[0].x & 7; |
|
reduceDuplexRow(3, rowa, 6); |
|
rowa = state[0].x & 7; |
|
reduceDuplexRow(6, rowa, 1); |
|
rowa = state[0].x & 7; |
|
reduceDuplexRow(1, rowa, 4); |
|
rowa = state[0].x & 7; |
|
reduceDuplexRow(4, rowa, 7); |
|
rowa = state[0].x & 7; |
|
reduceDuplexRow(7, rowa, 2); |
|
rowa = state[0].x & 7; |
|
reduceDuplexRow(2, rowa, 5); |
|
|
|
absorbblock(rowa); |
|
|
|
for (int i = 0; i<4; i++) {hash->h8[i] = as_ulong(state[i]);} |
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
} |
|
|
|
//skein256 |
|
|
|
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) |
|
__kernel void search3(__global hash_t* hashes) |
|
{ |
|
uint gid = get_global_id(0); |
|
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); |
|
|
|
|
|
sph_u64 h[9]; |
|
sph_u64 t[3]; |
|
sph_u64 dt0,dt1,dt2,dt3; |
|
sph_u64 p0, p1, p2, p3, p4, p5, p6, p7; |
|
h[8] = skein_ks_parity; |
|
|
|
for (int i = 0; i<8; i++) { |
|
h[i] = SKEIN_IV512_256[i]; |
|
h[8] ^= h[i];} |
|
|
|
t[0]=t12[0]; |
|
t[1]=t12[1]; |
|
t[2]=t12[2]; |
|
|
|
dt0=hash->h8[0]; |
|
dt1=hash->h8[1]; |
|
dt2=hash->h8[2]; |
|
dt3=hash->h8[3]; |
|
|
|
p0 = h[0] + dt0; |
|
p1 = h[1] + dt1; |
|
p2 = h[2] + dt2; |
|
p3 = h[3] + dt3; |
|
p4 = h[4]; |
|
p5 = h[5] + t[0]; |
|
p6 = h[6] + t[1]; |
|
p7 = h[7]; |
|
|
|
#pragma unroll |
|
for (int i = 1; i<19; i+=2) {Round_8_512(p0,p1,p2,p3,p4,p5,p6,p7,i);} |
|
p0 ^= dt0; |
|
p1 ^= dt1; |
|
p2 ^= dt2; |
|
p3 ^= dt3; |
|
|
|
h[0] = p0; |
|
h[1] = p1; |
|
h[2] = p2; |
|
h[3] = p3; |
|
h[4] = p4; |
|
h[5] = p5; |
|
h[6] = p6; |
|
h[7] = p7; |
|
h[8] = skein_ks_parity; |
|
|
|
for (int i = 0; i<8; i++) { h[8] ^= h[i]; } |
|
|
|
t[0] = t12[3]; |
|
t[1] = t12[4]; |
|
t[2] = t12[5]; |
|
p5 += t[0]; //p5 already equal h[5] |
|
p6 += t[1]; |
|
|
|
#pragma unroll |
|
for (int i = 1; i<19; i+=2) { Round_8_512(p0, p1, p2, p3, p4, p5, p6, p7, i); } |
|
|
|
hash->h8[0] = p0; |
|
hash->h8[1] = p1; |
|
hash->h8[2] = p2; |
|
hash->h8[3] = p3; |
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
} |
|
|
|
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) |
|
__kernel void search4(__global hash_t* hashes, __global uint* output, const ulong target) |
|
{ |
|
// __local ulong T0[256], T1[256], T2[256], T3[256], T4[256], T5[256], T6[256], T7[256]; |
|
// uint u = get_local_id(0); |
|
/* |
|
for (uint u = get_local_id(0); u < 256; u += get_local_size(0)) { |
|
|
|
|
|
T0[u] = T0_G[u]; |
|
T1[u] = T1_G[u]; |
|
T2[u] = T2_G[u]; |
|
T3[u] = T3_G[u]; |
|
T4[u] = T4_G[u]; |
|
T5[u] = T5_G[u]; |
|
T6[u] = T6_G[u]; |
|
T7[u] = T7_G[u]; |
|
} |
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
T1[u] = SPH_ROTL64(T0[u], 8UL); |
|
T2[u] = SPH_ROTL64(T0[u], 16UL); |
|
T3[u] = SPH_ROTL64(T0[u], 24UL); |
|
T4[u] = SPH_ROTL64(T0[u], 32UL); |
|
T5[u] = SPH_ROTL64(T0[u], 40UL); |
|
T6[u] = SPH_ROTL64(T0[u], 48UL); |
|
T7[u] = SPH_ROTL64(T0[u], 56UL); |
|
|
|
*/ |
|
uint gid = get_global_id(0); |
|
|
|
__global hash_t *hash = &(hashes[gid - get_global_offset(0)]); |
|
|
|
|
|
|
|
__private ulong message[8], state[8]; |
|
__private ulong t[8]; |
|
|
|
|
|
for (int u = 0; u<4; u++) { message[u] = hash->h8[u]; } |
|
|
|
message[4] = 0x80UL; |
|
message[5] = 0UL; |
|
message[6] = 0UL; |
|
message[7] = 0x0100000000000000UL; |
|
|
|
|
|
for (int u = 0; u<8; u++) { state[u] = message[u]; } |
|
state[7] ^= 0x0001000000000000UL; |
|
|
|
|
|
|
|
for (int r = 0; r < 10; r ++) {ROUND_SMALL_P(state, r); } |
|
|
|
state[7] ^= 0x0001000000000000UL; |
|
|
|
|
|
for (int r = 0; r < 10; r ++) {ROUND_SMALL_Q(message, r); } |
|
|
|
|
|
for (int u = 0; u<8; u++) { state[u] ^= message[u]; } |
|
message[7] = state[7]; |
|
|
|
for (int r = 0; r < 9; r ++) {ROUND_SMALL_P(state, r); } |
|
uchar8 State; |
|
State.s0 =as_uchar8(state[7]^0x79).s0; |
|
State.s1 =as_uchar8(state[0]^0x09).s1; |
|
State.s2 =as_uchar8(state[1]^0x19).s2; |
|
State.s3 =as_uchar8(state[2]^0x29).s3; |
|
State.s4 =as_uchar8(state[3]^0x39).s4; |
|
State.s5 =as_uchar8(state[4]^0x49).s5; |
|
State.s6 =as_uchar8(state[5]^0x59).s6; |
|
State.s7 =as_uchar8(state[6]^0x69).s7; |
|
|
|
|
|
state[7] =T0_G[State.s0] |
|
^ R64(T0_G[State.s1], 8) |
|
^ R64(T0_G[State.s2], 16) |
|
^ R64(T0_G[State.s3], 24) |
|
^ T4_G[State.s4] |
|
^ R64(T4_G[State.s5], 8) |
|
^ R64(T4_G[State.s6], 16) |
|
^ R64(T4_G[State.s7], 24) ^message[7]; |
|
|
|
// t[7] ^= message[7]; |
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
bool result = ( state[7] <= target); |
|
if (result) { |
|
output[atomic_inc(output + 0xFF)] = SWAP4(gid); |
|
} |
|
|
|
} |
|
|
|
|
|
#endif // LYRA2RE_CL |