OpenCL GPU miner
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

436 lines
12 KiB

#if __ENDIAN_LITTLE__
#define SPH_LITTLE_ENDIAN 1
#else
#define SPH_BIG_ENDIAN 1
#endif
#define SPH_UPTR sph_u64
typedef unsigned int sph_u32;
typedef int sph_s32;
#ifndef __OPENCL_VERSION__
typedef unsigned long long sph_u64;
typedef long long sph_s64;
#else
typedef unsigned long sph_u64;
typedef long sph_s64;
#endif
#define SPH_64 1
#define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
#define SPH_HAMSI_EXPAND_SMALL 1
#include "fugue.cl"
#include "shavite.cl"
#include "hamsi.cl"
#include "panama.cl"
#define SWAP4(x) as_uint(as_uchar4(x).wzyx)
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
#if SPH_BIG_ENDIAN
#define DEC32BE(x) (*(const __global sph_u32 *) (x))
#else
#define DEC32BE(x) SWAP4(*(const __global sph_u32 *) (x))
#endif
#define sph_bswap32(x) SWAP4(x)
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
static void sph_enc32be(void *dst, sph_u32 val)
{
#if defined SPH_UPTR
#if SPH_UNALIGNED
#if SPH_LITTLE_ENDIAN
val = sph_bswap32(val);
#endif
*(sph_u32 *)dst = val;
#else
if (((SPH_UPTR)dst & 3) == 0) {
#if SPH_LITTLE_ENDIAN
val = sph_bswap32(val);
#endif
*(sph_u32 *)dst = val;
} else {
((unsigned char *)dst)[0] = (val >> 24);
((unsigned char *)dst)[1] = (val >> 16);
((unsigned char *)dst)[2] = (val >> 8);
((unsigned char *)dst)[3] = val;
}
#endif
#else
((unsigned char *)dst)[0] = (val >> 24);
((unsigned char *)dst)[1] = (val >> 16);
((unsigned char *)dst)[2] = (val >> 8);
((unsigned char *)dst)[3] = val;
#endif
}
static void sph_enc32le(void *dst, sph_u32 val)
{
#if defined SPH_UPTR
#if SPH_UNALIGNED
#if SPH_BIG_ENDIAN
val = sph_bswap32(val);
#endif
*(sph_u32 *)dst = val;
#else
if (((SPH_UPTR)dst & 3) == 0) {
#if SPH_BIG_ENDIAN
val = sph_bswap32(val);
#endif
*(sph_u32 *)dst = val;
} else {
((unsigned char *)dst)[0] = val;
((unsigned char *)dst)[1] = (val >> 8);
((unsigned char *)dst)[2] = (val >> 16);
((unsigned char *)dst)[3] = (val >> 24);
}
#endif
#else
((unsigned char *)dst)[0] = val;
((unsigned char *)dst)[1] = (val >> 8);
((unsigned char *)dst)[2] = (val >> 16);
((unsigned char *)dst)[3] = (val >> 24);
#endif
}
static sph_u32 sph_dec32le_aligned(const void *src)
{
#if SPH_LITTLE_ENDIAN
return *(const sph_u32 *)src;
#elif SPH_BIG_ENDIAN
return sph_bswap32(*(const sph_u32 *)src);
#else
return (sph_u32)(((const unsigned char *)src)[0])
| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
#endif
}
__kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target)
{
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
AES0[i] = AES0_C[i];
AES1[i] = AES1_C[i];
AES2[i] = AES2_C[i];
AES3[i] = AES3_C[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
unsigned char hash[64];
for(unsigned j = 0; j < 64; j++)
hash[j] = 0;
sph_u32 gid = get_global_id(0);
// fugue
{
sph_u32 S00 = 0, S01 = 0, S02 = 0, S03 = 0, S04 = 0, S05 = 0, S06 = 0, S07 = 0, S08 = 0, S09 = 0; \
sph_u32 S10 = 0, S11 = 0, S12 = 0, S13 = 0, S14 = 0, S15 = 0, S16 = 0, S17 = 0, S18 = 0, S19 = 0; \
sph_u32 S20 = 0, S21 = 0, S22 = IV256[0], S23 = IV256[1], S24 = IV256[2], S25 = IV256[3], S26 = IV256[4], S27 = IV256[5], S28 = IV256[6], S29 = IV256[7];
FUGUE256_5(DEC32BE(block + 0x0), DEC32BE(block + 0x4), DEC32BE(block + 0x8), DEC32BE(block + 0xc), DEC32BE(block + 0x10));
FUGUE256_5(DEC32BE(block + 0x14), DEC32BE(block + 0x18), DEC32BE(block + 0x1c), DEC32BE(block + 0x20), DEC32BE(block + 0x24));
FUGUE256_5(DEC32BE(block + 0x28), DEC32BE(block + 0x2c), DEC32BE(block + 0x30), DEC32BE(block + 0x34), DEC32BE(block + 0x38));
FUGUE256_4(DEC32BE(block + 0x3c), DEC32BE(block + 0x40), DEC32BE(block + 0x44), DEC32BE(block + 0x48));
TIX2(SWAP4(gid), S06, S07, S14, S16, S00);
CMIX30(S03, S04, S05, S07, S08, S09, S18, S19, S20);
SMIX(S03, S04, S05, S06);
CMIX30(S00, S01, S02, S04, S05, S06, S15, S16, S17);
SMIX(S00, S01, S02, S03);
TIX2(0, S00, S01, S08, S10, S24);
CMIX30(S27, S28, S29, S01, S02, S03, S12, S13, S14);
SMIX(S27, S28, S29, S00);
CMIX30(S24, S25, S26, S28, S29, S00, S09, S10, S11);
SMIX(S24, S25, S26, S27);
TIX2(0x280, S24, S25, S02, S04, S18);
CMIX30(S21, S22, S23, S25, S26, S27, S06, S07, S08);
SMIX(S21, S22, S23, S24);
CMIX30(S18, S19, S20, S22, S23, S24, S03, S04, S05);
SMIX(S18, S19, S20, S21);
CMIX30(S15, S16, S17, S19, S20, S21, S00, S01, S02);
SMIX(S15, S16, S17, S18);
CMIX30(S12, S13, S14, S16, S17, S18, S27, S28, S29);
SMIX(S12, S13, S14, S15);
CMIX30(S09, S10, S11, S13, S14, S15, S24, S25, S26);
SMIX(S09, S10, S11, S12);
CMIX30(S06, S07, S08, S10, S11, S12, S21, S22, S23);
SMIX(S06, S07, S08, S09);
CMIX30(S03, S04, S05, S07, S08, S09, S18, S19, S20);
SMIX(S03, S04, S05, S06);
CMIX30(S00, S01, S02, S04, S05, S06, S15, S16, S17);
SMIX(S00, S01, S02, S03);
CMIX30(S27, S28, S29, S01, S02, S03, S12, S13, S14);
SMIX(S27, S28, S29, S00);
CMIX30(S24, S25, S26, S28, S29, S00, S09, S10, S11);
SMIX(S24, S25, S26, S27);
CMIX30(S21, S22, S23, S25, S26, S27, S06, S07, S08);
SMIX(S21, S22, S23, S24);
CMIX30(S18, S19, S20, S22, S23, S24, S03, S04, S05);
SMIX(S18, S19, S20, S21);
S22 ^= S18;
S03 ^= S18;
SMIX(S03, S04, S05, S06);
S07 ^= S03;
S19 ^= S03;
SMIX(S19, S20, S21, S22);
S23 ^= S19;
S04 ^= S19;
SMIX(S04, S05, S06, S07);
S08 ^= S04;
S20 ^= S04;
SMIX(S20, S21, S22, S23);
S24 ^= S20;
S05 ^= S20;
SMIX(S05, S06, S07, S08);
S09 ^= S05;
S21 ^= S05;
SMIX(S21, S22, S23, S24);
S25 ^= S21;
S06 ^= S21;
SMIX(S06, S07, S08, S09);
S10 ^= S06;
S22 ^= S06;
SMIX(S22, S23, S24, S25);
S26 ^= S22;
S07 ^= S22;
SMIX(S07, S08, S09, S10);
S11 ^= S07;
S23 ^= S07;
SMIX(S23, S24, S25, S26);
S27 ^= S23;
S08 ^= S23;
SMIX(S08, S09, S10, S11);
S12 ^= S08;
S24 ^= S08;
SMIX(S24, S25, S26, S27);
S28 ^= S24;
S09 ^= S24;
SMIX(S09, S10, S11, S12);
S13 ^= S09;
S25 ^= S09;
SMIX(S25, S26, S27, S28);
S29 ^= S25;
S10 ^= S25;
SMIX(S10, S11, S12, S13);
S14 ^= S10;
S26 ^= S10;
SMIX(S26, S27, S28, S29);
S00 ^= S26;
S11 ^= S26;
SMIX(S11, S12, S13, S14);
S15 ^= S11;
S27 ^= S11;
SMIX(S27, S28, S29, S00);
S01 ^= S27;
S12 ^= S27;
SMIX(S12, S13, S14, S15);
S16 ^= S12;
S28 ^= S12;
SMIX(S28, S29, S00, S01);
S02 ^= S28;
S13 ^= S28;
SMIX(S13, S14, S15, S16);
S17 ^= S13;
S29 ^= S13;
SMIX(S29, S00, S01, S02);
S03 ^= S29;
S14 ^= S29;
SMIX(S14, S15, S16, S17);
S18 ^= S14;
S00 ^= S14;
SMIX(S00, S01, S02, S03);
S04 ^= S00;
S15 ^= S00;
SMIX(S15, S16, S17, S18);
S19 ^= S15;
S01 ^= S15;
SMIX(S01, S02, S03, S04);
S05 ^= S01;
S16 ^= S01;
sph_enc32be((unsigned char*) &hash + 0, S02);
sph_enc32be((unsigned char*) &hash + 4, S03);
sph_enc32be((unsigned char*) &hash + 8, S04);
sph_enc32be((unsigned char*) &hash + 12, S05);
sph_enc32be((unsigned char*) &hash + 16, S16);
sph_enc32be((unsigned char*) &hash + 20, S17);
sph_enc32be((unsigned char*) &hash + 24, S18);
sph_enc32be((unsigned char*) &hash + 28, S19);
}
// shavite
{
sph_u32 h[] = { SPH_C32(0x49BB3E47), SPH_C32(0x2674860D), SPH_C32(0xA8B392AC), SPH_C32(0x021AC4E6), SPH_C32(0x409283CF), SPH_C32(0x620E5D86), SPH_C32(0x6D929DCB), SPH_C32(0x96CC2A8B) };
sph_u32 rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7;
sph_u32 rk8, rk9, rkA, rkB, rkC, rkD, rkE, rkF;
sph_u32 count0, count1;
rk0 = sph_dec32le_aligned((const unsigned char *)&hash + 0);
rk1 = sph_dec32le_aligned((const unsigned char *)&hash + 4);
rk2 = sph_dec32le_aligned((const unsigned char *)&hash + 8);
rk3 = sph_dec32le_aligned((const unsigned char *)&hash + 12);
rk4 = sph_dec32le_aligned((const unsigned char *)&hash + 16);
rk5 = sph_dec32le_aligned((const unsigned char *)&hash + 20);
rk6 = sph_dec32le_aligned((const unsigned char *)&hash + 24);
rk7 = sph_dec32le_aligned((const unsigned char *)&hash + 28);
rk8 = sph_dec32le_aligned((const unsigned char *)&hash + 32);
rk9 = sph_dec32le_aligned((const unsigned char *)&hash + 36);
rkA = sph_dec32le_aligned((const unsigned char *)&hash + 40);
rkB = sph_dec32le_aligned((const unsigned char *)&hash + 44);
rkC = sph_dec32le_aligned((const unsigned char *)&hash + 48);
rkD = sph_dec32le_aligned((const unsigned char *)&hash + 52);
rkE = sph_dec32le_aligned((const unsigned char *)&hash + 56);
rkF = sph_dec32le_aligned((const unsigned char *)&hash + 60);
count0 = 0x200;
count1 = 0;
c256(buf);
rk0 = 0x80;
rk1 = 0;
rk2 = 0;
rk3 = 0;
rk4 = 0;
rk5 = 0;
rk6 = 0;
rk7 = 0;
rk8 = 0;
rk9 = 0;
rkA = 0;
rkB = 0;
rkC = 0;
rkD = 0x2000000;
rkE = 0;
rkF = 0x1000000;
count0 = 0;
count1 = 0;
c256(buf);
for (unsigned u = 0; u < 8; u ++)
sph_enc32le((unsigned char *)&hash + (u << 2), h[u]);
}
// hamsi
{
sph_u32 c0 = HAMSI_IV256[0], c1 = HAMSI_IV256[1], c2 = HAMSI_IV256[2], c3 = HAMSI_IV256[3];
sph_u32 c4 = HAMSI_IV256[4], c5 = HAMSI_IV256[5], c6 = HAMSI_IV256[6], c7 = HAMSI_IV256[7];
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
sph_u32 h[8] = { c0, c1, c2, c3, c4, c5, c6, c7 };
#define buf(u) hash[i + u]
for(int i = 0; i < 64; i += 4) {
INPUT_SMALL;
P_SMALL;
T_SMALL;
}
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0)
INPUT_SMALL;
P_SMALL;
T_SMALL;
#undef buf
#define buf(u) 0
INPUT_SMALL;
P_SMALL;
T_SMALL;
#undef buf
#define buf(u) (u == 2 ? 2 : 0)
INPUT_SMALL;
PF_SMALL;
T_SMALL;
for (unsigned u = 0; u < 8; u ++)
sph_enc32be((unsigned char*) &hash + (u << 2), h[u]);
}
// panama
{
sph_u32 buffer[32][8];
sph_u32 state[17];
int i, j;
for(i = 0; i < 32; i++)
for(j = 0; j < 8; j++)
buffer[i][j] = 0;
for(i = 0; i < 17; i++)
state[i] = 0;
LVARS
unsigned ptr0 = 0;
#define INW1(i) sph_dec32le_aligned((unsigned char*) &hash + 4 * (i))
#define INW2(i) INW1(i)
M17(RSTATE);
PANAMA_STEP;
#undef INW1
#undef INW2
#define INW1(i) sph_dec32le_aligned((unsigned char*) &hash + 32 + 4 * (i))
#define INW2(i) INW1(i)
PANAMA_STEP;
M17(WSTATE);
#undef INW1
#undef INW2
#define INW1(i) (sph_u32) (i == 0)
#define INW2(i) INW1(i)
M17(RSTATE);
PANAMA_STEP;
M17(WSTATE);
#undef INW1
#undef INW2
#define INW1(i) INW_H1(INC ## i)
#define INW_H1(i) INW_H2(i)
#define INW_H2(i) a ## i
#define INW2(i) buffer[ptr4][i]
M17(RSTATE);
for(i = 0; i < 32; i++) {
unsigned ptr4 = (ptr0 + 4) & 31;
PANAMA_STEP;
}
M17(WSTATE);
#undef INW1
#undef INW_H1
#undef INW_H2
#undef INW2
bool result = ((((sph_u64) state[16] << 32) | state[15]) <= target);
if (result)
output[output[0xFF]++] = SWAP4(gid);
}
}