#if __ENDIAN_LITTLE__
#define SPH_LITTLE_ENDIAN 1
#else
#define SPH_BIG_ENDIAN 1
#endif

#define SPH_UPTR sph_u64

typedef unsigned int sph_u32;
typedef int sph_s32;
#ifndef __OPENCL_VERSION__
typedef unsigned long long sph_u64;
typedef long long sph_s64;
#else
typedef unsigned long sph_u64;
typedef long sph_s64;
#endif

#define SPH_64 1
#define SPH_64_TRUE 1

#define SPH_C32(x)    ((sph_u32)(x ## U))
#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))

#define SPH_C64(x)    ((sph_u64)(x ## UL))
#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
#define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))

#define SPH_HAMSI_EXPAND_SMALL 1

#include "fugue.cl"
#include "shavite.cl"
#include "hamsi.cl"
#include "panama.cl"

#define SWAP4(x) as_uint(as_uchar4(x).wzyx)
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)

#if SPH_BIG_ENDIAN
    #define DEC32BE(x) (*(const __global sph_u32 *) (x))
#else
    #define DEC32BE(x) SWAP4(*(const __global sph_u32 *) (x))
#endif

#define sph_bswap32(x) SWAP4(x)

static void sph_enc32be(void *dst, sph_u32 val)
{
#if defined SPH_UPTR
#if SPH_UNALIGNED
#if SPH_LITTLE_ENDIAN
	val = sph_bswap32(val);
#endif
	*(sph_u32 *)dst = val;
#else
	if (((SPH_UPTR)dst & 3) == 0) {
#if SPH_LITTLE_ENDIAN
		val = sph_bswap32(val);
#endif
		*(sph_u32 *)dst = val;
	} else {
		((unsigned char *)dst)[0] = (val >> 24);
		((unsigned char *)dst)[1] = (val >> 16);
		((unsigned char *)dst)[2] = (val >> 8);
		((unsigned char *)dst)[3] = val;
	}
#endif
#else
	((unsigned char *)dst)[0] = (val >> 24);
	((unsigned char *)dst)[1] = (val >> 16);
	((unsigned char *)dst)[2] = (val >> 8);
	((unsigned char *)dst)[3] = val;
#endif
}

static void sph_enc32le(void *dst, sph_u32 val)
{
#if defined SPH_UPTR
#if SPH_UNALIGNED
#if SPH_BIG_ENDIAN
	val = sph_bswap32(val);
#endif
	*(sph_u32 *)dst = val;
#else
	if (((SPH_UPTR)dst & 3) == 0) {
#if SPH_BIG_ENDIAN
		val = sph_bswap32(val);
#endif
		*(sph_u32 *)dst = val;
	} else {
		((unsigned char *)dst)[0] = val;
		((unsigned char *)dst)[1] = (val >> 8);
		((unsigned char *)dst)[2] = (val >> 16);
		((unsigned char *)dst)[3] = (val >> 24);
	}
#endif
#else
	((unsigned char *)dst)[0] = val;
	((unsigned char *)dst)[1] = (val >> 8);
	((unsigned char *)dst)[2] = (val >> 16);
	((unsigned char *)dst)[3] = (val >> 24);
#endif
}

static sph_u32 sph_dec32le_aligned(const void *src)
{
#if SPH_LITTLE_ENDIAN
	return *(const sph_u32 *)src;
#elif SPH_BIG_ENDIAN
	return sph_bswap32(*(const sph_u32 *)src);
#else
	return (sph_u32)(((const unsigned char *)src)[0])
		| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
		| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
		| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
#endif
}


__kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target)
{
    __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
    int init = get_local_id(0);
    int step = get_local_size(0);
    for (int i = init; i < 256; i += step)
    {
        AES0[i] = AES0_C[i];
        AES1[i] = AES1_C[i];
        AES2[i] = AES2_C[i];
        AES3[i] = AES3_C[i];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    unsigned char hash[64];
    for(unsigned j = 0; j < 64; j++)
        hash[j] = 0;
 
    sph_u32 gid = get_global_id(0);

    // fugue
    {
        sph_u32 S00 = 0, S01 = 0, S02 = 0, S03 = 0, S04 = 0, S05 = 0, S06 = 0, S07 = 0, S08 = 0, S09 = 0; \
        sph_u32 S10 = 0, S11 = 0, S12 = 0, S13 = 0, S14 = 0, S15 = 0, S16 = 0, S17 = 0, S18 = 0, S19 = 0; \
        sph_u32 S20 = 0, S21 = 0, S22 = IV256[0], S23 = IV256[1], S24 = IV256[2], S25 = IV256[3], S26 = IV256[4], S27 = IV256[5], S28 = IV256[6], S29 = IV256[7];
    
        FUGUE256_5(DEC32BE(block + 0x0), DEC32BE(block + 0x4), DEC32BE(block + 0x8), DEC32BE(block + 0xc), DEC32BE(block + 0x10));
        FUGUE256_5(DEC32BE(block + 0x14), DEC32BE(block + 0x18), DEC32BE(block + 0x1c), DEC32BE(block + 0x20), DEC32BE(block + 0x24));
        FUGUE256_5(DEC32BE(block + 0x28), DEC32BE(block + 0x2c), DEC32BE(block + 0x30), DEC32BE(block + 0x34), DEC32BE(block + 0x38));
        FUGUE256_4(DEC32BE(block + 0x3c), DEC32BE(block + 0x40), DEC32BE(block + 0x44), DEC32BE(block + 0x48));
    
        TIX2(SWAP4(gid), S06, S07, S14, S16, S00);
        CMIX30(S03, S04, S05, S07, S08, S09, S18, S19, S20);
        SMIX(S03, S04, S05, S06);
        CMIX30(S00, S01, S02, S04, S05, S06, S15, S16, S17);
        SMIX(S00, S01, S02, S03);
    
        TIX2(0, S00, S01, S08, S10, S24);
        CMIX30(S27, S28, S29, S01, S02, S03, S12, S13, S14);
        SMIX(S27, S28, S29, S00);
        CMIX30(S24, S25, S26, S28, S29, S00, S09, S10, S11);
        SMIX(S24, S25, S26, S27);
    
        TIX2(0x280, S24, S25, S02, S04, S18);
        CMIX30(S21, S22, S23, S25, S26, S27, S06, S07, S08);
        SMIX(S21, S22, S23, S24);
        CMIX30(S18, S19, S20, S22, S23, S24, S03, S04, S05);
        SMIX(S18, S19, S20, S21);
    
        CMIX30(S15, S16, S17, S19, S20, S21, S00, S01, S02);
        SMIX(S15, S16, S17, S18);
        CMIX30(S12, S13, S14, S16, S17, S18, S27, S28, S29);
        SMIX(S12, S13, S14, S15);
        CMIX30(S09, S10, S11, S13, S14, S15, S24, S25, S26);
        SMIX(S09, S10, S11, S12);
        CMIX30(S06, S07, S08, S10, S11, S12, S21, S22, S23);
        SMIX(S06, S07, S08, S09);
        CMIX30(S03, S04, S05, S07, S08, S09, S18, S19, S20);
        SMIX(S03, S04, S05, S06);
        CMIX30(S00, S01, S02, S04, S05, S06, S15, S16, S17);
        SMIX(S00, S01, S02, S03);
        CMIX30(S27, S28, S29, S01, S02, S03, S12, S13, S14);
        SMIX(S27, S28, S29, S00);
        CMIX30(S24, S25, S26, S28, S29, S00, S09, S10, S11);
        SMIX(S24, S25, S26, S27);
        CMIX30(S21, S22, S23, S25, S26, S27, S06, S07, S08);
        SMIX(S21, S22, S23, S24);
        CMIX30(S18, S19, S20, S22, S23, S24, S03, S04, S05);
        SMIX(S18, S19, S20, S21);
        S22 ^= S18;
        S03 ^= S18;
        SMIX(S03, S04, S05, S06);
        S07 ^= S03;
        S19 ^= S03;
        SMIX(S19, S20, S21, S22);
        S23 ^= S19;
        S04 ^= S19;
        SMIX(S04, S05, S06, S07);
        S08 ^= S04;
        S20 ^= S04;
        SMIX(S20, S21, S22, S23);
        S24 ^= S20;
        S05 ^= S20;
        SMIX(S05, S06, S07, S08);
        S09 ^= S05;
        S21 ^= S05;
        SMIX(S21, S22, S23, S24);
        S25 ^= S21;
        S06 ^= S21;
        SMIX(S06, S07, S08, S09);
        S10 ^= S06;
        S22 ^= S06;
        SMIX(S22, S23, S24, S25);
        S26 ^= S22;
        S07 ^= S22;
        SMIX(S07, S08, S09, S10);
        S11 ^= S07;
        S23 ^= S07;
        SMIX(S23, S24, S25, S26);
        S27 ^= S23;
        S08 ^= S23;
        SMIX(S08, S09, S10, S11);
        S12 ^= S08;
        S24 ^= S08;
        SMIX(S24, S25, S26, S27);
        S28 ^= S24;
        S09 ^= S24;
        SMIX(S09, S10, S11, S12);
        S13 ^= S09;
        S25 ^= S09;
        SMIX(S25, S26, S27, S28);
        S29 ^= S25;
        S10 ^= S25;
        SMIX(S10, S11, S12, S13);
        S14 ^= S10;
        S26 ^= S10;
        SMIX(S26, S27, S28, S29);
        S00 ^= S26;
        S11 ^= S26;
        SMIX(S11, S12, S13, S14);
        S15 ^= S11;
        S27 ^= S11;
        SMIX(S27, S28, S29, S00);
        S01 ^= S27;
        S12 ^= S27;
        SMIX(S12, S13, S14, S15);
        S16 ^= S12;
        S28 ^= S12;
        SMIX(S28, S29, S00, S01);
        S02 ^= S28;
        S13 ^= S28;
        SMIX(S13, S14, S15, S16);
        S17 ^= S13;
        S29 ^= S13;
        SMIX(S29, S00, S01, S02);
        S03 ^= S29;
        S14 ^= S29;
        SMIX(S14, S15, S16, S17);
        S18 ^= S14;
        S00 ^= S14;
        SMIX(S00, S01, S02, S03);
        S04 ^= S00;
        S15 ^= S00;
        SMIX(S15, S16, S17, S18);
        S19 ^= S15;
        S01 ^= S15;
        SMIX(S01, S02, S03, S04);
    
        S05 ^= S01;
        S16 ^= S01;
    
        sph_enc32be((unsigned char*) &hash +  0, S02);
        sph_enc32be((unsigned char*) &hash +  4, S03);
        sph_enc32be((unsigned char*) &hash +  8, S04);
        sph_enc32be((unsigned char*) &hash + 12, S05);
        sph_enc32be((unsigned char*) &hash + 16, S16);
        sph_enc32be((unsigned char*) &hash + 20, S17);
        sph_enc32be((unsigned char*) &hash + 24, S18);
        sph_enc32be((unsigned char*) &hash + 28, S19);
    }

    // shavite
    {
	sph_u32 h[] = { SPH_C32(0x49BB3E47), SPH_C32(0x2674860D), SPH_C32(0xA8B392AC), SPH_C32(0x021AC4E6), SPH_C32(0x409283CF), SPH_C32(0x620E5D86), SPH_C32(0x6D929DCB), SPH_C32(0x96CC2A8B) };
	sph_u32 rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7;
	sph_u32 rk8, rk9, rkA, rkB, rkC, rkD, rkE, rkF;
	sph_u32 count0, count1;

	rk0 = sph_dec32le_aligned((const unsigned char *)&hash +  0);
	rk1 = sph_dec32le_aligned((const unsigned char *)&hash +  4);
	rk2 = sph_dec32le_aligned((const unsigned char *)&hash +  8);
	rk3 = sph_dec32le_aligned((const unsigned char *)&hash + 12);
	rk4 = sph_dec32le_aligned((const unsigned char *)&hash + 16);
	rk5 = sph_dec32le_aligned((const unsigned char *)&hash + 20);
	rk6 = sph_dec32le_aligned((const unsigned char *)&hash + 24);
	rk7 = sph_dec32le_aligned((const unsigned char *)&hash + 28);
	rk8 = sph_dec32le_aligned((const unsigned char *)&hash + 32);
	rk9 = sph_dec32le_aligned((const unsigned char *)&hash + 36);
	rkA = sph_dec32le_aligned((const unsigned char *)&hash + 40);
	rkB = sph_dec32le_aligned((const unsigned char *)&hash + 44);
	rkC = sph_dec32le_aligned((const unsigned char *)&hash + 48);
	rkD = sph_dec32le_aligned((const unsigned char *)&hash + 52);
	rkE = sph_dec32le_aligned((const unsigned char *)&hash + 56);
	rkF = sph_dec32le_aligned((const unsigned char *)&hash + 60);
	count0 = 0x200;
	count1 = 0;
	c256(buf);

	rk0 = 0x80;
	rk1 = 0;
	rk2 = 0;
	rk3 = 0;
	rk4 = 0;
	rk5 = 0;
	rk6 = 0;
	rk7 = 0;
	rk8 = 0;
	rk9 = 0;
	rkA = 0;
	rkB = 0;
	rkC = 0;
	rkD = 0x2000000;
	rkE = 0;
	rkF = 0x1000000;
	count0 = 0;
	count1 = 0;
	c256(buf);

	for (unsigned u = 0; u < 8; u ++)
		sph_enc32le((unsigned char *)&hash + (u << 2), h[u]);
    }

    // hamsi
    {
        sph_u32 c0 = HAMSI_IV256[0], c1 = HAMSI_IV256[1], c2 = HAMSI_IV256[2], c3 = HAMSI_IV256[3];
        sph_u32 c4 = HAMSI_IV256[4], c5 = HAMSI_IV256[5], c6 = HAMSI_IV256[6], c7 = HAMSI_IV256[7];
        sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
        sph_u32 h[8] = { c0, c1, c2, c3, c4, c5, c6, c7 };

#define buf(u) hash[i + u]
        for(int i = 0; i < 64; i += 4) {
            INPUT_SMALL;
            P_SMALL;
            T_SMALL;
        }
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0)
        INPUT_SMALL;
        P_SMALL;
        T_SMALL;
#undef buf
#define buf(u) 0
        INPUT_SMALL;
        P_SMALL;
        T_SMALL;
#undef buf
#define buf(u) (u == 2 ? 2 : 0)
        INPUT_SMALL;
        PF_SMALL;
        T_SMALL;

        for (unsigned u = 0; u < 8; u ++)
            sph_enc32be((unsigned char*) &hash + (u << 2), h[u]);
    }

    // panama
    {
        sph_u32 buffer[32][8];
        sph_u32 state[17];
        int i, j;
        for(i = 0; i < 32; i++)
                for(j = 0; j < 8; j++)
                        buffer[i][j] = 0;
        for(i = 0; i < 17; i++)
                state[i] = 0;

        LVARS
        unsigned ptr0 = 0;
#define INW1(i)   sph_dec32le_aligned((unsigned char*) &hash + 4 * (i))
#define INW2(i)   INW1(i)

        M17(RSTATE);
        PANAMA_STEP;

#undef INW1
#undef INW2
#define INW1(i)   sph_dec32le_aligned((unsigned char*) &hash + 32 + 4 * (i))
#define INW2(i)   INW1(i)
        PANAMA_STEP;
        M17(WSTATE);

#undef INW1
#undef INW2

#define INW1(i)   (sph_u32) (i == 0)
#define INW2(i)   INW1(i)

        M17(RSTATE);
        PANAMA_STEP;
        M17(WSTATE);

#undef INW1
#undef INW2

#define INW1(i)     INW_H1(INC ## i)
#define INW_H1(i)   INW_H2(i)
#define INW_H2(i)   a ## i
#define INW2(i)     buffer[ptr4][i]

        M17(RSTATE);
        for(i = 0; i < 32; i++) {
                unsigned ptr4 = (ptr0 + 4) & 31;
                PANAMA_STEP;
        }
        M17(WSTATE);

#undef INW1
#undef INW_H1
#undef INW_H2
#undef INW2

        bool result = ((((sph_u64) state[16] << 32) | state[15]) <= target);
        if (result)
            output[output[0xFF]++] = SWAP4(gid);
    }
}