typedef unsigned int    uint_t;             /* native unsigned integer */

#define SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */

#define SKEIN_256_STATE_WORDS ( 4)
#define SKEIN_512_STATE_WORDS ( 8)
#define SKEIN1024_STATE_WORDS (16)

#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)

#define SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)

#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)

#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)

#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */

#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */

#define SKEIN_T1_FLAG_FIRST     (((uint64_t)  1 ) << SKEIN_T1_POS_FIRST)
#define SKEIN_T1_FLAG_BIT_PAD   (((uint64_t)  1 ) << SKEIN_T1_POS_BIT_PAD)
#define SKEIN_T1_FLAG_FINAL     (((uint64_t)  1 ) << SKEIN_T1_POS_FINAL)

#define SKEIN_BLK_TYPE_MSG      (48)                    /* message processing */
#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */

#define SKEIN_T1_BLK_TYPE(T)   (((uint64_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)

#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing */
#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */

#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)

#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}

#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)

#define Skein_Set_T0_T1(ctxPtr,T0,T1) { \
  Skein_Set_T0(ctxPtr,(T0)); \
  Skein_Set_T1(ctxPtr,(T1)); }

#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)   \
{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }

#define Skein_Set_Bit_Pad_Flag(hdr)      { (hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;     }

#define KW_TWK_BASE     (0)
#define KW_KEY_BASE     (3)
#define ks              (kw + KW_KEY_BASE)
#define ts              (kw + KW_TWK_BASE)

#define R512(p0,p1,p2,p3,p4,p5,p6,p7,R512ROT,rNum) \
	X##p0 += X##p1; X##p1 = ROTL64(X##p1,R512ROT##_0); X##p1 ^= X##p0; \
	X##p2 += X##p3; X##p3 = ROTL64(X##p3,R512ROT##_1); X##p3 ^= X##p2; \
	X##p4 += X##p5; X##p5 = ROTL64(X##p5,R512ROT##_2); X##p5 ^= X##p4; \
	X##p6 += X##p7; X##p7 = ROTL64(X##p7,R512ROT##_3); X##p7 ^= X##p6;

#define I512(R) \
	X0   += ks[((R)+1) % 9]; \
	X1   += ks[((R)+2) % 9]; \
	X2   += ks[((R)+3) % 9]; \
	X3   += ks[((R)+4) % 9]; \
	X4   += ks[((R)+5) % 9]; \
	X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \
	X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \
	X7   += ks[((R)+8) % 9] + (R)+1;


#define R512_8_rounds(R) \
	R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \
	R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \
	R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \
	R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \
	I512(2*(R)); \
	R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \
	R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \
	R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \
	R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \
	I512(2*(R)+1);

typedef struct
{
  size_t  hashBitLen;
  size_t  bCnt;
  uint64_t  T[SKEIN_MODIFIER_WORDS];
} Skein_Ctxt_Hdr_t;

typedef struct {
  Skein_Ctxt_Hdr_t h;
  uint64_t  X[SKEIN_256_STATE_WORDS];
  uint8_t  b[SKEIN_256_BLOCK_BYTES];
} Skein_256_Ctxt_t;

typedef struct {
  Skein_Ctxt_Hdr_t h;
  uint64_t  X[SKEIN_512_STATE_WORDS];
  uint8_t  b[SKEIN_512_BLOCK_BYTES];
} Skein_512_Ctxt_t;

typedef struct {
  uint_t  statebits;
  union {
	Skein_Ctxt_Hdr_t h;
	Skein_256_Ctxt_t ctx_256;
	Skein_512_Ctxt_t ctx_512;
  } u;
} skeinHashState;

__device__
void cn_skein256_init(skeinHashState *state, size_t hashBitLen)
{
	const uint64_t SKEIN_512_IV_256[] =
	{
		SKEIN_MK_64(0xCCD044A1,0x2FDB3E13),
		SKEIN_MK_64(0xE8359030,0x1A79A9EB),
		SKEIN_MK_64(0x55AEA061,0x4F816E6F),
		SKEIN_MK_64(0x2A2767A4,0xAE9B94DB),
		SKEIN_MK_64(0xEC06025E,0x74DD7683),
		SKEIN_MK_64(0xE7A436CD,0xC4746251),
		SKEIN_MK_64(0xC36FBAF9,0x393AD185),
		SKEIN_MK_64(0x3EEDBA18,0x33EDFC13)
	};

	Skein_512_Ctxt_t *ctx = &state->u.ctx_512;

	ctx->h.hashBitLen = hashBitLen;

	memcpy(ctx->X, SKEIN_512_IV_256, sizeof(ctx->X));

	Skein_Start_New_Type(ctx, MSG);
}

__device__
void cn_skein_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd)
{
	enum {
		R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
		R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
		R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
		R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
		R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
		R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
		R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
		R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22
	};
	uint64_t X0,X1,X2,X3,X4,X5,X6,X7;
	uint64_t w[SKEIN_512_STATE_WORDS];
	uint64_t kw[SKEIN_512_STATE_WORDS+4];

	ts[0] = ctx->h.T[0];
	ts[1] = ctx->h.T[1];

	do  {

		ts[0] += byteCntAdd;

		ks[0] = ctx->X[0];
		ks[1] = ctx->X[1];
		ks[2] = ctx->X[2];
		ks[3] = ctx->X[3];
		ks[4] = ctx->X[4];
		ks[5] = ctx->X[5];
		ks[6] = ctx->X[6];
		ks[7] = ctx->X[7];
		ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
		ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;

		ts[2] = ts[0] ^ ts[1];

		memcpy(w, blkPtr, SKEIN_512_STATE_WORDS << 3);

		X0 = w[0] + ks[0];
		X1 = w[1] + ks[1];
		X2 = w[2] + ks[2];
		X3 = w[3] + ks[3];
		X4 = w[4] + ks[4];
		X5 = w[5] + ks[5] + ts[0];
		X6 = w[6] + ks[6] + ts[1];
		X7 = w[7] + ks[7];

		blkPtr += SKEIN_512_BLOCK_BYTES;

		R512_8_rounds( 0);
		R512_8_rounds( 1);
		R512_8_rounds( 2);
		R512_8_rounds( 3);
		R512_8_rounds( 4);
		R512_8_rounds( 5);
		R512_8_rounds( 6);
		R512_8_rounds( 7);
		R512_8_rounds( 8);

		ctx->X[0] = X0 ^ w[0];
		ctx->X[1] = X1 ^ w[1];
		ctx->X[2] = X2 ^ w[2];
		ctx->X[3] = X3 ^ w[3];
		ctx->X[4] = X4 ^ w[4];
		ctx->X[5] = X5 ^ w[5];
		ctx->X[6] = X6 ^ w[6];
		ctx->X[7] = X7 ^ w[7];

		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
	} while (--blkCnt);

	ctx->h.T[0] = ts[0];
	ctx->h.T[1] = ts[1];
}

__device__
void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt)
{
	size_t n;

	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
	{
		if (ctx->h.bCnt) {

			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;

			if (n) {

				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
				msgByteCnt  -= n;
				msg         += n;
				ctx->h.bCnt += n;
			}

			cn_skein_processblock(ctx, ctx->b, 1, SKEIN_512_BLOCK_BYTES);
			ctx->h.bCnt = 0;
		}

		if (msgByteCnt > SKEIN_512_BLOCK_BYTES) {

			n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES;
			cn_skein_processblock(ctx, msg, n, SKEIN_512_BLOCK_BYTES);
			msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
			msg        += n * SKEIN_512_BLOCK_BYTES;
		}
	}

	if (msgByteCnt) {

		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
		ctx->h.bCnt += msgByteCnt;
	}
}

__device__
void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
{
	if ((databitlen & 7) == 0) {

		cn_skein_block(&state->u.ctx_512, data, databitlen >> 3);
	}
	else {

		size_t bCnt = (databitlen >> 3) + 1;
		uint8_t b,mask;

		mask = (uint8_t) (1u << (7 - (databitlen & 7)));
		b    = (uint8_t) ((data[bCnt-1] & (0-mask)) | mask);

		cn_skein_block(&state->u.ctx_512, data, bCnt - 1);
		cn_skein_block(&state->u.ctx_512, &b, 1);

		Skein_Set_Bit_Pad_Flag(state->u.h);
	}
}

__device__
void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restrict__ hashVal)
{
	uint64_t X[SKEIN_512_STATE_WORDS];
	Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
	const int byteCnt = (ctx->h.hashBitLen + 7) >> 3;

	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;

	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
	{
		memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
	}

	cn_skein_processblock(ctx, ctx->b, 1, ctx->h.bCnt);

	memset(ctx->b, 0, sizeof(ctx->b));
	memcpy(X, ctx->X, sizeof(X));

	for (int i = 0; i*SKEIN_512_BLOCK_BYTES < byteCnt; i++)
	{
		int n = byteCnt - i*SKEIN_512_BLOCK_BYTES;
		if (n > SKEIN_512_BLOCK_BYTES) n = SKEIN_512_BLOCK_BYTES;
		((uint64_t *)ctx->b)[0] = (uint64_t)i;
		Skein_Start_New_Type(ctx, OUT_FINAL);
		cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t));
		memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES/sizeof(uint32_t)), ctx->X, n);
		memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time
	}
}

__device__
void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval)
{
	int hashbitlen = 256;
	DataLength databitlen = len << 3;
	skeinHashState state;

	state.statebits = 64*SKEIN_512_STATE_WORDS;

	cn_skein256_init(&state, hashbitlen);
	cn_skein256_update(&state, data, databitlen);
	cn_skein256_final(&state, hashval);
}