typedef unsigned int uint_t; /* native unsigned integer */ #define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */ #define SKEIN_256_STATE_WORDS ( 4) #define SKEIN_512_STATE_WORDS ( 8) #define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS) #define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) #define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS) #define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) #define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS) #define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) #define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32)) #define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) #define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */ #define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ #define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */ #define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ #define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ #define SKEIN_T1_FLAG_FIRST (((uint64_t) 1 ) << SKEIN_T1_POS_FIRST) #define SKEIN_T1_FLAG_BIT_PAD (((uint64_t) 1 ) << SKEIN_T1_POS_BIT_PAD) #define SKEIN_T1_FLAG_FINAL (((uint64_t) 1 ) << SKEIN_T1_POS_FINAL) #define SKEIN_BLK_TYPE_MSG (48) /* message processing */ #define SKEIN_BLK_TYPE_OUT (63) /* output stage */ #define SKEIN_T1_BLK_TYPE(T) (((uint64_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) #define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ #define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ #define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) #define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);} #define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0) #define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1) #define Skein_Set_T0_T1(ctxPtr,T0,T1) { \ Skein_Set_T0(ctxPtr,(T0)); \ Skein_Set_T1(ctxPtr,(T1)); } #define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \ { Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; } #define Skein_Set_Bit_Pad_Flag(hdr) { (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; } #define KW_TWK_BASE (0) #define KW_KEY_BASE (3) #define ks (kw + KW_KEY_BASE) #define ts (kw + KW_TWK_BASE) #define R512(p0,p1,p2,p3,p4,p5,p6,p7,R512ROT,rNum) \ X##p0 += X##p1; X##p1 = ROTL64(X##p1,R512ROT##_0); X##p1 ^= X##p0; \ X##p2 += X##p3; X##p3 = ROTL64(X##p3,R512ROT##_1); X##p3 ^= X##p2; \ X##p4 += X##p5; X##p5 = ROTL64(X##p5,R512ROT##_2); X##p5 ^= X##p4; \ X##p6 += X##p7; X##p7 = ROTL64(X##p7,R512ROT##_3); X##p7 ^= X##p6; #define I512(R) \ X0 += ks[((R)+1) % 9]; \ X1 += ks[((R)+2) % 9]; \ X2 += ks[((R)+3) % 9]; \ X3 += ks[((R)+4) % 9]; \ X4 += ks[((R)+5) % 9]; \ X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \ X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \ X7 += ks[((R)+8) % 9] + (R)+1; #define R512_8_rounds(R) \ R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ I512(2*(R)); \ R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ I512(2*(R)+1); typedef struct { size_t hashBitLen; size_t bCnt; uint64_t T[SKEIN_MODIFIER_WORDS]; } Skein_Ctxt_Hdr_t; typedef struct { Skein_Ctxt_Hdr_t h; uint64_t X[SKEIN_256_STATE_WORDS]; uint8_t b[SKEIN_256_BLOCK_BYTES]; } Skein_256_Ctxt_t; typedef struct { Skein_Ctxt_Hdr_t h; uint64_t X[SKEIN_512_STATE_WORDS]; uint8_t b[SKEIN_512_BLOCK_BYTES]; } Skein_512_Ctxt_t; typedef struct { uint_t statebits; union { Skein_Ctxt_Hdr_t h; Skein_256_Ctxt_t ctx_256; Skein_512_Ctxt_t ctx_512; } u; } skeinHashState; __device__ void cn_skein_init(skeinHashState *state, size_t hashBitLen) { const uint64_t SKEIN_512_IV_256[] = { SKEIN_MK_64(0xCCD044A1,0x2FDB3E13), SKEIN_MK_64(0xE8359030,0x1A79A9EB), SKEIN_MK_64(0x55AEA061,0x4F816E6F), SKEIN_MK_64(0x2A2767A4,0xAE9B94DB), SKEIN_MK_64(0xEC06025E,0x74DD7683), SKEIN_MK_64(0xE7A436CD,0xC4746251), SKEIN_MK_64(0xC36FBAF9,0x393AD185), SKEIN_MK_64(0x3EEDBA18,0x33EDFC13) }; Skein_512_Ctxt_t *ctx = &state->u.ctx_512; ctx->h.hashBitLen = hashBitLen; memcpy(ctx->X, SKEIN_512_IV_256, sizeof(ctx->X)); Skein_Start_New_Type(ctx, MSG); } __device__ void cn_skein_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd) { enum { R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37, R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42, R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39, R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56, R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24, R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17, R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43, R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22 }; uint64_t X0,X1,X2,X3,X4,X5,X6,X7; uint64_t w[SKEIN_512_STATE_WORDS]; uint64_t kw[SKEIN_512_STATE_WORDS+4]; ts[0] = ctx->h.T[0]; ts[1] = ctx->h.T[1]; do { ts[0] += byteCntAdd; ks[0] = ctx->X[0]; ks[1] = ctx->X[1]; ks[2] = ctx->X[2]; ks[3] = ctx->X[3]; ks[4] = ctx->X[4]; ks[5] = ctx->X[5]; ks[6] = ctx->X[6]; ks[7] = ctx->X[7]; ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; ts[2] = ts[0] ^ ts[1]; memcpy(w, blkPtr, SKEIN_512_STATE_WORDS << 3); X0 = w[0] + ks[0]; X1 = w[1] + ks[1]; X2 = w[2] + ks[2]; X3 = w[3] + ks[3]; X4 = w[4] + ks[4]; X5 = w[5] + ks[5] + ts[0]; X6 = w[6] + ks[6] + ts[1]; X7 = w[7] + ks[7]; blkPtr += SKEIN_512_BLOCK_BYTES; R512_8_rounds( 0); R512_8_rounds( 1); R512_8_rounds( 2); R512_8_rounds( 3); R512_8_rounds( 4); R512_8_rounds( 5); R512_8_rounds( 6); R512_8_rounds( 7); R512_8_rounds( 8); ctx->X[0] = X0 ^ w[0]; ctx->X[1] = X1 ^ w[1]; ctx->X[2] = X2 ^ w[2]; ctx->X[3] = X3 ^ w[3]; ctx->X[4] = X4 ^ w[4]; ctx->X[5] = X5 ^ w[5]; ctx->X[6] = X6 ^ w[6]; ctx->X[7] = X7 ^ w[7]; ts[1] &= ~SKEIN_T1_FLAG_FIRST; } while (--blkCnt); ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; } __device__ void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt) { size_t n; if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) { if (ctx->h.bCnt) { n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; if (n) { memcpy(&ctx->b[ctx->h.bCnt],msg,n); msgByteCnt -= n; msg += n; ctx->h.bCnt += n; } cn_skein_processblock(ctx, ctx->b, 1, SKEIN_512_BLOCK_BYTES); ctx->h.bCnt = 0; } if (msgByteCnt > SKEIN_512_BLOCK_BYTES) { n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES; cn_skein_processblock(ctx, msg, n, SKEIN_512_BLOCK_BYTES); msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; msg += n * SKEIN_512_BLOCK_BYTES; } } if (msgByteCnt) { memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); ctx->h.bCnt += msgByteCnt; } } __device__ void cn_skein_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen) { if ((databitlen & 7) == 0) { cn_skein_block(&state->u.ctx_512, data, databitlen >> 3); } else { size_t bCnt = (databitlen >> 3) + 1; uint8_t b,mask; mask = (uint8_t) (1u << (7 - (databitlen & 7))); b = (uint8_t) ((data[bCnt-1] & (0-mask)) | mask); cn_skein_block(&state->u.ctx_512, data, bCnt - 1); cn_skein_block(&state->u.ctx_512, &b, 1); Skein_Set_Bit_Pad_Flag(state->u.h); } } __device__ void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal) { uint64_t X[SKEIN_512_STATE_WORDS]; Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512; const int byteCnt = (ctx->h.hashBitLen + 7) >> 3; ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) { memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); } cn_skein_processblock(ctx, ctx->b, 1, ctx->h.bCnt); memset(ctx->b, 0, sizeof(ctx->b)); memcpy(X, ctx->X, sizeof(X)); for (int i = 0; i*SKEIN_512_BLOCK_BYTES < byteCnt; i++) { int n = byteCnt - i*SKEIN_512_BLOCK_BYTES; if (n > SKEIN_512_BLOCK_BYTES) n = SKEIN_512_BLOCK_BYTES; ((uint64_t *)ctx->b)[0] = (uint64_t)i; Skein_Start_New_Type(ctx, OUT_FINAL); cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t)); memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES), ctx->X, n); memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time } } __device__ void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval) { int hashbitlen = 256; DataLength databitlen = len << 3; skeinHashState state; state.statebits = 64*SKEIN_512_STATE_WORDS; cn_skein_init(&state, hashbitlen); cn_skein_update(&state, data, databitlen); cn_skein_final(&state, (uint8_t*) hashval); }