ccminer/x11/simd_functions.cu

__device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[1];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[0];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[3];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[2];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[5];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[4];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[7];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[6];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[6];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[7];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[4];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[5];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[2];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[3];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[0];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[1];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[2];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[3];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[0];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[1];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[6];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[7];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[4];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[5];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[3];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[2];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[1];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[0];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[7];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[6];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[5];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[4];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[5];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[4];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[7];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[6];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[1];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[0];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[3];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[2];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[7];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[6];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[5];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[4];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[3];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[2];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[1];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[0];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[4];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[5];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[6];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[7];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[0];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[1];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[2];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[3];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[1];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[0];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[3];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[2];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[5];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[4];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[7];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[6];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[6];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[7];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[4];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[5];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[2];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[3];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[0];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[1];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[2];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[3];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[0];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[1];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[6];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[7];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[4];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[5];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[3];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[2];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[1];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[0];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[7];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[6];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[5];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[4];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[5];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[4];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[7];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[6];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[1];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[0];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[3];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[2];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[7];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[6];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[5];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[4];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[3];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[2];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[1];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[0];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[4];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[5];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[6];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[7];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[0];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[1];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[2];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[3];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[1];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[0];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[3];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[2];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[5];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[4];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[7];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[6];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[6];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[7];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[4];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[5];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[2];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[3];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[0];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[1];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[2];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[3];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[0];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[1];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[6];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[7];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[4];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[5];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[3];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[2];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[1];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[0];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[7];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[6];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[5];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[4];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[5];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[4];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[7];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[6];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[1];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[0];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[3];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[2];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[7];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[6];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[5];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[4];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[3];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[2];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[1];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[0];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[4];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[5];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[6];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[7];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[0];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[1];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[2];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[3];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[1];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[0];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[3];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[2];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[5];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[4];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[7];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[6];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[6];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[7];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[4];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[5];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[2];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[3];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[0];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[1];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[2];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[3];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[0];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[1];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[6];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[7];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[4];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[5];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[3];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[2];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[1];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[0];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[7];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[6];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[5];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[4];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[5];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[4];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[7];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[6];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[1];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[0];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[3];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[2];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[7];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[6];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[5];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[4];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[3];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[2];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[1];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[0];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[4];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[5];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[6];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[7];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[0];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[1];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[2];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[3];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[1];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[0];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[3];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[2];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[5];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[4];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[7];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[6];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}
__device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[6];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[7];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[4];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[5];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[2];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[3];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[0];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[1];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}

__device__ __forceinline__
void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[2];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[3];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[0];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[1];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[6];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[7];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[4];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[5];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}

__device__ __forceinline__
void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[3];
	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[2];
	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[1];
	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[0];
	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[7];
	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[6];
	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[5];
	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[4];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}

__device__ __forceinline__
void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[5];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[4];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[7];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[6];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[1];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[0];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[3];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[2];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}

__device__ __forceinline__
void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[7];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[6];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[5];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[4];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[3];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[2];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[1];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[0];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}

__device__ __forceinline__
void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[4];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[5];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[6];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[7];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[0];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[1];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[2];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[3];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}

__device__ __forceinline__
void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
	int j;
	uint32_t temp;
	uint32_t R[8];
#pragma unroll 8
	for(j=0; j<8; j++) {
		R[j] = ROTL32(A[j], r);
	}
	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
	D[0] = ROTL32(temp, s) + R[1];
	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
	D[1] = ROTL32(temp, s) + R[0];
	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
	D[2] = ROTL32(temp, s) + R[3];
	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
	D[3] = ROTL32(temp, s) + R[2];
	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
	D[4] = ROTL32(temp, s) + R[5];
	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
	D[5] = ROTL32(temp, s) + R[4];
	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
	D[6] = ROTL32(temp, s) + R[7];
	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
	D[7] = ROTL32(temp, s) + R[6];
#pragma unroll 8
	for(j=0; j<8; j++) {
		A[j] = R[j];
	}
}

__device__ __constant__
static const uint32_t d_cw0[8][8] = {
	0x531B1720, 	0xAC2CDE09, 	0x0B902D87, 	0x2369B1F4, 	0x2931AA01, 	0x02E4B082, 	0xC914C914, 	0xC1DAE1A6, 
	0xF18C2B5C, 	0x08AC306B, 	0x27BFC914, 	0xCEDC548D, 	0xC630C4BE, 	0xF18C4335, 	0xF0D3427C, 	0xBE3DA380, 
	0x143C02E4, 	0xA948C630, 	0xA4F2DE09, 	0xA71D2085, 	0xA439BD84, 	0x109FCD6A, 	0xEEA8EF61, 	0xA5AB1CE8, 
	0x0B90D4A4, 	0x3D6D039D, 	0x25944D53, 	0xBAA0E034, 	0x5BC71E5A, 	0xB1F4F2FE, 	0x12CADE09, 	0x548D41C3, 
	0x3CB4F80D, 	0x36ECEBC4, 	0xA66443EE, 	0x43351ABD, 	0xC7A20C49, 	0xEB0BB366, 	0xF5293F98, 	0x49B6DE09, 
	0x531B29EA, 	0x02E402E4, 	0xDB25C405, 	0x53D4E543, 	0x0AD71720, 	0xE1A61A04, 	0xB87534C1, 	0x3EDF43EE, 
	0x213E50F0, 	0x39173EDF, 	0xA9485B0E, 	0xEEA82EF9, 	0x14F55771, 	0xFAF15546, 	0x3D6DD9B3, 	0xAB73B92E, 
	0x582A48FD, 	0xEEA81892, 	0x4F7EAA01, 	0xAF10A88F, 	0x11581720, 	0x34C124DB, 	0xD1C0AB73, 	0x1E5AF0D3  
};
__device__ __forceinline__
void Round8_0_final(uint32_t *A, int r, int s, int t, int u) {
	STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]);
	STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]);
	STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]);
	STEP8_IF_3(d_cw0[3], u, r, &A[8], &A[16], &A[24], A);
	STEP8_MAJ_4(d_cw0[4], r, s, A, &A[8], &A[16], &A[24]);
	STEP8_MAJ_5(d_cw0[5], s, t, &A[24], A, &A[8], &A[16]);
	STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]);
	STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
}

__device__ __constant__
static const uint32_t d_cw1[8][8] = {
	0xC34C07F3, 	0xC914143C, 	0x599CBC12, 	0xBCCBE543, 	0x385EF3B7, 	0x14F54C9A, 	0x0AD7C068, 	0xB64A21F7, 
	0xDEC2AF10, 	0xC6E9C121, 	0x56B8A4F2, 	0x1158D107, 	0xEB0BA88F, 	0x050FAABA, 	0xC293264D, 	0x548D46D2, 
	0xACE5E8E0, 	0x53D421F7, 	0xF470D279, 	0xDC974E0C, 	0xD6CF55FF, 	0xFD1C4F7E, 	0x36EC36EC, 	0x3E261E5A, 
	0xEBC4FD1C, 	0x56B839D0, 	0x5B0E21F7, 	0x58E3DF7B, 	0x5BC7427C, 	0xEF613296, 	0x1158109F, 	0x5A55E318, 
	0xA7D6B703, 	0x1158E76E, 	0xB08255FF, 	0x50F05771, 	0xEEA8E8E0, 	0xCB3FDB25, 	0x2E40548D, 	0xE1A60F2D, 
	0xACE5D616, 	0xFD1CFD1C, 	0x24DB3BFB, 	0xAC2C1ABD, 	0xF529E8E0, 	0x1E5AE5FC, 	0x478BCB3F, 	0xC121BC12, 
	0xF4702B5C, 	0xC293FC63, 	0xDA6CB2AD, 	0x45601FCC, 	0xA439E1A6, 	0x4E0C0D02, 	0xED3621F7, 	0xAB73BE3D, 
	0x0E74D4A4, 	0xF754CF95, 	0xD84136EC, 	0x3124AB73, 	0x39D03B42, 	0x0E74BCCB, 	0x0F2DBD84, 	0x41C35C80  
};
__device__ __forceinline__
void Round8_1_final(uint32_t *A, int r, int s, int t, int u) {
	STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]);
	STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]);
	STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]);
	STEP8_IF_11(d_cw1[3], u, r, &A[8], &A[16], &A[24], A);
	STEP8_MAJ_12(d_cw1[4], r, s, A, &A[8], &A[16], &A[24]);
	STEP8_MAJ_13(d_cw1[5], s, t, &A[24], A, &A[8], &A[16]);
	STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]);
	STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
}

__device__ __constant__
static const uint32_t d_cw2[8][8] = {
	0xA4135BED, 	0xE10E1EF2, 	0x6C4F93B1, 	0x6E2191DF, 	0xE2E01D20, 	0xD1952E6B, 	0x6A7D9583, 	0x131DECE3, 
	0x369CC964, 	0xFB73048D, 	0x9E9D6163, 	0x280CD7F4, 	0xD9C6263A, 	0x1062EF9E, 	0x2AC7D539, 	0xAD2D52D3, 
	0x0A03F5FD, 	0x197CE684, 	0xAA72558E, 	0xDE5321AD, 	0xF0870F79, 	0x607A9F86, 	0xAFE85018, 	0x2AC7D539, 
	0xE2E01D20, 	0x2AC7D539, 	0xC6A93957, 	0x624C9DB4, 	0x6C4F93B1, 	0x641E9BE2, 	0x452CBAD4, 	0x263AD9C6, 
	0xC964369C, 	0xC3053CFB, 	0x452CBAD4, 	0x95836A7D, 	0x4AA2B55E, 	0xAB5B54A5, 	0xAC4453BC, 	0x74808B80, 
	0xCB3634CA, 	0xFC5C03A4, 	0x4B8BB475, 	0x21ADDE53, 	0xE2E01D20, 	0xDF3C20C4, 	0xBD8F4271, 	0xAA72558E, 
	0xFC5C03A4, 	0x48D0B730, 	0x2AC7D539, 	0xD70B28F5, 	0x53BCAC44, 	0x3FB6C04A, 	0x14EFEB11, 	0xDB982468, 
	0x9A1065F0, 	0xB0D14F2F, 	0x8D5272AE, 	0xC4D73B29, 	0x91DF6E21, 	0x949A6B66, 	0x303DCFC3, 	0x5932A6CE  
};
__device__ __forceinline__
void Round8_2_final(uint32_t *A, int r, int s, int t, int u) {
	STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]);
	STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]);
	STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]);
	STEP8_IF_19(d_cw2[3], u, r, &A[8], &A[16], &A[24], A);
	STEP8_MAJ_20(d_cw2[4], r, s, A, &A[8], &A[16], &A[24]);
	STEP8_MAJ_21(d_cw2[5], s, t, &A[24], A, &A[8], &A[16]);
	STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]);
	STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
}

__device__ __constant__
static const uint32_t d_cw3[8][8] = {
	0x1234EDCC, 	0xF5140AEC, 	0xCDF1320F, 	0x3DE4C21C, 	0x48D0B730, 	0x1234EDCC, 	0x131DECE3, 	0x52D3AD2D, 
	0xE684197C, 	0x6D3892C8, 	0x72AE8D52, 	0x6FF3900D, 	0x73978C69, 	0xEB1114EF, 	0x15D8EA28, 	0x71C58E3B, 
	0x90F66F0A, 	0x15D8EA28, 	0x9BE2641E, 	0x65F09A10, 	0xEA2815D8, 	0xBD8F4271, 	0x3A40C5C0, 	0xD9C6263A, 
	0xB38C4C74, 	0xBAD4452C, 	0x70DC8F24, 	0xAB5B54A5, 	0x46FEB902, 	0x1A65E59B, 	0x0DA7F259, 	0xA32A5CD6, 
	0xD62229DE, 	0xB81947E7, 	0x6D3892C8, 	0x15D8EA28, 	0xE59B1A65, 	0x065FF9A1, 	0xB2A34D5D, 	0x6A7D9583, 
	0x975568AB, 	0xFC5C03A4, 	0x2E6BD195, 	0x966C6994, 	0xF2590DA7, 	0x263AD9C6, 	0x5A1BA5E5, 	0xB0D14F2F, 
	0x975568AB, 	0x6994966C, 	0xF1700E90, 	0xD3672C99, 	0xCC1F33E1, 	0xFC5C03A4, 	0x452CBAD4, 	0x4E46B1BA, 
	0xF1700E90, 	0xB2A34D5D, 	0xD0AC2F54, 	0x5760A8A0, 	0x8C697397, 	0x624C9DB4, 	0xE85617AA, 	0x95836A7D  
};
__device__ __forceinline__
void Round8_3_final(uint32_t *A, int r, int s, int t, int u) {
	STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]);
	STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]);
	STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]);
	STEP8_IF_27(d_cw3[3], u, r, &A[8], &A[16], &A[24], A);
	STEP8_MAJ_28(d_cw3[4], r, s, A, &A[8], &A[16], &A[24]);
	STEP8_MAJ_29(d_cw3[5], s, t, &A[24], A, &A[8], &A[16]);
	STEP8_MAJ_30(d_cw3[6], t, u, &A[16], &A[24], A, &A[8]);
	STEP8_MAJ_31(d_cw3[7], u, r, &A[8], &A[16], &A[24], A);
}

#if __CUDA_ARCH__ < 350
#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x))
#else
//#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x))
#define expanded_vector(x) __ldg(&g_fft4[x])
#endif

__device__ __forceinline__
void Round8_0(uint32_t *A, const int thr_offset, int r, int s, int t, int u, uint4 *g_fft4) {
	uint32_t w[8];
    uint4 hv1, hv2;

	int tmp = 0 + thr_offset;
	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_0(w, r, s, A, &A[8], &A[16], &A[24]);
	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_1(w, s, t, &A[24], A, &A[8], &A[16]);
	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_2(w, t, u, &A[16], &A[24], A, &A[8]);
	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_3(w, u, r, &A[8], &A[16], &A[24], A);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_4(w, r, s, A, &A[8], &A[16], &A[24]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_5(w, s, t, &A[24], A, &A[8], &A[16]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_6(w, t, u, &A[16], &A[24], A, &A[8]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_7(w, u, r, &A[8], &A[16], &A[24], A);


}
__device__ __forceinline__ void Round8_1(uint32_t *A, const int thr_offset,
		int r, int s, int t, int u, uint4 *g_fft4) {
	uint32_t w[8];
    uint4 hv1, hv2;

	int tmp = 16 + thr_offset;
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_8(w, r, s, A, &A[8], &A[16], &A[24]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_9(w, s, t, &A[24], A, &A[8], &A[16]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_10(w, t, u, &A[16], &A[24], A, &A[8]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_11(w, u, r, &A[8], &A[16], &A[24], A);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_12(w, r, s, A, &A[8], &A[16], &A[24]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_13(w, s, t, &A[24], A, &A[8], &A[16]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_14(w, t, u, &A[16], &A[24], A, &A[8]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_15(w, u, r, &A[8], &A[16], &A[24], A);


}
__device__ __forceinline__ void Round8_2(uint32_t *A, const int thr_offset,
		int r, int s, int t, int u, uint4 *g_fft4) {
	uint32_t w[8];
    uint4 hv1, hv2;

	int tmp = 32 + thr_offset;
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_16(w, r, s, A, &A[8], &A[16], &A[24]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_17(w, s, t, &A[24], A, &A[8], &A[16]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_18(w, t, u, &A[16], &A[24], A, &A[8]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_19(w, u, r, &A[8], &A[16], &A[24], A);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_20(w, r, s, A, &A[8], &A[16], &A[24]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_21(w, s, t, &A[24], A, &A[8], &A[16]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_22(w, t, u, &A[16], &A[24], A, &A[8]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_23(w, u, r, &A[8], &A[16], &A[24], A);


}
__device__ __forceinline__ void Round8_3(uint32_t *A, const int thr_offset,
		int r, int s, int t, int u, uint4 *g_fft4) {
	uint32_t w[8];
    uint4 hv1, hv2;

	int tmp = 48 + thr_offset;
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_24(w, r, s, A, &A[8], &A[16], &A[24]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_25(w, s, t, &A[24], A, &A[8], &A[16]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_26(w, t, u, &A[16], &A[24], A, &A[8]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_IF_27(w, u, r, &A[8], &A[16], &A[24], A);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_28(w, r, s, A, &A[8], &A[16], &A[24]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_29(w, s, t, &A[24], A, &A[8], &A[16]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_30(w, t, u, &A[16], &A[24], A, &A[8]);
    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
	STEP8_MAJ_31(w, u, r, &A[8], &A[16], &A[24], A);


}

__device__ __forceinline__ void SIMD_Compress1(uint32_t *A, const int thr_id, const uint32_t *M, uint4 *g_fft4) {
	int i;
	const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente)
#pragma unroll 8
	for(i=0; i<8; i++) {
		A[i] ^= M[i];
		(&A[8])[i] ^= M[8+i];
	}
	Round8_0(A, thr_offset, 3, 23, 17, 27, g_fft4);
	Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4);
}

__device__ __forceinline__ void Compression1(const uint32_t *hashval, const int texture_id, uint4 *g_fft4, int *g_state) {
	uint32_t A[32];
	int i;
#pragma unroll 32
	for (i=0; i < 32; i++) A[i] = c_IV_512[i];
	uint32_t buffer[16];
#pragma unroll 16
	for (i=0; i < 16; i++) buffer[i] = hashval[i];
	SIMD_Compress1(A, texture_id, buffer, g_fft4);
	uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
#pragma unroll 32
	for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i];
}

__device__ __forceinline__ void SIMD_Compress2(uint32_t *A, const int thr_id, uint4 *g_fft4) {
	uint32_t IV[4][8];
	int i;
	const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente)
#pragma unroll 8
	for(i=0; i<8; i++) {
		IV[0][i] = c_IV_512[i];
		IV[1][i] = c_IV_512[8+i];
		IV[2][i] = c_IV_512[16+i];
		IV[3][i] = c_IV_512[24+i];
	}
	Round8_2(A, thr_offset, 29, 9, 15, 5, g_fft4);
	Round8_3(A, thr_offset, 4, 13, 10, 25, g_fft4);
	STEP8_IF_32(IV[0],  4, 13, A, &A[8], &A[16], &A[24]);
	STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]);
	STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]);
	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
}

__device__ __forceinline__ void Compression2(const int texture_id, uint4 *g_fft4, int *g_state) {
	uint32_t A[32];
	int i;
	uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
#pragma unroll 32
	for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
	SIMD_Compress2(A, texture_id, g_fft4);
#pragma unroll 32
	for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i];
}

__device__ __forceinline__ void SIMD_Compress_Final(uint32_t *A, const uint32_t *M) {
	uint32_t IV[4][8];
	int i;
#pragma unroll 8
	for(i=0; i<8; i++) {
		IV[0][i] = A[i];
		IV[1][i] = (&A[8])[i];
		IV[2][i] = (&A[16])[i];
		IV[3][i] = (&A[24])[i];
	}
#pragma unroll 8
	for(i=0; i<8; i++) {
		A[i] ^= M[i];
		(&A[8])[i] ^= M[8+i];
	}
	Round8_0_final(A, 3, 23, 17, 27);
	Round8_1_final(A, 28, 19, 22, 7);
	Round8_2_final(A, 29, 9, 15, 5);
	Round8_3_final(A, 4, 13, 10, 25);
	STEP8_IF_32(IV[0],  4, 13, A, &A[8], &A[16], &A[24]);
	STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]);
	STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]);
	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
}

__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, int *g_state) {
	uint32_t A[32];
	int i;
	uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
#pragma unroll 32
	for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
	uint32_t buffer[16];
	buffer[0] = 512;
#pragma unroll 15
	for (i=1; i < 16; i++) buffer[i] = 0;
	SIMD_Compress_Final(A, buffer);
#pragma unroll 16
	for (i=0; i < 16; i++)
		hashval[i] = A[i];
}