mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-24 21:45:04 +00:00
Faster Simd
On maxwell compress1 and compress2 can be run in one run instead of two.(750TI + 20KHASH)
This commit is contained in:
parent
7ce3fed611
commit
7d88e5cca1
@ -8,16 +8,16 @@
|
|||||||
#define TPB 64
|
#define TPB 64
|
||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
#include <stdio.h>
|
//#include <stdio.h>
|
||||||
|
|
||||||
int *d_state[8];
|
uint32_t *d_state[8];
|
||||||
uint4 *d_temp4[8];
|
uint4 *d_temp4[8];
|
||||||
|
|
||||||
// texture bound to d_temp4[thr_id], for read access in Compaction kernel
|
// texture bound to d_temp4[thr_id], for read access in Compaction kernel
|
||||||
texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
|
texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
|
||||||
|
|
||||||
__constant__ uint32_t c_perm[8][8];
|
__constant__ uint8_t c_perm[8][8];
|
||||||
const uint32_t h_perm[8][8] = {
|
const uint8_t h_perm[8][8] = {
|
||||||
{ 2, 3, 6, 7, 0, 1, 4, 5 },
|
{ 2, 3, 6, 7, 0, 1, 4, 5 },
|
||||||
{ 6, 7, 2, 3, 4, 5, 0, 1 },
|
{ 6, 7, 2, 3, 4, 5, 0, 1 },
|
||||||
{ 7, 6, 5, 4, 3, 2, 1, 0 },
|
{ 7, 6, 5, 4, 3, 2, 1, 0 },
|
||||||
@ -36,8 +36,8 @@ const uint32_t h_IV_512[32] = {
|
|||||||
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
|
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
|
||||||
};
|
};
|
||||||
|
|
||||||
__constant__ int c_FFT128_8_16_Twiddle[128];
|
__constant__ short c_FFT128_8_16_Twiddle[128];
|
||||||
static const int h_FFT128_8_16_Twiddle[128] = {
|
static const short h_FFT128_8_16_Twiddle[128] = {
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30,
|
1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30,
|
||||||
1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22,
|
1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22,
|
||||||
@ -48,8 +48,8 @@ static const int h_FFT128_8_16_Twiddle[128] = {
|
|||||||
1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79
|
1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79
|
||||||
};
|
};
|
||||||
|
|
||||||
__constant__ int c_FFT256_2_128_Twiddle[128];
|
__constant__ short c_FFT256_2_128_Twiddle[128];
|
||||||
static const int h_FFT256_2_128_Twiddle[128] = {
|
static const short h_FFT256_2_128_Twiddle[128] = {
|
||||||
1, 41,-118, 45, 46, 87, -31, 14,
|
1, 41,-118, 45, 46, 87, -31, 14,
|
||||||
60,-110, 116,-127, -67, 80, -61, 69,
|
60,-110, 116,-127, -67, 80, -61, 69,
|
||||||
2, 82, 21, 90, 92, -83, -62, 28,
|
2, 82, 21, 90, 92, -83, -62, 28,
|
||||||
@ -71,9 +71,10 @@ static const int h_FFT256_2_128_Twiddle[128] = {
|
|||||||
|
|
||||||
/************* the round function ****************/
|
/************* the round function ****************/
|
||||||
|
|
||||||
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
|
|
||||||
|
|
||||||
#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
|
#define IF(x, y, z) (((y ^ z) & x) ^ z)
|
||||||
|
#define MAJ(x, y, z) ((z &y) | ((z|y) & x))
|
||||||
|
|
||||||
|
|
||||||
#include "x11/simd_functions.cu"
|
#include "x11/simd_functions.cu"
|
||||||
|
|
||||||
@ -549,8 +550,7 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/***************************************************/
|
/***************************************************/
|
||||||
|
__global__ void __launch_bounds__(TPB, 8)
|
||||||
__global__ void __launch_bounds__(TPB,4)
|
|
||||||
x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4)
|
x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4)
|
||||||
{
|
{
|
||||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x)/8;
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x)/8;
|
||||||
@ -576,8 +576,9 @@ x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, u
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void __launch_bounds__(TPB,4)
|
/*
|
||||||
x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state)
|
__global__ void __launch_bounds__(TPB, 4)
|
||||||
|
x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
|
||||||
{
|
{
|
||||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
@ -590,9 +591,8 @@ x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash
|
|||||||
Compression1(Hash, hashPosition, g_fft4, g_state);
|
Compression1(Hash, hashPosition, g_fft4, g_state);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
__global__ void __launch_bounds__(TPB, 4)
|
||||||
__global__ void __launch_bounds__(TPB,4)
|
x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
|
||||||
x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state)
|
|
||||||
{
|
{
|
||||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
@ -604,9 +604,27 @@ x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash
|
|||||||
Compression2(hashPosition, g_fft4, g_state);
|
Compression2(hashPosition, g_fft4, g_state);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
__global__ void __launch_bounds__(TPB,4)
|
__global__ void __launch_bounds__(TPB, 4)
|
||||||
x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state)
|
x11_simd512_gpu_compress_64_maxwell(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
|
||||||
|
{
|
||||||
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||||
|
|
||||||
|
int hashPosition = nounce - startNounce;
|
||||||
|
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
|
||||||
|
|
||||||
|
Compression1(Hash, hashPosition, g_fft4, g_state);
|
||||||
|
Compression2(hashPosition, g_fft4, g_state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__global__ void __launch_bounds__(TPB, 4)
|
||||||
|
x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
|
||||||
{
|
{
|
||||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
@ -658,9 +676,7 @@ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint
|
|||||||
|
|
||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
|
|
||||||
x11_simd512_gpu_compress1_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
x11_simd512_gpu_compress_64_maxwell << <grid, block >> > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
||||||
x11_simd512_gpu_compress2_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
|
||||||
|
|
||||||
x11_simd512_gpu_final_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
x11_simd512_gpu_final_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
||||||
|
|
||||||
MyStreamSynchronize(NULL, order, thr_id);
|
MyStreamSynchronize(NULL, order, thr_id);
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
__device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for(int j=0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -24,17 +23,16 @@ __device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[6];
|
D[7] = ROTL32(temp, s) + R[6];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for(int j=0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -54,17 +52,16 @@ __device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[1];
|
D[7] = ROTL32(temp, s) + R[1];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -84,17 +81,16 @@ __device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[5];
|
D[7] = ROTL32(temp, s) + R[5];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -114,17 +110,16 @@ __device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[4];
|
D[7] = ROTL32(temp, s) + R[4];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -144,17 +139,16 @@ __device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[2];
|
D[7] = ROTL32(temp, s) + R[2];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -174,17 +168,16 @@ __device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[0];
|
D[7] = ROTL32(temp, s) + R[0];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -204,17 +197,16 @@ __device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[3];
|
D[7] = ROTL32(temp, s) + R[3];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -234,17 +226,16 @@ __device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[6];
|
D[7] = ROTL32(temp, s) + R[6];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -264,17 +255,16 @@ __device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[1];
|
D[7] = ROTL32(temp, s) + R[1];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -294,17 +284,17 @@ __device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[5];
|
D[7] = ROTL32(temp, s) + R[5];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -324,17 +314,16 @@ __device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[4];
|
D[7] = ROTL32(temp, s) + R[4];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -354,17 +343,16 @@ __device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[2];
|
D[7] = ROTL32(temp, s) + R[2];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -384,17 +372,16 @@ __device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, con
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[0];
|
D[7] = ROTL32(temp, s) + R[0];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -414,17 +401,16 @@ __device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, con
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[3];
|
D[7] = ROTL32(temp, s) + R[3];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -444,17 +430,16 @@ __device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, con
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[6];
|
D[7] = ROTL32(temp, s) + R[6];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -474,17 +459,16 @@ __device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, con
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[1];
|
D[7] = ROTL32(temp, s) + R[1];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -504,17 +488,16 @@ __device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[5];
|
D[7] = ROTL32(temp, s) + R[5];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -534,17 +517,16 @@ __device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[4];
|
D[7] = ROTL32(temp, s) + R[4];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -564,17 +546,16 @@ __device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[2];
|
D[7] = ROTL32(temp, s) + R[2];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -594,17 +575,16 @@ __device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[0];
|
D[7] = ROTL32(temp, s) + R[0];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -624,17 +604,16 @@ __device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, con
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[3];
|
D[7] = ROTL32(temp, s) + R[3];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -654,17 +633,16 @@ __device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, con
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[6];
|
D[7] = ROTL32(temp, s) + R[6];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -684,17 +662,16 @@ __device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, con
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[1];
|
D[7] = ROTL32(temp, s) + R[1];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -714,17 +691,16 @@ __device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, con
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[5];
|
D[7] = ROTL32(temp, s) + R[5];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -744,17 +720,16 @@ __device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[4];
|
D[7] = ROTL32(temp, s) + R[4];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -774,17 +749,16 @@ __device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[2];
|
D[7] = ROTL32(temp, s) + R[2];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -804,17 +778,16 @@ __device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[0];
|
D[7] = ROTL32(temp, s) + R[0];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -834,17 +807,16 @@ __device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[3];
|
D[7] = ROTL32(temp, s) + R[3];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -864,17 +836,16 @@ __device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, con
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[6];
|
D[7] = ROTL32(temp, s) + R[6];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -894,17 +865,16 @@ __device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, con
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[1];
|
D[7] = ROTL32(temp, s) + R[1];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -924,17 +894,16 @@ __device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, con
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[5];
|
D[7] = ROTL32(temp, s) + R[5];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
|
||||||
@ -954,17 +923,16 @@ __device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, con
|
|||||||
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[4];
|
D[7] = ROTL32(temp, s) + R[4];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -984,17 +952,16 @@ __device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[2];
|
D[7] = ROTL32(temp, s) + R[2];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -1014,17 +981,16 @@ __device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[0];
|
D[7] = ROTL32(temp, s) + R[0];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -1044,17 +1010,16 @@ __device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[3];
|
D[7] = ROTL32(temp, s) + R[3];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
|
||||||
{
|
{
|
||||||
int j;
|
|
||||||
uint32_t temp;
|
uint32_t temp;
|
||||||
uint32_t R[8];
|
uint32_t R[8];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
R[j] = ROTL32(A[j], r);
|
R[j] = ROTL32(A[j], r);
|
||||||
}
|
}
|
||||||
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
|
||||||
@ -1074,7 +1039,7 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons
|
|||||||
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
|
||||||
D[7] = ROTL32(temp, s) + R[6];
|
D[7] = ROTL32(temp, s) + R[6];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(j=0; j<8; j++) {
|
for (int j = 0; j<8; j++) {
|
||||||
A[j] = R[j];
|
A[j] = R[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1327,7 +1292,7 @@ __device__ __forceinline__ void SIMD_Compress1(uint32_t *A, const int thr_id, co
|
|||||||
Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4);
|
Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4);
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ void Compression1(const uint32_t *hashval, const int texture_id, uint4 *g_fft4, int *g_state) {
|
__device__ __forceinline__ void Compression1(const uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
|
||||||
uint32_t A[32];
|
uint32_t A[32];
|
||||||
int i;
|
int i;
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
@ -1360,10 +1325,10 @@ __device__ __forceinline__ void SIMD_Compress2(uint32_t *A, const int thr_id, ui
|
|||||||
STEP8_IF_35(IV[3], 25, 4, &A[8], &A[16], &A[24], A);
|
STEP8_IF_35(IV[3], 25, 4, &A[8], &A[16], &A[24], A);
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ void Compression2(const int texture_id, uint4 *g_fft4, int *g_state) {
|
__device__ __forceinline__ void Compression2(const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
|
||||||
uint32_t A[32];
|
uint32_t A[32];
|
||||||
int i;
|
int i;
|
||||||
uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
|
uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)];
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
|
for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
|
||||||
SIMD_Compress2(A, texture_id, g_fft4);
|
SIMD_Compress2(A, texture_id, g_fft4);
|
||||||
@ -1396,10 +1361,10 @@ __device__ __forceinline__ void SIMD_Compress_Final(uint32_t *A, const uint32_t
|
|||||||
STEP8_IF_35(IV[3], 25, 4, &A[8], &A[16], &A[24], A);
|
STEP8_IF_35(IV[3], 25, 4, &A[8], &A[16], &A[24], A);
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, int *g_state) {
|
__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
|
||||||
uint32_t A[32];
|
uint32_t A[32];
|
||||||
int i;
|
int i;
|
||||||
uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
|
uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)];
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
|
for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
|
||||||
uint32_t buffer[16];
|
uint32_t buffer[16];
|
||||||
|
Loading…
x
Reference in New Issue
Block a user