Browse Source

Faster Simd

On maxwell compress1 and compress2 can be run in one run instead of two.(750TI + 20KHASH)
2upstream
sp-hash 10 years ago committed by Tanguy Pruvot
parent
commit
7d88e5cca1
  1. 52
      x11/cuda_x11_simd512.cu
  2. 191
      x11/simd_functions.cu

52
x11/cuda_x11_simd512.cu

@ -8,16 +8,16 @@ @@ -8,16 +8,16 @@
#define TPB 64
#include "cuda_helper.h"
#include <stdio.h>
//#include <stdio.h>
int *d_state[8];
uint32_t *d_state[8];
uint4 *d_temp4[8];
// texture bound to d_temp4[thr_id], for read access in Compaction kernel
texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
__constant__ uint32_t c_perm[8][8];
const uint32_t h_perm[8][8] = {
__constant__ uint8_t c_perm[8][8];
const uint8_t h_perm[8][8] = {
{ 2, 3, 6, 7, 0, 1, 4, 5 },
{ 6, 7, 2, 3, 4, 5, 0, 1 },
{ 7, 6, 5, 4, 3, 2, 1, 0 },
@ -36,8 +36,8 @@ const uint32_t h_IV_512[32] = { @@ -36,8 +36,8 @@ const uint32_t h_IV_512[32] = {
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
};
__constant__ int c_FFT128_8_16_Twiddle[128];
static const int h_FFT128_8_16_Twiddle[128] = {
__constant__ short c_FFT128_8_16_Twiddle[128];
static const short h_FFT128_8_16_Twiddle[128] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30,
1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22,
@ -48,8 +48,8 @@ static const int h_FFT128_8_16_Twiddle[128] = { @@ -48,8 +48,8 @@ static const int h_FFT128_8_16_Twiddle[128] = {
1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79
};
__constant__ int c_FFT256_2_128_Twiddle[128];
static const int h_FFT256_2_128_Twiddle[128] = {
__constant__ short c_FFT256_2_128_Twiddle[128];
static const short h_FFT256_2_128_Twiddle[128] = {
1, 41,-118, 45, 46, 87, -31, 14,
60,-110, 116,-127, -67, 80, -61, 69,
2, 82, 21, 90, 92, -83, -62, 28,
@ -71,9 +71,10 @@ static const int h_FFT256_2_128_Twiddle[128] = { @@ -71,9 +71,10 @@ static const int h_FFT256_2_128_Twiddle[128] = {
/************* the round function ****************/
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
#define IF(x, y, z) (((y ^ z) & x) ^ z)
#define MAJ(x, y, z) ((z &y) | ((z|y) & x))
#include "x11/simd_functions.cu"
@ -549,8 +550,7 @@ void Expansion(const uint32_t *data, uint4 *g_temp4) @@ -549,8 +550,7 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)
}
/***************************************************/
__global__ void __launch_bounds__(TPB,4)
__global__ void __launch_bounds__(TPB, 8)
x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x)/8;
@ -576,8 +576,9 @@ x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, u @@ -576,8 +576,9 @@ x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, u
}
}
/*
__global__ void __launch_bounds__(TPB, 4)
x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state)
x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@ -590,9 +591,23 @@ x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash @@ -590,9 +591,23 @@ x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash
Compression1(Hash, hashPosition, g_fft4, g_state);
}
}
__global__ void __launch_bounds__(TPB, 4)
x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
Compression2(hashPosition, g_fft4, g_state);
}
}
*/
__global__ void __launch_bounds__(TPB, 4)
x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state)
x11_simd512_gpu_compress_64_maxwell(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@ -600,13 +615,16 @@ x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash @@ -600,13 +615,16 @@ x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
Compression1(Hash, hashPosition, g_fft4, g_state);
Compression2(hashPosition, g_fft4, g_state);
}
}
__global__ void __launch_bounds__(TPB, 4)
x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state)
x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@ -658,9 +676,7 @@ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint @@ -658,9 +676,7 @@ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint
dim3 grid((threads + threadsperblock-1)/threadsperblock);
x11_simd512_gpu_compress1_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
x11_simd512_gpu_compress2_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
x11_simd512_gpu_compress_64_maxwell << <grid, block >> > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
x11_simd512_gpu_final_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
MyStreamSynchronize(NULL, order, thr_id);

191
x11/simd_functions.cu

@ -1,10 +1,9 @@ @@ -1,10 +1,9 @@
__device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for(int j=0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -24,17 +23,16 @@ __device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const @@ -24,17 +23,16 @@ __device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[6];
#pragma unroll 8
for(j=0; j<8; j++) {
for(int j=0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -54,17 +52,16 @@ __device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const @@ -54,17 +52,16 @@ __device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[1];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -84,17 +81,16 @@ __device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const @@ -84,17 +81,16 @@ __device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[5];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -114,17 +110,16 @@ __device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const @@ -114,17 +110,16 @@ __device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[4];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -144,17 +139,16 @@ __device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, cons @@ -144,17 +139,16 @@ __device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[2];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -174,17 +168,16 @@ __device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, cons @@ -174,17 +168,16 @@ __device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[0];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -204,17 +197,16 @@ __device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, cons @@ -204,17 +197,16 @@ __device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[3];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -234,17 +226,16 @@ __device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, cons @@ -234,17 +226,16 @@ __device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[6];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -264,17 +255,16 @@ __device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const @@ -264,17 +255,16 @@ __device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[1];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -294,17 +284,17 @@ __device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const @@ -294,17 +284,17 @@ __device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[5];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -324,17 +314,16 @@ __device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, cons @@ -324,17 +314,16 @@ __device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[4];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -354,17 +343,16 @@ __device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, cons @@ -354,17 +343,16 @@ __device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[2];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -384,17 +372,16 @@ __device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, con @@ -384,17 +372,16 @@ __device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, con
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[0];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -414,17 +401,16 @@ __device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, con @@ -414,17 +401,16 @@ __device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, con
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[3];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -444,17 +430,16 @@ __device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, con @@ -444,17 +430,16 @@ __device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, con
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[6];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -474,17 +459,16 @@ __device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, con @@ -474,17 +459,16 @@ __device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, con
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[1];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -504,17 +488,16 @@ __device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, cons @@ -504,17 +488,16 @@ __device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[5];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -534,17 +517,16 @@ __device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, cons @@ -534,17 +517,16 @@ __device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[4];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -564,17 +546,16 @@ __device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, cons @@ -564,17 +546,16 @@ __device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[2];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -594,17 +575,16 @@ __device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, cons @@ -594,17 +575,16 @@ __device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[0];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -624,17 +604,16 @@ __device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, con @@ -624,17 +604,16 @@ __device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, con
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[3];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -654,17 +633,16 @@ __device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, con @@ -654,17 +633,16 @@ __device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, con
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[6];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -684,17 +662,16 @@ __device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, con @@ -684,17 +662,16 @@ __device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, con
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[1];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -714,17 +691,16 @@ __device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, con @@ -714,17 +691,16 @@ __device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, con
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[5];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -744,17 +720,16 @@ __device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, cons @@ -744,17 +720,16 @@ __device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[4];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -774,17 +749,16 @@ __device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, cons @@ -774,17 +749,16 @@ __device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[2];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -804,17 +778,16 @@ __device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, cons @@ -804,17 +778,16 @@ __device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[0];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -834,17 +807,16 @@ __device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, cons @@ -834,17 +807,16 @@ __device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[3];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -864,17 +836,16 @@ __device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, con @@ -864,17 +836,16 @@ __device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, con
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[6];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -894,17 +865,16 @@ __device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, con @@ -894,17 +865,16 @@ __device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, con
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[1];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -924,17 +894,16 @@ __device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, con @@ -924,17 +894,16 @@ __device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, con
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[5];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
@ -954,17 +923,16 @@ __device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, con @@ -954,17 +923,16 @@ __device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, con
temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[4];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -984,17 +952,16 @@ __device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, cons @@ -984,17 +952,16 @@ __device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[2];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -1014,17 +981,16 @@ __device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, cons @@ -1014,17 +981,16 @@ __device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[0];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -1044,17 +1010,16 @@ __device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, cons @@ -1044,17 +1010,16 @@ __device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[3];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
@ -1074,7 +1039,7 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons @@ -1074,7 +1039,7 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons
temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
D[7] = ROTL32(temp, s) + R[6];
#pragma unroll 8
for(j=0; j<8; j++) {
for (int j = 0; j<8; j++) {
A[j] = R[j];
}
}
@ -1327,7 +1292,7 @@ __device__ __forceinline__ void SIMD_Compress1(uint32_t *A, const int thr_id, co @@ -1327,7 +1292,7 @@ __device__ __forceinline__ void SIMD_Compress1(uint32_t *A, const int thr_id, co
Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4);
}
__device__ __forceinline__ void Compression1(const uint32_t *hashval, const int texture_id, uint4 *g_fft4, int *g_state) {
__device__ __forceinline__ void Compression1(const uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
uint32_t A[32];
int i;
#pragma unroll 32
@ -1360,10 +1325,10 @@ __device__ __forceinline__ void SIMD_Compress2(uint32_t *A, const int thr_id, ui @@ -1360,10 +1325,10 @@ __device__ __forceinline__ void SIMD_Compress2(uint32_t *A, const int thr_id, ui
STEP8_IF_35(IV[3], 25, 4, &A[8], &A[16], &A[24], A);
}
__device__ __forceinline__ void Compression2(const int texture_id, uint4 *g_fft4, int *g_state) {
__device__ __forceinline__ void Compression2(const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
uint32_t A[32];
int i;
uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)];
#pragma unroll 32
for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
SIMD_Compress2(A, texture_id, g_fft4);
@ -1396,10 +1361,10 @@ __device__ __forceinline__ void SIMD_Compress_Final(uint32_t *A, const uint32_t @@ -1396,10 +1361,10 @@ __device__ __forceinline__ void SIMD_Compress_Final(uint32_t *A, const uint32_t
STEP8_IF_35(IV[3], 25, 4, &A[8], &A[16], &A[24], A);
}
__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, int *g_state) {
__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
uint32_t A[32];
int i;
uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)];
#pragma unroll 32
for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
uint32_t buffer[16];

Loading…
Cancel
Save