Browse Source

x11: restore simd host2dev memcpytosymbol to reduce used cmem

Remove define attempts for SM 2.1 devices, fermi is not compatible
master
Tanguy Pruvot 10 years ago
parent
commit
194fda87c1
  1. 45
      x11/cuda_x11_aes.cu
  2. 42
      x11/cuda_x11_simd512.cu
  3. 72
      x11/simd_functions.cu
  4. 4
      x15/x15.cu

45
x11/cuda_x11_aes.cu

@ -2,8 +2,7 @@
/* AES Helper for inline-usage from SPH */ /* AES Helper for inline-usage from SPH */
#define AESx(x) SPH_C32(x) #define AESx(x) SPH_C32(x)
__device__ __constant__ static const uint32_t h_AES0[256] = {
static const uint32_t d_AES0[256] = {
AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6), AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591), AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56), AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
@ -70,8 +69,7 @@ static const uint32_t d_AES0[256] = {
AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C) AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
}; };
__device__ __constant__ static const uint32_t h_AES1[256] = {
static const uint32_t d_AES1[256] = {
AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
@ -138,8 +136,7 @@ static const uint32_t d_AES1[256] = {
AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
}; };
__device__ __constant__ static const uint32_t h_AES2[256] = {
static const uint32_t d_AES2[256] = {
AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
@ -206,8 +203,7 @@ static const uint32_t d_AES2[256] = {
AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
}; };
__device__ __constant__ static const uint32_t h_AES3[256] = {
static const uint32_t d_AES3[256] = {
AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B), AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5), AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B), AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
@ -274,12 +270,35 @@ static const uint32_t d_AES3[256] = {
AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616) AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
}; };
static __constant__ uint32_t d_AES0[256];
static __constant__ uint32_t d_AES1[256];
static __constant__ uint32_t d_AES2[256];
static __constant__ uint32_t d_AES3[256];
static void aes_cpu_init() static void aes_cpu_init()
{ {
cudaMemcpyToSymbol( d_AES0,
h_AES0,
sizeof(h_AES0),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_AES1,
h_AES1,
sizeof(h_AES1),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_AES2,
h_AES2,
sizeof(h_AES2),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_AES3,
h_AES3,
sizeof(h_AES3),
0, cudaMemcpyHostToDevice);
} }
__device__ __forceinline__ static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
static void aes_gpu_init(uint32_t *sharedMemory)
{ {
if(threadIdx.x < 256) if(threadIdx.x < 256)
{ {
@ -290,8 +309,7 @@ static void aes_gpu_init(uint32_t *sharedMemory)
} }
} }
__device__ __forceinline__ static __device__ __forceinline__ void aes_round(
static void aes_round(
const uint32_t *sharedMemory, const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t k0, uint32_t k0,
@ -338,8 +356,7 @@ static void aes_round(
sharedMemory[idx3]; // ^k3 sharedMemory[idx3]; // ^k3
} }
__device__ __forceinline__ static __device__ __forceinline__ void aes_round(
static void aes_round(
const uint32_t *sharedMemory, const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)

42
x11/cuda_x11_simd512.cu

@ -18,16 +18,16 @@ uint4 *d_temp4[8];
// texture bound to d_temp4[thr_id], for read access in Compaction kernel // texture bound to d_temp4[thr_id], for read access in Compaction kernel
texture<uint4, 1, cudaReadModeElementType> texRef1D_128; texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
__device__ __constant__ __constant__ uint32_t c_IV_512[32];
const uint32_t c_IV_512[32] = { const uint32_t h_IV_512[32] = {
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e, 0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257, 0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
}; };
__device__ __constant__ __constant__ int c_FFT128_8_16_Twiddle[128];
static const int c_FFT128_8_16_Twiddle[128] = { static const int h_FFT128_8_16_Twiddle[128] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30, 1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30,
1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22, 1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22,
@ -37,8 +37,9 @@ static const int c_FFT128_8_16_Twiddle[128] = {
1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114, 1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114,
1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79}; 1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79};
__device__ __constant__
static const int c_FFT256_2_128_Twiddle[128] = { __constant__ int c_FFT256_2_128_Twiddle[128];
static const int h_FFT256_2_128_Twiddle[128] = {
1, 41, -118, 45, 46, 87, -31, 14, 1, 41, -118, 45, 46, 87, -31, 14,
60, -110, 116, -127, -67, 80, -61, 69, 60, -110, 116, -127, -67, 80, -61, 69,
2, 82, 21, 90, 92, -83, -62, 28, 2, 82, 21, 90, 92, -83, -62, 28,
@ -154,23 +155,8 @@ X(j) = (u-v) << (2*n); \
#undef BUTTERFLY #undef BUTTERFLY
} }
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
/**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
*/
#undef __shfl
#define __shfl(var, srcLane, width) (uint32_t)(var)
#endif
__device__ __forceinline__ void FFT_16(int *y) { __device__ __forceinline__ void FFT_16(int *y) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32
# warning FFT_16() function is not compatible with SM 2.1 devices!
#endif
#endif
/* /*
* FFT_16 using w=2 as 16th root of unity * FFT_16 using w=2 as 16th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT. * Unrolled decimation in frequency (DIF) radix-2 NTT.
@ -334,11 +320,6 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4) __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
{ {
int i; int i;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32
# warning Expansion() function is not compatible with SM 2.1 devices
#endif
#endif
/* Message Expansion using Number Theoretical Transform similar to FFT */ /* Message Expansion using Number Theoretical Transform similar to FFT */
int expanded[32]; int expanded[32];
@ -655,6 +636,15 @@ __host__ void x11_simd512_cpu_init(int thr_id, int threads)
texRef1D_128.filterMode = cudaFilterModePoint; texRef1D_128.filterMode = cudaFilterModePoint;
texRef1D_128.addressMode[0] = cudaAddressModeClamp; texRef1D_128.addressMode[0] = cudaAddressModeClamp;
cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads); cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads);
cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice);
} }
__host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)

72
x11/simd_functions.cu

@ -898,9 +898,7 @@ __device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, con
A[j] = R[j]; A[j] = R[j];
} }
} }
__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__
void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{ {
int j; int j;
uint32_t temp; uint32_t temp;
@ -930,9 +928,7 @@ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, con
A[j] = R[j]; A[j] = R[j];
} }
} }
__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__
void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{ {
int j; int j;
uint32_t temp; uint32_t temp;
@ -962,9 +958,7 @@ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, con
A[j] = R[j]; A[j] = R[j];
} }
} }
__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__
void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{ {
int j; int j;
uint32_t temp; uint32_t temp;
@ -994,9 +988,7 @@ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, cons
A[j] = R[j]; A[j] = R[j];
} }
} }
__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__
void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{ {
int j; int j;
uint32_t temp; uint32_t temp;
@ -1026,9 +1018,7 @@ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, cons
A[j] = R[j]; A[j] = R[j];
} }
} }
__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__
void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{ {
int j; int j;
uint32_t temp; uint32_t temp;
@ -1058,9 +1048,7 @@ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, cons
A[j] = R[j]; A[j] = R[j];
} }
} }
__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__
void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{ {
int j; int j;
uint32_t temp; uint32_t temp;
@ -1090,9 +1078,8 @@ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, cons
A[j] = R[j]; A[j] = R[j];
} }
} }
static __constant__ uint32_t d_cw0[8][8];
__device__ __constant__ static const uint32_t h_cw0[8][8] = {
static const uint32_t d_cw0[8][8] = {
0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, 0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6,
0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380, 0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, 0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8,
@ -1102,8 +1089,10 @@ static const uint32_t d_cw0[8][8] = {
0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, 0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E,
0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3 0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3
}; };
__device__ __forceinline__ __device__ __forceinline__ void Round8_0_final(uint32_t *A,
void Round8_0_final(uint32_t *A, int r, int s, int t, int u) { int r, int s, int t, int u) {
STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]); STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]);
STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]); STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]);
STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]); STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1113,9 +1102,8 @@ void Round8_0_final(uint32_t *A, int r, int s, int t, int u) {
STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]); STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]);
STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A); STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
} }
static __constant__ uint32_t d_cw1[8][8];
__device__ __constant__ static const uint32_t h_cw1[8][8] = {
static const uint32_t d_cw1[8][8] = {
0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, 0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7,
0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2, 0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, 0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A,
@ -1125,8 +1113,10 @@ static const uint32_t d_cw1[8][8] = {
0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, 0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D,
0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80 0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80
}; };
__device__ __forceinline__ __device__ __forceinline__ void Round8_1_final(uint32_t *A,
void Round8_1_final(uint32_t *A, int r, int s, int t, int u) { int r, int s, int t, int u) {
STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]); STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]);
STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]); STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]);
STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]); STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1136,9 +1126,8 @@ void Round8_1_final(uint32_t *A, int r, int s, int t, int u) {
STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]); STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]);
STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A); STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
} }
static __constant__ uint32_t d_cw2[8][8];
__device__ __constant__ static const uint32_t h_cw2[8][8] = {
static const uint32_t d_cw2[8][8] = {
0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, 0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3,
0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3, 0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, 0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539,
@ -1148,8 +1137,10 @@ static const uint32_t d_cw2[8][8] = {
0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, 0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468,
0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE 0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE
}; };
__device__ __forceinline__ __device__ __forceinline__ void Round8_2_final(uint32_t *A,
void Round8_2_final(uint32_t *A, int r, int s, int t, int u) { int r, int s, int t, int u) {
STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]); STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]);
STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]); STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]);
STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]); STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1159,9 +1150,8 @@ void Round8_2_final(uint32_t *A, int r, int s, int t, int u) {
STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]); STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]);
STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A); STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
} }
static __constant__ uint32_t d_cw3[8][8];
__device__ __constant__ static const uint32_t h_cw3[8][8] = {
static const uint32_t d_cw3[8][8] = {
0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, 0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D,
0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B, 0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, 0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A,
@ -1171,8 +1161,10 @@ static const uint32_t d_cw3[8][8] = {
0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, 0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA,
0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D 0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D
}; };
__device__ __forceinline__ __device__ __forceinline__ void Round8_3_final(uint32_t *A,
void Round8_3_final(uint32_t *A, int r, int s, int t, int u) { int r, int s, int t, int u) {
STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]); STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]);
STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]); STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]);
STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]); STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1190,8 +1182,8 @@ void Round8_3_final(uint32_t *A, int r, int s, int t, int u) {
#define expanded_vector(x) __ldg(&g_fft4[x]) #define expanded_vector(x) __ldg(&g_fft4[x])
#endif #endif
__device__ __forceinline__ __device__ __forceinline__ void Round8_0(uint32_t *A, const int thr_offset,
void Round8_0(uint32_t *A, const int thr_offset, int r, int s, int t, int u, uint4 *g_fft4) { int r, int s, int t, int u, uint4 *g_fft4) {
uint32_t w[8]; uint32_t w[8];
uint4 hv1, hv2; uint4 hv1, hv2;

4
x15/x15.cu

@ -260,8 +260,8 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
#if NULLTEST #if NULLTEST
uint32_t buf[8]; memset(buf, 0, sizeof buf); uint32_t buf[8]; memset(buf, 0, sizeof buf);
cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost); CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
cudaThreadSynchronize(); CUDA_SAFE_CALL(cudaThreadSynchronize());
print_hash((unsigned char*)buf); printf("\n"); print_hash((unsigned char*)buf); printf("\n");
#endif #endif
/* Scan with GPU */ /* Scan with GPU */

Loading…
Cancel
Save