x11: restore simd host2dev memcpytosymbol to reduce used cmem

Remove define attempts for SM 2.1 devices, fermi is not compatible
This commit is contained in:
Tanguy Pruvot 2014-08-19 18:09:25 +02:00
parent bc2eb75758
commit 194fda87c1
4 changed files with 81 additions and 82 deletions

View File

@ -2,8 +2,7 @@
/* AES Helper for inline-usage from SPH */ /* AES Helper for inline-usage from SPH */
#define AESx(x) SPH_C32(x) #define AESx(x) SPH_C32(x)
__device__ __constant__ static const uint32_t h_AES0[256] = {
static const uint32_t d_AES0[256] = {
AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6), AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591), AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56), AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
@ -70,8 +69,7 @@ static const uint32_t d_AES0[256] = {
AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C) AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
}; };
__device__ __constant__ static const uint32_t h_AES1[256] = {
static const uint32_t d_AES1[256] = {
AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
@ -138,8 +136,7 @@ static const uint32_t d_AES1[256] = {
AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
}; };
__device__ __constant__ static const uint32_t h_AES2[256] = {
static const uint32_t d_AES2[256] = {
AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
@ -206,8 +203,7 @@ static const uint32_t d_AES2[256] = {
AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
}; };
__device__ __constant__ static const uint32_t h_AES3[256] = {
static const uint32_t d_AES3[256] = {
AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B), AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5), AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B), AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
@ -274,12 +270,35 @@ static const uint32_t d_AES3[256] = {
AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616) AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
}; };
static __constant__ uint32_t d_AES0[256];
static __constant__ uint32_t d_AES1[256];
static __constant__ uint32_t d_AES2[256];
static __constant__ uint32_t d_AES3[256];
static void aes_cpu_init() static void aes_cpu_init()
{ {
cudaMemcpyToSymbol( d_AES0,
h_AES0,
sizeof(h_AES0),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_AES1,
h_AES1,
sizeof(h_AES1),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_AES2,
h_AES2,
sizeof(h_AES2),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_AES3,
h_AES3,
sizeof(h_AES3),
0, cudaMemcpyHostToDevice);
} }
__device__ __forceinline__ static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
static void aes_gpu_init(uint32_t *sharedMemory)
{ {
if(threadIdx.x < 256) if(threadIdx.x < 256)
{ {
@ -290,8 +309,7 @@ static void aes_gpu_init(uint32_t *sharedMemory)
} }
} }
__device__ __forceinline__ static __device__ __forceinline__ void aes_round(
static void aes_round(
const uint32_t *sharedMemory, const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t k0, uint32_t k0,
@ -338,8 +356,7 @@ static void aes_round(
sharedMemory[idx3]; // ^k3 sharedMemory[idx3]; // ^k3
} }
__device__ __forceinline__ static __device__ __forceinline__ void aes_round(
static void aes_round(
const uint32_t *sharedMemory, const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)

View File

@ -18,16 +18,16 @@ uint4 *d_temp4[8];
// texture bound to d_temp4[thr_id], for read access in Compaction kernel // texture bound to d_temp4[thr_id], for read access in Compaction kernel
texture<uint4, 1, cudaReadModeElementType> texRef1D_128; texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
__device__ __constant__ __constant__ uint32_t c_IV_512[32];
const uint32_t c_IV_512[32] = { const uint32_t h_IV_512[32] = {
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e, 0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257, 0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
}; };
__device__ __constant__ __constant__ int c_FFT128_8_16_Twiddle[128];
static const int c_FFT128_8_16_Twiddle[128] = { static const int h_FFT128_8_16_Twiddle[128] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30, 1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30,
1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22, 1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22,
@ -37,8 +37,9 @@ static const int c_FFT128_8_16_Twiddle[128] = {
1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114, 1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114,
1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79}; 1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79};
__device__ __constant__
static const int c_FFT256_2_128_Twiddle[128] = { __constant__ int c_FFT256_2_128_Twiddle[128];
static const int h_FFT256_2_128_Twiddle[128] = {
1, 41, -118, 45, 46, 87, -31, 14, 1, 41, -118, 45, 46, 87, -31, 14,
60, -110, 116, -127, -67, 80, -61, 69, 60, -110, 116, -127, -67, 80, -61, 69,
2, 82, 21, 90, 92, -83, -62, 28, 2, 82, 21, 90, 92, -83, -62, 28,
@ -154,23 +155,8 @@ X(j) = (u-v) << (2*n); \
#undef BUTTERFLY #undef BUTTERFLY
} }
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
/**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
*/
#undef __shfl
#define __shfl(var, srcLane, width) (uint32_t)(var)
#endif
__device__ __forceinline__ void FFT_16(int *y) { __device__ __forceinline__ void FFT_16(int *y) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32
# warning FFT_16() function is not compatible with SM 2.1 devices!
#endif
#endif
/* /*
* FFT_16 using w=2 as 16th root of unity * FFT_16 using w=2 as 16th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT. * Unrolled decimation in frequency (DIF) radix-2 NTT.
@ -334,11 +320,6 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4) __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
{ {
int i; int i;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32
# warning Expansion() function is not compatible with SM 2.1 devices
#endif
#endif
/* Message Expansion using Number Theoretical Transform similar to FFT */ /* Message Expansion using Number Theoretical Transform similar to FFT */
int expanded[32]; int expanded[32];
@ -655,6 +636,15 @@ __host__ void x11_simd512_cpu_init(int thr_id, int threads)
texRef1D_128.filterMode = cudaFilterModePoint; texRef1D_128.filterMode = cudaFilterModePoint;
texRef1D_128.addressMode[0] = cudaAddressModeClamp; texRef1D_128.addressMode[0] = cudaAddressModeClamp;
cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads); cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads);
cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice);
} }
__host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)

View File

@ -898,9 +898,7 @@ __device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, con
A[j] = R[j]; A[j] = R[j];
} }
} }
__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__
void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{ {
int j; int j;
uint32_t temp; uint32_t temp;
@ -930,9 +928,7 @@ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, con
A[j] = R[j]; A[j] = R[j];
} }
} }
__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__
void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{ {
int j; int j;
uint32_t temp; uint32_t temp;
@ -962,9 +958,7 @@ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, con
A[j] = R[j]; A[j] = R[j];
} }
} }
__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__
void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{ {
int j; int j;
uint32_t temp; uint32_t temp;
@ -994,9 +988,7 @@ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, cons
A[j] = R[j]; A[j] = R[j];
} }
} }
__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__
void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{ {
int j; int j;
uint32_t temp; uint32_t temp;
@ -1026,9 +1018,7 @@ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, cons
A[j] = R[j]; A[j] = R[j];
} }
} }
__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__
void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{ {
int j; int j;
uint32_t temp; uint32_t temp;
@ -1058,9 +1048,7 @@ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, cons
A[j] = R[j]; A[j] = R[j];
} }
} }
__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__
void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{ {
int j; int j;
uint32_t temp; uint32_t temp;
@ -1090,9 +1078,8 @@ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, cons
A[j] = R[j]; A[j] = R[j];
} }
} }
static __constant__ uint32_t d_cw0[8][8];
__device__ __constant__ static const uint32_t h_cw0[8][8] = {
static const uint32_t d_cw0[8][8] = {
0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, 0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6,
0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380, 0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, 0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8,
@ -1102,8 +1089,10 @@ static const uint32_t d_cw0[8][8] = {
0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, 0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E,
0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3 0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3
}; };
__device__ __forceinline__ __device__ __forceinline__ void Round8_0_final(uint32_t *A,
void Round8_0_final(uint32_t *A, int r, int s, int t, int u) { int r, int s, int t, int u) {
STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]); STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]);
STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]); STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]);
STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]); STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1113,9 +1102,8 @@ void Round8_0_final(uint32_t *A, int r, int s, int t, int u) {
STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]); STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]);
STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A); STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
} }
static __constant__ uint32_t d_cw1[8][8];
__device__ __constant__ static const uint32_t h_cw1[8][8] = {
static const uint32_t d_cw1[8][8] = {
0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, 0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7,
0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2, 0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, 0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A,
@ -1125,8 +1113,10 @@ static const uint32_t d_cw1[8][8] = {
0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, 0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D,
0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80 0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80
}; };
__device__ __forceinline__ __device__ __forceinline__ void Round8_1_final(uint32_t *A,
void Round8_1_final(uint32_t *A, int r, int s, int t, int u) { int r, int s, int t, int u) {
STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]); STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]);
STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]); STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]);
STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]); STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1136,9 +1126,8 @@ void Round8_1_final(uint32_t *A, int r, int s, int t, int u) {
STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]); STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]);
STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A); STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
} }
static __constant__ uint32_t d_cw2[8][8];
__device__ __constant__ static const uint32_t h_cw2[8][8] = {
static const uint32_t d_cw2[8][8] = {
0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, 0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3,
0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3, 0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, 0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539,
@ -1148,8 +1137,10 @@ static const uint32_t d_cw2[8][8] = {
0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, 0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468,
0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE 0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE
}; };
__device__ __forceinline__ __device__ __forceinline__ void Round8_2_final(uint32_t *A,
void Round8_2_final(uint32_t *A, int r, int s, int t, int u) { int r, int s, int t, int u) {
STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]); STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]);
STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]); STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]);
STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]); STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1159,9 +1150,8 @@ void Round8_2_final(uint32_t *A, int r, int s, int t, int u) {
STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]); STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]);
STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A); STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
} }
static __constant__ uint32_t d_cw3[8][8];
__device__ __constant__ static const uint32_t h_cw3[8][8] = {
static const uint32_t d_cw3[8][8] = {
0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, 0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D,
0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B, 0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, 0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A,
@ -1171,8 +1161,10 @@ static const uint32_t d_cw3[8][8] = {
0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, 0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA,
0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D 0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D
}; };
__device__ __forceinline__ __device__ __forceinline__ void Round8_3_final(uint32_t *A,
void Round8_3_final(uint32_t *A, int r, int s, int t, int u) { int r, int s, int t, int u) {
STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]); STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]);
STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]); STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]);
STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]); STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1190,8 +1182,8 @@ void Round8_3_final(uint32_t *A, int r, int s, int t, int u) {
#define expanded_vector(x) __ldg(&g_fft4[x]) #define expanded_vector(x) __ldg(&g_fft4[x])
#endif #endif
__device__ __forceinline__ __device__ __forceinline__ void Round8_0(uint32_t *A, const int thr_offset,
void Round8_0(uint32_t *A, const int thr_offset, int r, int s, int t, int u, uint4 *g_fft4) { int r, int s, int t, int u, uint4 *g_fft4) {
uint32_t w[8]; uint32_t w[8];
uint4 hv1, hv2; uint4 hv1, hv2;

View File

@ -260,8 +260,8 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
#if NULLTEST #if NULLTEST
uint32_t buf[8]; memset(buf, 0, sizeof buf); uint32_t buf[8]; memset(buf, 0, sizeof buf);
cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost); CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
cudaThreadSynchronize(); CUDA_SAFE_CALL(cudaThreadSynchronize());
print_hash((unsigned char*)buf); printf("\n"); print_hash((unsigned char*)buf); printf("\n");
#endif #endif
/* Scan with GPU */ /* Scan with GPU */