Browse Source

x11: restore simd host2dev memcpytosymbol to reduce used cmem

Remove define attempts for SM 2.1 devices, fermi is not compatible
2upstream
Tanguy Pruvot 11 years ago
parent
commit
194fda87c1
  1. 45
      x11/cuda_x11_aes.cu
  2. 42
      x11/cuda_x11_simd512.cu
  3. 72
      x11/simd_functions.cu
  4. 4
      x15/x15.cu

45
x11/cuda_x11_aes.cu

@ -2,8 +2,7 @@ @@ -2,8 +2,7 @@
/* AES Helper for inline-usage from SPH */
#define AESx(x) SPH_C32(x)
__device__ __constant__
static const uint32_t d_AES0[256] = {
static const uint32_t h_AES0[256] = {
AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
@ -70,8 +69,7 @@ static const uint32_t d_AES0[256] = { @@ -70,8 +69,7 @@ static const uint32_t d_AES0[256] = {
AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
};
__device__ __constant__
static const uint32_t d_AES1[256] = {
static const uint32_t h_AES1[256] = {
AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
@ -138,8 +136,7 @@ static const uint32_t d_AES1[256] = { @@ -138,8 +136,7 @@ static const uint32_t d_AES1[256] = {
AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
};
__device__ __constant__
static const uint32_t d_AES2[256] = {
static const uint32_t h_AES2[256] = {
AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
@ -206,8 +203,7 @@ static const uint32_t d_AES2[256] = { @@ -206,8 +203,7 @@ static const uint32_t d_AES2[256] = {
AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
};
__device__ __constant__
static const uint32_t d_AES3[256] = {
static const uint32_t h_AES3[256] = {
AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
@ -274,12 +270,35 @@ static const uint32_t d_AES3[256] = { @@ -274,12 +270,35 @@ static const uint32_t d_AES3[256] = {
AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
};
static __constant__ uint32_t d_AES0[256];
static __constant__ uint32_t d_AES1[256];
static __constant__ uint32_t d_AES2[256];
static __constant__ uint32_t d_AES3[256];
static void aes_cpu_init()
{
cudaMemcpyToSymbol( d_AES0,
h_AES0,
sizeof(h_AES0),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_AES1,
h_AES1,
sizeof(h_AES1),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_AES2,
h_AES2,
sizeof(h_AES2),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_AES3,
h_AES3,
sizeof(h_AES3),
0, cudaMemcpyHostToDevice);
}
__device__ __forceinline__
static void aes_gpu_init(uint32_t *sharedMemory)
static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
{
if(threadIdx.x < 256)
{
@ -290,8 +309,7 @@ static void aes_gpu_init(uint32_t *sharedMemory) @@ -290,8 +309,7 @@ static void aes_gpu_init(uint32_t *sharedMemory)
}
}
__device__ __forceinline__
static void aes_round(
static __device__ __forceinline__ void aes_round(
const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t k0,
@ -338,8 +356,7 @@ static void aes_round( @@ -338,8 +356,7 @@ static void aes_round(
sharedMemory[idx3]; // ^k3
}
__device__ __forceinline__
static void aes_round(
static __device__ __forceinline__ void aes_round(
const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)

42
x11/cuda_x11_simd512.cu

@ -18,16 +18,16 @@ uint4 *d_temp4[8]; @@ -18,16 +18,16 @@ uint4 *d_temp4[8];
// texture bound to d_temp4[thr_id], for read access in Compaction kernel
texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
__device__ __constant__
const uint32_t c_IV_512[32] = {
__constant__ uint32_t c_IV_512[32];
const uint32_t h_IV_512[32] = {
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
};
__device__ __constant__
static const int c_FFT128_8_16_Twiddle[128] = {
__constant__ int c_FFT128_8_16_Twiddle[128];
static const int h_FFT128_8_16_Twiddle[128] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30,
1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22,
@ -37,8 +37,9 @@ static const int c_FFT128_8_16_Twiddle[128] = { @@ -37,8 +37,9 @@ static const int c_FFT128_8_16_Twiddle[128] = {
1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114,
1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79};
__device__ __constant__
static const int c_FFT256_2_128_Twiddle[128] = {
__constant__ int c_FFT256_2_128_Twiddle[128];
static const int h_FFT256_2_128_Twiddle[128] = {
1, 41, -118, 45, 46, 87, -31, 14,
60, -110, 116, -127, -67, 80, -61, 69,
2, 82, 21, 90, 92, -83, -62, 28,
@ -154,23 +155,8 @@ X(j) = (u-v) << (2*n); \ @@ -154,23 +155,8 @@ X(j) = (u-v) << (2*n); \
#undef BUTTERFLY
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
/**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
*/
#undef __shfl
#define __shfl(var, srcLane, width) (uint32_t)(var)
#endif
__device__ __forceinline__ void FFT_16(int *y) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32
# warning FFT_16() function is not compatible with SM 2.1 devices!
#endif
#endif
/*
* FFT_16 using w=2 as 16th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT.
@ -334,11 +320,6 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) { @@ -334,11 +320,6 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
{
int i;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32
# warning Expansion() function is not compatible with SM 2.1 devices
#endif
#endif
/* Message Expansion using Number Theoretical Transform similar to FFT */
int expanded[32];
@ -655,6 +636,15 @@ __host__ void x11_simd512_cpu_init(int thr_id, int threads) @@ -655,6 +636,15 @@ __host__ void x11_simd512_cpu_init(int thr_id, int threads)
texRef1D_128.filterMode = cudaFilterModePoint;
texRef1D_128.addressMode[0] = cudaAddressModeClamp;
cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads);
cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice);
}
__host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)

72
x11/simd_functions.cu

@ -898,9 +898,7 @@ __device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, con @@ -898,9 +898,7 @@ __device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, con
A[j] = R[j];
}
}
__device__ __forceinline__
void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
@ -930,9 +928,7 @@ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, con @@ -930,9 +928,7 @@ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, con
A[j] = R[j];
}
}
__device__ __forceinline__
void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
@ -962,9 +958,7 @@ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, con @@ -962,9 +958,7 @@ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, con
A[j] = R[j];
}
}
__device__ __forceinline__
void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
@ -994,9 +988,7 @@ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, cons @@ -994,9 +988,7 @@ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, cons
A[j] = R[j];
}
}
__device__ __forceinline__
void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
@ -1026,9 +1018,7 @@ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, cons @@ -1026,9 +1018,7 @@ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, cons
A[j] = R[j];
}
}
__device__ __forceinline__
void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
@ -1058,9 +1048,7 @@ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, cons @@ -1058,9 +1048,7 @@ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, cons
A[j] = R[j];
}
}
__device__ __forceinline__
void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
{
int j;
uint32_t temp;
@ -1090,9 +1078,8 @@ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, cons @@ -1090,9 +1078,8 @@ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, cons
A[j] = R[j];
}
}
__device__ __constant__
static const uint32_t d_cw0[8][8] = {
static __constant__ uint32_t d_cw0[8][8];
static const uint32_t h_cw0[8][8] = {
0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6,
0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8,
@ -1102,8 +1089,10 @@ static const uint32_t d_cw0[8][8] = { @@ -1102,8 +1089,10 @@ static const uint32_t d_cw0[8][8] = {
0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E,
0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3
};
__device__ __forceinline__
void Round8_0_final(uint32_t *A, int r, int s, int t, int u) {
__device__ __forceinline__ void Round8_0_final(uint32_t *A,
int r, int s, int t, int u) {
STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]);
STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]);
STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1113,9 +1102,8 @@ void Round8_0_final(uint32_t *A, int r, int s, int t, int u) { @@ -1113,9 +1102,8 @@ void Round8_0_final(uint32_t *A, int r, int s, int t, int u) {
STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]);
STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
}
__device__ __constant__
static const uint32_t d_cw1[8][8] = {
static __constant__ uint32_t d_cw1[8][8];
static const uint32_t h_cw1[8][8] = {
0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7,
0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A,
@ -1125,8 +1113,10 @@ static const uint32_t d_cw1[8][8] = { @@ -1125,8 +1113,10 @@ static const uint32_t d_cw1[8][8] = {
0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D,
0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80
};
__device__ __forceinline__
void Round8_1_final(uint32_t *A, int r, int s, int t, int u) {
__device__ __forceinline__ void Round8_1_final(uint32_t *A,
int r, int s, int t, int u) {
STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]);
STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]);
STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1136,9 +1126,8 @@ void Round8_1_final(uint32_t *A, int r, int s, int t, int u) { @@ -1136,9 +1126,8 @@ void Round8_1_final(uint32_t *A, int r, int s, int t, int u) {
STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]);
STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
}
__device__ __constant__
static const uint32_t d_cw2[8][8] = {
static __constant__ uint32_t d_cw2[8][8];
static const uint32_t h_cw2[8][8] = {
0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3,
0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539,
@ -1148,8 +1137,10 @@ static const uint32_t d_cw2[8][8] = { @@ -1148,8 +1137,10 @@ static const uint32_t d_cw2[8][8] = {
0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468,
0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE
};
__device__ __forceinline__
void Round8_2_final(uint32_t *A, int r, int s, int t, int u) {
__device__ __forceinline__ void Round8_2_final(uint32_t *A,
int r, int s, int t, int u) {
STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]);
STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]);
STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1159,9 +1150,8 @@ void Round8_2_final(uint32_t *A, int r, int s, int t, int u) { @@ -1159,9 +1150,8 @@ void Round8_2_final(uint32_t *A, int r, int s, int t, int u) {
STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]);
STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
}
__device__ __constant__
static const uint32_t d_cw3[8][8] = {
static __constant__ uint32_t d_cw3[8][8];
static const uint32_t h_cw3[8][8] = {
0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D,
0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A,
@ -1171,8 +1161,10 @@ static const uint32_t d_cw3[8][8] = { @@ -1171,8 +1161,10 @@ static const uint32_t d_cw3[8][8] = {
0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA,
0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D
};
__device__ __forceinline__
void Round8_3_final(uint32_t *A, int r, int s, int t, int u) {
__device__ __forceinline__ void Round8_3_final(uint32_t *A,
int r, int s, int t, int u) {
STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]);
STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]);
STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]);
@ -1190,8 +1182,8 @@ void Round8_3_final(uint32_t *A, int r, int s, int t, int u) { @@ -1190,8 +1182,8 @@ void Round8_3_final(uint32_t *A, int r, int s, int t, int u) {
#define expanded_vector(x) __ldg(&g_fft4[x])
#endif
__device__ __forceinline__
void Round8_0(uint32_t *A, const int thr_offset, int r, int s, int t, int u, uint4 *g_fft4) {
__device__ __forceinline__ void Round8_0(uint32_t *A, const int thr_offset,
int r, int s, int t, int u, uint4 *g_fft4) {
uint32_t w[8];
uint4 hv1, hv2;

4
x15/x15.cu

@ -260,8 +260,8 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, @@ -260,8 +260,8 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
#if NULLTEST
uint32_t buf[8]; memset(buf, 0, sizeof buf);
cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaThreadSynchronize());
print_hash((unsigned char*)buf); printf("\n");
#endif
/* Scan with GPU */

Loading…
Cancel
Save