Browse Source

sp echo optimisation + cleanup

Original Commit :
Removed sharedmem and reduced calculations with precalcing (ECHO hash).
750ti + 20KHASH(x11)

tpruvot notes:
Real change is more of 10 KH/s on stock clocks (but real)
launch bounds disabled, no perf increase with 64 registers
2upstream
sp-hash 10 years ago committed by Tanguy Pruvot
parent
commit
e18a54e8fc
  1. 364
      x11/cuda_x11_echo.cu

364
x11/cuda_x11_echo.cu

@ -11,10 +11,10 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
__device__ __forceinline__ void AES_2ROUND( __device__ __forceinline__ void AES_2ROUND(
const uint32_t* __restrict__ sharedMemory, const uint32_t* __restrict__ sharedMemory,
uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3) uint32_t &k0)
{ {
uint32_t y0, y1, y2, y3; uint32_t y0, y1, y2, y3;
aes_round(sharedMemory, aes_round(sharedMemory,
x0, x1, x2, x3, x0, x1, x2, x3,
k0, k0,
@ -28,178 +28,266 @@ __device__ __forceinline__ void AES_2ROUND(
k0++; k0++;
} }
__device__ __forceinline__ void cuda_echo_round( __constant__ uint32_t P[48] = {
const uint32_t *sharedMemory, 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
uint32_t *W, int round) //8-12
0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
//21-25
0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751,
0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
//34-38
0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7,
0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968,
0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
//58-61
};
__device__ __forceinline__
void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ hash)
{ {
// W hat 16*4 als Abmaße uint32_t k0;
uint32_t h[16];
// Big Sub Words #pragma unroll
#pragma unroll 16 for (int i = 0; i < 16; i++)
for(int i=0;i<16;i++)
{ {
int idx = i<<2; // *4 h[i] = hash[i];
AES_2ROUND(sharedMemory,
W[idx+0], W[idx+1], W[idx+2], W[idx+3],
k0, k1, k2, k3);
} }
// Shift Rows k0 = 512 + 8;
#pragma unroll 4
for(int i=0;i<4;i++) #pragma unroll
for (int idx = 0; idx < 16; idx+= 4)
{ {
uint32_t t; AES_2ROUND(sharedMemory,
h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0);
/// 1, 5, 9, 13
t = W[4 + i];
W[4 + i] = W[20 + i];
W[20 + i] = W[36 + i];
W[36 + i] = W[52 + i];
W[52 + i] = t;
// 2, 6, 10, 14
t = W[8 + i];
W[8 + i] = W[40 + i];
W[40 + i] = t;
t = W[24 + i];
W[24 + i] = W[56 + i];
W[56 + i] = t;
// 15, 11, 7, 3
t = W[60 + i];
W[60 + i] = W[44 + i];
W[44 + i] = W[28 + i];
W[28 + i] = W[12 + i];
W[12 + i] = t;
} }
k0 += 4;
// Mix Columns uint32_t W[64];
#pragma unroll
for (int i = 0; i<4; i++) // Schleife über je 2*uint32_t #pragma unroll 4
for (int i = 0; i < 4; i++)
{ {
#pragma unroll 64 uint32_t a = P[i];
for (int idx = 0; idx<64; idx += 16) // Schleife über die elemnte uint32_t b = P[i + 4];
{ uint32_t c = h[i + 8];
uint32_t d = P[i + 8];
uint32_t a = W[idx + i]; uint32_t ab = a ^ b;
uint32_t b = W[idx + i + 4]; uint32_t bc = b ^ c;
uint32_t c = W[idx + i + 8]; uint32_t cd = c ^ d;
uint32_t d = W[idx + i + 12];
uint32_t ab = a ^ b;
uint32_t bc = b ^ c;
uint32_t cd = c ^ d;
uint32_t t, t2, t3; uint32_t t = (ab & 0x80808080);
t = (ab & 0x80808080); uint32_t t2 = (bc & 0x80808080);
t2 = (bc & 0x80808080); uint32_t t3 = (cd & 0x80808080);
t3 = (cd & 0x80808080);
uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1); uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
W[idx + i] = abx ^ bc ^ d; W[0 + i] = abx ^ bc ^ d;
W[idx + i + 4] = bcx ^ a ^ cd; W[0 + i + 4] = bcx ^ a ^ cd;
W[idx + i + 8] = cdx ^ ab ^ d; W[0 + i + 8] = cdx ^ ab ^ d;
W[idx + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; W[0 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
}
}
} a = P[12 + i];
b = h[i + 4];
c = P[12 + i + 4];
d = P[12 + i + 8];
__global__ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) ab = a ^ b;
{ bc = b ^ c;
__shared__ uint32_t sharedMemory[1024]; cd = c ^ d;
aes_gpu_init(sharedMemory);
int thread = (blockDim.x * blockIdx.x + threadIdx.x); t = (ab & 0x80808080);
if (thread < threads) t2 = (bc & 0x80808080);
{ t3 = (cd & 0x80808080);
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
abx = (t >> 7) * 27 ^ ((ab^t) << 1);
int hashPosition = nounce - startNounce; bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
uint32_t W[64]; W[16 + i] = abx ^ bc ^ d;
uint32_t k0 = 512, k1 = 0, k2 = 0, k3 = 0; // K0 = bitlen W[16 + i + 4] = bcx ^ a ^ cd;
/* Initialisierung */ W[16 + i + 8] = cdx ^ ab ^ d;
#pragma unroll 8 W[16 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
for(int i=0;i<32;i+=4)
a = h[i];
b = P[24 + i + 0];
c = P[24 + i + 4];
d = P[24 + i + 8];
ab = a ^ b;
bc = b ^ c;
cd = c ^ d;
t = (ab & 0x80808080);
t2 = (bc & 0x80808080);
t3 = (cd & 0x80808080);
abx = (t >> 7) * 27 ^ ((ab^t) << 1);
bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
W[32 + i] = abx ^ bc ^ d;
W[32 + i + 4] = bcx ^ a ^ cd;
W[32 + i + 8] = cdx ^ ab ^ d;
W[32 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
a = P[36 + i ];
b = P[36 + i +4 ];
c = P[36 + i + 8];
d = h[i + 12];
ab = a ^ b;
bc = b ^ c;
cd = c ^ d;
t = (ab & 0x80808080);
t2 = (bc & 0x80808080);
t3 = (cd & 0x80808080);
abx = (t >> 7) * 27 ^ ((ab^t) << 1);
bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
W[48 + i] = abx ^ bc ^ d;
W[48 + i + 4] = bcx ^ a ^ cd;
W[48 + i + 8] = cdx ^ ab ^ d;
W[48 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
}
for (int k = 1; k < 10; k++)
{
// Big Sub Words
#pragma unroll 16
for (int i = 0; i < 16; i++)
{ {
W[i + 0] = 512; int idx = i << 2; // *4
W[i + 1] = 0; AES_2ROUND(sharedMemory,
W[i + 2] = 0; W[idx + 0], W[idx + 1], W[idx + 2], W[idx + 3],
W[i + 3] = 0; k0);
} }
// kopiere 32-byte großen hash // Shift Rows
#pragma unroll 16 #pragma unroll 4
for(int i=0;i<16;i++) for (int i = 0; i < 4; i++)
W[i+32] = Hash[i];
W[48] = 0x80; // fest
#pragma unroll 10
for(int i=49;i<59;i++)
W[i] = 0;
W[59] = 0x02000000; // fest
W[60] = k0; // bitlen
W[61] = k1;
W[62] = k2;
W[63] = k3;
for(int i=0;i<10;i++)
{ {
cuda_echo_round(sharedMemory, k0, k1, k2, k3, W, i); uint32_t t;
/// 1, 5, 9, 13
t = W[4 + i];
W[4 + i] = W[20 + i];
W[20 + i] = W[36 + i];
W[36 + i] = W[52 + i];
W[52 + i] = t;
// 2, 6, 10, 14
t = W[8 + i];
W[8 + i] = W[40 + i];
W[40 + i] = t;
t = W[24 + i];
W[24 + i] = W[56 + i];
W[56 + i] = t;
// 15, 11, 7, 3
t = W[60 + i];
W[60 + i] = W[44 + i];
W[44 + i] = W[28 + i];
W[28 + i] = W[12 + i];
W[12 + i] = t;
} }
#pragma unroll 8 // Mix Columns
for(int i=0;i<32;i+=4) #pragma unroll 4
for (int i = 0; i < 4; i++) // Schleife über je 2*uint32_t
{ {
W[i ] ^= W[32 + i ] ^ 512; #pragma unroll 64
W[i+1] ^= W[32 + i + 1]; for (int idx = 0; idx < 64; idx += 16) // Schleife über die elemnte
W[i+2] ^= W[32 + i + 2]; {
W[i+3] ^= W[32 + i + 3]; uint32_t a = W[idx + i];
uint32_t b = W[idx + i + 4];
uint32_t c = W[idx + i + 8];
uint32_t d = W[idx + i + 12];
uint32_t ab = a ^ b;
uint32_t bc = b ^ c;
uint32_t cd = c ^ d;
uint32_t t, t2, t3;
t = (ab & 0x80808080);
t2 = (bc & 0x80808080);
t3 = (cd & 0x80808080);
uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
W[idx + i] = abx ^ bc ^ d;
W[idx + i + 4] = bcx ^ a ^ cd;
W[idx + i + 8] = cdx ^ ab ^ d;
W[idx + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
}
} }
}
#pragma unroll 16 #pragma unroll 8
for(int i=0;i<16;i++) for (int i = 0; i<32; i += 4)
W[i] ^= Hash[i]; {
W[i] ^= W[32 + i] ^ 512;
W[i + 1] ^= W[32 + i + 1];
W[i + 2] ^= W[32 + i + 2];
W[i + 3] ^= W[32 + i + 3];
}
// tsiv: I feel iffy about removing this, but it seems to break the full hash #pragma unroll
// fortunately for X11 the flipped bit lands outside the first 32 bytes used as the final X11 hash for (int i = 0; i<16; i++)
// try chaining more algos after echo (X13) and boom hash[i] ^= W[i];
//W[8] ^= 0x10; }
__global__ /* __launch_bounds__(320, 3) will force 64 registers on the 750 Ti */
void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
__shared__ uint32_t sharedMemory[1024];
aes_gpu_init(sharedMemory);
W[27] ^= 0x02000000; int thread = (blockDim.x * blockIdx.x + threadIdx.x);
W[28] ^= k0; if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
#pragma unroll 16 cuda_echo_round(sharedMemory, Hash);
for(int i=0;i<16;i++) }
Hash[i] = W[i];
}
} }
// Setup-Funktionen __host__
__host__ void x11_echo512_cpu_init(int thr_id, int threads) void x11_echo512_cpu_init(int thr_id, int threads)
{ {
aes_cpu_init(); aes_cpu_init();
} }
__host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__
void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
x11_echo512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x11_echo512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

Loading…
Cancel
Save