Browse Source

x11: shavite and echo from sp (now ok on win32)

Previous echo commit was only increasing linux performance, and reducing
windows perf compared to the 1.4.9, this one seems to give at least
the 1.4.9 on windows, and the same on linux...

Shavite optimisation seems ok on both (use now 64 registers)

the launch_bounds will force the number of registers, so remove specific
Makefile rules on linux...

manual "cherry pick" with fixed line endings and some adaptations
master
Tanguy Pruvot 10 years ago
parent
commit
fdd5d29071
  1. 7
      Makefile.am
  2. 2
      x11/cuda_x11_aes.cu
  3. 54
      x11/cuda_x11_echo.cu
  4. 36
      x11/cuda_x11_shavite512.cu

7
Makefile.am

@ -89,13 +89,6 @@ qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu
x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
# Shavite compiles faster with 128 regs
x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=128 -o $@ -c $<
x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<

2
x11/cuda_x11_aes.cu

@ -311,7 +311,7 @@ void aes_gpu_init(uint32_t *sharedMemory)
} }
/* tried with 3 xor.b32 asm, not faster */ /* tried with 3 xor.b32 asm, not faster */
#define xor4_32(a,b,c,d) (a ^ b ^ c ^ d); #define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d));
__device__ __device__
static void aes_round( static void aes_round(

54
x11/cuda_x11_echo.cu

@ -28,7 +28,11 @@ __device__ __forceinline__ void AES_2ROUND(
k0++; k0++;
} }
__constant__ uint32_t P[48] = { __device__ __forceinline__
void cuda_echo_round(
const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ hash)
{
const uint32_t P[48] = {
0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
//8-12 //8-12
@ -46,13 +50,10 @@ __constant__ uint32_t P[48] = {
0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
//58-61 //58-61
}; };
__device__ __forceinline__
void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ hash)
{
uint32_t k0; uint32_t k0;
uint32_t h[16]; uint32_t h[16];
#pragma unroll
#pragma unroll 16
for (int i = 0; i < 16; i++) for (int i = 0; i < 16; i++)
{ {
h[i] = hash[i]; h[i] = hash[i];
@ -170,13 +171,21 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *
{ {
// Big Sub Words // Big Sub Words
#pragma unroll 16 #pragma unroll 4
for (int i = 0; i < 16; i++) for (int idx = 0; idx < 64; idx += 16)
{ {
int idx = i << 2; // *4
AES_2ROUND(sharedMemory, AES_2ROUND(sharedMemory,
W[idx + 0], W[idx + 1], W[idx + 2], W[idx + 3], W[idx + 0], W[idx + 1], W[idx + 2], W[idx + 3],
k0); k0);
AES_2ROUND(sharedMemory,
W[idx + 4], W[idx + 5], W[idx + 6], W[idx + 7],
k0);
AES_2ROUND(sharedMemory,
W[idx + 8], W[idx + 9], W[idx + 10], W[idx + 11],
k0);
AES_2ROUND(sharedMemory,
W[idx + 12], W[idx + 13], W[idx + 14], W[idx + 15],
k0);
} }
// Shift Rows // Shift Rows
@ -241,8 +250,8 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *
} }
} }
#pragma unroll 8 #pragma unroll
for (int i = 0; i<32; i += 4) for (int i = 0; i<16; i += 4)
{ {
W[i] ^= W[32 + i] ^ 512; W[i] ^= W[32 + i] ^ 512;
W[i + 1] ^= W[32 + i + 1]; W[i + 1] ^= W[32 + i + 1];
@ -255,12 +264,29 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *
hash[i] ^= W[i]; hash[i] ^= W[i];
} }
__global__ /* __launch_bounds__(320, 3) will force 64 registers on the 750 Ti */ __device__ __forceinline__
void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
{
/* each thread startup will fill a uint32 */
if (threadIdx.x < 128) {
sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
}
}
__global__ __launch_bounds__(128, 7) /* will force 72 registers */
void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
__shared__ uint32_t sharedMemory[1024]; __shared__ uint32_t sharedMemory[1024];
aes_gpu_init(sharedMemory); echo_gpu_init(sharedMemory);
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
@ -283,7 +309,7 @@ void x11_echo512_cpu_init(int thr_id, int threads)
__host__ __host__
void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const int threadsperblock = 128;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);

36
x11/cuda_x11_shavite512.cu

@ -1,9 +1,8 @@
#include "cuda_helper.h" #include "cuda_helper.h"
#define TPB 256 #include <memory.h>
// aus heavy.cu #define TPB 128
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
@ -1294,12 +1293,30 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u
state[0xF] ^= p7; state[0xF] ^= p7;
} }
__device__ __forceinline__
void shavite_gpu_init(uint32_t *sharedMemory)
{
/* each thread startup will fill a uint32 */
if (threadIdx.x < 128) {
sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
}
}
// GPU Hash // GPU Hash
__global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) __global__ __launch_bounds__(TPB, 8) /* 64 registers if TPB 128 (fast), 80 with 92 (medium), 32 if 256 (slow) */
void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
__shared__ uint32_t sharedMemory[1024]; __shared__ uint32_t sharedMemory[1024];
aes_gpu_init(sharedMemory); shavite_gpu_init(sharedMemory);
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
@ -1344,11 +1361,12 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
} }
} }
__global__ void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) __global__ __launch_bounds__(TPB, 8)
void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
{ {
__shared__ uint32_t sharedMemory[1024]; __shared__ uint32_t sharedMemory[1024];
aes_gpu_init(sharedMemory); shavite_gpu_init(sharedMemory);
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
@ -1397,9 +1415,9 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
size_t shared_size = 0; cudaFuncSetCacheConfig(x11_shavite512_gpu_hash_64, cudaFuncCachePreferL1);
x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x11_shavite512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

Loading…
Cancel
Save