diff --git a/Makefile.am b/Makefile.am index f2af2d1..79f521d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -89,13 +89,6 @@ qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< -x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu - $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< - -# Shavite compiles faster with 128 regs -x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu - $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=128 -o $@ -c $< - x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu index 1d1ae07..51f407c 100644 --- a/x11/cuda_x11_aes.cu +++ b/x11/cuda_x11_aes.cu @@ -311,7 +311,7 @@ void aes_gpu_init(uint32_t *sharedMemory) } /* tried with 3 xor.b32 asm, not faster */ -#define xor4_32(a,b,c,d) (a ^ b ^ c ^ d); +#define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d)); __device__ static void aes_round( diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu index 29505da..5fc2a0f 100644 --- a/x11/cuda_x11_echo.cu +++ b/x11/cuda_x11_echo.cu @@ -28,7 +28,11 @@ __device__ __forceinline__ void AES_2ROUND( k0++; } -__constant__ uint32_t P[48] = { +__device__ __forceinline__ +void cuda_echo_round( + const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ hash) +{ + const uint32_t P[48] = { 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, //8-12 @@ -45,14 +49,11 @@ __constant__ uint32_t P[48] = { 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af //58-61 -}; - -__device__ __forceinline__ -void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ hash) -{ + }; uint32_t k0; uint32_t h[16]; - #pragma unroll + + #pragma unroll 16 for (int i = 0; i < 16; i++) { h[i] = hash[i]; @@ -170,13 +171,21 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t * { // Big Sub Words - #pragma unroll 16 - for (int i = 0; i < 16; i++) + #pragma unroll 4 + for (int idx = 0; idx < 64; idx += 16) { - int idx = i << 2; // *4 AES_2ROUND(sharedMemory, W[idx + 0], W[idx + 1], W[idx + 2], W[idx + 3], k0); + AES_2ROUND(sharedMemory, + W[idx + 4], W[idx + 5], W[idx + 6], W[idx + 7], + k0); + AES_2ROUND(sharedMemory, + W[idx + 8], W[idx + 9], W[idx + 10], W[idx + 11], + k0); + AES_2ROUND(sharedMemory, + W[idx + 12], W[idx + 13], W[idx + 14], W[idx + 15], + k0); } // Shift Rows @@ -241,8 +250,8 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t * } } - #pragma unroll 8 - for (int i = 0; i<32; i += 4) + #pragma unroll + for (int i = 0; i<16; i += 4) { W[i] ^= W[32 + i] ^ 512; W[i + 1] ^= W[32 + i + 1]; @@ -255,12 +264,29 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t * hash[i] ^= W[i]; } -__global__ /* __launch_bounds__(320, 3) will force 64 registers on the 750 Ti */ +__device__ __forceinline__ +void echo_gpu_init(uint32_t *const __restrict__ sharedMemory) +{ + /* each thread startup will fill a uint32 */ + if (threadIdx.x < 128) { + sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; + sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x]; + sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x]; + sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x]; + + sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2]; + sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2]; + sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2]; + sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2]; + } +} + +__global__ __launch_bounds__(128, 7) /* will force 72 registers */ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) { __shared__ uint32_t sharedMemory[1024]; - aes_gpu_init(sharedMemory); + echo_gpu_init(sharedMemory); int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -283,7 +309,7 @@ void x11_echo512_cpu_init(int thr_id, int threads) __host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { - const int threadsperblock = 256; + const int threadsperblock = 128; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu index f87562a..dfe6a1e 100644 --- a/x11/cuda_x11_shavite512.cu +++ b/x11/cuda_x11_shavite512.cu @@ -1,9 +1,8 @@ #include "cuda_helper.h" -#define TPB 256 +#include -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); +#define TPB 128 __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) @@ -1294,12 +1293,30 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u state[0xF] ^= p7; } +__device__ __forceinline__ +void shavite_gpu_init(uint32_t *sharedMemory) +{ + /* each thread startup will fill a uint32 */ + if (threadIdx.x < 128) { + sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; + sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x]; + sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x]; + sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x]; + + sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2]; + sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2]; + sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2]; + sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2]; + } +} + // GPU Hash -__global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +__global__ __launch_bounds__(TPB, 8) /* 64 registers if TPB 128 (fast), 80 with 92 (medium), 32 if 256 (slow) */ +void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) { __shared__ uint32_t sharedMemory[1024]; - aes_gpu_init(sharedMemory); + shavite_gpu_init(sharedMemory); int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -1344,11 +1361,12 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui } } -__global__ void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) +__global__ __launch_bounds__(TPB, 8) +void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) { __shared__ uint32_t sharedMemory[1024]; - aes_gpu_init(sharedMemory); + shavite_gpu_init(sharedMemory); int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -1397,9 +1415,9 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - size_t shared_size = 0; + cudaFuncSetCacheConfig(x11_shavite512_gpu_hash_64, cudaFuncCachePreferL1); - x11_shavite512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x11_shavite512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); MyStreamSynchronize(NULL, order, thr_id); }