From 470966899560ea7d97343506a01f5bc4a93ca8b0 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 16 Jun 2015 08:13:25 +0200 Subject: [PATCH] jh512: rewrite and optimize with asm swap 5% improvement by the vshl asm swap functions, mixed shl+add inst., Add also xchg(x, y) func and XCHG(x, y) define in cuda_helper for later use... other jh changes are mainly for the beauty of the code... Signed-off-by: Tanguy Pruvot --- cuda_helper.h | 7 + quark/cuda_jh512.cu | 416 ++++++++++++++++++++++++-------------------- 2 files changed, 237 insertions(+), 186 deletions(-) diff --git a/cuda_helper.h b/cuda_helper.h index b46ef97..3b86e91 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -148,6 +148,13 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x) (((uint64_t)(x) & 0x00000000000000ffULL) << 56))) #endif +// swap two uint32_t without extra registers +__device__ __host__ __forceinline__ void xchg(uint32_t &x, uint32_t &y) { + x ^= y; y = x ^ y; x ^= y; +} +// for other types... +#define XCHG(x, y) { x ^= y; y = x ^ y; x ^= y; } + /*********************************************************************/ // Macros to catch CUDA errors in CUDA runtime calls diff --git a/quark/cuda_jh512.cu b/quark/cuda_jh512.cu index 1444c80..94453e9 100644 --- a/quark/cuda_jh512.cu +++ b/quark/cuda_jh512.cu @@ -1,236 +1,279 @@ #include "cuda_helper.h" -__constant__ unsigned char c_E8_bitslice_roundconstant[42][32] = { - { 0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40 }, - { 0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31 }, - { 0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc }, - { 0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3 }, - { 0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23 }, - { 0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97 }, - { 0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14 }, - { 0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4 }, - { 0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36 }, - { 0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f }, - { 0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b }, - { 0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62 }, - { 0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5 }, - { 0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f }, - { 0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a }, - { 0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf }, - { 0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0 }, - { 0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a }, - { 0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6 }, - { 0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67 }, - { 0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18 }, - { 0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e }, - { 0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1 }, - { 0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83 }, - { 0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef }, - { 0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65 }, - { 0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c }, - { 0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71 }, - { 0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0 }, - { 0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f }, - { 0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad }, - { 0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6 }, - { 0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63 }, - { 0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f }, - { 0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a }, - { 0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5 }, - { 0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48 }, - { 0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e }, - { 0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7 }, - { 0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde }, - { 0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a }, - { 0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2 } }; - -#define SWAP4(x,y)\ - y = (x & 0xf0f0f0f0UL); \ - x = (x ^ y); \ - y = (y >> 4); \ - x = (x << 4); \ - x= x | y; - -#define SWAP2(x,y)\ - y = (x & 0xccccccccUL); \ - x = (x ^ y); \ - y = (y >> 2); \ - x = (x << 2); \ - x= x | y; - -#define SWAP1(x,y)\ - y = (x & 0xaaaaaaaaUL); \ - x = (x ^ y); \ - y = (y >> 1); \ - x = x + x; \ - x= x | y; -/*swapping bits 16i||16i+1||......||16i+7 with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/ -//#define SWAP8(x) (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8)); -#define SWAP8(x) (x) = __byte_perm(x, x, 0x2301); +// #include // printf +// #include // sleep + +/* 1344 bytes */ +__constant__ static __align__(16) uint32_t c_E8_bslice32[42][8] = { + // Round 0 (Function0) + { 0xa2ded572, 0x90d6ab81, 0x67f815df, 0xf6875a4d, 0x0a15847b, 0xc54f9f4e, 0x571523b7, 0x402bd1c3 }, + { 0xe03a98ea, 0xb4960266, 0x9cfa455c, 0x8a53bbf2, 0x99d2c503, 0x1a1456b5, 0x9a99b266, 0x31a2db88 }, // 1 + { 0x5c5aa303, 0x8019051c, 0xdb0e199a, 0x1d959e84, 0x0ab23f40, 0xadeb336f, 0x1044c187, 0xdccde75e }, // 2 + { 0x9213ba10, 0x39812c0a, 0x416bbf02, 0x5078aa37, 0x156578dc, 0xd2bf1a3f, 0xd027bbf7, 0xd3910041 }, // 3 + { 0x0d5a2d42, 0x0ba75c18, 0x907eccf6, 0xac442bc7, 0x9c9f62dd, 0xd665dfd1, 0xce97c092, 0x23fcc663 }, // 4 + { 0x036c6e97, 0xbb03f1ee, 0x1ab8e09e, 0xfa618e5d, 0x7e450521, 0xb29796fd, 0xa8ec6c44, 0x97818394 }, // 5 + { 0x37858e4a, 0x8173fe8a, 0x2f3003db, 0x6c69b8f8, 0x2d8d672a, 0x4672c78a, 0x956a9ffb, 0x14427fc0 }, // 6 + // Round 7 (Function0) + { 0x8f15f4c5, 0xb775de52, 0xc45ec7bd, 0xbc88e4ae, 0xa76f4475, 0x1e00b882, 0x80bb118f, 0xf4a3a698 }, + { 0x338ff48e, 0x20edf1b6, 0x1563a3a9, 0xfde05a7c, 0x24565faa, 0x5ae9ca36, 0x89f9b7d5, 0x362c4206 }, + { 0x433529ce, 0x591ff5d0, 0x3d98fe4e, 0x86814e6f, 0x74f93a53, 0x81ad9d0e, 0xa74b9a73, 0x9f5ad8af }, + { 0x670605a7, 0x26077447, 0x6a6234ee, 0x3f1080c6, 0xbe280b8b, 0x6f7ea0e0, 0x2717b96e, 0x7b487ec6 }, + { 0xa50a550d, 0x81727686, 0xc0a4f84a, 0xd48d6050, 0x9fe7e391, 0x415a9e7e, 0x9ef18e97, 0x62b0e5f3 }, + { 0xec1f9ffc, 0xf594d74f, 0x7a205440, 0xd895fa9d, 0x001ae4e3, 0x117e2e55, 0x84c9f4ce, 0xa554c324 }, + { 0x2872df5b, 0xef7c8905, 0x286efebd, 0x2ed349ee, 0xe27ff578, 0x85937e44, 0xb2c4a50f, 0x7f5928eb }, + // Round 14 (Function0) + { 0x37695f70, 0x04771bc7, 0x4a3124b3, 0xe720b951, 0xf128865e, 0xe843fe74, 0x65e4d61d, 0x8a87d423 }, + { 0xa3e8297d, 0xfb301b1d, 0xf2947692, 0xe01bdc5b, 0x097acbdd, 0x4f4924da, 0xc1d9309b, 0xbf829cf2 }, + { 0x31bae7a4, 0x32fcae3b, 0xffbf70b4, 0x39d3bb53, 0x0544320d, 0xc1c39f45, 0x48bcf8de, 0xa08b29e0 }, + { 0xfd05c9e5, 0x01b771a2, 0x0f09aef7, 0x95ed44e3, 0x12347094, 0x368e3be9, 0x34f19042, 0x4a982f4f }, + { 0x631d4088, 0xf14abb7e, 0x15f66ca0, 0x30c60ae2, 0x4b44c147, 0xc5b67046, 0xffaf5287, 0xe68c6ecc }, + { 0x56a4d5a4, 0x45ce5773, 0x00ca4fbd, 0xadd16430, 0x4b849dda, 0x68cea6e8, 0xae183ec8, 0x67255c14 }, + { 0xf28cdaa3, 0x20b2601f, 0x16e10ecb, 0x7b846fc2, 0x5806e933, 0x7facced1, 0x9a99949a, 0x1885d1a0 }, + // Round 21 (Function0) + { 0xa15b5932, 0x67633d9f, 0xd319dd8d, 0xba6b04e4, 0xc01c9a50, 0xab19caf6, 0x46b4a5aa, 0x7eee560b }, + { 0xea79b11f, 0x5aac571d, 0x742128a9, 0x76d35075, 0x35f7bde9, 0xfec2463a, 0xee51363b, 0x01707da3 }, + { 0xafc135f7, 0x15638341, 0x42d8a498, 0xa8db3aea, 0x20eced78, 0x4d3bc3fa, 0x79676b9e, 0x832c8332 }, + { 0x1f3b40a7, 0x6c4e3ee7, 0xf347271c, 0xfd4f21d2, 0x34f04059, 0x398dfdb8, 0x9a762db7, 0xef5957dc }, + { 0x490c9b8d, 0xd0ae3b7d, 0xdaeb492b, 0x84558d7a, 0x49d7a25b, 0xf0e9a5f5, 0x0d70f368, 0x658ef8e4 }, + { 0xf4a2b8a0, 0x92946891, 0x533b1036, 0x4f88e856, 0x9e07a80c, 0x555cb05b, 0x5aec3e75, 0x4cbcbaf8 }, + { 0x993bbbe3, 0x28acae64, 0x7b9487f3, 0x6db334dc, 0xd6f4da75, 0x50a5346c, 0x5d1c6b72, 0x71db28b8 }, + // Round 28 (Function0) + { 0xf2e261f8, 0xf1bcac1c, 0x2a518d10, 0xa23fce43, 0x3364dbe3, 0x3cd1bb67, 0xfc75dd59, 0xb043e802 }, + { 0xca5b0a33, 0xc3943b92, 0x75a12988, 0x1e4d790e, 0x4d19347f, 0xd7757479, 0x5c5316b4, 0x3fafeeb6 }, + { 0xf7d4a8ea, 0x5324a326, 0x21391abe, 0xd23c32ba, 0x097ef45c, 0x4a17a344, 0x5127234c, 0xadd5a66d }, + { 0xa63e1db5, 0xa17cf84c, 0x08c9f2af, 0x4d608672, 0x983d5983, 0xcc3ee246, 0x563c6b91, 0xf6c76e08 }, + { 0xb333982f, 0xe8b6f406, 0x5e76bcb1, 0x36d4c1be, 0xa566d62b, 0x1582ee74, 0x2ae6c4ef, 0x6321efbc }, + { 0x0d4ec1fd, 0x1614c17e, 0x69c953f4, 0x16fae006, 0xc45a7da7, 0x3daf907e, 0x26585806, 0x3f9d6328 }, + { 0xe3f2c9d2, 0x16512a74, 0x0cd29b00, 0x9832e0f2, 0x30ceaa5f, 0xd830eb0d, 0x300cd4b7, 0x9af8cee3 }, + // Round 35 (Function0) + { 0x7b9ec54b, 0x574d239b, 0x9279f1b5, 0x316796e6, 0x6ee651ff, 0xf3a6e6cc, 0xd3688604, 0x05750a17 }, + { 0xd98176b1, 0xb3cb2bf4, 0xce6c3213, 0x47154778, 0x8452173c, 0x825446ff, 0x62a205f8, 0x486a9323 }, + { 0x0758df38, 0x442e7031, 0x65655e4e, 0x86ca0bd0, 0x897cfcf2, 0xa20940f0, 0x8e5086fc, 0x4e477830 }, + { 0x39eea065, 0x26b29721, 0x8338f7d1, 0x6ff81301, 0x37e95ef7, 0xd1ed44a3, 0xbd3a2ce4, 0xe7de9fef }, + { 0x15dfa08b, 0x7ceca7d8, 0xd9922576, 0x7eb027ab, 0xf6f7853c, 0xda7d8d53, 0xbe42dc12, 0xdea83eaa }, + { 0x93ce25aa, 0xdaef5fc0, 0xd86902bd, 0xa5194a17, 0xfd43f65a, 0x33664d97, 0xf908731a, 0x6a21fd4c }, + { 0x3198b435, 0xa163d09a, 0x701541db, 0x72409751, 0xbb0f1eea, 0xbf9d75f6, 0x9b54cded, 0xe26f4791 } + // 42 rounds... +}; + /*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 32-bit x*/ //#define SWAP16(x) (x) = ((((x) & 0x0000ffffUL) << 16) | (((x) & 0xffff0000UL) >> 16)); #define SWAP16(x) (x) = __byte_perm(x, x, 0x1032); -/*The MDS transform*/ +/*swapping bits 16i||16i+1||......||16i+7 with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/ +//#define SWAP8(x) (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8)); +#define SWAP8(x) (x) = __byte_perm(x, x, 0x2301); + +/* +__device__ __forceinline__ +static void SWAP4(uint32_t &x) { + uint32_t y = x & 0xF0F0F0F0; + x = (x ^ y) << 4; + x |= y >> 4; +} +__device__ __forceinline__ +static void SWAP2(uint32_t &x) { + uint32_t y = (x & 0xCCCCCCCC); + x = (x ^ y) << 2; + x |= y >> 2; +} +__device__ __forceinline__ +static void SWAP1(uint32_t &x) { + uint32_t y = (x & 0xAAAAAAAA); + x = (x ^ y) << 1; + x |= y >> 1; +} +*/ + +__device__ __forceinline__ +static void SWAP4x4(uint32_t *x) { + #pragma nounroll + // y is used as tmp register too + for (uint32_t y=0; y<4; y++, ++x) { + asm("and.b32 %1, %0, 0xF0F0F0F0;" + "xor.b32 %0, %0, %1;" + "shr.b32 %1, %1, 4;" + "vshl.u32.u32.u32.clamp.add %0, %0, 4, %1;\n\t" + : "+r"(*x) : "r"(y)); + } +} + +__device__ __forceinline__ +static void SWAP2x4(uint32_t *x) { + #pragma nounroll + // y is used as tmp register too + for (uint32_t y=0; y<4; y++, ++x) { + asm("and.b32 %1, %0, 0xCCCCCCCC;" + "xor.b32 %0, %0, %1;" + "shr.b32 %1, %1, 2;" + "vshl.u32.u32.u32.clamp.add %0, %0, 2, %1;\n\t" + : "+r"(*x) : "r"(y)); + } +} + +__device__ __forceinline__ +static void SWAP1x4(uint32_t *x) { + #pragma nounroll + // y is used as tmp register too + for (uint32_t y=0; y<4; y++, ++x) { + asm("and.b32 %1, %0, 0xAAAAAAAA;" + "xor.b32 %0, %0, %1;" + "shr.b32 %1, %1, 1;" + "vshl.u32.u32.u32.clamp.add %0, %0, 1, %1;\n\t" + : "+r"(*x) : "r"(y)); + } +} + +/* The MDS transform */ #define L(m0,m1,m2,m3,m4,m5,m6,m7) \ - (m4) ^= (m1); \ - (m5) ^= (m2); \ - (m6) ^= (m0) ^ (m3); \ - (m7) ^= (m0); \ - (m0) ^= (m5); \ - (m1) ^= (m6); \ - (m2) ^= (m4) ^ (m7); \ - (m3) ^= (m4); - -/*The Sbox*/ -#define Sbox(m0,m1,m2,m3,cc) \ + m4 ^= m1; \ + m5 ^= m2; \ + m6 ^= m0 ^ m3; \ + m7 ^= m0; \ + m0 ^= m5; \ + m1 ^= m6; \ + m2 ^= m4 ^ m7; \ + m3 ^= m4; + +/* The Sbox */ +#define Sbox(m0, m1, m2, m3, cc) \ m3 = ~(m3); \ - m0 ^= ((~(m2)) & (cc)); \ - temp0 = (cc) ^ ((m0) & (m1));\ - m0 ^= ((m2) & (m3)); \ - m3 ^= ((~(m1)) & (m2)); \ - m1 ^= ((m0) & (m2)); \ - m2 ^= ((m0) & (~(m3))); \ - m0 ^= ((m1) | (m3)); \ - m3 ^= ((m1) & (m2)); \ - m1 ^= (temp0 & (m0)); \ + m0 ^= (~(m2)) & cc; \ + temp0 = cc ^ (m0 & m1); \ + m0 ^= m2 & m3; \ + m3 ^= (~(m1)) & m2; \ + m1 ^= m0 & m2; \ + m2 ^= m0 & (~(m3)); \ + m0 ^= m1 | m3; \ + m3 ^= m1 & m2; \ + m1 ^= temp0 & m0; \ m2 ^= temp0; -static __device__ __forceinline__ void Sbox_and_MDS_layer(uint32_t x[8][4], uint32_t roundnumber) +__device__ __forceinline__ +static void Sbox_and_MDS_layer(uint32_t x[8][4], const int rnd) { - uint32_t temp0; - uint32_t cc0, cc1; - //Sbox and MDS layer -#pragma unroll 4 - for (int i = 0; i < 4; i++) { - cc0 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i]; - cc1 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i + 4]; - Sbox(x[0][i], x[2][i], x[4][i], x[6][i], cc0); - Sbox(x[1][i], x[3][i], x[5][i], x[7][i], cc1); + uint2* cc = (uint2*) &c_E8_bslice32[rnd]; + + // Sbox and MDS layer + #pragma unroll + for (int i = 0; i < 4; i++, ++cc) { + uint32_t temp0; + Sbox(x[0][i], x[2][i], x[4][i], x[6][i], cc->x); + Sbox(x[1][i], x[3][i], x[5][i], x[7][i], cc->y); L(x[0][i], x[2][i], x[4][i], x[6][i], x[1][i], x[3][i], x[5][i], x[7][i]); } } -static __device__ __forceinline__ void RoundFunction0(uint32_t x[8][4], uint32_t roundnumber) +__device__ __forceinline__ +static void RoundFunction0(uint32_t x[8][4], const int rnd) { - Sbox_and_MDS_layer(x, roundnumber); - -#pragma unroll 4 - for (int j = 1; j < 8; j = j + 2) - { - uint32_t y; - SWAP1(x[j][0], y); - SWAP1(x[j][1], y); - SWAP1(x[j][2], y); - SWAP1(x[j][3], y); + Sbox_and_MDS_layer(x, rnd + 0); // 0, 7, 14 .. 35 + #pragma unroll 4 + for (int j = 1; j < 8; j += 2) { // 1, 3, 5, 7 (Even) + SWAP1x4(x[j]); + // SWAP1(x[j][0]); SWAP1(x[j][1]); SWAP1(x[j][2]); SWAP1(x[j][3]); } } -static __device__ __forceinline__ void RoundFunction1(uint32_t x[8][4], uint32_t roundnumber) +__device__ __forceinline__ +static void RoundFunction1(uint32_t x[8][4], const int rnd) { - Sbox_and_MDS_layer(x, roundnumber); + Sbox_and_MDS_layer(x, rnd + 1); -#pragma unroll 4 - for (int j = 1; j < 8; j = j + 2) - { - uint32_t y; - SWAP2(x[j][0], y); - SWAP2(x[j][1], y); - SWAP2(x[j][2], y); - SWAP2(x[j][3], y); + #pragma unroll 4 + for (int j = 1; j < 8; j += 2) { + SWAP2x4(x[j]); + // SWAP2(x[j][0]); SWAP2(x[j][1]); SWAP2(x[j][2]); SWAP2(x[j][3]); } } -static __device__ __forceinline__ void RoundFunction2(uint32_t x[8][4], uint32_t roundnumber) +__device__ __forceinline__ +static void RoundFunction2(uint32_t x[8][4], const int rnd) { - Sbox_and_MDS_layer(x, roundnumber); + Sbox_and_MDS_layer(x, rnd + 2); -#pragma unroll 4 - for (int j = 1; j < 8; j = j + 2) - { - uint32_t y; - SWAP4(x[j][0], y); - SWAP4(x[j][1], y); - SWAP4(x[j][2], y); - SWAP4(x[j][3], y); + #pragma unroll 4 + for (int j = 1; j < 8; j += 2) { + SWAP4x4(x[j]); + // SWAP4(x[j][0]); SWAP4(x[j][1]); SWAP4(x[j][2]); SWAP4(x[j][3]); } } -static __device__ __forceinline__ void RoundFunction3(uint32_t x[8][4], uint32_t roundnumber) +__device__ __forceinline__ +static void RoundFunction3(uint32_t x[8][4], const int rnd) { - Sbox_and_MDS_layer(x, roundnumber); - -#pragma unroll 4 - for (int j = 1; j < 8; j = j + 2) - { -#pragma unroll 4 - for (int i = 0; i < 4; i++) SWAP8(x[j][i]); + Sbox_and_MDS_layer(x, rnd + 3); + + //uint32_t* xj = x[j]; + #pragma unroll 4 + for (int j = 1; j < 8; j += 2) { + SWAP8(x[j][0]); + SWAP8(x[j][1]); + SWAP8(x[j][2]); + SWAP8(x[j][3]); } } -static __device__ __forceinline__ void RoundFunction4(uint32_t x[8][4], uint32_t roundnumber) +__device__ __forceinline__ +static void RoundFunction4(uint32_t x[8][4], const int rnd) { - Sbox_and_MDS_layer(x, roundnumber); + Sbox_and_MDS_layer(x, rnd + 4); -#pragma unroll 4 - for (int j = 1; j < 8; j = j + 2) + #pragma unroll 4 + for (int j = 1; j < 8; j += 2) { -#pragma unroll 4 - for (int i = 0; i < 4; i++) SWAP16(x[j][i]); + //uint32_t* xj = x[j]; + #pragma unroll + for (int i = 0; i < 4; i++) + SWAP16(x[j][i]); } } -static __device__ __forceinline__ void RoundFunction5(uint32_t x[8][4], uint32_t roundnumber) +__device__ __forceinline__ +static void RoundFunction5(uint32_t x[8][4], const int rnd) { - uint32_t temp0; - - Sbox_and_MDS_layer(x, roundnumber); + Sbox_and_MDS_layer(x, rnd + 5); -#pragma unroll 4 - for (int j = 1; j < 8; j = j + 2) + #pragma unroll 4 + for (int j = 1; j < 8; j += 2) { -#pragma unroll 2 - for (int i = 0; i < 4; i = i + 2) { - temp0 = x[j][i]; x[j][i] = x[j][i + 1]; x[j][i + 1] = temp0; - } + xchg(x[j][0], x[j][1]); + xchg(x[j][2], x[j][3]); } } -static __device__ __forceinline__ void RoundFunction6(uint32_t x[8][4], uint32_t roundnumber) +__device__ __forceinline__ +static void RoundFunction6(uint32_t x[8][4], const int rnd) { - uint32_t temp0; + Sbox_and_MDS_layer(x, rnd + 6); - Sbox_and_MDS_layer(x, roundnumber); - -#pragma unroll 4 - for (int j = 1; j < 8; j = j + 2) + #pragma unroll 4 + for (int j = 1; j < 8; j += 2) { -#pragma unroll 2 - for (int i = 0; i < 2; i++) { - temp0 = x[j][i]; x[j][i] = x[j][i + 2]; x[j][i + 2] = temp0; - } + xchg(x[j][0], x[j][2]); + xchg(x[j][1], x[j][3]); } } -/*The bijective function E8, in bitslice form */ -static __device__ __forceinline__ void E8(uint32_t x[8][4]) +/* The bijective function E8, in bitslice form */ +__device__ +static void E8(uint32_t x[8][4]) { - /*perform 6 rounds*/ - //#pragma unroll 6 - for (int i = 0; i < 42; i += 7) + /* perform 6 loops of 7 rounds */ + for (int r = 0; r < 42; r += 7) { - RoundFunction0(x, i); - RoundFunction1(x, i + 1); - RoundFunction2(x, i + 2); - RoundFunction3(x, i + 3); - RoundFunction4(x, i + 4); - RoundFunction5(x, i + 5); - RoundFunction6(x, i + 6); + RoundFunction0(x, r); + RoundFunction1(x, r); + RoundFunction2(x, r); + RoundFunction3(x, r); + RoundFunction4(x, r); + RoundFunction5(x, r); + RoundFunction6(x, r); } } __global__ __launch_bounds__(256, 4) -void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) +void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t* g_hash, const uint32_t *const __restrict__ g_nonceVector) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -238,7 +281,7 @@ void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *c const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t hashPosition = nounce - startNounce; uint32_t *Hash = &g_hash[hashPosition * 16U]; - uint32_t x[8][4] = { + uint32_t x[8][4] = { /* init */ { 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a }, { 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 }, { 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea }, @@ -249,21 +292,22 @@ void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *c { 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } }; - #pragma unroll 16 + #pragma unroll for (int i = 0; i < 16; i++) - x[i >> 2][i & 3] ^= Hash[i]; + x[i/4][i & 3] ^= Hash[i]; E8(x); - #pragma unroll 16 - for (int i = 0; i < 16; i++) - x[(16 + i) >> 2][(16 + i) & 3] ^= Hash[i]; + #pragma unroll + for (uint8_t i = 0; i < 16; i++) + x[(i+16)/4][(i+16) & 3] ^= Hash[i]; - x[0 >> 2][0 & 3] ^= 0x80; - x[15 >> 2][15 & 3] ^= 0x00020000; + x[0][0] ^= 0x80U; + x[3][3] ^= 0x00020000U; E8(x); - x[(16 + 0) >> 2][(16 + 0) & 3] ^= 0x80; - x[(16 + 15) >> 2][(16 + 15) & 3] ^= 0x00020000; + + x[4][0] ^= 0x80U; + x[7][3] ^= 0x00020000U; Hash[0] = x[4][0]; Hash[1] = x[4][1];