diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu index 43ec8c3..362dd9c 100644 --- a/x11/cuda_x11_aes.cu +++ b/x11/cuda_x11_aes.cu @@ -1,8 +1,14 @@ /* AES Helper for inline-usage from SPH */ -#define AESx(x) SPH_C32(x) +#define AESx(x) (x ##UL) /* SPH_C32(x) */ +//#define DEVICE_DIRECT_CONSTANTS + +#ifdef DEVICE_DIRECT_CONSTANTS +__constant__ __align__(64) uint32_t d_AES0[256] = { +#else static const uint32_t h_AES0[256] = { +#endif AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6), AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591), AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56), @@ -69,7 +75,11 @@ static const uint32_t h_AES0[256] = { AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C) }; +#ifdef DEVICE_DIRECT_CONSTANTS +__constant__ __align__(64) uint32_t d_AES1[256] = { +#else static const uint32_t h_AES1[256] = { +#endif AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), @@ -136,7 +146,11 @@ static const uint32_t h_AES1[256] = { AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) }; +#ifdef DEVICE_DIRECT_CONSTANTS +__constant__ __align__(64) uint32_t d_AES2[256] = { +#else static const uint32_t h_AES2[256] = { +#endif AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), @@ -203,7 +217,11 @@ static const uint32_t h_AES2[256] = { AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) }; +#ifdef DEVICE_DIRECT_CONSTANTS +__constant__ __align__(64) uint32_t d_AES3[256] = { +#else static const uint32_t h_AES3[256] = { +#endif AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B), AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5), AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B), @@ -270,10 +288,11 @@ static const uint32_t h_AES3[256] = { AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616) }; -static __constant__ uint32_t d_AES0[256]; -static __constant__ uint32_t d_AES1[256]; -static __constant__ uint32_t d_AES2[256]; -static __constant__ uint32_t d_AES3[256]; +#ifndef DEVICE_DIRECT_CONSTANTS +static __constant__ __align__(64) uint32_t d_AES0[256]; +static __constant__ __align__(64) uint32_t d_AES1[256]; +static __constant__ __align__(64) uint32_t d_AES2[256]; +static __constant__ __align__(64) uint32_t d_AES3[256]; static void aes_cpu_init(int thr_id) { @@ -297,6 +316,9 @@ static void aes_cpu_init(int thr_id) sizeof(h_AES3), 0, cudaMemcpyHostToDevice)); } +#else +static void aes_cpu_init(int thr_id) {} +#endif __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory) @@ -319,7 +341,6 @@ static void aes_round( uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) { - y0 = xor4_32( sharedMemory[__byte_perm(x0, 0, 0x4440)], sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], @@ -350,7 +371,7 @@ static void aes_round( __device__ static void aes_round( const uint32_t *sharedMemory, - uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, + uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) { y0 = xor4_32( diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu index 0131b6c..7d4e8de 100644 --- a/x11/cuda_x11_shavite512.cu +++ b/x11/cuda_x11_shavite512.cu @@ -1,19 +1,11 @@ -#include "cuda_helper.h" +#include // memcpy() -#include +#include "cuda_helper.h" #define TPB 128 __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) -__device__ __constant__ -static const uint32_t d_ShaviteInitVector[16] = { - SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC), - SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC), - SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47), - SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A) -}; - #include "cuda_x11_aes.cu" __device__ __forceinline__ @@ -48,8 +40,8 @@ static void KEY_EXPAND_ELT( k3 = y0; } -__device__ -static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, uint32_t count) +__device__ __forceinline__ +static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, const uint32_t count) { uint32_t p0, p1, p2, p3, p4, p5, p6, p7; uint32_t p8, p9, pA, pB, pC, pD, pE, pF; @@ -76,82 +68,114 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u pD = state[0xD]; pE = state[0xE]; pF = state[0xF]; + /* round 0 */ rk00 = msg[0]; - x0 = p4 ^ rk00; + x0 = p4 ^ msg[0]; rk01 = msg[1]; - x1 = p5 ^ rk01; + x1 = p5 ^ msg[1]; rk02 = msg[2]; - x2 = p6 ^ rk02; + x2 = p6 ^ msg[2]; rk03 = msg[3]; - x3 = p7 ^ rk03; + x3 = p7 ^ msg[3]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); rk04 = msg[4]; - x0 ^= rk04; + x0 ^= msg[4]; rk05 = msg[5]; - x1 ^= rk05; + x1 ^= msg[5]; rk06 = msg[6]; - x2 ^= rk06; + x2 ^= msg[6]; rk07 = msg[7]; - x3 ^= rk07; + x3 ^= msg[7]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); rk08 = msg[8]; - x0 ^= rk08; + x0 ^= msg[8]; rk09 = msg[9]; - x1 ^= rk09; + x1 ^= msg[9]; rk0A = msg[10]; - x2 ^= rk0A; + x2 ^= msg[10]; rk0B = msg[11]; - x3 ^= rk0B; + x3 ^= msg[11]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); rk0C = msg[12]; - x0 ^= rk0C; + x0 ^= msg[12]; rk0D = msg[13]; - x1 ^= rk0D; + x1 ^= msg[13]; rk0E = msg[14]; - x2 ^= rk0E; + x2 ^= msg[14]; rk0F = msg[15]; - x3 ^= rk0F; + x3 ^= msg[15]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); p0 ^= x0; p1 ^= x1; p2 ^= x2; p3 ^= x3; - rk10 = msg[16]; - x0 = pC ^ rk10; - rk11 = msg[17]; - x1 = pD ^ rk11; - rk12 = msg[18]; - x2 = pE ^ rk12; - rk13 = msg[19]; - x3 = pF ^ rk13; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk14 = msg[20]; - x0 ^= rk14; - rk15 = msg[21]; - x1 ^= rk15; - rk16 = msg[22]; - x2 ^= rk16; - rk17 = msg[23]; - x3 ^= rk17; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk18 = msg[24]; - x0 ^= rk18; - rk19 = msg[25]; - x1 ^= rk19; - rk1A = msg[26]; - x2 ^= rk1A; - rk1B = msg[27]; - x3 ^= rk1B; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk1C = msg[28]; - x0 ^= rk1C; - rk1D = msg[29]; - x1 ^= rk1D; - rk1E = msg[30]; - x2 ^= rk1E; - rk1F = msg[31]; - x3 ^= rk1F; + if (count == 512) + { + rk10 = 0x80U; + x0 = pC ^ 0x80U; + rk11 = 0; + x1 = pD; + rk12 = 0; + x2 = pE; + rk13 = 0; + x3 = pF; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 = 0; + rk15 = 0; + rk16 = 0; + rk17 = 0; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 = 0; + rk19 = 0; + rk1A = 0; + rk1B = 0x02000000U; + x3 ^= 0x02000000U; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C = 0; + rk1D = 0; + rk1E = 0; + rk1F = 0x02000000; + x3 ^= 0x02000000; + } + else + { + rk10 = msg[16]; + x0 = pC ^ msg[16]; + rk11 = msg[17]; + x1 = pD ^ msg[17]; + rk12 = msg[18]; + x2 = pE ^ msg[18]; + rk13 = msg[19]; + x3 = pF ^ msg[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 = msg[20]; + x0 ^= msg[20]; + rk15 = msg[21]; + x1 ^= msg[21]; + rk16 = msg[22]; + x2 ^= msg[22]; + rk17 = msg[23]; + x3 ^= msg[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 = msg[24]; + x0 ^= msg[24]; + rk19 = msg[25]; + x1 ^= msg[25]; + rk1A = msg[26]; + x2 ^= msg[26]; + rk1B = msg[27]; + x3 ^= msg[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C = msg[28]; + x0 ^= msg[28]; + rk1D = msg[29]; + x1 ^= msg[29]; + rk1E = msg[30]; + x2 ^= msg[30]; + rk1F = msg[31]; + x3 ^= msg[31]; + } AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); p8 ^= x0; p9 ^= x1; @@ -249,7 +273,7 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u p5 ^= x1; p6 ^= x2; p7 ^= x3; - + rk00 ^= rk19; x0 = pC ^ rk00; rk01 ^= rk1A; @@ -330,6 +354,7 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u p1 ^= x1; p2 ^= x2; p3 ^= x3; + /* round 3, 7, 11 */ KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); rk00 ^= rk1C; @@ -419,6 +444,7 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u pD ^= x1; pE ^= x2; pF ^= x3; + /* round 4, 8, 12 */ rk00 ^= rk19; x0 = p4 ^ rk00; @@ -516,7 +542,7 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u rk04 ^= rk00; rk05 ^= rk01; rk06 ^= rk02; - rk07 ^= rk03; + rk07 ^= rk03; rk07 ^= SPH_T32(~counter); x0 ^= rk04; x1 ^= rk05; @@ -591,7 +617,7 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u p5 ^= x1; p6 ^= x2; p7 ^= x3; - + rk00 ^= rk19; x0 = pC ^ rk00; rk01 ^= rk1A; @@ -672,6 +698,7 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u p1 ^= x1; p2 ^= x2; p3 ^= x3; + /* round 3, 7, 11 */ KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); rk00 ^= rk1C; @@ -761,6 +788,7 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u pD ^= x1; pE ^= x2; pF ^= x3; + /* round 4, 8, 12 */ rk00 ^= rk19; x0 = p4 ^ rk00; @@ -934,7 +962,7 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u p5 ^= x1; p6 ^= x2; p7 ^= x3; - + rk00 ^= rk19; x0 = pC ^ rk00; rk01 ^= rk1A; @@ -1015,6 +1043,7 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u p1 ^= x1; p2 ^= x2; p3 ^= x3; + /* round 3, 7, 11 */ KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); rk00 ^= rk1C; @@ -1311,7 +1340,7 @@ void shavite_gpu_init(uint32_t *sharedMemory) } // GPU Hash -__global__ __launch_bounds__(TPB, 8) /* 64 registers if TPB 128 (fast), 80 with 92 (medium), 32 if 256 (slow) */ +__global__ __launch_bounds__(TPB, 7) /* 64 registers with 128,8 - 72 regs with 128,7 */ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) { __shared__ uint32_t sharedMemory[1024]; @@ -1327,11 +1356,12 @@ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_h uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; // kopiere init-state - uint32_t state[16]; - - #pragma unroll 16 - for(int i=0;i<16;i++) - state[i] = d_ShaviteInitVector[i]; + uint32_t state[16] = { + SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC), + SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC), + SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47), + SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A) + }; // nachricht laden uint32_t msg[32]; @@ -1361,7 +1391,7 @@ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_h } } -__global__ __launch_bounds__(TPB, 8) +__global__ __launch_bounds__(TPB, 7) void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) { __shared__ uint32_t sharedMemory[1024]; @@ -1374,11 +1404,12 @@ void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputH const uint32_t nounce = startNounce + thread; // kopiere init-state - uint32_t state[16]; - - #pragma unroll 16 - for(int i=0;i<16;i++) { - state[i] = d_ShaviteInitVector[i];} + uint32_t state[16] = { + SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC), + SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC), + SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47), + SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A) + }; uint32_t msg[32]; @@ -1402,40 +1433,38 @@ void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputH } //thread < threads } -__host__ void x11_shavite512_cpu_init(int thr_id, int threads) -{ - aes_cpu_init(thr_id); -} - -__host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ +void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { const int threadsperblock = TPB; - // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - cudaFuncSetCacheConfig(x11_shavite512_gpu_hash_64, cudaFuncCachePreferL1); - x11_shavite512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); MyStreamSynchronize(NULL, order, thr_id); } -__host__ void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) +__host__ +void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) { const int threadsperblock = TPB; - // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - size_t shared_size = 0; - - x11_shavite512_gpu_hash_80<<>>(threads, startNounce, d_outputHash); + x11_shavite512_gpu_hash_80<<>>(threads, startNounce, d_outputHash); MyStreamSynchronize(NULL, order, thr_id); } -__host__ void x11_shavite512_setBlock_80(void *pdata) +__host__ +void x11_shavite512_cpu_init(int thr_id, int threads) +{ + aes_cpu_init(thr_id); +} + +__host__ +void x11_shavite512_setBlock_80(void *pdata) { // Message mit Padding bereitstellen // lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen. @@ -1445,4 +1474,3 @@ __host__ void x11_shavite512_setBlock_80(void *pdata) cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); } -