From 26b9fe3586d1fce3bfe9b2cb887ae63b56ddd5ac Mon Sep 17 00:00:00 2001 From: sp-hash Date: Tue, 18 Nov 2014 23:58:19 +0100 Subject: [PATCH] faster x15, +23KH or 4ms on whirpool (30ms vs 34ms) tpruvot: i didnt pick the asm replace_hiword, slower on linux --- cuda_helper.h | 2 +- x15/cuda_x15_whirlpool.cu | 43 ++++++++++++++++++++++----------------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/cuda_helper.h b/cuda_helper.h index 249599f..d94e72a 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -61,7 +61,7 @@ __device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI) // das Hi Word in einem 64 Bit Typen ersetzen __device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) { - return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL); + return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32U); } // das Lo Word in einem 64 Bit Typen ersetzen diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu index 0bdd4c3..f15133d 100644 --- a/x15/cuda_x15_whirlpool.cu +++ b/x15/cuda_x15_whirlpool.cu @@ -5,7 +5,7 @@ #include #include -#define threadsperblock 512 +#define threadsperblock 256 #define USE_SHARED 1 @@ -37,7 +37,7 @@ __constant__ static uint64_t mixTob7Tox[256]; * * ==========================(LICENSE BEGIN)============================ * - * Copyright (c) 2014 djm34 & tpruvot + * Copyright (c) 2014 djm34 & tpruvot & SP * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the @@ -61,6 +61,7 @@ __constant__ static uint64_t mixTob7Tox[256]; * ===========================(LICENSE END)============================= * @author djm34 * @author tpruvot + * @author SP */ static const uint64_t old1_T0[256] = { @@ -2193,7 +2194,6 @@ static const uint64_t plain_RC[10] = { /* ====================================================================== */ -#define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF) #define TRANSFER(dst, src) { \ dst[0] = src ## 0; \ @@ -2207,6 +2207,8 @@ static const uint64_t plain_RC[10] = { } #if !USE_ALL_TABLES +#define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF) + /* method disabled to reduce code size */ __device__ __forceinline__ static uint64_t table_skew(uint64_t val, int num) { @@ -2243,22 +2245,14 @@ static uint64_t ROUND_ELT(const uint64_t* sharedMemory, uint64_t* __restrict__ i __device__ __forceinline__ static uint64_t ROUND_ELT(const uint64_t* sharedMemory, uint64_t* __restrict__ in, - int i0,int i1,int i2,int i3,int i4,int i5,int i6,int i7) +const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7) { - uint32_t idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7; - idx0 = BYTE(in[i0], 0); - idx1 = BYTE(in[i1], 1) + 256; - idx2 = BYTE(in[i2], 2) + 512; - idx3 = BYTE(in[i3], 3) + 768; - idx4 = BYTE(in[i4], 4) + 1024; - idx5 = BYTE(in[i5], 5) + 1280; - idx6 = BYTE(in[i6], 6) + 1536; - idx7 = BYTE(in[i7], 7) + 1792; - - return xor8(sharedMemory[idx0],sharedMemory[idx1],sharedMemory[idx2],sharedMemory[idx3], - sharedMemory[idx4],sharedMemory[idx5],sharedMemory[idx6],sharedMemory[idx7]); + uint32_t* in32 = (uint32_t*)in; + return (sharedMemory[__byte_perm(in32[(i0 << 1)], 0, 0x4440)] ^ sharedMemory[__byte_perm(in32[(i1 << 1)], 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(in32[(i2 << 1)], 0, 0x4442) + 512] ^ sharedMemory[__byte_perm(in32[(i3 << 1)], 0, 0x4443) + 768] ^ + sharedMemory[__byte_perm(in32[(i4 << 1) + 1], 0, 0x4440) + 1024] ^ sharedMemory[__byte_perm(in32[(i5 << 1) + 1], 0, 0x4441) + 1280] ^ + sharedMemory[__byte_perm(in32[(i6 << 1) + 1], 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(in32[(i7 << 1) + 1], 0, 0x4443) + 1792]); } - #endif /* USE_ALL_TABLES */ #define ROUND(table, in, out, c0, c1, c2, c3, c4, c5, c6, c7) { \ @@ -2270,10 +2264,21 @@ static uint64_t ROUND_ELT(const uint64_t* sharedMemory, uint64_t* __restrict__ i out ## 5 = xor1(ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6), c5); \ out ## 6 = xor1(ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7), c6); \ out ## 7 = xor1(ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0), c7); \ -} +} + +#define ROUND1(table, in, out, c) { \ + out ## 0 = xor1(ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1), c); \ + out ## 1 = ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2); \ + out ## 2 = ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3); \ + out ## 3 = ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4); \ + out ## 4 = ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5); \ + out ## 5 = ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6); \ + out ## 6 = ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7); \ + out ## 7 = ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0); \ +} #define ROUND_KSCHED(table, in, out, c) \ - ROUND(table, in, out, c, 0, 0, 0, 0, 0, 0, 0) \ + ROUND1(table, in, out, c) \ TRANSFER(in, out) #define ROUND_WENC(table, in, key, out) \