/* $Id: cubehash.c 227 2010-06-16 17:28:38Z tp $ */ /* * CubeHash implementation. * * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2007-2010 Projet RNRT SAPHIR * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * ===========================(LICENSE END)============================= * * @author Thomas Pornin */ /* * Some tests were conducted on an Intel Core2 Q6600 (32-bit and 64-bit * mode), a PowerPC G3, and a MIPS-compatible CPU (Broadcom BCM3302). * It appears that the optimal settings are: * -- full unroll, no state copy on the "big" systems (x86, PowerPC) * -- unroll to 4 or 8, state copy on the "small" system (MIPS) */ #if !defined SPH_CUBEHASH_UNROLL #define SPH_CUBEHASH_UNROLL 0 #endif __constant static const sph_u32 CUBEHASH_IV512[] = { SPH_C32(0x2AEA2A61), SPH_C32(0x50F494D4), SPH_C32(0x2D538B8B), SPH_C32(0x4167D83E), SPH_C32(0x3FEE2313), SPH_C32(0xC701CF8C), SPH_C32(0xCC39968E), SPH_C32(0x50AC5695), SPH_C32(0x4D42C787), SPH_C32(0xA647A8B3), SPH_C32(0x97CF0BEF), SPH_C32(0x825B4537), SPH_C32(0xEEF864D2), SPH_C32(0xF22090C4), SPH_C32(0xD0E5CD33), SPH_C32(0xA23911AE), SPH_C32(0xFCD398D9), SPH_C32(0x148FE485), SPH_C32(0x1B017BEF), SPH_C32(0xB6444532), SPH_C32(0x6A536159), SPH_C32(0x2FF5781C), SPH_C32(0x91FA7934), SPH_C32(0x0DBADEA9), SPH_C32(0xD65C8A2B), SPH_C32(0xA5A70E75), SPH_C32(0xB1C62456), SPH_C32(0xBC796576), SPH_C32(0x1921C8F7), SPH_C32(0xE7989AF1), SPH_C32(0x7795D246), SPH_C32(0xD43E3B44) }; #define T32 SPH_T32 #define ROTL32 SPH_ROTL32 #define ROUND_EVEN do { \ xg = T32(x0 + xg); \ x0 = ROTL32(x0, 7); \ xh = T32(x1 + xh); \ x1 = ROTL32(x1, 7); \ xi = T32(x2 + xi); \ x2 = ROTL32(x2, 7); \ xj = T32(x3 + xj); \ x3 = ROTL32(x3, 7); \ xk = T32(x4 + xk); \ x4 = ROTL32(x4, 7); \ xl = T32(x5 + xl); \ x5 = ROTL32(x5, 7); \ xm = T32(x6 + xm); \ x6 = ROTL32(x6, 7); \ xn = T32(x7 + xn); \ x7 = ROTL32(x7, 7); \ xo = T32(x8 + xo); \ x8 = ROTL32(x8, 7); \ xp = T32(x9 + xp); \ x9 = ROTL32(x9, 7); \ xq = T32(xa + xq); \ xa = ROTL32(xa, 7); \ xr = T32(xb + xr); \ xb = ROTL32(xb, 7); \ xs = T32(xc + xs); \ xc = ROTL32(xc, 7); \ xt = T32(xd + xt); \ xd = ROTL32(xd, 7); \ xu = T32(xe + xu); \ xe = ROTL32(xe, 7); \ xv = T32(xf + xv); \ xf = ROTL32(xf, 7); \ x8 ^= xg; \ x9 ^= xh; \ xa ^= xi; \ xb ^= xj; \ xc ^= xk; \ xd ^= xl; \ xe ^= xm; \ xf ^= xn; \ x0 ^= xo; \ x1 ^= xp; \ x2 ^= xq; \ x3 ^= xr; \ x4 ^= xs; \ x5 ^= xt; \ x6 ^= xu; \ x7 ^= xv; \ xi = T32(x8 + xi); \ x8 = ROTL32(x8, 11); \ xj = T32(x9 + xj); \ x9 = ROTL32(x9, 11); \ xg = T32(xa + xg); \ xa = ROTL32(xa, 11); \ xh = T32(xb + xh); \ xb = ROTL32(xb, 11); \ xm = T32(xc + xm); \ xc = ROTL32(xc, 11); \ xn = T32(xd + xn); \ xd = ROTL32(xd, 11); \ xk = T32(xe + xk); \ xe = ROTL32(xe, 11); \ xl = T32(xf + xl); \ xf = ROTL32(xf, 11); \ xq = T32(x0 + xq); \ x0 = ROTL32(x0, 11); \ xr = T32(x1 + xr); \ x1 = ROTL32(x1, 11); \ xo = T32(x2 + xo); \ x2 = ROTL32(x2, 11); \ xp = T32(x3 + xp); \ x3 = ROTL32(x3, 11); \ xu = T32(x4 + xu); \ x4 = ROTL32(x4, 11); \ xv = T32(x5 + xv); \ x5 = ROTL32(x5, 11); \ xs = T32(x6 + xs); \ x6 = ROTL32(x6, 11); \ xt = T32(x7 + xt); \ x7 = ROTL32(x7, 11); \ xc ^= xi; \ xd ^= xj; \ xe ^= xg; \ xf ^= xh; \ x8 ^= xm; \ x9 ^= xn; \ xa ^= xk; \ xb ^= xl; \ x4 ^= xq; \ x5 ^= xr; \ x6 ^= xo; \ x7 ^= xp; \ x0 ^= xu; \ x1 ^= xv; \ x2 ^= xs; \ x3 ^= xt; \ } while (0) #define ROUND_ODD do { \ xj = T32(xc + xj); \ xc = ROTL32(xc, 7); \ xi = T32(xd + xi); \ xd = ROTL32(xd, 7); \ xh = T32(xe + xh); \ xe = ROTL32(xe, 7); \ xg = T32(xf + xg); \ xf = ROTL32(xf, 7); \ xn = T32(x8 + xn); \ x8 = ROTL32(x8, 7); \ xm = T32(x9 + xm); \ x9 = ROTL32(x9, 7); \ xl = T32(xa + xl); \ xa = ROTL32(xa, 7); \ xk = T32(xb + xk); \ xb = ROTL32(xb, 7); \ xr = T32(x4 + xr); \ x4 = ROTL32(x4, 7); \ xq = T32(x5 + xq); \ x5 = ROTL32(x5, 7); \ xp = T32(x6 + xp); \ x6 = ROTL32(x6, 7); \ xo = T32(x7 + xo); \ x7 = ROTL32(x7, 7); \ xv = T32(x0 + xv); \ x0 = ROTL32(x0, 7); \ xu = T32(x1 + xu); \ x1 = ROTL32(x1, 7); \ xt = T32(x2 + xt); \ x2 = ROTL32(x2, 7); \ xs = T32(x3 + xs); \ x3 = ROTL32(x3, 7); \ x4 ^= xj; \ x5 ^= xi; \ x6 ^= xh; \ x7 ^= xg; \ x0 ^= xn; \ x1 ^= xm; \ x2 ^= xl; \ x3 ^= xk; \ xc ^= xr; \ xd ^= xq; \ xe ^= xp; \ xf ^= xo; \ x8 ^= xv; \ x9 ^= xu; \ xa ^= xt; \ xb ^= xs; \ xh = T32(x4 + xh); \ x4 = ROTL32(x4, 11); \ xg = T32(x5 + xg); \ x5 = ROTL32(x5, 11); \ xj = T32(x6 + xj); \ x6 = ROTL32(x6, 11); \ xi = T32(x7 + xi); \ x7 = ROTL32(x7, 11); \ xl = T32(x0 + xl); \ x0 = ROTL32(x0, 11); \ xk = T32(x1 + xk); \ x1 = ROTL32(x1, 11); \ xn = T32(x2 + xn); \ x2 = ROTL32(x2, 11); \ xm = T32(x3 + xm); \ x3 = ROTL32(x3, 11); \ xp = T32(xc + xp); \ xc = ROTL32(xc, 11); \ xo = T32(xd + xo); \ xd = ROTL32(xd, 11); \ xr = T32(xe + xr); \ xe = ROTL32(xe, 11); \ xq = T32(xf + xq); \ xf = ROTL32(xf, 11); \ xt = T32(x8 + xt); \ x8 = ROTL32(x8, 11); \ xs = T32(x9 + xs); \ x9 = ROTL32(x9, 11); \ xv = T32(xa + xv); \ xa = ROTL32(xa, 11); \ xu = T32(xb + xu); \ xb = ROTL32(xb, 11); \ x0 ^= xh; \ x1 ^= xg; \ x2 ^= xj; \ x3 ^= xi; \ x4 ^= xl; \ x5 ^= xk; \ x6 ^= xn; \ x7 ^= xm; \ x8 ^= xp; \ x9 ^= xo; \ xa ^= xr; \ xb ^= xq; \ xc ^= xt; \ xd ^= xs; \ xe ^= xv; \ xf ^= xu; \ } while (0) /* * There is no need to unroll all 16 rounds. The word-swapping permutation * is an involution, so we need to unroll an even number of rounds. On * "big" systems, unrolling 4 rounds yields about 97% of the speed * achieved with full unrolling; and it keeps the code more compact * for small architectures. */ #if SPH_CUBEHASH_UNROLL == 2 #define SIXTEEN_ROUNDS do { \ int j; \ for (j = 0; j < 8; j ++) { \ ROUND_EVEN; \ ROUND_ODD; \ } \ } while (0) #elif SPH_CUBEHASH_UNROLL == 4 #define SIXTEEN_ROUNDS do { \ int j; \ for (j = 0; j < 4; j ++) { \ ROUND_EVEN; \ ROUND_ODD; \ ROUND_EVEN; \ ROUND_ODD; \ } \ } while (0) #elif SPH_CUBEHASH_UNROLL == 8 #define SIXTEEN_ROUNDS do { \ int j; \ for (j = 0; j < 2; j ++) { \ ROUND_EVEN; \ ROUND_ODD; \ ROUND_EVEN; \ ROUND_ODD; \ ROUND_EVEN; \ ROUND_ODD; \ ROUND_EVEN; \ ROUND_ODD; \ } \ } while (0) #else #define SIXTEEN_ROUNDS do { \ ROUND_EVEN; \ ROUND_ODD; \ ROUND_EVEN; \ ROUND_ODD; \ ROUND_EVEN; \ ROUND_ODD; \ ROUND_EVEN; \ ROUND_ODD; \ ROUND_EVEN; \ ROUND_ODD; \ ROUND_EVEN; \ ROUND_ODD; \ ROUND_EVEN; \ ROUND_ODD; \ ROUND_EVEN; \ ROUND_ODD; \ } while (0) #endif