OpenCL GPU miner
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

1792 lines
48 KiB

/* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */
/*
* SIMD implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
#include <limits.h>
#include "sph_simd.h"
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD
#define SPH_SMALL_FOOTPRINT_SIMD 1
#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
typedef sph_u32 u32;
typedef sph_s32 s32;
#define C32 SPH_C32
#define T32 SPH_T32
#define ROL32 SPH_ROTL32
#define XCAT(x, y) XCAT_(x, y)
#define XCAT_(x, y) x ## y
/*
* The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
*/
static const s32 alpha_tab[] = {
1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130,
190, 80, 196, 69, 2, 82, 21, 90, 92, 174, 195, 28,
120, 37, 232, 3, 123, 160, 135, 138, 4, 164, 42, 180,
184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19,
8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12,
235, 126, 26, 38, 16, 142, 168, 206, 222, 107, 18, 224,
189, 39, 57, 24, 213, 252, 52, 76, 32, 27, 79, 155,
187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152,
64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96,
81, 237, 208, 47, 128, 108, 59, 106, 234, 85, 144, 250,
227, 55, 199, 192, 162, 217, 159, 94, 256, 216, 118, 212,
211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188,
255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254,
134, 97, 122, 119, 253, 93, 215, 77, 73, 166, 124, 201,
17, 183, 50, 251, 11, 194, 244, 238, 249, 186, 173, 154,
146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219,
241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233,
44, 5, 205, 181, 225, 230, 178, 102, 70, 43, 221, 66,
136, 179, 143, 209, 88, 10, 153, 105, 193, 203, 99, 204,
140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210,
129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65,
95, 40, 98, 163
};
/*
* Ranges:
* REDS1: from -32768..98302 to -383..383
* REDS2: from -2^31..2^31-1 to -32768..98302
*/
#define REDS1(x) (((x) & 0xFF) - ((x) >> 8))
#define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16))
/*
* If, upon entry, the values of q[] are all in the -N..N range (where
* N >= 98302) then the new values of q[] are in the -2N..2N range.
*
* Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
*/
#define FFT_LOOP(rb, hk, as, id) do { \
size_t u, v; \
s32 m = q[(rb)]; \
s32 n = q[(rb) + (hk)]; \
q[(rb)] = m + n; \
q[(rb) + (hk)] = m - n; \
u = v = 0; \
goto id; \
for (; u < (hk); u += 4, v += 4 * (as)) { \
s32 t; \
m = q[(rb) + u + 0]; \
n = q[(rb) + u + 0 + (hk)]; \
t = REDS2(n * alpha_tab[v + 0 * (as)]); \
q[(rb) + u + 0] = m + t; \
q[(rb) + u + 0 + (hk)] = m - t; \
id: \
m = q[(rb) + u + 1]; \
n = q[(rb) + u + 1 + (hk)]; \
t = REDS2(n * alpha_tab[v + 1 * (as)]); \
q[(rb) + u + 1] = m + t; \
q[(rb) + u + 1 + (hk)] = m - t; \
m = q[(rb) + u + 2]; \
n = q[(rb) + u + 2 + (hk)]; \
t = REDS2(n * alpha_tab[v + 2 * (as)]); \
q[(rb) + u + 2] = m + t; \
q[(rb) + u + 2 + (hk)] = m - t; \
m = q[(rb) + u + 3]; \
n = q[(rb) + u + 3 + (hk)]; \
t = REDS2(n * alpha_tab[v + 3 * (as)]); \
q[(rb) + u + 3] = m + t; \
q[(rb) + u + 3 + (hk)] = m - t; \
} \
} while (0)
/*
* Output ranges:
* d0: min= 0 max= 1020
* d1: min= -67 max= 4587
* d2: min=-4335 max= 4335
* d3: min=-4147 max= 507
* d4: min= -510 max= 510
* d5: min= -252 max= 4402
* d6: min=-4335 max= 4335
* d7: min=-4332 max= 322
*/
#define FFT8(xb, xs, d) do { \
s32 x0 = x[(xb)]; \
s32 x1 = x[(xb) + (xs)]; \
s32 x2 = x[(xb) + 2 * (xs)]; \
s32 x3 = x[(xb) + 3 * (xs)]; \
s32 a0 = x0 + x2; \
s32 a1 = x0 + (x2 << 4); \
s32 a2 = x0 - x2; \
s32 a3 = x0 - (x2 << 4); \
s32 b0 = x1 + x3; \
s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
s32 b2 = (x1 << 4) - (x3 << 4); \
s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
d ## 0 = a0 + b0; \
d ## 1 = a1 + b1; \
d ## 2 = a2 + b2; \
d ## 3 = a3 + b3; \
d ## 4 = a0 - b0; \
d ## 5 = a1 - b1; \
d ## 6 = a2 - b2; \
d ## 7 = a3 - b3; \
} while (0)
/*
* When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
* to some shifting.
*
* Output: within -591471..591723
*/
#define FFT16(xb, xs, rb) do { \
s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
FFT8(xb, (xs) << 1, d1_); \
FFT8((xb) + (xs), (xs) << 1, d2_); \
q[(rb) + 0] = d1_0 + d2_0; \
q[(rb) + 1] = d1_1 + (d2_1 << 1); \
q[(rb) + 2] = d1_2 + (d2_2 << 2); \
q[(rb) + 3] = d1_3 + (d2_3 << 3); \
q[(rb) + 4] = d1_4 + (d2_4 << 4); \
q[(rb) + 5] = d1_5 + (d2_5 << 5); \
q[(rb) + 6] = d1_6 + (d2_6 << 6); \
q[(rb) + 7] = d1_7 + (d2_7 << 7); \
q[(rb) + 8] = d1_0 - d2_0; \
q[(rb) + 9] = d1_1 - (d2_1 << 1); \
q[(rb) + 10] = d1_2 - (d2_2 << 2); \
q[(rb) + 11] = d1_3 - (d2_3 << 3); \
q[(rb) + 12] = d1_4 - (d2_4 << 4); \
q[(rb) + 13] = d1_5 - (d2_5 << 5); \
q[(rb) + 14] = d1_6 - (d2_6 << 6); \
q[(rb) + 15] = d1_7 - (d2_7 << 7); \
} while (0)
/*
* Output range: |q| <= 1183446
*/
#define FFT32(xb, xs, rb, id) do { \
FFT16(xb, (xs) << 1, rb); \
FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
FFT_LOOP(rb, 16, 8, id); \
} while (0)
/*
* Output range: |q| <= 2366892
*/
#define FFT64(xb, xs, rb, id) do { \
FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \
FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \
FFT_LOOP(rb, 32, 4, id); \
} while (0)
#if SPH_SMALL_FOOTPRINT_SIMD
static void
fft32(unsigned char *x, size_t xs, s32 *q)
{
size_t xd;
xd = xs << 1;
FFT16(0, xd, 0);
FFT16(xs, xd, 16);
FFT_LOOP(0, 16, 8, label_);
}
#define FFT128(xb, xs, rb, id) do { \
fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \
fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \
FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \
fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \
fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \
FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \
FFT_LOOP(rb, 64, 2, XCAT(id, a)); \
} while (0)
#else
/*
* Output range: |q| <= 4733784
*/
#define FFT128(xb, xs, rb, id) do { \
FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \
FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \
FFT_LOOP(rb, 64, 2, id); \
} while (0)
#endif
/*
* For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression
* function which does not fit in the 32 kB L1 cache of a typical x86
* Intel. We therefore add a function call layer at the FFT64 level.
*/
static void
fft64(unsigned char *x, size_t xs, s32 *q)
{
size_t xd;
xd = xs << 1;
FFT32(0, xd, 0, label_a);
FFT32(xs, xd, 32, label_b);
FFT_LOOP(0, 32, 4, label_);
}
/*
* Output range: |q| <= 9467568
*/
#define FFT256(xb, xs, rb, id) do { \
fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \
fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 64]); \
FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \
fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \
fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \
FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \
FFT_LOOP(rb, 128, 1, XCAT(id, a)); \
} while (0)
/*
* alpha^(127*i) mod 257
*/
static const unsigned short yoff_s_n[] = {
1, 98, 95, 58, 30, 113, 23, 198, 129, 49, 176, 29,
15, 185, 140, 99, 193, 153, 88, 143, 136, 221, 70, 178,
225, 205, 44, 200, 68, 239, 35, 89, 241, 231, 22, 100,
34, 248, 146, 173, 249, 244, 11, 50, 17, 124, 73, 215,
253, 122, 134, 25, 137, 62, 165, 236, 255, 61, 67, 141,
197, 31, 211, 118, 256, 159, 162, 199, 227, 144, 234, 59,
128, 208, 81, 228, 242, 72, 117, 158, 64, 104, 169, 114,
121, 36, 187, 79, 32, 52, 213, 57, 189, 18, 222, 168,
16, 26, 235, 157, 223, 9, 111, 84, 8, 13, 246, 207,
240, 133, 184, 42, 4, 135, 123, 232, 120, 195, 92, 21,
2, 196, 190, 116, 60, 226, 46, 139
};
/*
* alpha^(127*i) + alpha^(125*i) mod 257
*/
static const unsigned short yoff_s_f[] = {
2, 156, 118, 107, 45, 212, 111, 162, 97, 249, 211, 3,
49, 101, 151, 223, 189, 178, 253, 204, 76, 82, 232, 65,
96, 176, 161, 47, 189, 61, 248, 107, 0, 131, 133, 113,
17, 33, 12, 111, 251, 103, 57, 148, 47, 65, 249, 143,
189, 8, 204, 230, 205, 151, 187, 227, 247, 111, 140, 6,
77, 10, 21, 149, 255, 101, 139, 150, 212, 45, 146, 95,
160, 8, 46, 254, 208, 156, 106, 34, 68, 79, 4, 53,
181, 175, 25, 192, 161, 81, 96, 210, 68, 196, 9, 150,
0, 126, 124, 144, 240, 224, 245, 146, 6, 154, 200, 109,
210, 192, 8, 114, 68, 249, 53, 27, 52, 106, 70, 30,
10, 146, 117, 251, 180, 247, 236, 108
};
/*
* beta^(255*i) mod 257
*/
static const unsigned short yoff_b_n[] = {
1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172,
23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101,
15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10,
88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230,
225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150,
35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109,
34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194,
11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93,
253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83,
165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110,
197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217,
162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108,
128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171,
117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78,
121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252,
213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142,
16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182,
111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74,
240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160,
123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82,
2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87,
46, 45, 139, 41
};
/*
* beta^(255*i) + beta^(253*i) mod 257
*/
static const unsigned short yoff_b_f[] = {
2, 203, 156, 47, 118, 214, 107, 106, 45, 93, 212, 20,
111, 73, 162, 251, 97, 215, 249, 53, 211, 19, 3, 89,
49, 207, 101, 67, 151, 130, 223, 23, 189, 202, 178, 239,
253, 127, 204, 49, 76, 236, 82, 137, 232, 157, 65, 79,
96, 161, 176, 130, 161, 30, 47, 9, 189, 247, 61, 226,
248, 90, 107, 64, 0, 88, 131, 243, 133, 59, 113, 115,
17, 236, 33, 213, 12, 191, 111, 19, 251, 61, 103, 208,
57, 35, 148, 248, 47, 116, 65, 119, 249, 178, 143, 40,
189, 129, 8, 163, 204, 227, 230, 196, 205, 122, 151, 45,
187, 19, 227, 72, 247, 125, 111, 121, 140, 220, 6, 107,
77, 69, 10, 101, 21, 65, 149, 171, 255, 54, 101, 210,
139, 43, 150, 151, 212, 164, 45, 237, 146, 184, 95, 6,
160, 42, 8, 204, 46, 238, 254, 168, 208, 50, 156, 190,
106, 127, 34, 234, 68, 55, 79, 18, 4, 130, 53, 208,
181, 21, 175, 120, 25, 100, 192, 178, 161, 96, 81, 127,
96, 227, 210, 248, 68, 10, 196, 31, 9, 167, 150, 193,
0, 169, 126, 14, 124, 198, 144, 142, 240, 21, 224, 44,
245, 66, 146, 238, 6, 196, 154, 49, 200, 222, 109, 9,
210, 141, 192, 138, 8, 79, 114, 217, 68, 128, 249, 94,
53, 30, 27, 61, 52, 135, 106, 212, 70, 238, 30, 185,
10, 132, 146, 136, 117, 37, 251, 150, 180, 188, 247, 156,
236, 192, 108, 86
};
#define INNER(l, h, mm) (((u32)((l) * (mm)) & 0xFFFFU) \
+ ((u32)((h) * (mm)) << 16))
#define W_SMALL(sb, o1, o2, mm) \
(INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \
INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \
INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \
INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm)
#define WS_0_0 W_SMALL( 4, 0, 1, 185)
#define WS_0_1 W_SMALL( 6, 0, 1, 185)
#define WS_0_2 W_SMALL( 0, 0, 1, 185)
#define WS_0_3 W_SMALL( 2, 0, 1, 185)
#define WS_0_4 W_SMALL( 7, 0, 1, 185)
#define WS_0_5 W_SMALL( 5, 0, 1, 185)
#define WS_0_6 W_SMALL( 3, 0, 1, 185)
#define WS_0_7 W_SMALL( 1, 0, 1, 185)
#define WS_1_0 W_SMALL(15, 0, 1, 185)
#define WS_1_1 W_SMALL(11, 0, 1, 185)
#define WS_1_2 W_SMALL(12, 0, 1, 185)
#define WS_1_3 W_SMALL( 8, 0, 1, 185)
#define WS_1_4 W_SMALL( 9, 0, 1, 185)
#define WS_1_5 W_SMALL(13, 0, 1, 185)
#define WS_1_6 W_SMALL(10, 0, 1, 185)
#define WS_1_7 W_SMALL(14, 0, 1, 185)
#define WS_2_0 W_SMALL(17, -128, -64, 233)
#define WS_2_1 W_SMALL(18, -128, -64, 233)
#define WS_2_2 W_SMALL(23, -128, -64, 233)
#define WS_2_3 W_SMALL(20, -128, -64, 233)
#define WS_2_4 W_SMALL(22, -128, -64, 233)
#define WS_2_5 W_SMALL(21, -128, -64, 233)
#define WS_2_6 W_SMALL(16, -128, -64, 233)
#define WS_2_7 W_SMALL(19, -128, -64, 233)
#define WS_3_0 W_SMALL(30, -191, -127, 233)
#define WS_3_1 W_SMALL(24, -191, -127, 233)
#define WS_3_2 W_SMALL(25, -191, -127, 233)
#define WS_3_3 W_SMALL(31, -191, -127, 233)
#define WS_3_4 W_SMALL(27, -191, -127, 233)
#define WS_3_5 W_SMALL(29, -191, -127, 233)
#define WS_3_6 W_SMALL(28, -191, -127, 233)
#define WS_3_7 W_SMALL(26, -191, -127, 233)
#define W_BIG(sb, o1, o2, mm) \
(INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)
#define WB_0_0 W_BIG( 4, 0, 1, 185)
#define WB_0_1 W_BIG( 6, 0, 1, 185)
#define WB_0_2 W_BIG( 0, 0, 1, 185)
#define WB_0_3 W_BIG( 2, 0, 1, 185)
#define WB_0_4 W_BIG( 7, 0, 1, 185)
#define WB_0_5 W_BIG( 5, 0, 1, 185)
#define WB_0_6 W_BIG( 3, 0, 1, 185)
#define WB_0_7 W_BIG( 1, 0, 1, 185)
#define WB_1_0 W_BIG(15, 0, 1, 185)
#define WB_1_1 W_BIG(11, 0, 1, 185)
#define WB_1_2 W_BIG(12, 0, 1, 185)
#define WB_1_3 W_BIG( 8, 0, 1, 185)
#define WB_1_4 W_BIG( 9, 0, 1, 185)
#define WB_1_5 W_BIG(13, 0, 1, 185)
#define WB_1_6 W_BIG(10, 0, 1, 185)
#define WB_1_7 W_BIG(14, 0, 1, 185)
#define WB_2_0 W_BIG(17, -256, -128, 233)
#define WB_2_1 W_BIG(18, -256, -128, 233)
#define WB_2_2 W_BIG(23, -256, -128, 233)
#define WB_2_3 W_BIG(20, -256, -128, 233)
#define WB_2_4 W_BIG(22, -256, -128, 233)
#define WB_2_5 W_BIG(21, -256, -128, 233)
#define WB_2_6 W_BIG(16, -256, -128, 233)
#define WB_2_7 W_BIG(19, -256, -128, 233)
#define WB_3_0 W_BIG(30, -383, -255, 233)
#define WB_3_1 W_BIG(24, -383, -255, 233)
#define WB_3_2 W_BIG(25, -383, -255, 233)
#define WB_3_3 W_BIG(31, -383, -255, 233)
#define WB_3_4 W_BIG(27, -383, -255, 233)
#define WB_3_5 W_BIG(29, -383, -255, 233)
#define WB_3_6 W_BIG(28, -383, -255, 233)
#define WB_3_7 W_BIG(26, -383, -255, 233)
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define MAJ(x, y, z) (((x) & (y)) | (((x) | (y)) & (z)))
#define PP4_0_0 1
#define PP4_0_1 0
#define PP4_0_2 3
#define PP4_0_3 2
#define PP4_1_0 2
#define PP4_1_1 3
#define PP4_1_2 0
#define PP4_1_3 1
#define PP4_2_0 3
#define PP4_2_1 2
#define PP4_2_2 1
#define PP4_2_3 0
#define PP8_0_0 1
#define PP8_0_1 0
#define PP8_0_2 3
#define PP8_0_3 2
#define PP8_0_4 5
#define PP8_0_5 4
#define PP8_0_6 7
#define PP8_0_7 6
#define PP8_1_0 6
#define PP8_1_1 7
#define PP8_1_2 4
#define PP8_1_3 5
#define PP8_1_4 2
#define PP8_1_5 3
#define PP8_1_6 0
#define PP8_1_7 1
#define PP8_2_0 2
#define PP8_2_1 3
#define PP8_2_2 0
#define PP8_2_3 1
#define PP8_2_4 6
#define PP8_2_5 7
#define PP8_2_6 4
#define PP8_2_7 5
#define PP8_3_0 3
#define PP8_3_1 2
#define PP8_3_2 1
#define PP8_3_3 0
#define PP8_3_4 7
#define PP8_3_5 6
#define PP8_3_6 5
#define PP8_3_7 4
#define PP8_4_0 5
#define PP8_4_1 4
#define PP8_4_2 7
#define PP8_4_3 6
#define PP8_4_4 1
#define PP8_4_5 0
#define PP8_4_6 3
#define PP8_4_7 2
#define PP8_5_0 7
#define PP8_5_1 6
#define PP8_5_2 5
#define PP8_5_3 4
#define PP8_5_4 3
#define PP8_5_5 2
#define PP8_5_6 1
#define PP8_5_7 0
#define PP8_6_0 4
#define PP8_6_1 5
#define PP8_6_2 6
#define PP8_6_3 7
#define PP8_6_4 0
#define PP8_6_5 1
#define PP8_6_6 2
#define PP8_6_7 3
#if SPH_SIMD_NOCOPY
#define DECL_STATE_SMALL
#define READ_STATE_SMALL(sc)
#define WRITE_STATE_SMALL(sc)
#define DECL_STATE_BIG
#define READ_STATE_BIG(sc)
#define WRITE_STATE_BIG(sc)
#else
#define DECL_STATE_SMALL \
u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3;
#define READ_STATE_SMALL(sc) do { \
A0 = (sc)->state[ 0]; \
A1 = (sc)->state[ 1]; \
A2 = (sc)->state[ 2]; \
A3 = (sc)->state[ 3]; \
B0 = (sc)->state[ 4]; \
B1 = (sc)->state[ 5]; \
B2 = (sc)->state[ 6]; \
B3 = (sc)->state[ 7]; \
C0 = (sc)->state[ 8]; \
C1 = (sc)->state[ 9]; \
C2 = (sc)->state[10]; \
C3 = (sc)->state[11]; \
D0 = (sc)->state[12]; \
D1 = (sc)->state[13]; \
D2 = (sc)->state[14]; \
D3 = (sc)->state[15]; \
} while (0)
#define WRITE_STATE_SMALL(sc) do { \
(sc)->state[ 0] = A0; \
(sc)->state[ 1] = A1; \
(sc)->state[ 2] = A2; \
(sc)->state[ 3] = A3; \
(sc)->state[ 4] = B0; \
(sc)->state[ 5] = B1; \
(sc)->state[ 6] = B2; \
(sc)->state[ 7] = B3; \
(sc)->state[ 8] = C0; \
(sc)->state[ 9] = C1; \
(sc)->state[10] = C2; \
(sc)->state[11] = C3; \
(sc)->state[12] = D0; \
(sc)->state[13] = D1; \
(sc)->state[14] = D2; \
(sc)->state[15] = D3; \
} while (0)
#define DECL_STATE_BIG \
u32 A0, A1, A2, A3, A4, A5, A6, A7; \
u32 B0, B1, B2, B3, B4, B5, B6, B7; \
u32 C0, C1, C2, C3, C4, C5, C6, C7; \
u32 D0, D1, D2, D3, D4, D5, D6, D7;
#define READ_STATE_BIG(sc) do { \
A0 = (sc)->state[ 0]; \
A1 = (sc)->state[ 1]; \
A2 = (sc)->state[ 2]; \
A3 = (sc)->state[ 3]; \
A4 = (sc)->state[ 4]; \
A5 = (sc)->state[ 5]; \
A6 = (sc)->state[ 6]; \
A7 = (sc)->state[ 7]; \
B0 = (sc)->state[ 8]; \
B1 = (sc)->state[ 9]; \
B2 = (sc)->state[10]; \
B3 = (sc)->state[11]; \
B4 = (sc)->state[12]; \
B5 = (sc)->state[13]; \
B6 = (sc)->state[14]; \
B7 = (sc)->state[15]; \
C0 = (sc)->state[16]; \
C1 = (sc)->state[17]; \
C2 = (sc)->state[18]; \
C3 = (sc)->state[19]; \
C4 = (sc)->state[20]; \
C5 = (sc)->state[21]; \
C6 = (sc)->state[22]; \
C7 = (sc)->state[23]; \
D0 = (sc)->state[24]; \
D1 = (sc)->state[25]; \
D2 = (sc)->state[26]; \
D3 = (sc)->state[27]; \
D4 = (sc)->state[28]; \
D5 = (sc)->state[29]; \
D6 = (sc)->state[30]; \
D7 = (sc)->state[31]; \
} while (0)
#define WRITE_STATE_BIG(sc) do { \
(sc)->state[ 0] = A0; \
(sc)->state[ 1] = A1; \
(sc)->state[ 2] = A2; \
(sc)->state[ 3] = A3; \
(sc)->state[ 4] = A4; \
(sc)->state[ 5] = A5; \
(sc)->state[ 6] = A6; \
(sc)->state[ 7] = A7; \
(sc)->state[ 8] = B0; \
(sc)->state[ 9] = B1; \
(sc)->state[10] = B2; \
(sc)->state[11] = B3; \
(sc)->state[12] = B4; \
(sc)->state[13] = B5; \
(sc)->state[14] = B6; \
(sc)->state[15] = B7; \
(sc)->state[16] = C0; \
(sc)->state[17] = C1; \
(sc)->state[18] = C2; \
(sc)->state[19] = C3; \
(sc)->state[20] = C4; \
(sc)->state[21] = C5; \
(sc)->state[22] = C6; \
(sc)->state[23] = C7; \
(sc)->state[24] = D0; \
(sc)->state[25] = D1; \
(sc)->state[26] = D2; \
(sc)->state[27] = D3; \
(sc)->state[28] = D4; \
(sc)->state[29] = D5; \
(sc)->state[30] = D6; \
(sc)->state[31] = D7; \
} while (0)
#endif
#define STEP_ELT(n, w, fun, s, ppb) do { \
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
D ## n = C ## n; \
C ## n = B ## n; \
B ## n = tA ## n; \
} while (0)
#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \
u32 tA0 = ROL32(A0, r); \
u32 tA1 = ROL32(A1, r); \
u32 tA2 = ROL32(A2, r); \
u32 tA3 = ROL32(A3, r); \
STEP_ELT(0, w0, fun, s, pp4b); \
STEP_ELT(1, w1, fun, s, pp4b); \
STEP_ELT(2, w2, fun, s, pp4b); \
STEP_ELT(3, w3, fun, s, pp4b); \
} while (0)
#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \
u32 tA0 = ROL32(A0, r); \
u32 tA1 = ROL32(A1, r); \
u32 tA2 = ROL32(A2, r); \
u32 tA3 = ROL32(A3, r); \
u32 tA4 = ROL32(A4, r); \
u32 tA5 = ROL32(A5, r); \
u32 tA6 = ROL32(A6, r); \
u32 tA7 = ROL32(A7, r); \
STEP_ELT(0, w0, fun, s, pp8b); \
STEP_ELT(1, w1, fun, s, pp8b); \
STEP_ELT(2, w2, fun, s, pp8b); \
STEP_ELT(3, w3, fun, s, pp8b); \
STEP_ELT(4, w4, fun, s, pp8b); \
STEP_ELT(5, w5, fun, s, pp8b); \
STEP_ELT(6, w6, fun, s, pp8b); \
STEP_ELT(7, w7, fun, s, pp8b); \
} while (0)
#define M3_0_0 0_
#define M3_1_0 1_
#define M3_2_0 2_
#define M3_3_0 0_
#define M3_4_0 1_
#define M3_5_0 2_
#define M3_6_0 0_
#define M3_7_0 1_
#define M3_0_1 1_
#define M3_1_1 2_
#define M3_2_1 0_
#define M3_3_1 1_
#define M3_4_1 2_
#define M3_5_1 0_
#define M3_6_1 1_
#define M3_7_1 2_
#define M3_0_2 2_
#define M3_1_2 0_
#define M3_2_2 1_
#define M3_3_2 2_
#define M3_4_2 0_
#define M3_5_2 1_
#define M3_6_2 2_
#define M3_7_2 0_
#define STEP_SMALL_(w, fun, r, s, pp4b) STEP_SMALL w, fun, r, s, pp4b)
#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3) do { \
STEP_SMALL_(WS_ ## ri ## 0, \
IF, p0, p1, XCAT(PP4_, M3_0_ ## isp)); \
STEP_SMALL_(WS_ ## ri ## 1, \
IF, p1, p2, XCAT(PP4_, M3_1_ ## isp)); \
STEP_SMALL_(WS_ ## ri ## 2, \
IF, p2, p3, XCAT(PP4_, M3_2_ ## isp)); \
STEP_SMALL_(WS_ ## ri ## 3, \
IF, p3, p0, XCAT(PP4_, M3_3_ ## isp)); \
STEP_SMALL_(WS_ ## ri ## 4, \
MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \
STEP_SMALL_(WS_ ## ri ## 5, \
MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \
STEP_SMALL_(WS_ ## ri ## 6, \
MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \
STEP_SMALL_(WS_ ## ri ## 7, \
MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \
} while (0)
#define M7_0_0 0_
#define M7_1_0 1_
#define M7_2_0 2_
#define M7_3_0 3_
#define M7_4_0 4_
#define M7_5_0 5_
#define M7_6_0 6_
#define M7_7_0 0_
#define M7_0_1 1_
#define M7_1_1 2_
#define M7_2_1 3_
#define M7_3_1 4_
#define M7_4_1 5_
#define M7_5_1 6_
#define M7_6_1 0_
#define M7_7_1 1_
#define M7_0_2 2_
#define M7_1_2 3_
#define M7_2_2 4_
#define M7_3_2 5_
#define M7_4_2 6_
#define M7_5_2 0_
#define M7_6_2 1_
#define M7_7_2 2_
#define M7_0_3 3_
#define M7_1_3 4_
#define M7_2_3 5_
#define M7_3_3 6_
#define M7_4_3 0_
#define M7_5_3 1_
#define M7_6_3 2_
#define M7_7_3 3_
#define STEP_BIG_(w, fun, r, s, pp8b) STEP_BIG w, fun, r, s, pp8b)
#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3) do { \
STEP_BIG_(WB_ ## ri ## 0, \
IF, p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
STEP_BIG_(WB_ ## ri ## 1, \
IF, p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
STEP_BIG_(WB_ ## ri ## 2, \
IF, p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
STEP_BIG_(WB_ ## ri ## 3, \
IF, p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
STEP_BIG_(WB_ ## ri ## 4, \
MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
STEP_BIG_(WB_ ## ri ## 5, \
MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
STEP_BIG_(WB_ ## ri ## 6, \
MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
STEP_BIG_(WB_ ## ri ## 7, \
MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
} while (0)
#if SPH_SMALL_FOOTPRINT_SIMD
#define A0 state[ 0]
#define A1 state[ 1]
#define A2 state[ 2]
#define A3 state[ 3]
#define B0 state[ 4]
#define B1 state[ 5]
#define B2 state[ 6]
#define B3 state[ 7]
#define C0 state[ 8]
#define C1 state[ 9]
#define C2 state[10]
#define C3 state[11]
#define D0 state[12]
#define D1 state[13]
#define D2 state[14]
#define D3 state[15]
#define STEP2_ELT(n, w, fun, s, ppb) do { \
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
D ## n = C ## n; \
C ## n = B ## n; \
B ## n = tA[n]; \
} while (0)
#define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \
u32 tA[4]; \
tA[0] = ROL32(A0, r); \
tA[1] = ROL32(A1, r); \
tA[2] = ROL32(A2, r); \
tA[3] = ROL32(A3, r); \
STEP2_ELT(0, w0, fun, s, pp4b); \
STEP2_ELT(1, w1, fun, s, pp4b); \
STEP2_ELT(2, w2, fun, s, pp4b); \
STEP2_ELT(3, w3, fun, s, pp4b); \
} while (0)
static void
one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
{
static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 };
STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF, p0, p1, pp4k[isp + 0]);
STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF, p1, p2, pp4k[isp + 1]);
STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF, p2, p3, pp4k[isp + 2]);
STEP2_SMALL(w[12], w[13], w[14], w[15], IF, p3, p0, pp4k[isp + 3]);
STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]);
STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]);
STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]);
STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]);
}
static void
compress_small(sph_simd_small_context *sc, int last)
{
unsigned char *x;
s32 q[128];
int i;
u32 w[32];
u32 state[16];
size_t u;
static const size_t wsp[32] = {
4 << 3, 6 << 3, 0 << 3, 2 << 3,
7 << 3, 5 << 3, 3 << 3, 1 << 3,
15 << 3, 11 << 3, 12 << 3, 8 << 3,
9 << 3, 13 << 3, 10 << 3, 14 << 3,
17 << 3, 18 << 3, 23 << 3, 20 << 3,
22 << 3, 21 << 3, 16 << 3, 19 << 3,
30 << 3, 24 << 3, 25 << 3, 31 << 3,
27 << 3, 29 << 3, 28 << 3, 26 << 3
};
x = sc->buf;
FFT128(0, 1, 0, ll);
if (last) {
for (i = 0; i < 128; i ++) {
s32 tq;
tq = q[i] + yoff_s_f[i];
tq = REDS2(tq);
tq = REDS1(tq);
tq = REDS1(tq);
q[i] = (tq <= 128 ? tq : tq - 257);
}
} else {
for (i = 0; i < 128; i ++) {
s32 tq;
tq = q[i] + yoff_s_n[i];
tq = REDS2(tq);
tq = REDS1(tq);
tq = REDS1(tq);
q[i] = (tq <= 128 ? tq : tq - 257);
}
}
for (i = 0; i < 16; i += 4) {
state[i + 0] = sc->state[i + 0]
^ sph_dec32le_aligned(x + 4 * (i + 0));
state[i + 1] = sc->state[i + 1]
^ sph_dec32le_aligned(x + 4 * (i + 1));
state[i + 2] = sc->state[i + 2]
^ sph_dec32le_aligned(x + 4 * (i + 2));
state[i + 3] = sc->state[i + 3]
^ sph_dec32le_aligned(x + 4 * (i + 3));
}
#define WSREAD(sb, o1, o2, mm) do { \
for (u = 0; u < 32; u += 4) { \
size_t v = wsp[(u >> 2) + (sb)]; \
w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
q[v + 2 * 0 + (o2)], mm); \
w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
q[v + 2 * 1 + (o2)], mm); \
w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
q[v + 2 * 2 + (o2)], mm); \
w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
q[v + 2 * 3 + (o2)], mm); \
} \
} while (0)
WSREAD( 0, 0, 1, 185);
one_round_small(state, w, 0, 3, 23, 17, 27);
WSREAD( 8, 0, 1, 185);
one_round_small(state, w, 2, 28, 19, 22, 7);
WSREAD(16, -128, -64, 233);
one_round_small(state, w, 1, 29, 9, 15, 5);
WSREAD(24, -191, -127, 233);
one_round_small(state, w, 0, 4, 13, 10, 25);
#undef WSREAD
STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
IF, 4, 13, PP4_2_);
STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
IF, 13, 10, PP4_0_);
STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
IF, 10, 25, PP4_1_);
STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
IF, 25, 4, PP4_2_);
memcpy(sc->state, state, sizeof state);
}
#undef A0
#undef A1
#undef A2
#undef A3
#undef B0
#undef B1
#undef B2
#undef B3
#undef C0
#undef C1
#undef C2
#undef C3
#undef D0
#undef D1
#undef D2
#undef D3
#else
#if SPH_SIMD_NOCOPY
#define A0 (sc->state[ 0])
#define A1 (sc->state[ 1])
#define A2 (sc->state[ 2])
#define A3 (sc->state[ 3])
#define B0 (sc->state[ 4])
#define B1 (sc->state[ 5])
#define B2 (sc->state[ 6])
#define B3 (sc->state[ 7])
#define C0 (sc->state[ 8])
#define C1 (sc->state[ 9])
#define C2 (sc->state[10])
#define C3 (sc->state[11])
#define D0 (sc->state[12])
#define D1 (sc->state[13])
#define D2 (sc->state[14])
#define D3 (sc->state[15])
#endif
static void
compress_small(sph_simd_small_context *sc, int last)
{
unsigned char *x;
s32 q[128];
int i;
DECL_STATE_SMALL
#if SPH_SIMD_NOCOPY
sph_u32 saved[16];
#endif
#if SPH_SIMD_NOCOPY
memcpy(saved, sc->state, sizeof saved);
#endif
x = sc->buf;
FFT128(0, 1, 0, ll);
if (last) {
for (i = 0; i < 128; i ++) {
s32 tq;
tq = q[i] + yoff_s_f[i];
tq = REDS2(tq);
tq = REDS1(tq);
tq = REDS1(tq);
q[i] = (tq <= 128 ? tq : tq - 257);
}
} else {
for (i = 0; i < 128; i ++) {
s32 tq;
tq = q[i] + yoff_s_n[i];
tq = REDS2(tq);
tq = REDS1(tq);
tq = REDS1(tq);
q[i] = (tq <= 128 ? tq : tq - 257);
}
}
READ_STATE_SMALL(sc);
A0 ^= sph_dec32le_aligned(x + 0);
A1 ^= sph_dec32le_aligned(x + 4);
A2 ^= sph_dec32le_aligned(x + 8);
A3 ^= sph_dec32le_aligned(x + 12);
B0 ^= sph_dec32le_aligned(x + 16);
B1 ^= sph_dec32le_aligned(x + 20);
B2 ^= sph_dec32le_aligned(x + 24);
B3 ^= sph_dec32le_aligned(x + 28);
C0 ^= sph_dec32le_aligned(x + 32);
C1 ^= sph_dec32le_aligned(x + 36);
C2 ^= sph_dec32le_aligned(x + 40);
C3 ^= sph_dec32le_aligned(x + 44);
D0 ^= sph_dec32le_aligned(x + 48);
D1 ^= sph_dec32le_aligned(x + 52);
D2 ^= sph_dec32le_aligned(x + 56);
D3 ^= sph_dec32le_aligned(x + 60);
ONE_ROUND_SMALL(0_, 0, 3, 23, 17, 27);
ONE_ROUND_SMALL(1_, 2, 28, 19, 22, 7);
ONE_ROUND_SMALL(2_, 1, 29, 9, 15, 5);
ONE_ROUND_SMALL(3_, 0, 4, 13, 10, 25);
#if SPH_SIMD_NOCOPY
STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3],
IF, 4, 13, PP4_2_);
STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7],
IF, 13, 10, PP4_0_);
STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11],
IF, 10, 25, PP4_1_);
STEP_SMALL(saved[12], saved[13], saved[14], saved[15],
IF, 25, 4, PP4_2_);
#else
STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
IF, 4, 13, PP4_2_);
STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
IF, 13, 10, PP4_0_);
STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
IF, 10, 25, PP4_1_);
STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
IF, 25, 4, PP4_2_);
WRITE_STATE_SMALL(sc);
#endif
}
#if SPH_SIMD_NOCOPY
#undef A0
#undef A1
#undef A2
#undef A3
#undef B0
#undef B1
#undef B2
#undef B3
#undef C0
#undef C1
#undef C2
#undef C3
#undef D0
#undef D1
#undef D2
#undef D3
#endif
#endif
#if SPH_SMALL_FOOTPRINT_SIMD
#define A0 state[ 0]
#define A1 state[ 1]
#define A2 state[ 2]
#define A3 state[ 3]
#define A4 state[ 4]
#define A5 state[ 5]
#define A6 state[ 6]
#define A7 state[ 7]
#define B0 state[ 8]
#define B1 state[ 9]
#define B2 state[10]
#define B3 state[11]
#define B4 state[12]
#define B5 state[13]
#define B6 state[14]
#define B7 state[15]
#define C0 state[16]
#define C1 state[17]
#define C2 state[18]
#define C3 state[19]
#define C4 state[20]
#define C5 state[21]
#define C6 state[22]
#define C7 state[23]
#define D0 state[24]
#define D1 state[25]
#define D2 state[26]
#define D3 state[27]
#define D4 state[28]
#define D5 state[29]
#define D6 state[30]
#define D7 state[31]
/*
* Not needed -- already defined for SIMD-224 / SIMD-256
*
#define STEP2_ELT(n, w, fun, s, ppb) do { \
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
D ## n = C ## n; \
C ## n = B ## n; \
B ## n = tA[n]; \
} while (0)
*/
#define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \
u32 tA[8]; \
tA[0] = ROL32(A0, r); \
tA[1] = ROL32(A1, r); \
tA[2] = ROL32(A2, r); \
tA[3] = ROL32(A3, r); \
tA[4] = ROL32(A4, r); \
tA[5] = ROL32(A5, r); \
tA[6] = ROL32(A6, r); \
tA[7] = ROL32(A7, r); \
STEP2_ELT(0, w0, fun, s, pp8b); \
STEP2_ELT(1, w1, fun, s, pp8b); \
STEP2_ELT(2, w2, fun, s, pp8b); \
STEP2_ELT(3, w3, fun, s, pp8b); \
STEP2_ELT(4, w4, fun, s, pp8b); \
STEP2_ELT(5, w5, fun, s, pp8b); \
STEP2_ELT(6, w6, fun, s, pp8b); \
STEP2_ELT(7, w7, fun, s, pp8b); \
} while (0)
static void
one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
{
static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 };
STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7],
IF, p0, p1, pp8k[isp + 0]);
STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15],
IF, p1, p2, pp8k[isp + 1]);
STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23],
IF, p2, p3, pp8k[isp + 2]);
STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31],
IF, p3, p0, pp8k[isp + 3]);
STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39],
MAJ, p0, p1, pp8k[isp + 4]);
STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47],
MAJ, p1, p2, pp8k[isp + 5]);
STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55],
MAJ, p2, p3, pp8k[isp + 6]);
STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63],
MAJ, p3, p0, pp8k[isp + 7]);
}
static void
compress_big(sph_simd_big_context *sc, int last)
{
unsigned char *x;
s32 q[256];
int i;
u32 w[64];
u32 state[32];
size_t u;
static const size_t wbp[32] = {
4 << 4, 6 << 4, 0 << 4, 2 << 4,
7 << 4, 5 << 4, 3 << 4, 1 << 4,
15 << 4, 11 << 4, 12 << 4, 8 << 4,
9 << 4, 13 << 4, 10 << 4, 14 << 4,
17 << 4, 18 << 4, 23 << 4, 20 << 4,
22 << 4, 21 << 4, 16 << 4, 19 << 4,
30 << 4, 24 << 4, 25 << 4, 31 << 4,
27 << 4, 29 << 4, 28 << 4, 26 << 4
};
x = sc->buf;
FFT256(0, 1, 0, ll);
if (last) {
for (i = 0; i < 256; i ++) {
s32 tq;
tq = q[i] + yoff_b_f[i];
tq = REDS2(tq);
tq = REDS1(tq);
tq = REDS1(tq);
q[i] = (tq <= 128 ? tq : tq - 257);
}
} else {
for (i = 0; i < 256; i ++) {
s32 tq;
tq = q[i] + yoff_b_n[i];
tq = REDS2(tq);
tq = REDS1(tq);
tq = REDS1(tq);
q[i] = (tq <= 128 ? tq : tq - 257);
}
}
for (i = 0; i < 32; i += 8) {
state[i + 0] = sc->state[i + 0]
^ sph_dec32le_aligned(x + 4 * (i + 0));
state[i + 1] = sc->state[i + 1]
^ sph_dec32le_aligned(x + 4 * (i + 1));
state[i + 2] = sc->state[i + 2]
^ sph_dec32le_aligned(x + 4 * (i + 2));
state[i + 3] = sc->state[i + 3]
^ sph_dec32le_aligned(x + 4 * (i + 3));
state[i + 4] = sc->state[i + 4]
^ sph_dec32le_aligned(x + 4 * (i + 4));
state[i + 5] = sc->state[i + 5]
^ sph_dec32le_aligned(x + 4 * (i + 5));
state[i + 6] = sc->state[i + 6]
^ sph_dec32le_aligned(x + 4 * (i + 6));
state[i + 7] = sc->state[i + 7]
^ sph_dec32le_aligned(x + 4 * (i + 7));
}
#define WBREAD(sb, o1, o2, mm) do { \
for (u = 0; u < 64; u += 8) { \
size_t v = wbp[(u >> 3) + (sb)]; \
w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
q[v + 2 * 0 + (o2)], mm); \
w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
q[v + 2 * 1 + (o2)], mm); \
w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
q[v + 2 * 2 + (o2)], mm); \
w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
q[v + 2 * 3 + (o2)], mm); \
w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \
q[v + 2 * 4 + (o2)], mm); \
w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \
q[v + 2 * 5 + (o2)], mm); \
w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \
q[v + 2 * 6 + (o2)], mm); \
w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \
q[v + 2 * 7 + (o2)], mm); \
} \
} while (0)
WBREAD( 0, 0, 1, 185);
one_round_big(state, w, 0, 3, 23, 17, 27);
WBREAD( 8, 0, 1, 185);
one_round_big(state, w, 1, 28, 19, 22, 7);
WBREAD(16, -256, -128, 233);
one_round_big(state, w, 2, 29, 9, 15, 5);
WBREAD(24, -383, -255, 233);
one_round_big(state, w, 3, 4, 13, 10, 25);
#undef WBREAD
STEP_BIG(
sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
IF, 4, 13, PP8_4_);
STEP_BIG(
sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
sc->state[12], sc->state[13], sc->state[14], sc->state[15],
IF, 13, 10, PP8_5_);
STEP_BIG(
sc->state[16], sc->state[17], sc->state[18], sc->state[19],
sc->state[20], sc->state[21], sc->state[22], sc->state[23],
IF, 10, 25, PP8_6_);
STEP_BIG(
sc->state[24], sc->state[25], sc->state[26], sc->state[27],
sc->state[28], sc->state[29], sc->state[30], sc->state[31],
IF, 25, 4, PP8_0_);
memcpy(sc->state, state, sizeof state);
}
#undef A0
#undef A1
#undef A2
#undef A3
#undef A4
#undef A5
#undef A6
#undef A7
#undef B0
#undef B1
#undef B2
#undef B3
#undef B4
#undef B5
#undef B6
#undef B7
#undef C0
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#undef D0
#undef D1
#undef D2
#undef D3
#undef D4
#undef D5
#undef D6
#undef D7
#else
#if SPH_SIMD_NOCOPY
#define A0 (sc->state[ 0])
#define A1 (sc->state[ 1])
#define A2 (sc->state[ 2])
#define A3 (sc->state[ 3])
#define A4 (sc->state[ 4])
#define A5 (sc->state[ 5])
#define A6 (sc->state[ 6])
#define A7 (sc->state[ 7])
#define B0 (sc->state[ 8])
#define B1 (sc->state[ 9])
#define B2 (sc->state[10])
#define B3 (sc->state[11])
#define B4 (sc->state[12])
#define B5 (sc->state[13])
#define B6 (sc->state[14])
#define B7 (sc->state[15])
#define C0 (sc->state[16])
#define C1 (sc->state[17])
#define C2 (sc->state[18])
#define C3 (sc->state[19])
#define C4 (sc->state[20])
#define C5 (sc->state[21])
#define C6 (sc->state[22])
#define C7 (sc->state[23])
#define D0 (sc->state[24])
#define D1 (sc->state[25])
#define D2 (sc->state[26])
#define D3 (sc->state[27])
#define D4 (sc->state[28])
#define D5 (sc->state[29])
#define D6 (sc->state[30])
#define D7 (sc->state[31])
#endif
static void
compress_big(sph_simd_big_context *sc, int last)
{
unsigned char *x;
s32 q[256];
int i;
DECL_STATE_BIG
#if SPH_SIMD_NOCOPY
sph_u32 saved[32];
#endif
#if SPH_SIMD_NOCOPY
memcpy(saved, sc->state, sizeof saved);
#endif
x = sc->buf;
FFT256(0, 1, 0, ll);
if (last) {
for (i = 0; i < 256; i ++) {
s32 tq;
tq = q[i] + yoff_b_f[i];
tq = REDS2(tq);
tq = REDS1(tq);
tq = REDS1(tq);
q[i] = (tq <= 128 ? tq : tq - 257);
}
} else {
for (i = 0; i < 256; i ++) {
s32 tq;
tq = q[i] + yoff_b_n[i];
tq = REDS2(tq);
tq = REDS1(tq);
tq = REDS1(tq);
q[i] = (tq <= 128 ? tq : tq - 257);
}
}
READ_STATE_BIG(sc);
A0 ^= sph_dec32le_aligned(x + 0);
A1 ^= sph_dec32le_aligned(x + 4);
A2 ^= sph_dec32le_aligned(x + 8);
A3 ^= sph_dec32le_aligned(x + 12);
A4 ^= sph_dec32le_aligned(x + 16);
A5 ^= sph_dec32le_aligned(x + 20);
A6 ^= sph_dec32le_aligned(x + 24);
A7 ^= sph_dec32le_aligned(x + 28);
B0 ^= sph_dec32le_aligned(x + 32);
B1 ^= sph_dec32le_aligned(x + 36);
B2 ^= sph_dec32le_aligned(x + 40);
B3 ^= sph_dec32le_aligned(x + 44);
B4 ^= sph_dec32le_aligned(x + 48);
B5 ^= sph_dec32le_aligned(x + 52);
B6 ^= sph_dec32le_aligned(x + 56);
B7 ^= sph_dec32le_aligned(x + 60);
C0 ^= sph_dec32le_aligned(x + 64);
C1 ^= sph_dec32le_aligned(x + 68);
C2 ^= sph_dec32le_aligned(x + 72);
C3 ^= sph_dec32le_aligned(x + 76);
C4 ^= sph_dec32le_aligned(x + 80);
C5 ^= sph_dec32le_aligned(x + 84);
C6 ^= sph_dec32le_aligned(x + 88);
C7 ^= sph_dec32le_aligned(x + 92);
D0 ^= sph_dec32le_aligned(x + 96);
D1 ^= sph_dec32le_aligned(x + 100);
D2 ^= sph_dec32le_aligned(x + 104);
D3 ^= sph_dec32le_aligned(x + 108);
D4 ^= sph_dec32le_aligned(x + 112);
D5 ^= sph_dec32le_aligned(x + 116);
D6 ^= sph_dec32le_aligned(x + 120);
D7 ^= sph_dec32le_aligned(x + 124);
ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27);
ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
#if SPH_SIMD_NOCOPY
STEP_BIG(
saved[ 0], saved[ 1], saved[ 2], saved[ 3],
saved[ 4], saved[ 5], saved[ 6], saved[ 7],
IF, 4, 13, PP8_4_);
STEP_BIG(
saved[ 8], saved[ 9], saved[10], saved[11],
saved[12], saved[13], saved[14], saved[15],
IF, 13, 10, PP8_5_);
STEP_BIG(
saved[16], saved[17], saved[18], saved[19],
saved[20], saved[21], saved[22], saved[23],
IF, 10, 25, PP8_6_);
STEP_BIG(
saved[24], saved[25], saved[26], saved[27],
saved[28], saved[29], saved[30], saved[31],
IF, 25, 4, PP8_0_);
#else
STEP_BIG(
sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
IF, 4, 13, PP8_4_);
STEP_BIG(
sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
sc->state[12], sc->state[13], sc->state[14], sc->state[15],
IF, 13, 10, PP8_5_);
STEP_BIG(
sc->state[16], sc->state[17], sc->state[18], sc->state[19],
sc->state[20], sc->state[21], sc->state[22], sc->state[23],
IF, 10, 25, PP8_6_);
STEP_BIG(
sc->state[24], sc->state[25], sc->state[26], sc->state[27],
sc->state[28], sc->state[29], sc->state[30], sc->state[31],
IF, 25, 4, PP8_0_);
WRITE_STATE_BIG(sc);
#endif
}
#if SPH_SIMD_NOCOPY
#undef A0
#undef A1
#undef A2
#undef A3
#undef A4
#undef A5
#undef A6
#undef A7
#undef B0
#undef B1
#undef B2
#undef B3
#undef B4
#undef B5
#undef B6
#undef B7
#undef C0
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#undef D0
#undef D1
#undef D2
#undef D3
#undef D4
#undef D5
#undef D6
#undef D7
#endif
#endif
static const u32 IV224[] = {
C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53),
C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96),
C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6),
C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8)
};
static const u32 IV256[] = {
C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9),
C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3),
C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9),
C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1)
};
static const u32 IV384[] = {
C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B),
C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1),
C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A),
C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8),
C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2),
C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462),
C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5),
C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71)
};
static const u32 IV512[] = {
C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22)
};
static void
init_small(void *cc, const u32 *iv)
{
sph_simd_small_context *sc;
sc = (sph_simd_small_context *)cc;
memcpy(sc->state, iv, sizeof sc->state);
sc->count_low = sc->count_high = 0;
sc->ptr = 0;
}
static void
init_big(void *cc, const u32 *iv)
{
sph_simd_big_context *sc;
sc = (sph_simd_big_context *)cc;
memcpy(sc->state, iv, sizeof sc->state);
sc->count_low = sc->count_high = 0;
sc->ptr = 0;
}
static void
update_small(void *cc, const void *data, size_t len)
{
sph_simd_small_context *sc;
sc = (sph_simd_small_context *)cc;
while (len > 0) {
size_t clen;
clen = (sizeof sc->buf) - sc->ptr;
if (clen > len)
clen = len;
memcpy(sc->buf + sc->ptr, data, clen);
data = (const unsigned char *)data + clen;
len -= clen;
if ((sc->ptr += clen) == sizeof sc->buf) {
compress_small(sc, 0);
sc->ptr = 0;
sc->count_low = T32(sc->count_low + 1);
if (sc->count_low == 0)
sc->count_high ++;
}
}
}
static void
update_big(void *cc, const void *data, size_t len)
{
sph_simd_big_context *sc;
sc = (sph_simd_big_context *)cc;
while (len > 0) {
size_t clen;
clen = (sizeof sc->buf) - sc->ptr;
if (clen > len)
clen = len;
memcpy(sc->buf + sc->ptr, data, clen);
data = (const unsigned char *)data + clen;
len -= clen;
if ((sc->ptr += clen) == sizeof sc->buf) {
compress_big(sc, 0);
sc->ptr = 0;
sc->count_low = T32(sc->count_low + 1);
if (sc->count_low == 0)
sc->count_high ++;
}
}
}
static void
encode_count_small(unsigned char *dst,
u32 low, u32 high, size_t ptr, unsigned n)
{
low = T32(low << 9);
high = T32(high << 9) + (low >> 23);
low += (ptr << 3) + n;
sph_enc32le(dst, low);
sph_enc32le(dst + 4, high);
}
static void
encode_count_big(unsigned char *dst,
u32 low, u32 high, size_t ptr, unsigned n)
{
low = T32(low << 10);
high = T32(high << 10) + (low >> 22);
low += (ptr << 3) + n;
sph_enc32le(dst, low);
sph_enc32le(dst + 4, high);
}
static void
finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
{
sph_simd_small_context *sc;
unsigned char *d;
size_t u;
sc = (sph_simd_small_context *)cc;
if (sc->ptr > 0 || n > 0) {
memset(sc->buf + sc->ptr, 0,
(sizeof sc->buf) - sc->ptr);
sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
compress_small(sc, 0);
}
memset(sc->buf, 0, sizeof sc->buf);
encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
compress_small(sc, 1);
d = (unsigned char *)dst;
for (d = (unsigned char *)dst, u = 0; u < dst_len; u ++)
sph_enc32le(d + (u << 2), sc->state[u]);
}
static void
finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
{
sph_simd_big_context *sc;
unsigned char *d;
size_t u;
sc = (sph_simd_big_context *)cc;
if (sc->ptr > 0 || n > 0) {
memset(sc->buf + sc->ptr, 0,
(sizeof sc->buf) - sc->ptr);
sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
compress_big(sc, 0);
}
memset(sc->buf, 0, sizeof sc->buf);
encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
compress_big(sc, 1);
d = (unsigned char *)dst;
for (d = (unsigned char *)dst, u = 0; u < dst_len; u ++)
sph_enc32le(d + (u << 2), sc->state[u]);
}
void
sph_simd224_init(void *cc)
{
init_small(cc, IV224);
}
void
sph_simd224(void *cc, const void *data, size_t len)
{
update_small(cc, data, len);
}
void
sph_simd224_close(void *cc, void *dst)
{
sph_simd224_addbits_and_close(cc, 0, 0, dst);
}
void
sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
finalize_small(cc, ub, n, dst, 7);
sph_simd224_init(cc);
}
void
sph_simd256_init(void *cc)
{
init_small(cc, IV256);
}
void
sph_simd256(void *cc, const void *data, size_t len)
{
update_small(cc, data, len);
}
void
sph_simd256_close(void *cc, void *dst)
{
sph_simd256_addbits_and_close(cc, 0, 0, dst);
}
void
sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
finalize_small(cc, ub, n, dst, 8);
sph_simd256_init(cc);
}
void
sph_simd384_init(void *cc)
{
init_big(cc, IV384);
}
void
sph_simd384(void *cc, const void *data, size_t len)
{
update_big(cc, data, len);
}
void
sph_simd384_close(void *cc, void *dst)
{
sph_simd384_addbits_and_close(cc, 0, 0, dst);
}
void
sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
finalize_big(cc, ub, n, dst, 12);
sph_simd384_init(cc);
}
void
sph_simd512_init(void *cc)
{
init_big(cc, IV512);
}
void
sph_simd512(void *cc, const void *data, size_t len)
{
update_big(cc, data, len);
}
void
sph_simd512_close(void *cc, void *dst)
{
sph_simd512_addbits_and_close(cc, 0, 0, dst);
}
void
sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
finalize_big(cc, ub, n, dst, 16);
sph_simd512_init(cc);
}