You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1799 lines
48 KiB
1799 lines
48 KiB
/* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */ |
|
/* |
|
* SIMD implementation. |
|
* |
|
* ==========================(LICENSE BEGIN)============================ |
|
* |
|
* Copyright (c) 2007-2010 Projet RNRT SAPHIR |
|
* |
|
* Permission is hereby granted, free of charge, to any person obtaining |
|
* a copy of this software and associated documentation files (the |
|
* "Software"), to deal in the Software without restriction, including |
|
* without limitation the rights to use, copy, modify, merge, publish, |
|
* distribute, sublicense, and/or sell copies of the Software, and to |
|
* permit persons to whom the Software is furnished to do so, subject to |
|
* the following conditions: |
|
* |
|
* The above copyright notice and this permission notice shall be |
|
* included in all copies or substantial portions of the Software. |
|
* |
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
|
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
|
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
|
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
|
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
|
* |
|
* ===========================(LICENSE END)============================= |
|
* |
|
* @author Thomas Pornin <thomas.pornin@cryptolog.com> |
|
*/ |
|
|
|
#include <stddef.h> |
|
#include <string.h> |
|
#include <limits.h> |
|
|
|
#include "sph_simd.h" |
|
|
|
#ifdef __cplusplus |
|
extern "C"{ |
|
#endif |
|
|
|
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD |
|
#define SPH_SMALL_FOOTPRINT_SIMD 1 |
|
#endif |
|
|
|
#ifdef _MSC_VER |
|
#pragma warning (disable: 4146) |
|
#endif |
|
|
|
typedef sph_u32 u32; |
|
typedef sph_s32 s32; |
|
#define C32 SPH_C32 |
|
#define T32 SPH_T32 |
|
#define ROL32 SPH_ROTL32 |
|
|
|
#define XCAT(x, y) XCAT_(x, y) |
|
#define XCAT_(x, y) x ## y |
|
|
|
/* |
|
* The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive. |
|
*/ |
|
static const s32 alpha_tab[] = { |
|
1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130, |
|
190, 80, 196, 69, 2, 82, 21, 90, 92, 174, 195, 28, |
|
120, 37, 232, 3, 123, 160, 135, 138, 4, 164, 42, 180, |
|
184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19, |
|
8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12, |
|
235, 126, 26, 38, 16, 142, 168, 206, 222, 107, 18, 224, |
|
189, 39, 57, 24, 213, 252, 52, 76, 32, 27, 79, 155, |
|
187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152, |
|
64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96, |
|
81, 237, 208, 47, 128, 108, 59, 106, 234, 85, 144, 250, |
|
227, 55, 199, 192, 162, 217, 159, 94, 256, 216, 118, 212, |
|
211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188, |
|
255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254, |
|
134, 97, 122, 119, 253, 93, 215, 77, 73, 166, 124, 201, |
|
17, 183, 50, 251, 11, 194, 244, 238, 249, 186, 173, 154, |
|
146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219, |
|
241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233, |
|
44, 5, 205, 181, 225, 230, 178, 102, 70, 43, 221, 66, |
|
136, 179, 143, 209, 88, 10, 153, 105, 193, 203, 99, 204, |
|
140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210, |
|
129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65, |
|
95, 40, 98, 163 |
|
}; |
|
|
|
/* |
|
* Ranges: |
|
* REDS1: from -32768..98302 to -383..383 |
|
* REDS2: from -2^31..2^31-1 to -32768..98302 |
|
*/ |
|
#define REDS1(x) (((x) & 0xFF) - ((x) >> 8)) |
|
#define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16)) |
|
|
|
/* |
|
* If, upon entry, the values of q[] are all in the -N..N range (where |
|
* N >= 98302) then the new values of q[] are in the -2N..2N range. |
|
* |
|
* Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608. |
|
*/ |
|
#define FFT_LOOP(rb, hk, as, id) do { \ |
|
size_t u, v; \ |
|
s32 m = q[(rb)]; \ |
|
s32 n = q[(rb) + (hk)]; \ |
|
q[(rb)] = m + n; \ |
|
q[(rb) + (hk)] = m - n; \ |
|
u = v = 0; \ |
|
goto id; \ |
|
for (; u < (hk); u += 4, v += 4 * (as)) { \ |
|
s32 t; \ |
|
m = q[(rb) + u + 0]; \ |
|
n = q[(rb) + u + 0 + (hk)]; \ |
|
t = REDS2(n * alpha_tab[v + 0 * (as)]); \ |
|
q[(rb) + u + 0] = m + t; \ |
|
q[(rb) + u + 0 + (hk)] = m - t; \ |
|
id: \ |
|
m = q[(rb) + u + 1]; \ |
|
n = q[(rb) + u + 1 + (hk)]; \ |
|
t = REDS2(n * alpha_tab[v + 1 * (as)]); \ |
|
q[(rb) + u + 1] = m + t; \ |
|
q[(rb) + u + 1 + (hk)] = m - t; \ |
|
m = q[(rb) + u + 2]; \ |
|
n = q[(rb) + u + 2 + (hk)]; \ |
|
t = REDS2(n * alpha_tab[v + 2 * (as)]); \ |
|
q[(rb) + u + 2] = m + t; \ |
|
q[(rb) + u + 2 + (hk)] = m - t; \ |
|
m = q[(rb) + u + 3]; \ |
|
n = q[(rb) + u + 3 + (hk)]; \ |
|
t = REDS2(n * alpha_tab[v + 3 * (as)]); \ |
|
q[(rb) + u + 3] = m + t; \ |
|
q[(rb) + u + 3 + (hk)] = m - t; \ |
|
} \ |
|
} while (0) |
|
|
|
/* |
|
* Output ranges: |
|
* d0: min= 0 max= 1020 |
|
* d1: min= -67 max= 4587 |
|
* d2: min=-4335 max= 4335 |
|
* d3: min=-4147 max= 507 |
|
* d4: min= -510 max= 510 |
|
* d5: min= -252 max= 4402 |
|
* d6: min=-4335 max= 4335 |
|
* d7: min=-4332 max= 322 |
|
*/ |
|
#define FFT8(xb, xs, d) do { \ |
|
s32 x0 = x[(xb)]; \ |
|
s32 x1 = x[(xb) + (xs)]; \ |
|
s32 x2 = x[(xb) + 2 * (xs)]; \ |
|
s32 x3 = x[(xb) + 3 * (xs)]; \ |
|
s32 a0 = x0 + x2; \ |
|
s32 a1 = x0 + (x2 << 4); \ |
|
s32 a2 = x0 - x2; \ |
|
s32 a3 = x0 - (x2 << 4); \ |
|
s32 b0 = x1 + x3; \ |
|
s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \ |
|
s32 b2 = (x1 << 4) - (x3 << 4); \ |
|
s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \ |
|
d ## 0 = a0 + b0; \ |
|
d ## 1 = a1 + b1; \ |
|
d ## 2 = a2 + b2; \ |
|
d ## 3 = a3 + b3; \ |
|
d ## 4 = a0 - b0; \ |
|
d ## 5 = a1 - b1; \ |
|
d ## 6 = a2 - b2; \ |
|
d ## 7 = a3 - b3; \ |
|
} while (0) |
|
|
|
/* |
|
* When k=16, we have alpha=2. Multiplication by alpha^i is then reduced |
|
* to some shifting. |
|
* |
|
* Output: within -591471..591723 |
|
*/ |
|
#define FFT16(xb, xs, rb) do { \ |
|
s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \ |
|
s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \ |
|
FFT8(xb, (xs) << 1, d1_); \ |
|
FFT8((xb) + (xs), (xs) << 1, d2_); \ |
|
q[(rb) + 0] = d1_0 + d2_0; \ |
|
q[(rb) + 1] = d1_1 + (d2_1 << 1); \ |
|
q[(rb) + 2] = d1_2 + (d2_2 << 2); \ |
|
q[(rb) + 3] = d1_3 + (d2_3 << 3); \ |
|
q[(rb) + 4] = d1_4 + (d2_4 << 4); \ |
|
q[(rb) + 5] = d1_5 + (d2_5 << 5); \ |
|
q[(rb) + 6] = d1_6 + (d2_6 << 6); \ |
|
q[(rb) + 7] = d1_7 + (d2_7 << 7); \ |
|
q[(rb) + 8] = d1_0 - d2_0; \ |
|
q[(rb) + 9] = d1_1 - (d2_1 << 1); \ |
|
q[(rb) + 10] = d1_2 - (d2_2 << 2); \ |
|
q[(rb) + 11] = d1_3 - (d2_3 << 3); \ |
|
q[(rb) + 12] = d1_4 - (d2_4 << 4); \ |
|
q[(rb) + 13] = d1_5 - (d2_5 << 5); \ |
|
q[(rb) + 14] = d1_6 - (d2_6 << 6); \ |
|
q[(rb) + 15] = d1_7 - (d2_7 << 7); \ |
|
} while (0) |
|
|
|
/* |
|
* Output range: |q| <= 1183446 |
|
*/ |
|
#define FFT32(xb, xs, rb, id) do { \ |
|
FFT16(xb, (xs) << 1, rb); \ |
|
FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \ |
|
FFT_LOOP(rb, 16, 8, id); \ |
|
} while (0) |
|
|
|
/* |
|
* Output range: |q| <= 2366892 |
|
*/ |
|
#define FFT64(xb, xs, rb, id) do { \ |
|
FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \ |
|
FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \ |
|
FFT_LOOP(rb, 32, 4, id); \ |
|
} while (0) |
|
|
|
#if SPH_SMALL_FOOTPRINT_SIMD |
|
|
|
static void |
|
fft32(unsigned char *x, size_t xs, s32 *q) |
|
{ |
|
size_t xd; |
|
|
|
xd = xs << 1; |
|
FFT16(0, xd, 0); |
|
FFT16(xs, xd, 16); |
|
FFT_LOOP(0, 16, 8, label_); |
|
} |
|
|
|
#define FFT128(xb, xs, rb, id) do { \ |
|
fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \ |
|
fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \ |
|
FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \ |
|
fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \ |
|
fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \ |
|
FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \ |
|
FFT_LOOP(rb, 64, 2, XCAT(id, a)); \ |
|
} while (0) |
|
|
|
#else |
|
|
|
/* |
|
* Output range: |q| <= 4733784 |
|
*/ |
|
#define FFT128(xb, xs, rb, id) do { \ |
|
FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \ |
|
FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \ |
|
FFT_LOOP(rb, 64, 2, id); \ |
|
} while (0) |
|
|
|
#endif |
|
|
|
/* |
|
* For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression |
|
* function which does not fit in the 32 kB L1 cache of a typical x86 |
|
* Intel. We therefore add a function call layer at the FFT64 level. |
|
*/ |
|
|
|
static void |
|
fft64(unsigned char *x, size_t xs, s32 *q) |
|
{ |
|
size_t xd; |
|
|
|
xd = xs << 1; |
|
FFT32(0, xd, 0, label_a); |
|
FFT32(xs, xd, 32, label_b); |
|
FFT_LOOP(0, 32, 4, label_); |
|
} |
|
|
|
/* |
|
* Output range: |q| <= 9467568 |
|
*/ |
|
#define FFT256(xb, xs, rb, id) do { \ |
|
fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \ |
|
fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 64]); \ |
|
FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \ |
|
fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \ |
|
fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \ |
|
FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \ |
|
FFT_LOOP(rb, 128, 1, XCAT(id, a)); \ |
|
} while (0) |
|
|
|
/* |
|
* alpha^(127*i) mod 257 |
|
*/ |
|
static const unsigned short yoff_s_n[] = { |
|
1, 98, 95, 58, 30, 113, 23, 198, 129, 49, 176, 29, |
|
15, 185, 140, 99, 193, 153, 88, 143, 136, 221, 70, 178, |
|
225, 205, 44, 200, 68, 239, 35, 89, 241, 231, 22, 100, |
|
34, 248, 146, 173, 249, 244, 11, 50, 17, 124, 73, 215, |
|
253, 122, 134, 25, 137, 62, 165, 236, 255, 61, 67, 141, |
|
197, 31, 211, 118, 256, 159, 162, 199, 227, 144, 234, 59, |
|
128, 208, 81, 228, 242, 72, 117, 158, 64, 104, 169, 114, |
|
121, 36, 187, 79, 32, 52, 213, 57, 189, 18, 222, 168, |
|
16, 26, 235, 157, 223, 9, 111, 84, 8, 13, 246, 207, |
|
240, 133, 184, 42, 4, 135, 123, 232, 120, 195, 92, 21, |
|
2, 196, 190, 116, 60, 226, 46, 139 |
|
}; |
|
|
|
/* |
|
* alpha^(127*i) + alpha^(125*i) mod 257 |
|
*/ |
|
static const unsigned short yoff_s_f[] = { |
|
2, 156, 118, 107, 45, 212, 111, 162, 97, 249, 211, 3, |
|
49, 101, 151, 223, 189, 178, 253, 204, 76, 82, 232, 65, |
|
96, 176, 161, 47, 189, 61, 248, 107, 0, 131, 133, 113, |
|
17, 33, 12, 111, 251, 103, 57, 148, 47, 65, 249, 143, |
|
189, 8, 204, 230, 205, 151, 187, 227, 247, 111, 140, 6, |
|
77, 10, 21, 149, 255, 101, 139, 150, 212, 45, 146, 95, |
|
160, 8, 46, 254, 208, 156, 106, 34, 68, 79, 4, 53, |
|
181, 175, 25, 192, 161, 81, 96, 210, 68, 196, 9, 150, |
|
0, 126, 124, 144, 240, 224, 245, 146, 6, 154, 200, 109, |
|
210, 192, 8, 114, 68, 249, 53, 27, 52, 106, 70, 30, |
|
10, 146, 117, 251, 180, 247, 236, 108 |
|
}; |
|
|
|
/* |
|
* beta^(255*i) mod 257 |
|
*/ |
|
static const unsigned short yoff_b_n[] = { |
|
1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172, |
|
23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101, |
|
15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10, |
|
88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230, |
|
225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150, |
|
35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109, |
|
34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194, |
|
11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93, |
|
253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83, |
|
165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110, |
|
197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217, |
|
162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108, |
|
128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171, |
|
117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78, |
|
121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252, |
|
213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142, |
|
16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182, |
|
111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74, |
|
240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160, |
|
123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82, |
|
2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87, |
|
46, 45, 139, 41 |
|
}; |
|
|
|
/* |
|
* beta^(255*i) + beta^(253*i) mod 257 |
|
*/ |
|
static const unsigned short yoff_b_f[] = { |
|
2, 203, 156, 47, 118, 214, 107, 106, 45, 93, 212, 20, |
|
111, 73, 162, 251, 97, 215, 249, 53, 211, 19, 3, 89, |
|
49, 207, 101, 67, 151, 130, 223, 23, 189, 202, 178, 239, |
|
253, 127, 204, 49, 76, 236, 82, 137, 232, 157, 65, 79, |
|
96, 161, 176, 130, 161, 30, 47, 9, 189, 247, 61, 226, |
|
248, 90, 107, 64, 0, 88, 131, 243, 133, 59, 113, 115, |
|
17, 236, 33, 213, 12, 191, 111, 19, 251, 61, 103, 208, |
|
57, 35, 148, 248, 47, 116, 65, 119, 249, 178, 143, 40, |
|
189, 129, 8, 163, 204, 227, 230, 196, 205, 122, 151, 45, |
|
187, 19, 227, 72, 247, 125, 111, 121, 140, 220, 6, 107, |
|
77, 69, 10, 101, 21, 65, 149, 171, 255, 54, 101, 210, |
|
139, 43, 150, 151, 212, 164, 45, 237, 146, 184, 95, 6, |
|
160, 42, 8, 204, 46, 238, 254, 168, 208, 50, 156, 190, |
|
106, 127, 34, 234, 68, 55, 79, 18, 4, 130, 53, 208, |
|
181, 21, 175, 120, 25, 100, 192, 178, 161, 96, 81, 127, |
|
96, 227, 210, 248, 68, 10, 196, 31, 9, 167, 150, 193, |
|
0, 169, 126, 14, 124, 198, 144, 142, 240, 21, 224, 44, |
|
245, 66, 146, 238, 6, 196, 154, 49, 200, 222, 109, 9, |
|
210, 141, 192, 138, 8, 79, 114, 217, 68, 128, 249, 94, |
|
53, 30, 27, 61, 52, 135, 106, 212, 70, 238, 30, 185, |
|
10, 132, 146, 136, 117, 37, 251, 150, 180, 188, 247, 156, |
|
236, 192, 108, 86 |
|
}; |
|
|
|
#define INNER(l, h, mm) (((u32)((l) * (mm)) & 0xFFFFU) \ |
|
+ ((u32)((h) * (mm)) << 16)) |
|
|
|
#define W_SMALL(sb, o1, o2, mm) \ |
|
(INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \ |
|
INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \ |
|
INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \ |
|
INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm) |
|
|
|
#define WS_0_0 W_SMALL( 4, 0, 1, 185) |
|
#define WS_0_1 W_SMALL( 6, 0, 1, 185) |
|
#define WS_0_2 W_SMALL( 0, 0, 1, 185) |
|
#define WS_0_3 W_SMALL( 2, 0, 1, 185) |
|
#define WS_0_4 W_SMALL( 7, 0, 1, 185) |
|
#define WS_0_5 W_SMALL( 5, 0, 1, 185) |
|
#define WS_0_6 W_SMALL( 3, 0, 1, 185) |
|
#define WS_0_7 W_SMALL( 1, 0, 1, 185) |
|
#define WS_1_0 W_SMALL(15, 0, 1, 185) |
|
#define WS_1_1 W_SMALL(11, 0, 1, 185) |
|
#define WS_1_2 W_SMALL(12, 0, 1, 185) |
|
#define WS_1_3 W_SMALL( 8, 0, 1, 185) |
|
#define WS_1_4 W_SMALL( 9, 0, 1, 185) |
|
#define WS_1_5 W_SMALL(13, 0, 1, 185) |
|
#define WS_1_6 W_SMALL(10, 0, 1, 185) |
|
#define WS_1_7 W_SMALL(14, 0, 1, 185) |
|
#define WS_2_0 W_SMALL(17, -128, -64, 233) |
|
#define WS_2_1 W_SMALL(18, -128, -64, 233) |
|
#define WS_2_2 W_SMALL(23, -128, -64, 233) |
|
#define WS_2_3 W_SMALL(20, -128, -64, 233) |
|
#define WS_2_4 W_SMALL(22, -128, -64, 233) |
|
#define WS_2_5 W_SMALL(21, -128, -64, 233) |
|
#define WS_2_6 W_SMALL(16, -128, -64, 233) |
|
#define WS_2_7 W_SMALL(19, -128, -64, 233) |
|
#define WS_3_0 W_SMALL(30, -191, -127, 233) |
|
#define WS_3_1 W_SMALL(24, -191, -127, 233) |
|
#define WS_3_2 W_SMALL(25, -191, -127, 233) |
|
#define WS_3_3 W_SMALL(31, -191, -127, 233) |
|
#define WS_3_4 W_SMALL(27, -191, -127, 233) |
|
#define WS_3_5 W_SMALL(29, -191, -127, 233) |
|
#define WS_3_6 W_SMALL(28, -191, -127, 233) |
|
#define WS_3_7 W_SMALL(26, -191, -127, 233) |
|
|
|
#define W_BIG(sb, o1, o2, mm) \ |
|
(INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \ |
|
INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \ |
|
INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \ |
|
INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \ |
|
INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \ |
|
INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \ |
|
INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \ |
|
INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm) |
|
|
|
#define WB_0_0 W_BIG( 4, 0, 1, 185) |
|
#define WB_0_1 W_BIG( 6, 0, 1, 185) |
|
#define WB_0_2 W_BIG( 0, 0, 1, 185) |
|
#define WB_0_3 W_BIG( 2, 0, 1, 185) |
|
#define WB_0_4 W_BIG( 7, 0, 1, 185) |
|
#define WB_0_5 W_BIG( 5, 0, 1, 185) |
|
#define WB_0_6 W_BIG( 3, 0, 1, 185) |
|
#define WB_0_7 W_BIG( 1, 0, 1, 185) |
|
#define WB_1_0 W_BIG(15, 0, 1, 185) |
|
#define WB_1_1 W_BIG(11, 0, 1, 185) |
|
#define WB_1_2 W_BIG(12, 0, 1, 185) |
|
#define WB_1_3 W_BIG( 8, 0, 1, 185) |
|
#define WB_1_4 W_BIG( 9, 0, 1, 185) |
|
#define WB_1_5 W_BIG(13, 0, 1, 185) |
|
#define WB_1_6 W_BIG(10, 0, 1, 185) |
|
#define WB_1_7 W_BIG(14, 0, 1, 185) |
|
#define WB_2_0 W_BIG(17, -256, -128, 233) |
|
#define WB_2_1 W_BIG(18, -256, -128, 233) |
|
#define WB_2_2 W_BIG(23, -256, -128, 233) |
|
#define WB_2_3 W_BIG(20, -256, -128, 233) |
|
#define WB_2_4 W_BIG(22, -256, -128, 233) |
|
#define WB_2_5 W_BIG(21, -256, -128, 233) |
|
#define WB_2_6 W_BIG(16, -256, -128, 233) |
|
#define WB_2_7 W_BIG(19, -256, -128, 233) |
|
#define WB_3_0 W_BIG(30, -383, -255, 233) |
|
#define WB_3_1 W_BIG(24, -383, -255, 233) |
|
#define WB_3_2 W_BIG(25, -383, -255, 233) |
|
#define WB_3_3 W_BIG(31, -383, -255, 233) |
|
#define WB_3_4 W_BIG(27, -383, -255, 233) |
|
#define WB_3_5 W_BIG(29, -383, -255, 233) |
|
#define WB_3_6 W_BIG(28, -383, -255, 233) |
|
#define WB_3_7 W_BIG(26, -383, -255, 233) |
|
|
|
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) |
|
#define MAJ(x, y, z) (((x) & (y)) | (((x) | (y)) & (z))) |
|
|
|
#define PP4_0_0 1 |
|
#define PP4_0_1 0 |
|
#define PP4_0_2 3 |
|
#define PP4_0_3 2 |
|
#define PP4_1_0 2 |
|
#define PP4_1_1 3 |
|
#define PP4_1_2 0 |
|
#define PP4_1_3 1 |
|
#define PP4_2_0 3 |
|
#define PP4_2_1 2 |
|
#define PP4_2_2 1 |
|
#define PP4_2_3 0 |
|
|
|
#define PP8_0_0 1 |
|
#define PP8_0_1 0 |
|
#define PP8_0_2 3 |
|
#define PP8_0_3 2 |
|
#define PP8_0_4 5 |
|
#define PP8_0_5 4 |
|
#define PP8_0_6 7 |
|
#define PP8_0_7 6 |
|
|
|
#define PP8_1_0 6 |
|
#define PP8_1_1 7 |
|
#define PP8_1_2 4 |
|
#define PP8_1_3 5 |
|
#define PP8_1_4 2 |
|
#define PP8_1_5 3 |
|
#define PP8_1_6 0 |
|
#define PP8_1_7 1 |
|
|
|
#define PP8_2_0 2 |
|
#define PP8_2_1 3 |
|
#define PP8_2_2 0 |
|
#define PP8_2_3 1 |
|
#define PP8_2_4 6 |
|
#define PP8_2_5 7 |
|
#define PP8_2_6 4 |
|
#define PP8_2_7 5 |
|
|
|
#define PP8_3_0 3 |
|
#define PP8_3_1 2 |
|
#define PP8_3_2 1 |
|
#define PP8_3_3 0 |
|
#define PP8_3_4 7 |
|
#define PP8_3_5 6 |
|
#define PP8_3_6 5 |
|
#define PP8_3_7 4 |
|
|
|
#define PP8_4_0 5 |
|
#define PP8_4_1 4 |
|
#define PP8_4_2 7 |
|
#define PP8_4_3 6 |
|
#define PP8_4_4 1 |
|
#define PP8_4_5 0 |
|
#define PP8_4_6 3 |
|
#define PP8_4_7 2 |
|
|
|
#define PP8_5_0 7 |
|
#define PP8_5_1 6 |
|
#define PP8_5_2 5 |
|
#define PP8_5_3 4 |
|
#define PP8_5_4 3 |
|
#define PP8_5_5 2 |
|
#define PP8_5_6 1 |
|
#define PP8_5_7 0 |
|
|
|
#define PP8_6_0 4 |
|
#define PP8_6_1 5 |
|
#define PP8_6_2 6 |
|
#define PP8_6_3 7 |
|
#define PP8_6_4 0 |
|
#define PP8_6_5 1 |
|
#define PP8_6_6 2 |
|
#define PP8_6_7 3 |
|
|
|
#if SPH_SIMD_NOCOPY |
|
|
|
#define DECL_STATE_SMALL |
|
#define READ_STATE_SMALL(sc) |
|
#define WRITE_STATE_SMALL(sc) |
|
#define DECL_STATE_BIG |
|
#define READ_STATE_BIG(sc) |
|
#define WRITE_STATE_BIG(sc) |
|
|
|
#else |
|
|
|
#define DECL_STATE_SMALL \ |
|
u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3; |
|
|
|
#define READ_STATE_SMALL(sc) do { \ |
|
A0 = (sc)->state[ 0]; \ |
|
A1 = (sc)->state[ 1]; \ |
|
A2 = (sc)->state[ 2]; \ |
|
A3 = (sc)->state[ 3]; \ |
|
B0 = (sc)->state[ 4]; \ |
|
B1 = (sc)->state[ 5]; \ |
|
B2 = (sc)->state[ 6]; \ |
|
B3 = (sc)->state[ 7]; \ |
|
C0 = (sc)->state[ 8]; \ |
|
C1 = (sc)->state[ 9]; \ |
|
C2 = (sc)->state[10]; \ |
|
C3 = (sc)->state[11]; \ |
|
D0 = (sc)->state[12]; \ |
|
D1 = (sc)->state[13]; \ |
|
D2 = (sc)->state[14]; \ |
|
D3 = (sc)->state[15]; \ |
|
} while (0) |
|
|
|
#define WRITE_STATE_SMALL(sc) do { \ |
|
(sc)->state[ 0] = A0; \ |
|
(sc)->state[ 1] = A1; \ |
|
(sc)->state[ 2] = A2; \ |
|
(sc)->state[ 3] = A3; \ |
|
(sc)->state[ 4] = B0; \ |
|
(sc)->state[ 5] = B1; \ |
|
(sc)->state[ 6] = B2; \ |
|
(sc)->state[ 7] = B3; \ |
|
(sc)->state[ 8] = C0; \ |
|
(sc)->state[ 9] = C1; \ |
|
(sc)->state[10] = C2; \ |
|
(sc)->state[11] = C3; \ |
|
(sc)->state[12] = D0; \ |
|
(sc)->state[13] = D1; \ |
|
(sc)->state[14] = D2; \ |
|
(sc)->state[15] = D3; \ |
|
} while (0) |
|
|
|
#define DECL_STATE_BIG \ |
|
u32 A0, A1, A2, A3, A4, A5, A6, A7; \ |
|
u32 B0, B1, B2, B3, B4, B5, B6, B7; \ |
|
u32 C0, C1, C2, C3, C4, C5, C6, C7; \ |
|
u32 D0, D1, D2, D3, D4, D5, D6, D7; |
|
|
|
#define READ_STATE_BIG(sc) do { \ |
|
A0 = (sc)->state[ 0]; \ |
|
A1 = (sc)->state[ 1]; \ |
|
A2 = (sc)->state[ 2]; \ |
|
A3 = (sc)->state[ 3]; \ |
|
A4 = (sc)->state[ 4]; \ |
|
A5 = (sc)->state[ 5]; \ |
|
A6 = (sc)->state[ 6]; \ |
|
A7 = (sc)->state[ 7]; \ |
|
B0 = (sc)->state[ 8]; \ |
|
B1 = (sc)->state[ 9]; \ |
|
B2 = (sc)->state[10]; \ |
|
B3 = (sc)->state[11]; \ |
|
B4 = (sc)->state[12]; \ |
|
B5 = (sc)->state[13]; \ |
|
B6 = (sc)->state[14]; \ |
|
B7 = (sc)->state[15]; \ |
|
C0 = (sc)->state[16]; \ |
|
C1 = (sc)->state[17]; \ |
|
C2 = (sc)->state[18]; \ |
|
C3 = (sc)->state[19]; \ |
|
C4 = (sc)->state[20]; \ |
|
C5 = (sc)->state[21]; \ |
|
C6 = (sc)->state[22]; \ |
|
C7 = (sc)->state[23]; \ |
|
D0 = (sc)->state[24]; \ |
|
D1 = (sc)->state[25]; \ |
|
D2 = (sc)->state[26]; \ |
|
D3 = (sc)->state[27]; \ |
|
D4 = (sc)->state[28]; \ |
|
D5 = (sc)->state[29]; \ |
|
D6 = (sc)->state[30]; \ |
|
D7 = (sc)->state[31]; \ |
|
} while (0) |
|
|
|
#define WRITE_STATE_BIG(sc) do { \ |
|
(sc)->state[ 0] = A0; \ |
|
(sc)->state[ 1] = A1; \ |
|
(sc)->state[ 2] = A2; \ |
|
(sc)->state[ 3] = A3; \ |
|
(sc)->state[ 4] = A4; \ |
|
(sc)->state[ 5] = A5; \ |
|
(sc)->state[ 6] = A6; \ |
|
(sc)->state[ 7] = A7; \ |
|
(sc)->state[ 8] = B0; \ |
|
(sc)->state[ 9] = B1; \ |
|
(sc)->state[10] = B2; \ |
|
(sc)->state[11] = B3; \ |
|
(sc)->state[12] = B4; \ |
|
(sc)->state[13] = B5; \ |
|
(sc)->state[14] = B6; \ |
|
(sc)->state[15] = B7; \ |
|
(sc)->state[16] = C0; \ |
|
(sc)->state[17] = C1; \ |
|
(sc)->state[18] = C2; \ |
|
(sc)->state[19] = C3; \ |
|
(sc)->state[20] = C4; \ |
|
(sc)->state[21] = C5; \ |
|
(sc)->state[22] = C6; \ |
|
(sc)->state[23] = C7; \ |
|
(sc)->state[24] = D0; \ |
|
(sc)->state[25] = D1; \ |
|
(sc)->state[26] = D2; \ |
|
(sc)->state[27] = D3; \ |
|
(sc)->state[28] = D4; \ |
|
(sc)->state[29] = D5; \ |
|
(sc)->state[30] = D6; \ |
|
(sc)->state[31] = D7; \ |
|
} while (0) |
|
|
|
#endif |
|
|
|
#define STEP_ELT(n, w, fun, s, ppb) do { \ |
|
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ |
|
A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \ |
|
D ## n = C ## n; \ |
|
C ## n = B ## n; \ |
|
B ## n = tA ## n; \ |
|
} while (0) |
|
|
|
#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \ |
|
u32 tA0 = ROL32(A0, r); \ |
|
u32 tA1 = ROL32(A1, r); \ |
|
u32 tA2 = ROL32(A2, r); \ |
|
u32 tA3 = ROL32(A3, r); \ |
|
STEP_ELT(0, w0, fun, s, pp4b); \ |
|
STEP_ELT(1, w1, fun, s, pp4b); \ |
|
STEP_ELT(2, w2, fun, s, pp4b); \ |
|
STEP_ELT(3, w3, fun, s, pp4b); \ |
|
} while (0) |
|
|
|
#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \ |
|
u32 tA0 = ROL32(A0, r); \ |
|
u32 tA1 = ROL32(A1, r); \ |
|
u32 tA2 = ROL32(A2, r); \ |
|
u32 tA3 = ROL32(A3, r); \ |
|
u32 tA4 = ROL32(A4, r); \ |
|
u32 tA5 = ROL32(A5, r); \ |
|
u32 tA6 = ROL32(A6, r); \ |
|
u32 tA7 = ROL32(A7, r); \ |
|
STEP_ELT(0, w0, fun, s, pp8b); \ |
|
STEP_ELT(1, w1, fun, s, pp8b); \ |
|
STEP_ELT(2, w2, fun, s, pp8b); \ |
|
STEP_ELT(3, w3, fun, s, pp8b); \ |
|
STEP_ELT(4, w4, fun, s, pp8b); \ |
|
STEP_ELT(5, w5, fun, s, pp8b); \ |
|
STEP_ELT(6, w6, fun, s, pp8b); \ |
|
STEP_ELT(7, w7, fun, s, pp8b); \ |
|
} while (0) |
|
|
|
#define M3_0_0 0_ |
|
#define M3_1_0 1_ |
|
#define M3_2_0 2_ |
|
#define M3_3_0 0_ |
|
#define M3_4_0 1_ |
|
#define M3_5_0 2_ |
|
#define M3_6_0 0_ |
|
#define M3_7_0 1_ |
|
|
|
#define M3_0_1 1_ |
|
#define M3_1_1 2_ |
|
#define M3_2_1 0_ |
|
#define M3_3_1 1_ |
|
#define M3_4_1 2_ |
|
#define M3_5_1 0_ |
|
#define M3_6_1 1_ |
|
#define M3_7_1 2_ |
|
|
|
#define M3_0_2 2_ |
|
#define M3_1_2 0_ |
|
#define M3_2_2 1_ |
|
#define M3_3_2 2_ |
|
#define M3_4_2 0_ |
|
#define M3_5_2 1_ |
|
#define M3_6_2 2_ |
|
#define M3_7_2 0_ |
|
|
|
#define STEP_SMALL_(w, fun, r, s, pp4b) STEP_SMALL w, fun, r, s, pp4b) |
|
|
|
#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3) do { \ |
|
STEP_SMALL_(WS_ ## ri ## 0, \ |
|
IF, p0, p1, XCAT(PP4_, M3_0_ ## isp)); \ |
|
STEP_SMALL_(WS_ ## ri ## 1, \ |
|
IF, p1, p2, XCAT(PP4_, M3_1_ ## isp)); \ |
|
STEP_SMALL_(WS_ ## ri ## 2, \ |
|
IF, p2, p3, XCAT(PP4_, M3_2_ ## isp)); \ |
|
STEP_SMALL_(WS_ ## ri ## 3, \ |
|
IF, p3, p0, XCAT(PP4_, M3_3_ ## isp)); \ |
|
STEP_SMALL_(WS_ ## ri ## 4, \ |
|
MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \ |
|
STEP_SMALL_(WS_ ## ri ## 5, \ |
|
MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \ |
|
STEP_SMALL_(WS_ ## ri ## 6, \ |
|
MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \ |
|
STEP_SMALL_(WS_ ## ri ## 7, \ |
|
MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \ |
|
} while (0) |
|
|
|
#define M7_0_0 0_ |
|
#define M7_1_0 1_ |
|
#define M7_2_0 2_ |
|
#define M7_3_0 3_ |
|
#define M7_4_0 4_ |
|
#define M7_5_0 5_ |
|
#define M7_6_0 6_ |
|
#define M7_7_0 0_ |
|
|
|
#define M7_0_1 1_ |
|
#define M7_1_1 2_ |
|
#define M7_2_1 3_ |
|
#define M7_3_1 4_ |
|
#define M7_4_1 5_ |
|
#define M7_5_1 6_ |
|
#define M7_6_1 0_ |
|
#define M7_7_1 1_ |
|
|
|
#define M7_0_2 2_ |
|
#define M7_1_2 3_ |
|
#define M7_2_2 4_ |
|
#define M7_3_2 5_ |
|
#define M7_4_2 6_ |
|
#define M7_5_2 0_ |
|
#define M7_6_2 1_ |
|
#define M7_7_2 2_ |
|
|
|
#define M7_0_3 3_ |
|
#define M7_1_3 4_ |
|
#define M7_2_3 5_ |
|
#define M7_3_3 6_ |
|
#define M7_4_3 0_ |
|
#define M7_5_3 1_ |
|
#define M7_6_3 2_ |
|
#define M7_7_3 3_ |
|
|
|
#define STEP_BIG_(w, fun, r, s, pp8b) STEP_BIG w, fun, r, s, pp8b) |
|
|
|
#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3) do { \ |
|
STEP_BIG_(WB_ ## ri ## 0, \ |
|
IF, p0, p1, XCAT(PP8_, M7_0_ ## isp)); \ |
|
STEP_BIG_(WB_ ## ri ## 1, \ |
|
IF, p1, p2, XCAT(PP8_, M7_1_ ## isp)); \ |
|
STEP_BIG_(WB_ ## ri ## 2, \ |
|
IF, p2, p3, XCAT(PP8_, M7_2_ ## isp)); \ |
|
STEP_BIG_(WB_ ## ri ## 3, \ |
|
IF, p3, p0, XCAT(PP8_, M7_3_ ## isp)); \ |
|
STEP_BIG_(WB_ ## ri ## 4, \ |
|
MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \ |
|
STEP_BIG_(WB_ ## ri ## 5, \ |
|
MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \ |
|
STEP_BIG_(WB_ ## ri ## 6, \ |
|
MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \ |
|
STEP_BIG_(WB_ ## ri ## 7, \ |
|
MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \ |
|
} while (0) |
|
|
|
#if SPH_SMALL_FOOTPRINT_SIMD |
|
|
|
#define A0 state[ 0] |
|
#define A1 state[ 1] |
|
#define A2 state[ 2] |
|
#define A3 state[ 3] |
|
#define B0 state[ 4] |
|
#define B1 state[ 5] |
|
#define B2 state[ 6] |
|
#define B3 state[ 7] |
|
#define C0 state[ 8] |
|
#define C1 state[ 9] |
|
#define C2 state[10] |
|
#define C3 state[11] |
|
#define D0 state[12] |
|
#define D1 state[13] |
|
#define D2 state[14] |
|
#define D3 state[15] |
|
|
|
#define STEP2_ELT(n, w, fun, s, ppb) do { \ |
|
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ |
|
A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \ |
|
D ## n = C ## n; \ |
|
C ## n = B ## n; \ |
|
B ## n = tA[n]; \ |
|
} while (0) |
|
|
|
#define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \ |
|
u32 tA[4]; \ |
|
tA[0] = ROL32(A0, r); \ |
|
tA[1] = ROL32(A1, r); \ |
|
tA[2] = ROL32(A2, r); \ |
|
tA[3] = ROL32(A3, r); \ |
|
STEP2_ELT(0, w0, fun, s, pp4b); \ |
|
STEP2_ELT(1, w1, fun, s, pp4b); \ |
|
STEP2_ELT(2, w2, fun, s, pp4b); \ |
|
STEP2_ELT(3, w3, fun, s, pp4b); \ |
|
} while (0) |
|
|
|
static void |
|
one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3) |
|
{ |
|
static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 }; |
|
|
|
STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF, p0, p1, pp4k[isp + 0]); |
|
STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF, p1, p2, pp4k[isp + 1]); |
|
STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF, p2, p3, pp4k[isp + 2]); |
|
STEP2_SMALL(w[12], w[13], w[14], w[15], IF, p3, p0, pp4k[isp + 3]); |
|
STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]); |
|
STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]); |
|
STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]); |
|
STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]); |
|
} |
|
|
|
static void |
|
compress_small(sph_simd_small_context *sc, int last) |
|
{ |
|
unsigned char *x; |
|
s32 q[128]; |
|
int i; |
|
u32 w[32]; |
|
u32 state[16]; |
|
size_t u; |
|
|
|
static const size_t wsp[32] = { |
|
4 << 3, 6 << 3, 0 << 3, 2 << 3, |
|
7 << 3, 5 << 3, 3 << 3, 1 << 3, |
|
15 << 3, 11 << 3, 12 << 3, 8 << 3, |
|
9 << 3, 13 << 3, 10 << 3, 14 << 3, |
|
17 << 3, 18 << 3, 23 << 3, 20 << 3, |
|
22 << 3, 21 << 3, 16 << 3, 19 << 3, |
|
30 << 3, 24 << 3, 25 << 3, 31 << 3, |
|
27 << 3, 29 << 3, 28 << 3, 26 << 3 |
|
}; |
|
|
|
x = sc->buf; |
|
FFT128(0, 1, 0, ll); |
|
if (last) { |
|
for (i = 0; i < 128; i ++) { |
|
s32 tq; |
|
|
|
tq = q[i] + yoff_s_f[i]; |
|
tq = REDS2(tq); |
|
tq = REDS1(tq); |
|
tq = REDS1(tq); |
|
q[i] = (tq <= 128 ? tq : tq - 257); |
|
} |
|
} else { |
|
for (i = 0; i < 128; i ++) { |
|
s32 tq; |
|
|
|
tq = q[i] + yoff_s_n[i]; |
|
tq = REDS2(tq); |
|
tq = REDS1(tq); |
|
tq = REDS1(tq); |
|
q[i] = (tq <= 128 ? tq : tq - 257); |
|
} |
|
} |
|
|
|
for (i = 0; i < 16; i += 4) { |
|
state[i + 0] = sc->state[i + 0] |
|
^ sph_dec32le_aligned(x + 4 * (i + 0)); |
|
state[i + 1] = sc->state[i + 1] |
|
^ sph_dec32le_aligned(x + 4 * (i + 1)); |
|
state[i + 2] = sc->state[i + 2] |
|
^ sph_dec32le_aligned(x + 4 * (i + 2)); |
|
state[i + 3] = sc->state[i + 3] |
|
^ sph_dec32le_aligned(x + 4 * (i + 3)); |
|
} |
|
|
|
#define WSREAD(sb, o1, o2, mm) do { \ |
|
for (u = 0; u < 32; u += 4) { \ |
|
size_t v = wsp[(u >> 2) + (sb)]; \ |
|
w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \ |
|
q[v + 2 * 0 + (o2)], mm); \ |
|
w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \ |
|
q[v + 2 * 1 + (o2)], mm); \ |
|
w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \ |
|
q[v + 2 * 2 + (o2)], mm); \ |
|
w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \ |
|
q[v + 2 * 3 + (o2)], mm); \ |
|
} \ |
|
} while (0) |
|
|
|
WSREAD( 0, 0, 1, 185); |
|
one_round_small(state, w, 0, 3, 23, 17, 27); |
|
WSREAD( 8, 0, 1, 185); |
|
one_round_small(state, w, 2, 28, 19, 22, 7); |
|
WSREAD(16, -128, -64, 233); |
|
one_round_small(state, w, 1, 29, 9, 15, 5); |
|
WSREAD(24, -191, -127, 233); |
|
one_round_small(state, w, 0, 4, 13, 10, 25); |
|
|
|
#undef WSREAD |
|
|
|
STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], |
|
IF, 4, 13, PP4_2_); |
|
STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], |
|
IF, 13, 10, PP4_0_); |
|
STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], |
|
IF, 10, 25, PP4_1_); |
|
STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15], |
|
IF, 25, 4, PP4_2_); |
|
|
|
memcpy(sc->state, state, sizeof state); |
|
} |
|
|
|
#undef A0 |
|
#undef A1 |
|
#undef A2 |
|
#undef A3 |
|
#undef B0 |
|
#undef B1 |
|
#undef B2 |
|
#undef B3 |
|
#undef C0 |
|
#undef C1 |
|
#undef C2 |
|
#undef C3 |
|
#undef D0 |
|
#undef D1 |
|
#undef D2 |
|
#undef D3 |
|
|
|
#else |
|
|
|
#if SPH_SIMD_NOCOPY |
|
#define A0 (sc->state[ 0]) |
|
#define A1 (sc->state[ 1]) |
|
#define A2 (sc->state[ 2]) |
|
#define A3 (sc->state[ 3]) |
|
#define B0 (sc->state[ 4]) |
|
#define B1 (sc->state[ 5]) |
|
#define B2 (sc->state[ 6]) |
|
#define B3 (sc->state[ 7]) |
|
#define C0 (sc->state[ 8]) |
|
#define C1 (sc->state[ 9]) |
|
#define C2 (sc->state[10]) |
|
#define C3 (sc->state[11]) |
|
#define D0 (sc->state[12]) |
|
#define D1 (sc->state[13]) |
|
#define D2 (sc->state[14]) |
|
#define D3 (sc->state[15]) |
|
#endif |
|
|
|
static void |
|
compress_small(sph_simd_small_context *sc, int last) |
|
{ |
|
unsigned char *x; |
|
s32 q[128]; |
|
int i; |
|
DECL_STATE_SMALL |
|
#if SPH_SIMD_NOCOPY |
|
sph_u32 saved[16]; |
|
#endif |
|
|
|
#if SPH_SIMD_NOCOPY |
|
memcpy(saved, sc->state, sizeof saved); |
|
#endif |
|
x = sc->buf; |
|
FFT128(0, 1, 0, ll); |
|
if (last) { |
|
for (i = 0; i < 128; i ++) { |
|
s32 tq; |
|
|
|
tq = q[i] + yoff_s_f[i]; |
|
tq = REDS2(tq); |
|
tq = REDS1(tq); |
|
tq = REDS1(tq); |
|
q[i] = (tq <= 128 ? tq : tq - 257); |
|
} |
|
} else { |
|
for (i = 0; i < 128; i ++) { |
|
s32 tq; |
|
|
|
tq = q[i] + yoff_s_n[i]; |
|
tq = REDS2(tq); |
|
tq = REDS1(tq); |
|
tq = REDS1(tq); |
|
q[i] = (tq <= 128 ? tq : tq - 257); |
|
} |
|
} |
|
READ_STATE_SMALL(sc); |
|
A0 ^= sph_dec32le_aligned(x + 0); |
|
A1 ^= sph_dec32le_aligned(x + 4); |
|
A2 ^= sph_dec32le_aligned(x + 8); |
|
A3 ^= sph_dec32le_aligned(x + 12); |
|
B0 ^= sph_dec32le_aligned(x + 16); |
|
B1 ^= sph_dec32le_aligned(x + 20); |
|
B2 ^= sph_dec32le_aligned(x + 24); |
|
B3 ^= sph_dec32le_aligned(x + 28); |
|
C0 ^= sph_dec32le_aligned(x + 32); |
|
C1 ^= sph_dec32le_aligned(x + 36); |
|
C2 ^= sph_dec32le_aligned(x + 40); |
|
C3 ^= sph_dec32le_aligned(x + 44); |
|
D0 ^= sph_dec32le_aligned(x + 48); |
|
D1 ^= sph_dec32le_aligned(x + 52); |
|
D2 ^= sph_dec32le_aligned(x + 56); |
|
D3 ^= sph_dec32le_aligned(x + 60); |
|
ONE_ROUND_SMALL(0_, 0, 3, 23, 17, 27); |
|
ONE_ROUND_SMALL(1_, 2, 28, 19, 22, 7); |
|
ONE_ROUND_SMALL(2_, 1, 29, 9, 15, 5); |
|
ONE_ROUND_SMALL(3_, 0, 4, 13, 10, 25); |
|
#if SPH_SIMD_NOCOPY |
|
STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3], |
|
IF, 4, 13, PP4_2_); |
|
STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7], |
|
IF, 13, 10, PP4_0_); |
|
STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11], |
|
IF, 10, 25, PP4_1_); |
|
STEP_SMALL(saved[12], saved[13], saved[14], saved[15], |
|
IF, 25, 4, PP4_2_); |
|
#else |
|
STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], |
|
IF, 4, 13, PP4_2_); |
|
STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], |
|
IF, 13, 10, PP4_0_); |
|
STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], |
|
IF, 10, 25, PP4_1_); |
|
STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15], |
|
IF, 25, 4, PP4_2_); |
|
WRITE_STATE_SMALL(sc); |
|
#endif |
|
} |
|
|
|
#if SPH_SIMD_NOCOPY |
|
#undef A0 |
|
#undef A1 |
|
#undef A2 |
|
#undef A3 |
|
#undef B0 |
|
#undef B1 |
|
#undef B2 |
|
#undef B3 |
|
#undef C0 |
|
#undef C1 |
|
#undef C2 |
|
#undef C3 |
|
#undef D0 |
|
#undef D1 |
|
#undef D2 |
|
#undef D3 |
|
#endif |
|
|
|
#endif |
|
|
|
#if SPH_SMALL_FOOTPRINT_SIMD |
|
|
|
#define A0 state[ 0] |
|
#define A1 state[ 1] |
|
#define A2 state[ 2] |
|
#define A3 state[ 3] |
|
#define A4 state[ 4] |
|
#define A5 state[ 5] |
|
#define A6 state[ 6] |
|
#define A7 state[ 7] |
|
#define B0 state[ 8] |
|
#define B1 state[ 9] |
|
#define B2 state[10] |
|
#define B3 state[11] |
|
#define B4 state[12] |
|
#define B5 state[13] |
|
#define B6 state[14] |
|
#define B7 state[15] |
|
#define C0 state[16] |
|
#define C1 state[17] |
|
#define C2 state[18] |
|
#define C3 state[19] |
|
#define C4 state[20] |
|
#define C5 state[21] |
|
#define C6 state[22] |
|
#define C7 state[23] |
|
#define D0 state[24] |
|
#define D1 state[25] |
|
#define D2 state[26] |
|
#define D3 state[27] |
|
#define D4 state[28] |
|
#define D5 state[29] |
|
#define D6 state[30] |
|
#define D7 state[31] |
|
|
|
/* |
|
* Not needed -- already defined for SIMD-224 / SIMD-256 |
|
* |
|
#define STEP2_ELT(n, w, fun, s, ppb) do { \ |
|
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ |
|
A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \ |
|
D ## n = C ## n; \ |
|
C ## n = B ## n; \ |
|
B ## n = tA[n]; \ |
|
} while (0) |
|
*/ |
|
|
|
#define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \ |
|
u32 tA[8]; \ |
|
tA[0] = ROL32(A0, r); \ |
|
tA[1] = ROL32(A1, r); \ |
|
tA[2] = ROL32(A2, r); \ |
|
tA[3] = ROL32(A3, r); \ |
|
tA[4] = ROL32(A4, r); \ |
|
tA[5] = ROL32(A5, r); \ |
|
tA[6] = ROL32(A6, r); \ |
|
tA[7] = ROL32(A7, r); \ |
|
STEP2_ELT(0, w0, fun, s, pp8b); \ |
|
STEP2_ELT(1, w1, fun, s, pp8b); \ |
|
STEP2_ELT(2, w2, fun, s, pp8b); \ |
|
STEP2_ELT(3, w3, fun, s, pp8b); \ |
|
STEP2_ELT(4, w4, fun, s, pp8b); \ |
|
STEP2_ELT(5, w5, fun, s, pp8b); \ |
|
STEP2_ELT(6, w6, fun, s, pp8b); \ |
|
STEP2_ELT(7, w7, fun, s, pp8b); \ |
|
} while (0) |
|
|
|
static void |
|
one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3) |
|
{ |
|
static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 }; |
|
|
|
STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7], |
|
IF, p0, p1, pp8k[isp + 0]); |
|
STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15], |
|
IF, p1, p2, pp8k[isp + 1]); |
|
STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23], |
|
IF, p2, p3, pp8k[isp + 2]); |
|
STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31], |
|
IF, p3, p0, pp8k[isp + 3]); |
|
STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39], |
|
MAJ, p0, p1, pp8k[isp + 4]); |
|
STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47], |
|
MAJ, p1, p2, pp8k[isp + 5]); |
|
STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55], |
|
MAJ, p2, p3, pp8k[isp + 6]); |
|
STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63], |
|
MAJ, p3, p0, pp8k[isp + 7]); |
|
} |
|
|
|
static void |
|
compress_big(sph_simd_big_context *sc, int last) |
|
{ |
|
unsigned char *x; |
|
s32 q[256]; |
|
int i; |
|
u32 w[64]; |
|
u32 state[32]; |
|
size_t u; |
|
|
|
static const size_t wbp[32] = { |
|
4 << 4, 6 << 4, 0 << 4, 2 << 4, |
|
7 << 4, 5 << 4, 3 << 4, 1 << 4, |
|
15 << 4, 11 << 4, 12 << 4, 8 << 4, |
|
9 << 4, 13 << 4, 10 << 4, 14 << 4, |
|
17 << 4, 18 << 4, 23 << 4, 20 << 4, |
|
22 << 4, 21 << 4, 16 << 4, 19 << 4, |
|
30 << 4, 24 << 4, 25 << 4, 31 << 4, |
|
27 << 4, 29 << 4, 28 << 4, 26 << 4 |
|
}; |
|
|
|
x = sc->buf; |
|
FFT256(0, 1, 0, ll); |
|
if (last) { |
|
for (i = 0; i < 256; i ++) { |
|
s32 tq; |
|
|
|
tq = q[i] + yoff_b_f[i]; |
|
tq = REDS2(tq); |
|
tq = REDS1(tq); |
|
tq = REDS1(tq); |
|
q[i] = (tq <= 128 ? tq : tq - 257); |
|
} |
|
} else { |
|
for (i = 0; i < 256; i ++) { |
|
s32 tq; |
|
|
|
tq = q[i] + yoff_b_n[i]; |
|
tq = REDS2(tq); |
|
tq = REDS1(tq); |
|
tq = REDS1(tq); |
|
q[i] = (tq <= 128 ? tq : tq - 257); |
|
} |
|
} |
|
|
|
for (i = 0; i < 32; i += 8) { |
|
state[i + 0] = sc->state[i + 0] |
|
^ sph_dec32le_aligned(x + 4 * (i + 0)); |
|
state[i + 1] = sc->state[i + 1] |
|
^ sph_dec32le_aligned(x + 4 * (i + 1)); |
|
state[i + 2] = sc->state[i + 2] |
|
^ sph_dec32le_aligned(x + 4 * (i + 2)); |
|
state[i + 3] = sc->state[i + 3] |
|
^ sph_dec32le_aligned(x + 4 * (i + 3)); |
|
state[i + 4] = sc->state[i + 4] |
|
^ sph_dec32le_aligned(x + 4 * (i + 4)); |
|
state[i + 5] = sc->state[i + 5] |
|
^ sph_dec32le_aligned(x + 4 * (i + 5)); |
|
state[i + 6] = sc->state[i + 6] |
|
^ sph_dec32le_aligned(x + 4 * (i + 6)); |
|
state[i + 7] = sc->state[i + 7] |
|
^ sph_dec32le_aligned(x + 4 * (i + 7)); |
|
} |
|
|
|
#define WBREAD(sb, o1, o2, mm) do { \ |
|
for (u = 0; u < 64; u += 8) { \ |
|
size_t v = wbp[(u >> 3) + (sb)]; \ |
|
w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \ |
|
q[v + 2 * 0 + (o2)], mm); \ |
|
w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \ |
|
q[v + 2 * 1 + (o2)], mm); \ |
|
w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \ |
|
q[v + 2 * 2 + (o2)], mm); \ |
|
w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \ |
|
q[v + 2 * 3 + (o2)], mm); \ |
|
w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \ |
|
q[v + 2 * 4 + (o2)], mm); \ |
|
w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \ |
|
q[v + 2 * 5 + (o2)], mm); \ |
|
w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \ |
|
q[v + 2 * 6 + (o2)], mm); \ |
|
w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \ |
|
q[v + 2 * 7 + (o2)], mm); \ |
|
} \ |
|
} while (0) |
|
|
|
WBREAD( 0, 0, 1, 185); |
|
one_round_big(state, w, 0, 3, 23, 17, 27); |
|
WBREAD( 8, 0, 1, 185); |
|
one_round_big(state, w, 1, 28, 19, 22, 7); |
|
WBREAD(16, -256, -128, 233); |
|
one_round_big(state, w, 2, 29, 9, 15, 5); |
|
WBREAD(24, -383, -255, 233); |
|
one_round_big(state, w, 3, 4, 13, 10, 25); |
|
|
|
#undef WBREAD |
|
|
|
STEP_BIG( |
|
sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], |
|
sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], |
|
IF, 4, 13, PP8_4_); |
|
STEP_BIG( |
|
sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], |
|
sc->state[12], sc->state[13], sc->state[14], sc->state[15], |
|
IF, 13, 10, PP8_5_); |
|
STEP_BIG( |
|
sc->state[16], sc->state[17], sc->state[18], sc->state[19], |
|
sc->state[20], sc->state[21], sc->state[22], sc->state[23], |
|
IF, 10, 25, PP8_6_); |
|
STEP_BIG( |
|
sc->state[24], sc->state[25], sc->state[26], sc->state[27], |
|
sc->state[28], sc->state[29], sc->state[30], sc->state[31], |
|
IF, 25, 4, PP8_0_); |
|
|
|
memcpy(sc->state, state, sizeof state); |
|
} |
|
|
|
#undef A0 |
|
#undef A1 |
|
#undef A2 |
|
#undef A3 |
|
#undef A4 |
|
#undef A5 |
|
#undef A6 |
|
#undef A7 |
|
#undef B0 |
|
#undef B1 |
|
#undef B2 |
|
#undef B3 |
|
#undef B4 |
|
#undef B5 |
|
#undef B6 |
|
#undef B7 |
|
#undef C0 |
|
#undef C1 |
|
#undef C2 |
|
#undef C3 |
|
#undef C4 |
|
#undef C5 |
|
#undef C6 |
|
#undef C7 |
|
#undef D0 |
|
#undef D1 |
|
#undef D2 |
|
#undef D3 |
|
#undef D4 |
|
#undef D5 |
|
#undef D6 |
|
#undef D7 |
|
|
|
#else |
|
|
|
#if SPH_SIMD_NOCOPY |
|
#define A0 (sc->state[ 0]) |
|
#define A1 (sc->state[ 1]) |
|
#define A2 (sc->state[ 2]) |
|
#define A3 (sc->state[ 3]) |
|
#define A4 (sc->state[ 4]) |
|
#define A5 (sc->state[ 5]) |
|
#define A6 (sc->state[ 6]) |
|
#define A7 (sc->state[ 7]) |
|
#define B0 (sc->state[ 8]) |
|
#define B1 (sc->state[ 9]) |
|
#define B2 (sc->state[10]) |
|
#define B3 (sc->state[11]) |
|
#define B4 (sc->state[12]) |
|
#define B5 (sc->state[13]) |
|
#define B6 (sc->state[14]) |
|
#define B7 (sc->state[15]) |
|
#define C0 (sc->state[16]) |
|
#define C1 (sc->state[17]) |
|
#define C2 (sc->state[18]) |
|
#define C3 (sc->state[19]) |
|
#define C4 (sc->state[20]) |
|
#define C5 (sc->state[21]) |
|
#define C6 (sc->state[22]) |
|
#define C7 (sc->state[23]) |
|
#define D0 (sc->state[24]) |
|
#define D1 (sc->state[25]) |
|
#define D2 (sc->state[26]) |
|
#define D3 (sc->state[27]) |
|
#define D4 (sc->state[28]) |
|
#define D5 (sc->state[29]) |
|
#define D6 (sc->state[30]) |
|
#define D7 (sc->state[31]) |
|
#endif |
|
|
|
static void |
|
compress_big(sph_simd_big_context *sc, int last) |
|
{ |
|
unsigned char *x; |
|
s32 q[256]; |
|
int i; |
|
DECL_STATE_BIG |
|
#if SPH_SIMD_NOCOPY |
|
sph_u32 saved[32]; |
|
#endif |
|
|
|
#if SPH_SIMD_NOCOPY |
|
memcpy(saved, sc->state, sizeof saved); |
|
#endif |
|
|
|
x = sc->buf; |
|
FFT256(0, 1, 0, ll); |
|
if (last) { |
|
for (i = 0; i < 256; i ++) { |
|
s32 tq; |
|
|
|
tq = q[i] + yoff_b_f[i]; |
|
tq = REDS2(tq); |
|
tq = REDS1(tq); |
|
tq = REDS1(tq); |
|
q[i] = (tq <= 128 ? tq : tq - 257); |
|
} |
|
} else { |
|
for (i = 0; i < 256; i ++) { |
|
s32 tq; |
|
|
|
tq = q[i] + yoff_b_n[i]; |
|
tq = REDS2(tq); |
|
tq = REDS1(tq); |
|
tq = REDS1(tq); |
|
q[i] = (tq <= 128 ? tq : tq - 257); |
|
} |
|
} |
|
READ_STATE_BIG(sc); |
|
A0 ^= sph_dec32le_aligned(x + 0); |
|
A1 ^= sph_dec32le_aligned(x + 4); |
|
A2 ^= sph_dec32le_aligned(x + 8); |
|
A3 ^= sph_dec32le_aligned(x + 12); |
|
A4 ^= sph_dec32le_aligned(x + 16); |
|
A5 ^= sph_dec32le_aligned(x + 20); |
|
A6 ^= sph_dec32le_aligned(x + 24); |
|
A7 ^= sph_dec32le_aligned(x + 28); |
|
B0 ^= sph_dec32le_aligned(x + 32); |
|
B1 ^= sph_dec32le_aligned(x + 36); |
|
B2 ^= sph_dec32le_aligned(x + 40); |
|
B3 ^= sph_dec32le_aligned(x + 44); |
|
B4 ^= sph_dec32le_aligned(x + 48); |
|
B5 ^= sph_dec32le_aligned(x + 52); |
|
B6 ^= sph_dec32le_aligned(x + 56); |
|
B7 ^= sph_dec32le_aligned(x + 60); |
|
C0 ^= sph_dec32le_aligned(x + 64); |
|
C1 ^= sph_dec32le_aligned(x + 68); |
|
C2 ^= sph_dec32le_aligned(x + 72); |
|
C3 ^= sph_dec32le_aligned(x + 76); |
|
C4 ^= sph_dec32le_aligned(x + 80); |
|
C5 ^= sph_dec32le_aligned(x + 84); |
|
C6 ^= sph_dec32le_aligned(x + 88); |
|
C7 ^= sph_dec32le_aligned(x + 92); |
|
D0 ^= sph_dec32le_aligned(x + 96); |
|
D1 ^= sph_dec32le_aligned(x + 100); |
|
D2 ^= sph_dec32le_aligned(x + 104); |
|
D3 ^= sph_dec32le_aligned(x + 108); |
|
D4 ^= sph_dec32le_aligned(x + 112); |
|
D5 ^= sph_dec32le_aligned(x + 116); |
|
D6 ^= sph_dec32le_aligned(x + 120); |
|
D7 ^= sph_dec32le_aligned(x + 124); |
|
|
|
ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); |
|
ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); |
|
ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); |
|
ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); |
|
#if SPH_SIMD_NOCOPY |
|
STEP_BIG( |
|
saved[ 0], saved[ 1], saved[ 2], saved[ 3], |
|
saved[ 4], saved[ 5], saved[ 6], saved[ 7], |
|
IF, 4, 13, PP8_4_); |
|
STEP_BIG( |
|
saved[ 8], saved[ 9], saved[10], saved[11], |
|
saved[12], saved[13], saved[14], saved[15], |
|
IF, 13, 10, PP8_5_); |
|
STEP_BIG( |
|
saved[16], saved[17], saved[18], saved[19], |
|
saved[20], saved[21], saved[22], saved[23], |
|
IF, 10, 25, PP8_6_); |
|
STEP_BIG( |
|
saved[24], saved[25], saved[26], saved[27], |
|
saved[28], saved[29], saved[30], saved[31], |
|
IF, 25, 4, PP8_0_); |
|
#else |
|
STEP_BIG( |
|
sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], |
|
sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], |
|
IF, 4, 13, PP8_4_); |
|
STEP_BIG( |
|
sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], |
|
sc->state[12], sc->state[13], sc->state[14], sc->state[15], |
|
IF, 13, 10, PP8_5_); |
|
STEP_BIG( |
|
sc->state[16], sc->state[17], sc->state[18], sc->state[19], |
|
sc->state[20], sc->state[21], sc->state[22], sc->state[23], |
|
IF, 10, 25, PP8_6_); |
|
STEP_BIG( |
|
sc->state[24], sc->state[25], sc->state[26], sc->state[27], |
|
sc->state[28], sc->state[29], sc->state[30], sc->state[31], |
|
IF, 25, 4, PP8_0_); |
|
WRITE_STATE_BIG(sc); |
|
#endif |
|
} |
|
|
|
#if SPH_SIMD_NOCOPY |
|
#undef A0 |
|
#undef A1 |
|
#undef A2 |
|
#undef A3 |
|
#undef A4 |
|
#undef A5 |
|
#undef A6 |
|
#undef A7 |
|
#undef B0 |
|
#undef B1 |
|
#undef B2 |
|
#undef B3 |
|
#undef B4 |
|
#undef B5 |
|
#undef B6 |
|
#undef B7 |
|
#undef C0 |
|
#undef C1 |
|
#undef C2 |
|
#undef C3 |
|
#undef C4 |
|
#undef C5 |
|
#undef C6 |
|
#undef C7 |
|
#undef D0 |
|
#undef D1 |
|
#undef D2 |
|
#undef D3 |
|
#undef D4 |
|
#undef D5 |
|
#undef D6 |
|
#undef D7 |
|
#endif |
|
|
|
#endif |
|
|
|
static const u32 IV224[] = { |
|
C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53), |
|
C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96), |
|
C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6), |
|
C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8) |
|
}; |
|
|
|
static const u32 IV256[] = { |
|
C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9), |
|
C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3), |
|
C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9), |
|
C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1) |
|
}; |
|
|
|
static const u32 IV384[] = { |
|
C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B), |
|
C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1), |
|
C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A), |
|
C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8), |
|
C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2), |
|
C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462), |
|
C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5), |
|
C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71) |
|
}; |
|
|
|
static const u32 IV512[] = { |
|
C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), |
|
C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), |
|
C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), |
|
C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), |
|
C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), |
|
C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), |
|
C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), |
|
C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22) |
|
}; |
|
|
|
static void |
|
init_small(void *cc, const u32 *iv) |
|
{ |
|
sph_simd_small_context *sc; |
|
|
|
sc = cc; |
|
memcpy(sc->state, iv, sizeof sc->state); |
|
sc->count_low = sc->count_high = 0; |
|
sc->ptr = 0; |
|
} |
|
|
|
static void |
|
init_big(void *cc, const u32 *iv) |
|
{ |
|
sph_simd_big_context *sc; |
|
|
|
sc = cc; |
|
memcpy(sc->state, iv, sizeof sc->state); |
|
sc->count_low = sc->count_high = 0; |
|
sc->ptr = 0; |
|
} |
|
|
|
static void |
|
update_small(void *cc, const void *data, size_t len) |
|
{ |
|
sph_simd_small_context *sc; |
|
|
|
sc = cc; |
|
while (len > 0) { |
|
size_t clen; |
|
|
|
clen = (sizeof sc->buf) - sc->ptr; |
|
if (clen > len) |
|
clen = len; |
|
memcpy(sc->buf + sc->ptr, data, clen); |
|
data = (const unsigned char *)data + clen; |
|
len -= clen; |
|
if ((sc->ptr += clen) == sizeof sc->buf) { |
|
compress_small(sc, 0); |
|
sc->ptr = 0; |
|
sc->count_low = T32(sc->count_low + 1); |
|
if (sc->count_low == 0) |
|
sc->count_high ++; |
|
} |
|
} |
|
} |
|
|
|
static void |
|
update_big(void *cc, const void *data, size_t len) |
|
{ |
|
sph_simd_big_context *sc; |
|
|
|
sc = cc; |
|
while (len > 0) { |
|
size_t clen; |
|
|
|
clen = (sizeof sc->buf) - sc->ptr; |
|
if (clen > len) |
|
clen = len; |
|
memcpy(sc->buf + sc->ptr, data, clen); |
|
data = (const unsigned char *)data + clen; |
|
len -= clen; |
|
if ((sc->ptr += clen) == sizeof sc->buf) { |
|
compress_big(sc, 0); |
|
sc->ptr = 0; |
|
sc->count_low = T32(sc->count_low + 1); |
|
if (sc->count_low == 0) |
|
sc->count_high ++; |
|
} |
|
} |
|
} |
|
|
|
static void |
|
encode_count_small(unsigned char *dst, |
|
u32 low, u32 high, size_t ptr, unsigned n) |
|
{ |
|
low = T32(low << 9); |
|
high = T32(high << 9) + (low >> 23); |
|
low += (ptr << 3) + n; |
|
sph_enc32le(dst, low); |
|
sph_enc32le(dst + 4, high); |
|
} |
|
|
|
static void |
|
encode_count_big(unsigned char *dst, |
|
u32 low, u32 high, size_t ptr, unsigned n) |
|
{ |
|
low = T32(low << 10); |
|
high = T32(high << 10) + (low >> 22); |
|
low += (ptr << 3) + n; |
|
sph_enc32le(dst, low); |
|
sph_enc32le(dst + 4, high); |
|
} |
|
|
|
static void |
|
finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len) |
|
{ |
|
sph_simd_small_context *sc; |
|
unsigned char *d; |
|
size_t u; |
|
|
|
sc = cc; |
|
if (sc->ptr > 0 || n > 0) { |
|
memset(sc->buf + sc->ptr, 0, |
|
(sizeof sc->buf) - sc->ptr); |
|
sc->buf[sc->ptr] = ub & (0xFF << (8 - n)); |
|
compress_small(sc, 0); |
|
} |
|
memset(sc->buf, 0, sizeof sc->buf); |
|
encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n); |
|
compress_small(sc, 1); |
|
d = dst; |
|
for (d = dst, u = 0; u < dst_len; u ++) |
|
sph_enc32le(d + (u << 2), sc->state[u]); |
|
} |
|
|
|
static void |
|
finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len) |
|
{ |
|
sph_simd_big_context *sc; |
|
unsigned char *d; |
|
size_t u; |
|
|
|
sc = cc; |
|
if (sc->ptr > 0 || n > 0) { |
|
memset(sc->buf + sc->ptr, 0, |
|
(sizeof sc->buf) - sc->ptr); |
|
sc->buf[sc->ptr] = ub & (0xFF << (8 - n)); |
|
compress_big(sc, 0); |
|
} |
|
memset(sc->buf, 0, sizeof sc->buf); |
|
encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n); |
|
compress_big(sc, 1); |
|
d = dst; |
|
for (d = dst, u = 0; u < dst_len; u ++) |
|
sph_enc32le(d + (u << 2), sc->state[u]); |
|
} |
|
|
|
void |
|
sph_simd224_init(void *cc) |
|
{ |
|
init_small(cc, IV224); |
|
} |
|
|
|
void |
|
sph_simd224(void *cc, const void *data, size_t len) |
|
{ |
|
update_small(cc, data, len); |
|
} |
|
|
|
void |
|
sph_simd224_close(void *cc, void *dst) |
|
{ |
|
sph_simd224_addbits_and_close(cc, 0, 0, dst); |
|
} |
|
|
|
void |
|
sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) |
|
{ |
|
finalize_small(cc, ub, n, dst, 7); |
|
sph_simd224_init(cc); |
|
} |
|
|
|
void |
|
sph_simd256_init(void *cc) |
|
{ |
|
init_small(cc, IV256); |
|
} |
|
|
|
void |
|
sph_simd256(void *cc, const void *data, size_t len) |
|
{ |
|
update_small(cc, data, len); |
|
} |
|
|
|
void |
|
sph_simd256_close(void *cc, void *dst) |
|
{ |
|
sph_simd256_addbits_and_close(cc, 0, 0, dst); |
|
} |
|
|
|
void |
|
sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) |
|
{ |
|
finalize_small(cc, ub, n, dst, 8); |
|
sph_simd256_init(cc); |
|
} |
|
|
|
void |
|
sph_simd384_init(void *cc) |
|
{ |
|
init_big(cc, IV384); |
|
} |
|
|
|
void |
|
sph_simd384(void *cc, const void *data, size_t len) |
|
{ |
|
update_big(cc, data, len); |
|
} |
|
|
|
void |
|
sph_simd384_close(void *cc, void *dst) |
|
{ |
|
sph_simd384_addbits_and_close(cc, 0, 0, dst); |
|
} |
|
|
|
void |
|
sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) |
|
{ |
|
finalize_big(cc, ub, n, dst, 12); |
|
sph_simd384_init(cc); |
|
} |
|
|
|
void |
|
sph_simd512_init(void *cc) |
|
{ |
|
init_big(cc, IV512); |
|
} |
|
|
|
void |
|
sph_simd512(void *cc, const void *data, size_t len) |
|
{ |
|
update_big(cc, data, len); |
|
} |
|
|
|
void |
|
sph_simd512_close(void *cc, void *dst) |
|
{ |
|
sph_simd512_addbits_and_close(cc, 0, 0, dst); |
|
} |
|
|
|
void |
|
sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) |
|
{ |
|
finalize_big(cc, ub, n, dst, 16); |
|
sph_simd512_init(cc); |
|
} |
|
#ifdef __cplusplus |
|
} |
|
#endif |