ccminer/sph/simd.c

/* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */
/*
 * SIMD implementation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 * 
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */

#include <stddef.h>
#include <string.h>
#include <limits.h>

#include "sph_simd.h"

#ifdef __cplusplus
extern "C"{
#endif

#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD
#define SPH_SMALL_FOOTPRINT_SIMD   1
#endif

#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif

typedef sph_u32 u32;
typedef sph_s32 s32;
#define C32     SPH_C32
#define T32     SPH_T32
#define ROL32   SPH_ROTL32

#define XCAT(x, y)    XCAT_(x, y)
#define XCAT_(x, y)   x ## y

/*
 * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
 */
static const s32 alpha_tab[] = {
	  1,  41, 139,  45,  46,  87, 226,  14,  60, 147, 116, 130,
	190,  80, 196,  69,   2,  82,  21,  90,  92, 174, 195,  28,
	120,  37, 232,   3, 123, 160, 135, 138,   4, 164,  42, 180,
	184,  91, 133,  56, 240,  74, 207,   6, 246,  63,  13,  19,
	  8,  71,  84, 103, 111, 182,   9, 112, 223, 148, 157,  12,
	235, 126,  26,  38,  16, 142, 168, 206, 222, 107,  18, 224,
	189,  39,  57,  24, 213, 252,  52,  76,  32,  27,  79, 155,
	187, 214,  36, 191, 121,  78, 114,  48, 169, 247, 104, 152,
	 64,  54, 158,  53, 117, 171,  72, 125, 242, 156, 228,  96,
	 81, 237, 208,  47, 128, 108,  59, 106, 234,  85, 144, 250,
	227,  55, 199, 192, 162, 217, 159,  94, 256, 216, 118, 212,
	211, 170,  31, 243, 197, 110, 141, 127,  67, 177,  61, 188,
	255, 175, 236, 167, 165,  83,  62, 229, 137, 220,  25, 254,
	134,  97, 122, 119, 253,  93, 215,  77,  73, 166, 124, 201,
	 17, 183,  50, 251,  11, 194, 244, 238, 249, 186, 173, 154,
	146,  75, 248, 145,  34, 109, 100, 245,  22, 131, 231, 219,
	241, 115,  89,  51,  35, 150, 239,  33,  68, 218, 200, 233,
	 44,   5, 205, 181, 225, 230, 178, 102,  70,  43, 221,  66,
	136, 179, 143, 209,  88,  10, 153, 105, 193, 203,  99, 204,
	140,  86, 185, 132,  15, 101,  29, 161, 176,  20,  49, 210,
	129, 149, 198, 151,  23, 172, 113,   7,  30, 202,  58,  65,
	 95,  40,  98, 163
};

/*
 * Ranges:
 *   REDS1: from -32768..98302 to -383..383
 *   REDS2: from -2^31..2^31-1 to -32768..98302
 */
#define REDS1(x)    (((x) & 0xFF) - ((x) >> 8))
#define REDS2(x)    (((x) & 0xFFFF) + ((x) >> 16))

/*
 * If, upon entry, the values of q[] are all in the -N..N range (where
 * N >= 98302) then the new values of q[] are in the -2N..2N range.
 *
 * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
 */
#define FFT_LOOP(rb, hk, as, id)   do { \
		size_t u, v; \
		s32 m = q[(rb)]; \
		s32 n = q[(rb) + (hk)]; \
		q[(rb)] = m + n; \
		q[(rb) + (hk)] = m - n; \
		u = v = 0; \
		goto id; \
		for (; u < (hk); u += 4, v += 4 * (as)) { \
			s32 t; \
			m = q[(rb) + u + 0]; \
			n = q[(rb) + u + 0 + (hk)]; \
			t = REDS2(n * alpha_tab[v + 0 * (as)]); \
			q[(rb) + u + 0] = m + t; \
			q[(rb) + u + 0 + (hk)] = m - t; \
		id: \
			m = q[(rb) + u + 1]; \
			n = q[(rb) + u + 1 + (hk)]; \
			t = REDS2(n * alpha_tab[v + 1 * (as)]); \
			q[(rb) + u + 1] = m + t; \
			q[(rb) + u + 1 + (hk)] = m - t; \
			m = q[(rb) + u + 2]; \
			n = q[(rb) + u + 2 + (hk)]; \
			t = REDS2(n * alpha_tab[v + 2 * (as)]); \
			q[(rb) + u + 2] = m + t; \
			q[(rb) + u + 2 + (hk)] = m - t; \
			m = q[(rb) + u + 3]; \
			n = q[(rb) + u + 3 + (hk)]; \
			t = REDS2(n * alpha_tab[v + 3 * (as)]); \
			q[(rb) + u + 3] = m + t; \
			q[(rb) + u + 3 + (hk)] = m - t; \
		} \
	} while (0)

/*
 * Output ranges:
 *   d0:   min=    0   max= 1020
 *   d1:   min=  -67   max= 4587
 *   d2:   min=-4335   max= 4335
 *   d3:   min=-4147   max=  507
 *   d4:   min= -510   max=  510
 *   d5:   min= -252   max= 4402
 *   d6:   min=-4335   max= 4335
 *   d7:   min=-4332   max=  322
 */
#define FFT8(xb, xs, d)   do { \
		s32 x0 = x[(xb)]; \
		s32 x1 = x[(xb) + (xs)]; \
		s32 x2 = x[(xb) + 2 * (xs)]; \
		s32 x3 = x[(xb) + 3 * (xs)]; \
		s32 a0 = x0 + x2; \
		s32 a1 = x0 + (x2 << 4); \
		s32 a2 = x0 - x2; \
		s32 a3 = x0 - (x2 << 4); \
		s32 b0 = x1 + x3; \
		s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
		s32 b2 = (x1 << 4) - (x3 << 4); \
		s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
		d ## 0 = a0 + b0; \
		d ## 1 = a1 + b1; \
		d ## 2 = a2 + b2; \
		d ## 3 = a3 + b3; \
		d ## 4 = a0 - b0; \
		d ## 5 = a1 - b1; \
		d ## 6 = a2 - b2; \
		d ## 7 = a3 - b3; \
	} while (0)

/*
 * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
 * to some shifting.
 *
 * Output: within -591471..591723
 */
#define FFT16(xb, xs, rb)   do { \
		s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
		s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
		FFT8(xb, (xs) << 1, d1_); \
		FFT8((xb) + (xs), (xs) << 1, d2_); \
		q[(rb) +  0] = d1_0 + d2_0; \
		q[(rb) +  1] = d1_1 + (d2_1 << 1); \
		q[(rb) +  2] = d1_2 + (d2_2 << 2); \
		q[(rb) +  3] = d1_3 + (d2_3 << 3); \
		q[(rb) +  4] = d1_4 + (d2_4 << 4); \
		q[(rb) +  5] = d1_5 + (d2_5 << 5); \
		q[(rb) +  6] = d1_6 + (d2_6 << 6); \
		q[(rb) +  7] = d1_7 + (d2_7 << 7); \
		q[(rb) +  8] = d1_0 - d2_0; \
		q[(rb) +  9] = d1_1 - (d2_1 << 1); \
		q[(rb) + 10] = d1_2 - (d2_2 << 2); \
		q[(rb) + 11] = d1_3 - (d2_3 << 3); \
		q[(rb) + 12] = d1_4 - (d2_4 << 4); \
		q[(rb) + 13] = d1_5 - (d2_5 << 5); \
		q[(rb) + 14] = d1_6 - (d2_6 << 6); \
		q[(rb) + 15] = d1_7 - (d2_7 << 7); \
	} while (0)

/*
 * Output range: |q| <= 1183446
 */
#define FFT32(xb, xs, rb, id)   do { \
		FFT16(xb, (xs) << 1, rb); \
		FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
		FFT_LOOP(rb, 16, 8, id); \
	} while (0)

/*
 * Output range: |q| <= 2366892
 */
#define FFT64(xb, xs, rb, id)   do { \
		FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \
		FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \
		FFT_LOOP(rb, 32, 4, id); \
	} while (0)

#if SPH_SMALL_FOOTPRINT_SIMD

static void
fft32(unsigned char *x, size_t xs, s32 *q)
{
	size_t xd;

	xd = xs << 1;
	FFT16(0, xd, 0);
	FFT16(xs, xd, 16);
	FFT_LOOP(0, 16, 8, label_);
}

#define FFT128(xb, xs, rb, id)   do { \
		fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) +  0]); \
		fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \
		FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \
		fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \
		fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \
		FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \
		FFT_LOOP(rb, 64, 2, XCAT(id, a)); \
	} while (0)

#else

/*
 * Output range: |q| <= 4733784
 */
#define FFT128(xb, xs, rb, id)   do { \
		FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \
		FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \
		FFT_LOOP(rb, 64, 2, id); \
	} while (0)

#endif

/*
 * For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression
 * function which does not fit in the 32 kB L1 cache of a typical x86
 * Intel. We therefore add a function call layer at the FFT64 level.
 */

static void
fft64(unsigned char *x, size_t xs, s32 *q)
{
	size_t xd;

	xd = xs << 1;
	FFT32(0, xd, 0, label_a);
	FFT32(xs, xd, 32, label_b);
	FFT_LOOP(0, 32, 4, label_);
}

/*
 * Output range: |q| <= 9467568
 */
#define FFT256(xb, xs, rb, id)   do { \
		fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) +   0]); \
		fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) +  64]); \
		FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \
		fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \
		fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \
		FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \
		FFT_LOOP(rb, 128, 1, XCAT(id, a)); \
	} while (0)

/*
 * alpha^(127*i) mod 257
 */
static const unsigned short yoff_s_n[] = {
	  1,  98,  95,  58,  30, 113,  23, 198, 129,  49, 176,  29,
	 15, 185, 140,  99, 193, 153,  88, 143, 136, 221,  70, 178,
	225, 205,  44, 200,  68, 239,  35,  89, 241, 231,  22, 100,
	 34, 248, 146, 173, 249, 244,  11,  50,  17, 124,  73, 215,
	253, 122, 134,  25, 137,  62, 165, 236, 255,  61,  67, 141,
	197,  31, 211, 118, 256, 159, 162, 199, 227, 144, 234,  59,
	128, 208,  81, 228, 242,  72, 117, 158,  64, 104, 169, 114,
	121,  36, 187,  79,  32,  52, 213,  57, 189,  18, 222, 168,
	 16,  26, 235, 157, 223,   9, 111,  84,   8,  13, 246, 207,
	240, 133, 184,  42,   4, 135, 123, 232, 120, 195,  92,  21,
	  2, 196, 190, 116,  60, 226,  46, 139
};

/*
 * alpha^(127*i) + alpha^(125*i) mod 257
 */
static const unsigned short yoff_s_f[] = {
	  2, 156, 118, 107,  45, 212, 111, 162,  97, 249, 211,   3,
	 49, 101, 151, 223, 189, 178, 253, 204,  76,  82, 232,  65,
	 96, 176, 161,  47, 189,  61, 248, 107,   0, 131, 133, 113,
	 17,  33,  12, 111, 251, 103,  57, 148,  47,  65, 249, 143,
	189,   8, 204, 230, 205, 151, 187, 227, 247, 111, 140,   6,
	 77,  10,  21, 149, 255, 101, 139, 150, 212,  45, 146,  95,
	160,   8,  46, 254, 208, 156, 106,  34,  68,  79,   4,  53,
	181, 175,  25, 192, 161,  81,  96, 210,  68, 196,   9, 150,
	  0, 126, 124, 144, 240, 224, 245, 146,   6, 154, 200, 109,
	210, 192,   8, 114,  68, 249,  53,  27,  52, 106,  70,  30,
	 10, 146, 117, 251, 180, 247, 236, 108
};

/*
 * beta^(255*i) mod 257
 */
static const unsigned short yoff_b_n[] = {
	  1, 163,  98,  40,  95,  65,  58, 202,  30,   7, 113, 172,
	 23, 151, 198, 149, 129, 210,  49,  20, 176, 161,  29, 101,
	 15, 132, 185,  86, 140, 204,  99, 203, 193, 105, 153,  10,
	 88, 209, 143, 179, 136,  66, 221,  43,  70, 102, 178, 230,
	225, 181, 205,   5,  44, 233, 200, 218,  68,  33, 239, 150,
	 35,  51,  89, 115, 241, 219, 231, 131,  22, 245, 100, 109,
	 34, 145, 248,  75, 146, 154, 173, 186, 249, 238, 244, 194,
	 11, 251,  50, 183,  17, 201, 124, 166,  73,  77, 215,  93,
	253, 119, 122,  97, 134, 254,  25, 220, 137, 229,  62,  83,
	165, 167, 236, 175, 255, 188,  61, 177,  67, 127, 141, 110,
	197, 243,  31, 170, 211, 212, 118, 216, 256,  94, 159, 217,
	162, 192, 199,  55, 227, 250, 144,  85, 234, 106,  59, 108,
	128,  47, 208, 237,  81,  96, 228, 156, 242, 125,  72, 171,
	117,  53, 158,  54,  64, 152, 104, 247, 169,  48, 114,  78,
	121, 191,  36, 214, 187, 155,  79,  27,  32,  76,  52, 252,
	213,  24,  57,  39, 189, 224,  18, 107, 222, 206, 168, 142,
	 16,  38,  26, 126, 235,  12, 157, 148, 223, 112,   9, 182,
	111, 103,  84,  71,   8,  19,  13,  63, 246,   6, 207,  74,
	240,  56, 133,  91, 184, 180,  42, 164,   4, 138, 135, 160,
	123,   3, 232,  37, 120,  28, 195, 174,  92,  90,  21,  82,
	  2,  69, 196,  80, 190, 130, 116, 147,  60,  14, 226,  87,
	 46,  45, 139,  41
};

/*
 * beta^(255*i) + beta^(253*i) mod 257
 */
static const unsigned short yoff_b_f[] = {
	  2, 203, 156,  47, 118, 214, 107, 106,  45,  93, 212,  20,
	111,  73, 162, 251,  97, 215, 249,  53, 211,  19,   3,  89,
	 49, 207, 101,  67, 151, 130, 223,  23, 189, 202, 178, 239,
	253, 127, 204,  49,  76, 236,  82, 137, 232, 157,  65,  79,
	 96, 161, 176, 130, 161,  30,  47,   9, 189, 247,  61, 226,
	248,  90, 107,  64,   0,  88, 131, 243, 133,  59, 113, 115,
	 17, 236,  33, 213,  12, 191, 111,  19, 251,  61, 103, 208,
	 57,  35, 148, 248,  47, 116,  65, 119, 249, 178, 143,  40,
	189, 129,   8, 163, 204, 227, 230, 196, 205, 122, 151,  45,
	187,  19, 227,  72, 247, 125, 111, 121, 140, 220,   6, 107,
	 77,  69,  10, 101,  21,  65, 149, 171, 255,  54, 101, 210,
	139,  43, 150, 151, 212, 164,  45, 237, 146, 184,  95,   6,
	160,  42,   8, 204,  46, 238, 254, 168, 208,  50, 156, 190,
	106, 127,  34, 234,  68,  55,  79,  18,   4, 130,  53, 208,
	181,  21, 175, 120,  25, 100, 192, 178, 161,  96,  81, 127,
	 96, 227, 210, 248,  68,  10, 196,  31,   9, 167, 150, 193,
	  0, 169, 126,  14, 124, 198, 144, 142, 240,  21, 224,  44,
	245,  66, 146, 238,   6, 196, 154,  49, 200, 222, 109,   9,
	210, 141, 192, 138,   8,  79, 114, 217,  68, 128, 249,  94,
	 53,  30,  27,  61,  52, 135, 106, 212,  70, 238,  30, 185,
	 10, 132, 146, 136, 117,  37, 251, 150, 180, 188, 247, 156,
	236, 192, 108,  86
};

#define INNER(l, h, mm)   (((u32)((l) * (mm)) & 0xFFFFU) \
                          + ((u32)((h) * (mm)) << 16))

#define W_SMALL(sb, o1, o2, mm) \
	(INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \
	 INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \
	 INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \
	 INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm)

#define WS_0_0   W_SMALL( 4,    0,    1, 185)
#define WS_0_1   W_SMALL( 6,    0,    1, 185)
#define WS_0_2   W_SMALL( 0,    0,    1, 185)
#define WS_0_3   W_SMALL( 2,    0,    1, 185)
#define WS_0_4   W_SMALL( 7,    0,    1, 185)
#define WS_0_5   W_SMALL( 5,    0,    1, 185)
#define WS_0_6   W_SMALL( 3,    0,    1, 185)
#define WS_0_7   W_SMALL( 1,    0,    1, 185)
#define WS_1_0   W_SMALL(15,    0,    1, 185)
#define WS_1_1   W_SMALL(11,    0,    1, 185)
#define WS_1_2   W_SMALL(12,    0,    1, 185)
#define WS_1_3   W_SMALL( 8,    0,    1, 185)
#define WS_1_4   W_SMALL( 9,    0,    1, 185)
#define WS_1_5   W_SMALL(13,    0,    1, 185)
#define WS_1_6   W_SMALL(10,    0,    1, 185)
#define WS_1_7   W_SMALL(14,    0,    1, 185)
#define WS_2_0   W_SMALL(17, -128,  -64, 233)
#define WS_2_1   W_SMALL(18, -128,  -64, 233)
#define WS_2_2   W_SMALL(23, -128,  -64, 233)
#define WS_2_3   W_SMALL(20, -128,  -64, 233)
#define WS_2_4   W_SMALL(22, -128,  -64, 233)
#define WS_2_5   W_SMALL(21, -128,  -64, 233)
#define WS_2_6   W_SMALL(16, -128,  -64, 233)
#define WS_2_7   W_SMALL(19, -128,  -64, 233)
#define WS_3_0   W_SMALL(30, -191, -127, 233)
#define WS_3_1   W_SMALL(24, -191, -127, 233)
#define WS_3_2   W_SMALL(25, -191, -127, 233)
#define WS_3_3   W_SMALL(31, -191, -127, 233)
#define WS_3_4   W_SMALL(27, -191, -127, 233)
#define WS_3_5   W_SMALL(29, -191, -127, 233)
#define WS_3_6   W_SMALL(28, -191, -127, 233)
#define WS_3_7   W_SMALL(26, -191, -127, 233)

#define W_BIG(sb, o1, o2, mm) \
	(INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
	 INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
	 INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
	 INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
	 INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
	 INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
	 INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
	 INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)

#define WB_0_0   W_BIG( 4,    0,    1, 185)
#define WB_0_1   W_BIG( 6,    0,    1, 185)
#define WB_0_2   W_BIG( 0,    0,    1, 185)
#define WB_0_3   W_BIG( 2,    0,    1, 185)
#define WB_0_4   W_BIG( 7,    0,    1, 185)
#define WB_0_5   W_BIG( 5,    0,    1, 185)
#define WB_0_6   W_BIG( 3,    0,    1, 185)
#define WB_0_7   W_BIG( 1,    0,    1, 185)
#define WB_1_0   W_BIG(15,    0,    1, 185)
#define WB_1_1   W_BIG(11,    0,    1, 185)
#define WB_1_2   W_BIG(12,    0,    1, 185)
#define WB_1_3   W_BIG( 8,    0,    1, 185)
#define WB_1_4   W_BIG( 9,    0,    1, 185)
#define WB_1_5   W_BIG(13,    0,    1, 185)
#define WB_1_6   W_BIG(10,    0,    1, 185)
#define WB_1_7   W_BIG(14,    0,    1, 185)
#define WB_2_0   W_BIG(17, -256, -128, 233)
#define WB_2_1   W_BIG(18, -256, -128, 233)
#define WB_2_2   W_BIG(23, -256, -128, 233)
#define WB_2_3   W_BIG(20, -256, -128, 233)
#define WB_2_4   W_BIG(22, -256, -128, 233)
#define WB_2_5   W_BIG(21, -256, -128, 233)
#define WB_2_6   W_BIG(16, -256, -128, 233)
#define WB_2_7   W_BIG(19, -256, -128, 233)
#define WB_3_0   W_BIG(30, -383, -255, 233)
#define WB_3_1   W_BIG(24, -383, -255, 233)
#define WB_3_2   W_BIG(25, -383, -255, 233)
#define WB_3_3   W_BIG(31, -383, -255, 233)
#define WB_3_4   W_BIG(27, -383, -255, 233)
#define WB_3_5   W_BIG(29, -383, -255, 233)
#define WB_3_6   W_BIG(28, -383, -255, 233)
#define WB_3_7   W_BIG(26, -383, -255, 233)

#define IF(x, y, z)    ((((y) ^ (z)) & (x)) ^ (z))
#define MAJ(x, y, z)   (((x) & (y)) | (((x) | (y)) & (z)))

#define PP4_0_0   1
#define PP4_0_1   0
#define PP4_0_2   3
#define PP4_0_3   2
#define PP4_1_0   2
#define PP4_1_1   3
#define PP4_1_2   0
#define PP4_1_3   1
#define PP4_2_0   3
#define PP4_2_1   2
#define PP4_2_2   1
#define PP4_2_3   0

#define PP8_0_0   1
#define PP8_0_1   0
#define PP8_0_2   3
#define PP8_0_3   2
#define PP8_0_4   5
#define PP8_0_5   4
#define PP8_0_6   7
#define PP8_0_7   6

#define PP8_1_0   6
#define PP8_1_1   7
#define PP8_1_2   4
#define PP8_1_3   5
#define PP8_1_4   2
#define PP8_1_5   3
#define PP8_1_6   0
#define PP8_1_7   1

#define PP8_2_0   2
#define PP8_2_1   3
#define PP8_2_2   0
#define PP8_2_3   1
#define PP8_2_4   6
#define PP8_2_5   7
#define PP8_2_6   4
#define PP8_2_7   5

#define PP8_3_0   3
#define PP8_3_1   2
#define PP8_3_2   1
#define PP8_3_3   0
#define PP8_3_4   7
#define PP8_3_5   6
#define PP8_3_6   5
#define PP8_3_7   4

#define PP8_4_0   5
#define PP8_4_1   4
#define PP8_4_2   7
#define PP8_4_3   6
#define PP8_4_4   1
#define PP8_4_5   0
#define PP8_4_6   3
#define PP8_4_7   2

#define PP8_5_0   7
#define PP8_5_1   6
#define PP8_5_2   5
#define PP8_5_3   4
#define PP8_5_4   3
#define PP8_5_5   2
#define PP8_5_6   1
#define PP8_5_7   0

#define PP8_6_0   4
#define PP8_6_1   5
#define PP8_6_2   6
#define PP8_6_3   7
#define PP8_6_4   0
#define PP8_6_5   1
#define PP8_6_6   2
#define PP8_6_7   3

#if SPH_SIMD_NOCOPY

#define DECL_STATE_SMALL
#define READ_STATE_SMALL(sc)
#define WRITE_STATE_SMALL(sc)
#define DECL_STATE_BIG
#define READ_STATE_BIG(sc)
#define WRITE_STATE_BIG(sc)

#else

#define DECL_STATE_SMALL   \
	u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3;

#define READ_STATE_SMALL(sc)   do { \
		A0 = (sc)->state[ 0]; \
		A1 = (sc)->state[ 1]; \
		A2 = (sc)->state[ 2]; \
		A3 = (sc)->state[ 3]; \
		B0 = (sc)->state[ 4]; \
		B1 = (sc)->state[ 5]; \
		B2 = (sc)->state[ 6]; \
		B3 = (sc)->state[ 7]; \
		C0 = (sc)->state[ 8]; \
		C1 = (sc)->state[ 9]; \
		C2 = (sc)->state[10]; \
		C3 = (sc)->state[11]; \
		D0 = (sc)->state[12]; \
		D1 = (sc)->state[13]; \
		D2 = (sc)->state[14]; \
		D3 = (sc)->state[15]; \
	} while (0)

#define WRITE_STATE_SMALL(sc)   do { \
		(sc)->state[ 0] = A0; \
		(sc)->state[ 1] = A1; \
		(sc)->state[ 2] = A2; \
		(sc)->state[ 3] = A3; \
		(sc)->state[ 4] = B0; \
		(sc)->state[ 5] = B1; \
		(sc)->state[ 6] = B2; \
		(sc)->state[ 7] = B3; \
		(sc)->state[ 8] = C0; \
		(sc)->state[ 9] = C1; \
		(sc)->state[10] = C2; \
		(sc)->state[11] = C3; \
		(sc)->state[12] = D0; \
		(sc)->state[13] = D1; \
		(sc)->state[14] = D2; \
		(sc)->state[15] = D3; \
	} while (0)

#define DECL_STATE_BIG   \
	u32 A0, A1, A2, A3, A4, A5, A6, A7; \
	u32 B0, B1, B2, B3, B4, B5, B6, B7; \
	u32 C0, C1, C2, C3, C4, C5, C6, C7; \
	u32 D0, D1, D2, D3, D4, D5, D6, D7;

#define READ_STATE_BIG(sc)   do { \
		A0 = (sc)->state[ 0]; \
		A1 = (sc)->state[ 1]; \
		A2 = (sc)->state[ 2]; \
		A3 = (sc)->state[ 3]; \
		A4 = (sc)->state[ 4]; \
		A5 = (sc)->state[ 5]; \
		A6 = (sc)->state[ 6]; \
		A7 = (sc)->state[ 7]; \
		B0 = (sc)->state[ 8]; \
		B1 = (sc)->state[ 9]; \
		B2 = (sc)->state[10]; \
		B3 = (sc)->state[11]; \
		B4 = (sc)->state[12]; \
		B5 = (sc)->state[13]; \
		B6 = (sc)->state[14]; \
		B7 = (sc)->state[15]; \
		C0 = (sc)->state[16]; \
		C1 = (sc)->state[17]; \
		C2 = (sc)->state[18]; \
		C3 = (sc)->state[19]; \
		C4 = (sc)->state[20]; \
		C5 = (sc)->state[21]; \
		C6 = (sc)->state[22]; \
		C7 = (sc)->state[23]; \
		D0 = (sc)->state[24]; \
		D1 = (sc)->state[25]; \
		D2 = (sc)->state[26]; \
		D3 = (sc)->state[27]; \
		D4 = (sc)->state[28]; \
		D5 = (sc)->state[29]; \
		D6 = (sc)->state[30]; \
		D7 = (sc)->state[31]; \
	} while (0)

#define WRITE_STATE_BIG(sc)   do { \
		(sc)->state[ 0] = A0; \
		(sc)->state[ 1] = A1; \
		(sc)->state[ 2] = A2; \
		(sc)->state[ 3] = A3; \
		(sc)->state[ 4] = A4; \
		(sc)->state[ 5] = A5; \
		(sc)->state[ 6] = A6; \
		(sc)->state[ 7] = A7; \
		(sc)->state[ 8] = B0; \
		(sc)->state[ 9] = B1; \
		(sc)->state[10] = B2; \
		(sc)->state[11] = B3; \
		(sc)->state[12] = B4; \
		(sc)->state[13] = B5; \
		(sc)->state[14] = B6; \
		(sc)->state[15] = B7; \
		(sc)->state[16] = C0; \
		(sc)->state[17] = C1; \
		(sc)->state[18] = C2; \
		(sc)->state[19] = C3; \
		(sc)->state[20] = C4; \
		(sc)->state[21] = C5; \
		(sc)->state[22] = C6; \
		(sc)->state[23] = C7; \
		(sc)->state[24] = D0; \
		(sc)->state[25] = D1; \
		(sc)->state[26] = D2; \
		(sc)->state[27] = D3; \
		(sc)->state[28] = D4; \
		(sc)->state[29] = D5; \
		(sc)->state[30] = D6; \
		(sc)->state[31] = D7; \
	} while (0)

#endif

#define STEP_ELT(n, w, fun, s, ppb)   do { \
		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
		A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
		D ## n = C ## n; \
		C ## n = B ## n; \
		B ## n = tA ## n; \
	} while (0)

#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)   do { \
		u32 tA0 = ROL32(A0, r); \
		u32 tA1 = ROL32(A1, r); \
		u32 tA2 = ROL32(A2, r); \
		u32 tA3 = ROL32(A3, r); \
		STEP_ELT(0, w0, fun, s, pp4b); \
		STEP_ELT(1, w1, fun, s, pp4b); \
		STEP_ELT(2, w2, fun, s, pp4b); \
		STEP_ELT(3, w3, fun, s, pp4b); \
	} while (0)

#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)   do { \
		u32 tA0 = ROL32(A0, r); \
		u32 tA1 = ROL32(A1, r); \
		u32 tA2 = ROL32(A2, r); \
		u32 tA3 = ROL32(A3, r); \
		u32 tA4 = ROL32(A4, r); \
		u32 tA5 = ROL32(A5, r); \
		u32 tA6 = ROL32(A6, r); \
		u32 tA7 = ROL32(A7, r); \
		STEP_ELT(0, w0, fun, s, pp8b); \
		STEP_ELT(1, w1, fun, s, pp8b); \
		STEP_ELT(2, w2, fun, s, pp8b); \
		STEP_ELT(3, w3, fun, s, pp8b); \
		STEP_ELT(4, w4, fun, s, pp8b); \
		STEP_ELT(5, w5, fun, s, pp8b); \
		STEP_ELT(6, w6, fun, s, pp8b); \
		STEP_ELT(7, w7, fun, s, pp8b); \
	} while (0)

#define M3_0_0   0_
#define M3_1_0   1_
#define M3_2_0   2_
#define M3_3_0   0_
#define M3_4_0   1_
#define M3_5_0   2_
#define M3_6_0   0_
#define M3_7_0   1_

#define M3_0_1   1_
#define M3_1_1   2_
#define M3_2_1   0_
#define M3_3_1   1_
#define M3_4_1   2_
#define M3_5_1   0_
#define M3_6_1   1_
#define M3_7_1   2_

#define M3_0_2   2_
#define M3_1_2   0_
#define M3_2_2   1_
#define M3_3_2   2_
#define M3_4_2   0_
#define M3_5_2   1_
#define M3_6_2   2_
#define M3_7_2   0_

#define STEP_SMALL_(w, fun, r, s, pp4b)   STEP_SMALL w, fun, r, s, pp4b)

#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3)   do { \
		STEP_SMALL_(WS_ ## ri ## 0, \
			IF,  p0, p1, XCAT(PP4_, M3_0_ ## isp)); \
		STEP_SMALL_(WS_ ## ri ## 1, \
			IF,  p1, p2, XCAT(PP4_, M3_1_ ## isp)); \
		STEP_SMALL_(WS_ ## ri ## 2, \
			IF,  p2, p3, XCAT(PP4_, M3_2_ ## isp)); \
		STEP_SMALL_(WS_ ## ri ## 3, \
			IF,  p3, p0, XCAT(PP4_, M3_3_ ## isp)); \
		STEP_SMALL_(WS_ ## ri ## 4, \
			MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \
		STEP_SMALL_(WS_ ## ri ## 5, \
			MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \
		STEP_SMALL_(WS_ ## ri ## 6, \
			MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \
		STEP_SMALL_(WS_ ## ri ## 7, \
			MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \
	} while (0)

#define M7_0_0   0_
#define M7_1_0   1_
#define M7_2_0   2_
#define M7_3_0   3_
#define M7_4_0   4_
#define M7_5_0   5_
#define M7_6_0   6_
#define M7_7_0   0_

#define M7_0_1   1_
#define M7_1_1   2_
#define M7_2_1   3_
#define M7_3_1   4_
#define M7_4_1   5_
#define M7_5_1   6_
#define M7_6_1   0_
#define M7_7_1   1_

#define M7_0_2   2_
#define M7_1_2   3_
#define M7_2_2   4_
#define M7_3_2   5_
#define M7_4_2   6_
#define M7_5_2   0_
#define M7_6_2   1_
#define M7_7_2   2_

#define M7_0_3   3_
#define M7_1_3   4_
#define M7_2_3   5_
#define M7_3_3   6_
#define M7_4_3   0_
#define M7_5_3   1_
#define M7_6_3   2_
#define M7_7_3   3_

#define STEP_BIG_(w, fun, r, s, pp8b)   STEP_BIG w, fun, r, s, pp8b)

#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3)   do { \
		STEP_BIG_(WB_ ## ri ## 0, \
			IF,  p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
		STEP_BIG_(WB_ ## ri ## 1, \
			IF,  p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
		STEP_BIG_(WB_ ## ri ## 2, \
			IF,  p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
		STEP_BIG_(WB_ ## ri ## 3, \
			IF,  p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
		STEP_BIG_(WB_ ## ri ## 4, \
			MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
		STEP_BIG_(WB_ ## ri ## 5, \
			MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
		STEP_BIG_(WB_ ## ri ## 6, \
			MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
		STEP_BIG_(WB_ ## ri ## 7, \
			MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
	} while (0)

#if SPH_SMALL_FOOTPRINT_SIMD

#define A0   state[ 0]
#define A1   state[ 1]
#define A2   state[ 2]
#define A3   state[ 3]
#define B0   state[ 4]
#define B1   state[ 5]
#define B2   state[ 6]
#define B3   state[ 7]
#define C0   state[ 8]
#define C1   state[ 9]
#define C2   state[10]
#define C3   state[11]
#define D0   state[12]
#define D1   state[13]
#define D2   state[14]
#define D3   state[15]

#define STEP2_ELT(n, w, fun, s, ppb)   do { \
		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
		A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
		D ## n = C ## n; \
		C ## n = B ## n; \
		B ## n = tA[n]; \
	} while (0)

#define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)   do { \
		u32 tA[4]; \
		tA[0] = ROL32(A0, r); \
		tA[1] = ROL32(A1, r); \
		tA[2] = ROL32(A2, r); \
		tA[3] = ROL32(A3, r); \
		STEP2_ELT(0, w0, fun, s, pp4b); \
		STEP2_ELT(1, w1, fun, s, pp4b); \
		STEP2_ELT(2, w2, fun, s, pp4b); \
		STEP2_ELT(3, w3, fun, s, pp4b); \
	} while (0)

static void
one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
{
	static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 };

	STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF,  p0, p1, pp4k[isp + 0]);
	STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF,  p1, p2, pp4k[isp + 1]);
	STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF,  p2, p3, pp4k[isp + 2]);
	STEP2_SMALL(w[12], w[13], w[14], w[15], IF,  p3, p0, pp4k[isp + 3]);
	STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]);
	STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]);
	STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]);
	STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]);
}

static void
compress_small(sph_simd_small_context *sc, int last)
{
	unsigned char *x;
	s32 q[128];
	int i;
	u32 w[32];
	u32 state[16];
	size_t u;

	static const size_t wsp[32] = {
		 4 << 3,  6 << 3,  0 << 3,  2 << 3,
		 7 << 3,  5 << 3,  3 << 3,  1 << 3,
		15 << 3, 11 << 3, 12 << 3,  8 << 3,
		 9 << 3, 13 << 3, 10 << 3, 14 << 3,
		17 << 3, 18 << 3, 23 << 3, 20 << 3,
		22 << 3, 21 << 3, 16 << 3, 19 << 3,
		30 << 3, 24 << 3, 25 << 3, 31 << 3,
		27 << 3, 29 << 3, 28 << 3, 26 << 3
	};

	x = sc->buf;
	FFT128(0, 1, 0, ll);
	if (last) {
		for (i = 0; i < 128; i ++) {
			s32 tq;

			tq = q[i] + yoff_s_f[i];
			tq = REDS2(tq);
			tq = REDS1(tq);
			tq = REDS1(tq);
			q[i] = (tq <= 128 ? tq : tq - 257);
		}
	} else {
		for (i = 0; i < 128; i ++) {
			s32 tq;

			tq = q[i] + yoff_s_n[i];
			tq = REDS2(tq);
			tq = REDS1(tq);
			tq = REDS1(tq);
			q[i] = (tq <= 128 ? tq : tq - 257);
		}
	}

	for (i = 0; i < 16; i += 4) {
		state[i + 0] = sc->state[i + 0]
			^ sph_dec32le_aligned(x + 4 * (i + 0));
		state[i + 1] = sc->state[i + 1]
			^ sph_dec32le_aligned(x + 4 * (i + 1));
		state[i + 2] = sc->state[i + 2]
			^ sph_dec32le_aligned(x + 4 * (i + 2));
		state[i + 3] = sc->state[i + 3]
			^ sph_dec32le_aligned(x + 4 * (i + 3));
	}

#define WSREAD(sb, o1, o2, mm)   do { \
		for (u = 0; u < 32; u += 4) { \
			size_t v = wsp[(u >> 2) + (sb)]; \
			w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
				q[v + 2 * 0 + (o2)], mm); \
			w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
				q[v + 2 * 1 + (o2)], mm); \
			w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
				q[v + 2 * 2 + (o2)], mm); \
			w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
				q[v + 2 * 3 + (o2)], mm); \
		} \
	} while (0)

	WSREAD( 0,    0,    1, 185);
	one_round_small(state, w, 0,  3, 23, 17, 27);
	WSREAD( 8,    0,    1, 185);
	one_round_small(state, w, 2, 28, 19, 22,  7);
	WSREAD(16, -128,  -64, 233);
	one_round_small(state, w, 1, 29,  9, 15,  5);
	WSREAD(24, -191, -127, 233);
	one_round_small(state, w, 0,  4, 13, 10, 25);

#undef WSREAD

	STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
		IF,  4, 13, PP4_2_);
	STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
		IF, 13, 10, PP4_0_);
	STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
		IF, 10, 25, PP4_1_);
	STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
		IF, 25,  4, PP4_2_);

	memcpy(sc->state, state, sizeof state);
}

#undef A0
#undef A1
#undef A2
#undef A3
#undef B0
#undef B1
#undef B2
#undef B3
#undef C0
#undef C1
#undef C2
#undef C3
#undef D0
#undef D1
#undef D2
#undef D3

#else

#if SPH_SIMD_NOCOPY
#define A0   (sc->state[ 0])
#define A1   (sc->state[ 1])
#define A2   (sc->state[ 2])
#define A3   (sc->state[ 3])
#define B0   (sc->state[ 4])
#define B1   (sc->state[ 5])
#define B2   (sc->state[ 6])
#define B3   (sc->state[ 7])
#define C0   (sc->state[ 8])
#define C1   (sc->state[ 9])
#define C2   (sc->state[10])
#define C3   (sc->state[11])
#define D0   (sc->state[12])
#define D1   (sc->state[13])
#define D2   (sc->state[14])
#define D3   (sc->state[15])
#endif

static void
compress_small(sph_simd_small_context *sc, int last)
{
	unsigned char *x;
	s32 q[128];
	int i;
	DECL_STATE_SMALL
#if SPH_SIMD_NOCOPY
	sph_u32 saved[16];
#endif

#if SPH_SIMD_NOCOPY
	memcpy(saved, sc->state, sizeof saved);
#endif
	x = sc->buf;
	FFT128(0, 1, 0, ll);
	if (last) {
		for (i = 0; i < 128; i ++) {
			s32 tq;

			tq = q[i] + yoff_s_f[i];
			tq = REDS2(tq);
			tq = REDS1(tq);
			tq = REDS1(tq);
			q[i] = (tq <= 128 ? tq : tq - 257);
		}
	} else {
		for (i = 0; i < 128; i ++) {
			s32 tq;

			tq = q[i] + yoff_s_n[i];
			tq = REDS2(tq);
			tq = REDS1(tq);
			tq = REDS1(tq);
			q[i] = (tq <= 128 ? tq : tq - 257);
		}
	}
	READ_STATE_SMALL(sc);
	A0 ^= sph_dec32le_aligned(x +  0);
	A1 ^= sph_dec32le_aligned(x +  4);
	A2 ^= sph_dec32le_aligned(x +  8);
	A3 ^= sph_dec32le_aligned(x + 12);
	B0 ^= sph_dec32le_aligned(x + 16);
	B1 ^= sph_dec32le_aligned(x + 20);
	B2 ^= sph_dec32le_aligned(x + 24);
	B3 ^= sph_dec32le_aligned(x + 28);
	C0 ^= sph_dec32le_aligned(x + 32);
	C1 ^= sph_dec32le_aligned(x + 36);
	C2 ^= sph_dec32le_aligned(x + 40);
	C3 ^= sph_dec32le_aligned(x + 44);
	D0 ^= sph_dec32le_aligned(x + 48);
	D1 ^= sph_dec32le_aligned(x + 52);
	D2 ^= sph_dec32le_aligned(x + 56);
	D3 ^= sph_dec32le_aligned(x + 60);
	ONE_ROUND_SMALL(0_, 0,  3, 23, 17, 27);
	ONE_ROUND_SMALL(1_, 2, 28, 19, 22,  7);
	ONE_ROUND_SMALL(2_, 1, 29,  9, 15,  5);
	ONE_ROUND_SMALL(3_, 0,  4, 13, 10, 25);
#if SPH_SIMD_NOCOPY
	STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3],
		IF,  4, 13, PP4_2_);
	STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7],
		IF, 13, 10, PP4_0_);
	STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11],
		IF, 10, 25, PP4_1_);
	STEP_SMALL(saved[12], saved[13], saved[14], saved[15],
		IF, 25,  4, PP4_2_);
#else
	STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
		IF,  4, 13, PP4_2_);
	STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
		IF, 13, 10, PP4_0_);
	STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
		IF, 10, 25, PP4_1_);
	STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
		IF, 25,  4, PP4_2_);
	WRITE_STATE_SMALL(sc);
#endif
}

#if SPH_SIMD_NOCOPY
#undef A0
#undef A1
#undef A2
#undef A3
#undef B0
#undef B1
#undef B2
#undef B3
#undef C0
#undef C1
#undef C2
#undef C3
#undef D0
#undef D1
#undef D2
#undef D3
#endif

#endif

#if SPH_SMALL_FOOTPRINT_SIMD

#define A0   state[ 0]
#define A1   state[ 1]
#define A2   state[ 2]
#define A3   state[ 3]
#define A4   state[ 4]
#define A5   state[ 5]
#define A6   state[ 6]
#define A7   state[ 7]
#define B0   state[ 8]
#define B1   state[ 9]
#define B2   state[10]
#define B3   state[11]
#define B4   state[12]
#define B5   state[13]
#define B6   state[14]
#define B7   state[15]
#define C0   state[16]
#define C1   state[17]
#define C2   state[18]
#define C3   state[19]
#define C4   state[20]
#define C5   state[21]
#define C6   state[22]
#define C7   state[23]
#define D0   state[24]
#define D1   state[25]
#define D2   state[26]
#define D3   state[27]
#define D4   state[28]
#define D5   state[29]
#define D6   state[30]
#define D7   state[31]

/*
 * Not needed -- already defined for SIMD-224 / SIMD-256
 *
#define STEP2_ELT(n, w, fun, s, ppb)   do { \
		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
		A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
		D ## n = C ## n; \
		C ## n = B ## n; \
		B ## n = tA[n]; \
	} while (0)
 */

#define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)   do { \
		u32 tA[8]; \
		tA[0] = ROL32(A0, r); \
		tA[1] = ROL32(A1, r); \
		tA[2] = ROL32(A2, r); \
		tA[3] = ROL32(A3, r); \
		tA[4] = ROL32(A4, r); \
		tA[5] = ROL32(A5, r); \
		tA[6] = ROL32(A6, r); \
		tA[7] = ROL32(A7, r); \
		STEP2_ELT(0, w0, fun, s, pp8b); \
		STEP2_ELT(1, w1, fun, s, pp8b); \
		STEP2_ELT(2, w2, fun, s, pp8b); \
		STEP2_ELT(3, w3, fun, s, pp8b); \
		STEP2_ELT(4, w4, fun, s, pp8b); \
		STEP2_ELT(5, w5, fun, s, pp8b); \
		STEP2_ELT(6, w6, fun, s, pp8b); \
		STEP2_ELT(7, w7, fun, s, pp8b); \
	} while (0)

static void
one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
{
	static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 };

	STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7],
		IF,  p0, p1, pp8k[isp + 0]);
	STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15],
		IF,  p1, p2, pp8k[isp + 1]);
	STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23],
		IF,  p2, p3, pp8k[isp + 2]);
	STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31],
		IF,  p3, p0, pp8k[isp + 3]);
	STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39],
		MAJ, p0, p1, pp8k[isp + 4]);
	STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47],
		MAJ, p1, p2, pp8k[isp + 5]);
	STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55],
		MAJ, p2, p3, pp8k[isp + 6]);
	STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63],
		MAJ, p3, p0, pp8k[isp + 7]);
}

static void
compress_big(sph_simd_big_context *sc, int last)
{
	unsigned char *x;
	s32 q[256];
	int i;
	u32 w[64];
	u32 state[32];
	size_t u;

	static const size_t wbp[32] = {
		 4 << 4,  6 << 4,  0 << 4,  2 << 4,
		 7 << 4,  5 << 4,  3 << 4,  1 << 4,
		15 << 4, 11 << 4, 12 << 4,  8 << 4,
		 9 << 4, 13 << 4, 10 << 4, 14 << 4,
		17 << 4, 18 << 4, 23 << 4, 20 << 4,
		22 << 4, 21 << 4, 16 << 4, 19 << 4,
		30 << 4, 24 << 4, 25 << 4, 31 << 4,
		27 << 4, 29 << 4, 28 << 4, 26 << 4
	};

	x = sc->buf;
	FFT256(0, 1, 0, ll);
	if (last) {
		for (i = 0; i < 256; i ++) {
			s32 tq;

			tq = q[i] + yoff_b_f[i];
			tq = REDS2(tq);
			tq = REDS1(tq);
			tq = REDS1(tq);
			q[i] = (tq <= 128 ? tq : tq - 257);
		}
	} else {
		for (i = 0; i < 256; i ++) {
			s32 tq;

			tq = q[i] + yoff_b_n[i];
			tq = REDS2(tq);
			tq = REDS1(tq);
			tq = REDS1(tq);
			q[i] = (tq <= 128 ? tq : tq - 257);
		}
	}

	for (i = 0; i < 32; i += 8) {
		state[i + 0] = sc->state[i + 0]
			^ sph_dec32le_aligned(x + 4 * (i + 0));
		state[i + 1] = sc->state[i + 1]
			^ sph_dec32le_aligned(x + 4 * (i + 1));
		state[i + 2] = sc->state[i + 2]
			^ sph_dec32le_aligned(x + 4 * (i + 2));
		state[i + 3] = sc->state[i + 3]
			^ sph_dec32le_aligned(x + 4 * (i + 3));
		state[i + 4] = sc->state[i + 4]
			^ sph_dec32le_aligned(x + 4 * (i + 4));
		state[i + 5] = sc->state[i + 5]
			^ sph_dec32le_aligned(x + 4 * (i + 5));
		state[i + 6] = sc->state[i + 6]
			^ sph_dec32le_aligned(x + 4 * (i + 6));
		state[i + 7] = sc->state[i + 7]
			^ sph_dec32le_aligned(x + 4 * (i + 7));
	}

#define WBREAD(sb, o1, o2, mm)   do { \
		for (u = 0; u < 64; u += 8) { \
			size_t v = wbp[(u >> 3) + (sb)]; \
			w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
				q[v + 2 * 0 + (o2)], mm); \
			w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
				q[v + 2 * 1 + (o2)], mm); \
			w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
				q[v + 2 * 2 + (o2)], mm); \
			w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
				q[v + 2 * 3 + (o2)], mm); \
			w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \
				q[v + 2 * 4 + (o2)], mm); \
			w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \
				q[v + 2 * 5 + (o2)], mm); \
			w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \
				q[v + 2 * 6 + (o2)], mm); \
			w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \
				q[v + 2 * 7 + (o2)], mm); \
		} \
	} while (0)

	WBREAD( 0,    0,    1, 185);
	one_round_big(state, w, 0,  3, 23, 17, 27);
	WBREAD( 8,    0,    1, 185);
	one_round_big(state, w, 1, 28, 19, 22,  7);
	WBREAD(16, -256, -128, 233);
	one_round_big(state, w, 2, 29,  9, 15,  5);
	WBREAD(24, -383, -255, 233);
	one_round_big(state, w, 3,  4, 13, 10, 25);

#undef WBREAD

	STEP_BIG(
		sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
		sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
		IF,  4, 13, PP8_4_);
	STEP_BIG(
		sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
		sc->state[12], sc->state[13], sc->state[14], sc->state[15],
		IF, 13, 10, PP8_5_);
	STEP_BIG(
		sc->state[16], sc->state[17], sc->state[18], sc->state[19],
		sc->state[20], sc->state[21], sc->state[22], sc->state[23],
		IF, 10, 25, PP8_6_);
	STEP_BIG(
		sc->state[24], sc->state[25], sc->state[26], sc->state[27],
		sc->state[28], sc->state[29], sc->state[30], sc->state[31],
		IF, 25,  4, PP8_0_);

	memcpy(sc->state, state, sizeof state);
}

#undef A0
#undef A1
#undef A2
#undef A3
#undef A4
#undef A5
#undef A6
#undef A7
#undef B0
#undef B1
#undef B2
#undef B3
#undef B4
#undef B5
#undef B6
#undef B7
#undef C0
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#undef D0
#undef D1
#undef D2
#undef D3
#undef D4
#undef D5
#undef D6
#undef D7

#else

#if SPH_SIMD_NOCOPY
#define A0   (sc->state[ 0])
#define A1   (sc->state[ 1])
#define A2   (sc->state[ 2])
#define A3   (sc->state[ 3])
#define A4   (sc->state[ 4])
#define A5   (sc->state[ 5])
#define A6   (sc->state[ 6])
#define A7   (sc->state[ 7])
#define B0   (sc->state[ 8])
#define B1   (sc->state[ 9])
#define B2   (sc->state[10])
#define B3   (sc->state[11])
#define B4   (sc->state[12])
#define B5   (sc->state[13])
#define B6   (sc->state[14])
#define B7   (sc->state[15])
#define C0   (sc->state[16])
#define C1   (sc->state[17])
#define C2   (sc->state[18])
#define C3   (sc->state[19])
#define C4   (sc->state[20])
#define C5   (sc->state[21])
#define C6   (sc->state[22])
#define C7   (sc->state[23])
#define D0   (sc->state[24])
#define D1   (sc->state[25])
#define D2   (sc->state[26])
#define D3   (sc->state[27])
#define D4   (sc->state[28])
#define D5   (sc->state[29])
#define D6   (sc->state[30])
#define D7   (sc->state[31])
#endif

static void
compress_big(sph_simd_big_context *sc, int last)
{
	unsigned char *x;
	s32 q[256];
	int i;
	DECL_STATE_BIG
#if SPH_SIMD_NOCOPY
	sph_u32 saved[32];
#endif

#if SPH_SIMD_NOCOPY
	memcpy(saved, sc->state, sizeof saved);
#endif

	x = sc->buf;
	FFT256(0, 1, 0, ll);
	if (last) {
		for (i = 0; i < 256; i ++) {
			s32 tq;

			tq = q[i] + yoff_b_f[i];
			tq = REDS2(tq);
			tq = REDS1(tq);
			tq = REDS1(tq);
			q[i] = (tq <= 128 ? tq : tq - 257);
		}
	} else {
		for (i = 0; i < 256; i ++) {
			s32 tq;

			tq = q[i] + yoff_b_n[i];
			tq = REDS2(tq);
			tq = REDS1(tq);
			tq = REDS1(tq);
			q[i] = (tq <= 128 ? tq : tq - 257);
		}
	}
	READ_STATE_BIG(sc);
	A0 ^= sph_dec32le_aligned(x +   0);
	A1 ^= sph_dec32le_aligned(x +   4);
	A2 ^= sph_dec32le_aligned(x +   8);
	A3 ^= sph_dec32le_aligned(x +  12);
	A4 ^= sph_dec32le_aligned(x +  16);
	A5 ^= sph_dec32le_aligned(x +  20);
	A6 ^= sph_dec32le_aligned(x +  24);
	A7 ^= sph_dec32le_aligned(x +  28);
	B0 ^= sph_dec32le_aligned(x +  32);
	B1 ^= sph_dec32le_aligned(x +  36);
	B2 ^= sph_dec32le_aligned(x +  40);
	B3 ^= sph_dec32le_aligned(x +  44);
	B4 ^= sph_dec32le_aligned(x +  48);
	B5 ^= sph_dec32le_aligned(x +  52);
	B6 ^= sph_dec32le_aligned(x +  56);
	B7 ^= sph_dec32le_aligned(x +  60);
	C0 ^= sph_dec32le_aligned(x +  64);
	C1 ^= sph_dec32le_aligned(x +  68);
	C2 ^= sph_dec32le_aligned(x +  72);
	C3 ^= sph_dec32le_aligned(x +  76);
	C4 ^= sph_dec32le_aligned(x +  80);
	C5 ^= sph_dec32le_aligned(x +  84);
	C6 ^= sph_dec32le_aligned(x +  88);
	C7 ^= sph_dec32le_aligned(x +  92);
	D0 ^= sph_dec32le_aligned(x +  96);
	D1 ^= sph_dec32le_aligned(x + 100);
	D2 ^= sph_dec32le_aligned(x + 104);
	D3 ^= sph_dec32le_aligned(x + 108);
	D4 ^= sph_dec32le_aligned(x + 112);
	D5 ^= sph_dec32le_aligned(x + 116);
	D6 ^= sph_dec32le_aligned(x + 120);
	D7 ^= sph_dec32le_aligned(x + 124);

	ONE_ROUND_BIG(0_, 0,  3, 23, 17, 27);
	ONE_ROUND_BIG(1_, 1, 28, 19, 22,  7);
	ONE_ROUND_BIG(2_, 2, 29,  9, 15,  5);
	ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);
#if SPH_SIMD_NOCOPY
	STEP_BIG(
		saved[ 0], saved[ 1], saved[ 2], saved[ 3],
		saved[ 4], saved[ 5], saved[ 6], saved[ 7],
		IF,  4, 13, PP8_4_);
	STEP_BIG(
		saved[ 8], saved[ 9], saved[10], saved[11],
		saved[12], saved[13], saved[14], saved[15],
		IF, 13, 10, PP8_5_);
	STEP_BIG(
		saved[16], saved[17], saved[18], saved[19],
		saved[20], saved[21], saved[22], saved[23],
		IF, 10, 25, PP8_6_);
	STEP_BIG(
		saved[24], saved[25], saved[26], saved[27],
		saved[28], saved[29], saved[30], saved[31],
		IF, 25,  4, PP8_0_);
#else
	STEP_BIG(
		sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
		sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
		IF,  4, 13, PP8_4_);
	STEP_BIG(
		sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
		sc->state[12], sc->state[13], sc->state[14], sc->state[15],
		IF, 13, 10, PP8_5_);
	STEP_BIG(
		sc->state[16], sc->state[17], sc->state[18], sc->state[19],
		sc->state[20], sc->state[21], sc->state[22], sc->state[23],
		IF, 10, 25, PP8_6_);
	STEP_BIG(
		sc->state[24], sc->state[25], sc->state[26], sc->state[27],
		sc->state[28], sc->state[29], sc->state[30], sc->state[31],
		IF, 25,  4, PP8_0_);
	WRITE_STATE_BIG(sc);
#endif
}

#if SPH_SIMD_NOCOPY
#undef A0
#undef A1
#undef A2
#undef A3
#undef A4
#undef A5
#undef A6
#undef A7
#undef B0
#undef B1
#undef B2
#undef B3
#undef B4
#undef B5
#undef B6
#undef B7
#undef C0
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#undef D0
#undef D1
#undef D2
#undef D3
#undef D4
#undef D5
#undef D6
#undef D7
#endif

#endif

static const u32 IV224[] = {
	C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53),
	C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96),
	C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6),
	C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8)
};

static const u32 IV256[] = {
	C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9),
	C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3),
	C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9),
	C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1)
};

static const u32 IV384[] = {
	C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B),
	C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1),
	C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A),
	C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8),
	C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2),
	C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462),
	C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5),
	C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71)
};

static const u32 IV512[] = {
	C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
	C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
	C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
	C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
	C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
	C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
	C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
	C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22)
};

static void
init_small(void *cc, const u32 *iv)
{
	sph_simd_small_context *sc;

	sc = cc;
	memcpy(sc->state, iv, sizeof sc->state);
	sc->count_low = sc->count_high = 0;
	sc->ptr = 0;
}

static void
init_big(void *cc, const u32 *iv)
{
	sph_simd_big_context *sc;

	sc = cc;
	memcpy(sc->state, iv, sizeof sc->state);
	sc->count_low = sc->count_high = 0;
	sc->ptr = 0;
}

static void
update_small(void *cc, const void *data, size_t len)
{
	sph_simd_small_context *sc;

	sc = cc;
	while (len > 0) {
		size_t clen;

		clen = (sizeof sc->buf) - sc->ptr;
		if (clen > len)
			clen = len;
		memcpy(sc->buf + sc->ptr, data, clen);
		data = (const unsigned char *)data + clen;
		len -= clen;
		if ((sc->ptr += clen) == sizeof sc->buf) {
			compress_small(sc, 0);
			sc->ptr = 0;
			sc->count_low = T32(sc->count_low + 1);
			if (sc->count_low == 0)
				sc->count_high ++;
		}
	}
}

static void
update_big(void *cc, const void *data, size_t len)
{
	sph_simd_big_context *sc;

	sc = cc;
	while (len > 0) {
		size_t clen;

		clen = (sizeof sc->buf) - sc->ptr;
		if (clen > len)
			clen = len;
		memcpy(sc->buf + sc->ptr, data, clen);
		data = (const unsigned char *)data + clen;
		len -= clen;
		if ((sc->ptr += clen) == sizeof sc->buf) {
			compress_big(sc, 0);
			sc->ptr = 0;
			sc->count_low = T32(sc->count_low + 1);
			if (sc->count_low == 0)
				sc->count_high ++;
		}
	}
}

static void
encode_count_small(unsigned char *dst,
	u32 low, u32 high, size_t ptr, unsigned n)
{
	low = T32(low << 9);
	high = T32(high << 9) + (low >> 23);
	low += (ptr << 3) + n;
	sph_enc32le(dst, low);
	sph_enc32le(dst + 4, high);
}

static void
encode_count_big(unsigned char *dst,
	u32 low, u32 high, size_t ptr, unsigned n)
{
	low = T32(low << 10);
	high = T32(high << 10) + (low >> 22);
	low += (ptr << 3) + n;
	sph_enc32le(dst, low);
	sph_enc32le(dst + 4, high);
}

static void
finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
{
	sph_simd_small_context *sc;
	unsigned char *d;
	size_t u;

	sc = cc;
	if (sc->ptr > 0 || n > 0) {
		memset(sc->buf + sc->ptr, 0,
			(sizeof sc->buf) - sc->ptr);
		sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
		compress_small(sc, 0);
	}
	memset(sc->buf, 0, sizeof sc->buf);
	encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
	compress_small(sc, 1);
	d = dst;
	for (d = dst, u = 0; u < dst_len; u ++)
		sph_enc32le(d + (u << 2), sc->state[u]);
}

static void
finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
{
	sph_simd_big_context *sc;
	unsigned char *d;
	size_t u;

	sc = cc;
	if (sc->ptr > 0 || n > 0) {
		memset(sc->buf + sc->ptr, 0,
			(sizeof sc->buf) - sc->ptr);
		sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
		compress_big(sc, 0);
	}
	memset(sc->buf, 0, sizeof sc->buf);
	encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
	compress_big(sc, 1);
	d = dst;
	for (d = dst, u = 0; u < dst_len; u ++)
		sph_enc32le(d + (u << 2), sc->state[u]);
}

void
sph_simd224_init(void *cc)
{
	init_small(cc, IV224);
}

void
sph_simd224(void *cc, const void *data, size_t len)
{
	update_small(cc, data, len);
}

void
sph_simd224_close(void *cc, void *dst)
{
	sph_simd224_addbits_and_close(cc, 0, 0, dst);
}

void
sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
	finalize_small(cc, ub, n, dst, 7);
	sph_simd224_init(cc);
}

void
sph_simd256_init(void *cc)
{
	init_small(cc, IV256);
}

void
sph_simd256(void *cc, const void *data, size_t len)
{
	update_small(cc, data, len);
}

void
sph_simd256_close(void *cc, void *dst)
{
	sph_simd256_addbits_and_close(cc, 0, 0, dst);
}

void
sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
	finalize_small(cc, ub, n, dst, 8);
	sph_simd256_init(cc);
}

void
sph_simd384_init(void *cc)
{
	init_big(cc, IV384);
}

void
sph_simd384(void *cc, const void *data, size_t len)
{
	update_big(cc, data, len);
}

void
sph_simd384_close(void *cc, void *dst)
{
	sph_simd384_addbits_and_close(cc, 0, 0, dst);
}

void
sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
	finalize_big(cc, ub, n, dst, 12);
	sph_simd384_init(cc);
}

void
sph_simd512_init(void *cc)
{
	init_big(cc, IV512);
}

void
sph_simd512(void *cc, const void *data, size_t len)
{
	update_big(cc, data, len);
}

void
sph_simd512_close(void *cc, void *dst)
{
	sph_simd512_addbits_and_close(cc, 0, 0, dst);
}

void
sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
	finalize_big(cc, ub, n, dst, 16);
	sph_simd512_init(cc);
}
#ifdef __cplusplus
}
#endif