You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
184 lines
4.3 KiB
184 lines
4.3 KiB
/* AesOpt.c -- Intel's AES |
|
2013-11-12 : Igor Pavlov : Public domain */ |
|
|
|
#include "Precomp.h" |
|
|
|
#include "CpuArch.h" |
|
|
|
#ifdef MY_CPU_X86_OR_AMD64 |
|
#if _MSC_VER >= 1500 |
|
#define USE_INTEL_AES |
|
#endif |
|
#endif |
|
|
|
#ifdef USE_INTEL_AES |
|
|
|
#include <wmmintrin.h> |
|
|
|
void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks) |
|
{ |
|
__m128i m = *p; |
|
for (; numBlocks != 0; numBlocks--, data++) |
|
{ |
|
UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; |
|
const __m128i *w = p + 3; |
|
m = _mm_xor_si128(m, *data); |
|
m = _mm_xor_si128(m, p[2]); |
|
do |
|
{ |
|
m = _mm_aesenc_si128(m, w[0]); |
|
m = _mm_aesenc_si128(m, w[1]); |
|
w += 2; |
|
} |
|
while (--numRounds2 != 0); |
|
m = _mm_aesenc_si128(m, w[0]); |
|
m = _mm_aesenclast_si128(m, w[1]); |
|
*data = m; |
|
} |
|
*p = m; |
|
} |
|
|
|
#define NUM_WAYS 3 |
|
|
|
#define AES_OP_W(op, n) { \ |
|
const __m128i t = w[n]; \ |
|
m0 = op(m0, t); \ |
|
m1 = op(m1, t); \ |
|
m2 = op(m2, t); \ |
|
} |
|
|
|
#define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n) |
|
#define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n) |
|
#define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n) |
|
#define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n) |
|
|
|
void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks) |
|
{ |
|
__m128i iv = *p; |
|
for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS) |
|
{ |
|
UInt32 numRounds2 = *(const UInt32 *)(p + 1); |
|
const __m128i *w = p + numRounds2 * 2; |
|
__m128i m0, m1, m2; |
|
{ |
|
const __m128i t = w[2]; |
|
m0 = _mm_xor_si128(t, data[0]); |
|
m1 = _mm_xor_si128(t, data[1]); |
|
m2 = _mm_xor_si128(t, data[2]); |
|
} |
|
numRounds2--; |
|
do |
|
{ |
|
AES_DEC(1) |
|
AES_DEC(0) |
|
w -= 2; |
|
} |
|
while (--numRounds2 != 0); |
|
AES_DEC(1) |
|
AES_DEC_LAST(0) |
|
|
|
{ |
|
__m128i t; |
|
t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t; |
|
t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t; |
|
t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t; |
|
} |
|
} |
|
for (; numBlocks != 0; numBlocks--, data++) |
|
{ |
|
UInt32 numRounds2 = *(const UInt32 *)(p + 1); |
|
const __m128i *w = p + numRounds2 * 2; |
|
__m128i m = _mm_xor_si128(w[2], *data); |
|
numRounds2--; |
|
do |
|
{ |
|
m = _mm_aesdec_si128(m, w[1]); |
|
m = _mm_aesdec_si128(m, w[0]); |
|
w -= 2; |
|
} |
|
while (--numRounds2 != 0); |
|
m = _mm_aesdec_si128(m, w[1]); |
|
m = _mm_aesdeclast_si128(m, w[0]); |
|
|
|
m = _mm_xor_si128(m, iv); |
|
iv = *data; |
|
*data = m; |
|
} |
|
*p = iv; |
|
} |
|
|
|
void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks) |
|
{ |
|
__m128i ctr = *p; |
|
__m128i one; |
|
one.m128i_u64[0] = 1; |
|
one.m128i_u64[1] = 0; |
|
for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS) |
|
{ |
|
UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; |
|
const __m128i *w = p; |
|
__m128i m0, m1, m2; |
|
{ |
|
const __m128i t = w[2]; |
|
ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t); |
|
ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t); |
|
ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t); |
|
} |
|
w += 3; |
|
do |
|
{ |
|
AES_ENC(0) |
|
AES_ENC(1) |
|
w += 2; |
|
} |
|
while (--numRounds2 != 0); |
|
AES_ENC(0) |
|
AES_ENC_LAST(1) |
|
data[0] = _mm_xor_si128(data[0], m0); |
|
data[1] = _mm_xor_si128(data[1], m1); |
|
data[2] = _mm_xor_si128(data[2], m2); |
|
} |
|
for (; numBlocks != 0; numBlocks--, data++) |
|
{ |
|
UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; |
|
const __m128i *w = p; |
|
__m128i m; |
|
ctr = _mm_add_epi64(ctr, one); |
|
m = _mm_xor_si128(ctr, p[2]); |
|
w += 3; |
|
do |
|
{ |
|
m = _mm_aesenc_si128(m, w[0]); |
|
m = _mm_aesenc_si128(m, w[1]); |
|
w += 2; |
|
} |
|
while (--numRounds2 != 0); |
|
m = _mm_aesenc_si128(m, w[0]); |
|
m = _mm_aesenclast_si128(m, w[1]); |
|
*data = _mm_xor_si128(*data, m); |
|
} |
|
*p = ctr; |
|
} |
|
|
|
#else |
|
|
|
void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks); |
|
void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks); |
|
void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks); |
|
|
|
void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks) |
|
{ |
|
AesCbc_Encode(p, data, numBlocks); |
|
} |
|
|
|
void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks) |
|
{ |
|
AesCbc_Decode(p, data, numBlocks); |
|
} |
|
|
|
void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks) |
|
{ |
|
AesCtr_Code(p, data, numBlocks); |
|
} |
|
|
|
#endif
|
|
|