diff --git a/aes.cpp b/aes.cpp index 48dc8fc2..ee7d455b 100644 --- a/aes.cpp +++ b/aes.cpp @@ -26,7 +26,7 @@ namespace crypto "pslldq $4, %%xmm4 \n" \ "pxor %%xmm4, %%xmm1 \n" \ "pxor %%xmm2, %%xmm1 \n" \ - "movups %%xmm1, (%%rcx) \n" \ + "movaps %%xmm1, (%%rcx) \n" \ "aeskeygenassist $0, %%xmm1, %%xmm4 \n" \ "pshufd $0xaa, %%xmm4, %%xmm2 \n" \ "movaps %%xmm3, %%xmm4 \n" \ @@ -37,7 +37,7 @@ namespace crypto "pslldq $4, %%xmm4 \n" \ "pxor %%xmm4, %%xmm3 \n" \ "pxor %%xmm2, %%xmm3 \n" \ - "movups %%xmm3, 16(%%rcx) \n" \ + "movaps %%xmm3, 16(%%rcx) \n" \ "add $32, %%rcx \n" @@ -45,11 +45,11 @@ namespace crypto { __asm__ ( - "movups (%%rsi), %%xmm1 \n" - "movups 16(%%rsi), %%xmm3 \n" - "movups %%xmm1, (%%rdi) \n" - "movups %%xmm3, 16(%%rdi) \n" - "lea 32(%%rdi), %%rcx \n" + "movups (%[key]), %%xmm1 \n" + "movups 16(%[key]), %%xmm3 \n" + "movaps %%xmm1, (%[shed]) \n" + "movaps %%xmm3, 16(%[shed]) \n" + "lea 32(%[shed]), %%rcx \n" "aeskeygenassist $1, %%xmm3, %%xmm2 \n" KeyExpansion256 "aeskeygenassist $2, %%xmm3, %%xmm2 \n" @@ -75,8 +75,8 @@ namespace crypto "pxor %%xmm2, %%xmm1 \n" "movups %%xmm1, (%%rcx) \n" : // output - : "S" (key), "D" (m_KeySchedule) // input - : "%rcx" // clogged + : [key]"r"(key), [shed]"r"(m_KeySchedule) // input + : "%rcx", "%xmm1", "%xmm2", "%xmm3", "%xmm4" // clogged ); } @@ -84,24 +84,24 @@ namespace crypto { __asm__ ( - "movups (%%rsi), %%xmm0 \n" - "pxor (%%rdx), %%xmm0 \n" - "aesenc 16(%%rdx), %%xmm0 \n" - "aesenc 32(%%rdx), %%xmm0 \n" - "aesenc 48(%%rdx), %%xmm0 \n" - "aesenc 64(%%rdx), %%xmm0 \n" - "aesenc 80(%%rdx), %%xmm0 \n" - "aesenc 96(%%rdx), %%xmm0 \n" - "aesenc 112(%%rdx), %%xmm0 \n" - "aesenc 128(%%rdx), %%xmm0 \n" - "aesenc 144(%%rdx), %%xmm0 \n" - "aesenc 160(%%rdx), %%xmm0 \n" - "aesenc 176(%%rdx), %%xmm0 \n" - "aesenc 192(%%rdx), %%xmm0 \n" - "aesenc 208(%%rdx), %%xmm0 \n" - "aesenclast 224(%%rdx), %%xmm0 \n" - "movups %%xmm0, (%%rdi) \n" - : : "d" (m_KeySchedule), "S" (in), "D" (out) + "movups (%[in]), %%xmm0 \n" + "pxor (%[shed]), %%xmm0 \n" + "aesenc 16(%[shed]), %%xmm0 \n" + "aesenc 32(%[shed]), %%xmm0 \n" + "aesenc 48(%[shed]), %%xmm0 \n" + "aesenc 64(%[shed]), %%xmm0 \n" + "aesenc 80(%[shed]), %%xmm0 \n" + "aesenc 96(%[shed]), %%xmm0 \n" + "aesenc 112(%[shed]), %%xmm0 \n" + "aesenc 128(%[shed]), %%xmm0 \n" + "aesenc 144(%[shed]), %%xmm0 \n" + "aesenc 160(%[shed]), %%xmm0 \n" + "aesenc 176(%[shed]), %%xmm0 \n" + "aesenc 192(%[shed]), %%xmm0 \n" + "aesenc 208(%[shed]), %%xmm0 \n" + "aesenclast 224(%[shed]), %%xmm0 \n" + "movups %%xmm0, (%[out]) \n" + : : [shed]"r"(m_KeySchedule), [in]"r"(in), [out]"r"(out) : "%xmm0" ); } @@ -109,31 +109,31 @@ namespace crypto { __asm__ ( - "movups (%%rsi), %%xmm0 \n" - "pxor 224(%%rdx), %%xmm0 \n" - "aesdec 208(%%rdx), %%xmm0 \n" - "aesdec 192(%%rdx), %%xmm0 \n" - "aesdec 176(%%rdx), %%xmm0 \n" - "aesdec 160(%%rdx), %%xmm0 \n" - "aesdec 144(%%rdx), %%xmm0 \n" - "aesdec 128(%%rdx), %%xmm0 \n" - "aesdec 112(%%rdx), %%xmm0 \n" - "aesdec 96(%%rdx), %%xmm0 \n" - "aesdec 80(%%rdx), %%xmm0 \n" - "aesdec 64(%%rdx), %%xmm0 \n" - "aesdec 48(%%rdx), %%xmm0 \n" - "aesdec 32(%%rdx), %%xmm0 \n" - "aesdec 16(%%rdx), %%xmm0 \n" - "aesdeclast (%%rdx), %%xmm0 \n" - "movups %%xmm0, (%%rdi) \n" - : : "d" (m_KeySchedule), "S" (in), "D" (out) + "movups (%[in]), %%xmm0 \n" + "pxor 224(%[shed]), %%xmm0 \n" + "aesdec 208(%[shed]), %%xmm0 \n" + "aesdec 192(%[shed]), %%xmm0 \n" + "aesdec 176(%[shed]), %%xmm0 \n" + "aesdec 160(%[shed]), %%xmm0 \n" + "aesdec 144(%[shed]), %%xmm0 \n" + "aesdec 128(%[shed]), %%xmm0 \n" + "aesdec 112(%[shed]), %%xmm0 \n" + "aesdec 96(%[shed]), %%xmm0 \n" + "aesdec 80(%[shed]), %%xmm0 \n" + "aesdec 64(%[shed]), %%xmm0 \n" + "aesdec 48(%[shed]), %%xmm0 \n" + "aesdec 32(%[shed]), %%xmm0 \n" + "aesdec 16(%[shed]), %%xmm0 \n" + "aesdeclast (%[shed]), %%xmm0 \n" + "movups %%xmm0, (%[out]) \n" + : : [shed]"r"(m_KeySchedule), [in]"r"(in), [out]"r"(out) : "%xmm0" ); } #define CallAESIMC(offset) \ - "movups "#offset"(%%rdx), %%xmm0 \n" \ + "movaps "#offset"(%[shed]), %%xmm0 \n" \ "aesimc %%xmm0, %%xmm0 \n" \ - "movups %%xmm0, "#offset"(%%rdx) \n" + "movaps %%xmm0, "#offset"(%[shed]) \n" void ECBDecryptionAESNI::SetKey (const uint8_t * key) { @@ -154,7 +154,7 @@ namespace crypto CallAESIMC(176) CallAESIMC(192) CallAESIMC(208) - : : "d" (m_KeySchedule) + : : [shed]"r"(m_KeySchedule) : "%xmm0" ); }