Browse Source
0.13830ee48
Update Bitcoin for libsecp256k1 API change (Pieter Wuille)ecae2ac
Squashed 'src/secp256k1/' changes from b0210a9..bccaf86 (Pieter Wuille)
Wladimir J. van der Laan
10 years ago
31 changed files with 1202 additions and 849 deletions
@ -1,57 +0,0 @@
@@ -1,57 +0,0 @@
|
||||
#! /bin/sh |
||||
command="" |
||||
infile="" |
||||
o_opt=no |
||||
pic=no |
||||
while [ $# -gt 0 ]; do |
||||
case "$1" in |
||||
-DPIC|-fPIC|-fpic|-Kpic|-KPIC) |
||||
if [ "$pic" != "yes" ] ; then |
||||
command="$command -DPIC" |
||||
pic=yes |
||||
fi |
||||
;; |
||||
-f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \ |
||||
-fobj|-fwin32|-fwin64|-frdf|-fieee|-fmacho|-fmacho64) |
||||
# it's a file format specifier for nasm. |
||||
command="$command $1" |
||||
;; |
||||
-f*) |
||||
# maybe a code-generation flag for gcc. |
||||
;; |
||||
-[Ii]*) |
||||
incdir=`echo "$1" | sed 's/^-[Ii]//'` |
||||
if [ "x$incdir" = x -a "x$2" != x ] ; then |
||||
case "$2" in |
||||
-*) ;; |
||||
*) incdir="$2"; shift;; |
||||
esac |
||||
fi |
||||
if [ "x$incdir" != x ] ; then |
||||
# In the case of NASM, the trailing slash is necessary. |
||||
incdir=`echo "$incdir" | sed 's%/*$%/%'` |
||||
command="$command -I$incdir" |
||||
fi |
||||
;; |
||||
-o*) |
||||
o_opt=yes |
||||
command="$command $1" |
||||
;; |
||||
*.asm) |
||||
infile=$1 |
||||
command="$command $1" |
||||
;; |
||||
*) |
||||
command="$command $1" |
||||
;; |
||||
esac |
||||
shift |
||||
done |
||||
if [ "$o_opt" != yes ] ; then |
||||
# By default, NASM creates an output file |
||||
# in the same directory as the input file. |
||||
outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o" |
||||
command="$command $outfile" |
||||
fi |
||||
echo $command |
||||
exec $command |
@ -0,0 +1,37 @@
@@ -0,0 +1,37 @@
|
||||
/**********************************************************************
|
||||
* Copyright (c) 2014 Pieter Wuille * |
||||
* Distributed under the MIT software license, see the accompanying * |
||||
* file COPYING or http://www.opensource.org/licenses/mit-license.php.*
|
||||
**********************************************************************/ |
||||
|
||||
#ifndef _SECP256K1_BENCH_H_ |
||||
#define _SECP256K1_BENCH_H_ |
||||
|
||||
#include <stdio.h> |
||||
#include <math.h> |
||||
#include "sys/time.h" |
||||
|
||||
static double gettimedouble(void) { |
||||
struct timeval tv; |
||||
gettimeofday(&tv, NULL); |
||||
return tv.tv_usec * 0.000001 + tv.tv_sec; |
||||
} |
||||
|
||||
void run_benchmark(void (*benchmark)(void*), void (*setup)(void*), void (*teardown)(void*), void* data, int count, int iter) { |
||||
double min = HUGE_VAL; |
||||
double sum = 0.0; |
||||
double max = 0.0; |
||||
for (int i = 0; i < count; i++) { |
||||
if (setup) setup(data); |
||||
double begin = gettimedouble(); |
||||
benchmark(data); |
||||
double total = gettimedouble() - begin; |
||||
if (teardown) teardown(data); |
||||
if (total < min) min = total; |
||||
if (total > max) max = total; |
||||
sum += total; |
||||
} |
||||
printf("min %.3fus / avg %.3fus / max %.3fus\n", min * 1000000.0 / iter, (sum / count) * 1000000.0 / iter, max * 1000000.0 / iter); |
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,46 @@
@@ -0,0 +1,46 @@
|
||||
/**********************************************************************
|
||||
* Copyright (c) 2014 Pieter Wuille * |
||||
* Distributed under the MIT software license, see the accompanying * |
||||
* file COPYING or http://www.opensource.org/licenses/mit-license.php.*
|
||||
**********************************************************************/ |
||||
|
||||
#include "include/secp256k1.h" |
||||
#include "util.h" |
||||
#include "bench.h" |
||||
|
||||
typedef struct { |
||||
unsigned char msg[32]; |
||||
unsigned char sig[64]; |
||||
} bench_recover_t; |
||||
|
||||
void bench_recover(void* arg) { |
||||
bench_recover_t *data = (bench_recover_t*)arg; |
||||
|
||||
unsigned char pubkey[33]; |
||||
for (int i=0; i<20000; i++) { |
||||
int pubkeylen = 33; |
||||
CHECK(secp256k1_ecdsa_recover_compact(data->msg, data->sig, pubkey, &pubkeylen, 1, i % 2)); |
||||
for (int j = 0; j < 32; j++) { |
||||
data->sig[j + 32] = data->msg[j]; /* Move former message to S. */ |
||||
data->msg[j] = data->sig[j]; /* Move former R to message. */ |
||||
data->sig[j] = pubkey[j + 1]; /* Move recovered pubkey X coordinate to R (which must be a valid X coordinate). */ |
||||
} |
||||
} |
||||
} |
||||
|
||||
void bench_recover_setup(void* arg) { |
||||
bench_recover_t *data = (bench_recover_t*)arg; |
||||
|
||||
for (int i = 0; i < 32; i++) data->msg[i] = 1 + i; |
||||
for (int i = 0; i < 64; i++) data->sig[i] = 65 + i; |
||||
} |
||||
|
||||
int main(void) { |
||||
secp256k1_start(SECP256K1_START_VERIFY); |
||||
|
||||
bench_recover_t data; |
||||
run_benchmark(bench_recover, bench_recover_setup, NULL, &data, 10, 20000); |
||||
|
||||
secp256k1_stop(); |
||||
return 0; |
||||
} |
@ -1,469 +0,0 @@
@@ -1,469 +0,0 @@
|
||||
;; Added by Diederik Huys, March 2013 |
||||
;; |
||||
;; Provided public procedures: |
||||
;; secp256k1_fe_mul_inner |
||||
;; secp256k1_fe_sqr_inner |
||||
;; |
||||
;; Needed tools: YASM (http://yasm.tortall.net) |
||||
;; |
||||
;; |
||||
|
||||
BITS 64 |
||||
|
||||
%ifidn __OUTPUT_FORMAT__,macho64 |
||||
%define SYM(x) _ %+ x |
||||
%else |
||||
%define SYM(x) x |
||||
%endif |
||||
|
||||
;; Procedure ExSetMult |
||||
;; Register Layout: |
||||
;; INPUT: rdi = a->n |
||||
;; rsi = b->n |
||||
;; rdx = r->a |
||||
;; |
||||
;; INTERNAL: rdx:rax = multiplication accumulator |
||||
;; r9:r8 = c |
||||
;; r10-r13 = t0-t3 |
||||
;; r14 = b.n[0] / t4 |
||||
;; r15 = b.n[1] / t5 |
||||
;; rbx = b.n[2] / t6 |
||||
;; rcx = b.n[3] / t7 |
||||
;; rbp = Constant 0FFFFFFFFFFFFFh / t8 |
||||
;; rsi = b.n / b.n[4] / t9 |
||||
|
||||
GLOBAL SYM(secp256k1_fe_mul_inner) |
||||
ALIGN 32 |
||||
SYM(secp256k1_fe_mul_inner): |
||||
push rbp |
||||
push rbx |
||||
push r12 |
||||
push r13 |
||||
push r14 |
||||
push r15 |
||||
push rdx |
||||
mov r14,[rsi+8*0] ; preload b.n[0]. This will be the case until |
||||
; b.n[0] is no longer needed, then we reassign |
||||
; r14 to t4 |
||||
;; c=a.n[0] * b.n[0] |
||||
mov rax,[rdi+0*8] ; load a.n[0] |
||||
mov rbp,0FFFFFFFFFFFFFh |
||||
mul r14 ; rdx:rax=a.n[0]*b.n[0] |
||||
mov r15,[rsi+1*8] |
||||
mov r10,rbp ; load modulus into target register for t0 |
||||
mov r8,rax |
||||
and r10,rax ; only need lower qword of c |
||||
shrd r8,rdx,52 |
||||
xor r9,r9 ; c < 2^64, so we ditch the HO part |
||||
|
||||
;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0] |
||||
mov rax,[rdi+0*8] |
||||
mul r15 |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+1*8] |
||||
mul r14 |
||||
mov r11,rbp |
||||
mov rbx,[rsi+2*8] |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and r11,r8 |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=a.n[0 1 2] * b.n[2 1 0] |
||||
mov rax,[rdi+0*8] |
||||
mul rbx |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+1*8] |
||||
mul r15 |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+2*8] |
||||
mul r14 |
||||
mov r12,rbp |
||||
mov rcx,[rsi+3*8] |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and r12,r8 |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=a.n[0 1 2 3] * b.n[3 2 1 0] |
||||
mov rax,[rdi+0*8] |
||||
mul rcx |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+1*8] |
||||
mul rbx |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+2*8] |
||||
mul r15 |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+3*8] |
||||
mul r14 |
||||
mov r13,rbp |
||||
mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and r13,r8 |
||||
|
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
|
||||
;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0] |
||||
mov rax,[rdi+0*8] |
||||
mul rsi |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+1*8] |
||||
mul rcx |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+2*8] |
||||
mul rbx |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+3*8] |
||||
mul r15 |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+4*8] |
||||
mul r14 |
||||
mov r14,rbp ; load modulus into t4 and destroy a.n[0] |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and r14,r8 |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=a.n[1 2 3 4] * b.n[4 3 2 1] |
||||
mov rax,[rdi+1*8] |
||||
mul rsi |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+2*8] |
||||
mul rcx |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+3*8] |
||||
mul rbx |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+4*8] |
||||
mul r15 |
||||
mov r15,rbp |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
and r15,r8 |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=a.n[2 3 4] * b.n[4 3 2] |
||||
mov rax,[rdi+2*8] |
||||
mul rsi |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+3*8] |
||||
mul rcx |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+4*8] |
||||
mul rbx |
||||
mov rbx,rbp |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
and rbx,r8 |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=a.n[3 4] * b.n[4 3] |
||||
mov rax,[rdi+3*8] |
||||
mul rsi |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,[rdi+4*8] |
||||
mul rcx |
||||
mov rcx,rbp |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and rcx,r8 |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=a.n[4] * b.n[4] |
||||
mov rax,[rdi+4*8] |
||||
mul rsi |
||||
;; mov rbp,rbp ; modulus already there! |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and rbp,r8 |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
mov rsi,r8 ; load c into t9 and destroy b.n[4] |
||||
|
||||
;; ******************************************************* |
||||
common_exit_norm: |
||||
mov rdi,01000003D10h ; load constant |
||||
|
||||
mov rax,r15 ; get t5 |
||||
mul rdi |
||||
add rax,r10 ; +t0 |
||||
adc rdx,0 |
||||
mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers! |
||||
mov r8,rax ; +c |
||||
and r10,rax |
||||
shrd r8,rdx,52 |
||||
xor r9,r9 |
||||
|
||||
mov rax,rbx ; get t6 |
||||
mul rdi |
||||
add rax,r11 ; +t1 |
||||
adc rdx,0 |
||||
mov r11,0FFFFFFFFFFFFFh ; modulus |
||||
add r8,rax ; +c |
||||
adc r9,rdx |
||||
and r11,r8 |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
mov rax,rcx ; get t7 |
||||
mul rdi |
||||
add rax,r12 ; +t2 |
||||
adc rdx,0 |
||||
pop rbx ; retrieve pointer to this.n |
||||
mov r12,0FFFFFFFFFFFFFh ; modulus |
||||
add r8,rax ; +c |
||||
adc r9,rdx |
||||
and r12,r8 |
||||
mov [rbx+2*8],r12 ; mov into this.n[2] |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
mov rax,rbp ; get t8 |
||||
mul rdi |
||||
add rax,r13 ; +t3 |
||||
adc rdx,0 |
||||
mov r13,0FFFFFFFFFFFFFh ; modulus |
||||
add r8,rax ; +c |
||||
adc r9,rdx |
||||
and r13,r8 |
||||
mov [rbx+3*8],r13 ; -> this.n[3] |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
mov rax,rsi ; get t9 |
||||
mul rdi |
||||
add rax,r14 ; +t4 |
||||
adc rdx,0 |
||||
mov r14,0FFFFFFFFFFFFh ; !!! |
||||
add r8,rax ; +c |
||||
adc r9,rdx |
||||
and r14,r8 |
||||
mov [rbx+4*8],r14 ; -> this.n[4] |
||||
shrd r8,r9,48 ; !!! |
||||
xor r9,r9 |
||||
|
||||
mov rax,01000003D1h |
||||
mul r8 |
||||
add rax,r10 |
||||
adc rdx,0 |
||||
mov r10,0FFFFFFFFFFFFFh ; modulus |
||||
mov r8,rax |
||||
and rax,r10 |
||||
shrd r8,rdx,52 |
||||
mov [rbx+0*8],rax ; -> this.n[0] |
||||
add r8,r11 |
||||
mov [rbx+1*8],r8 ; -> this.n[1] |
||||
|
||||
pop r15 |
||||
pop r14 |
||||
pop r13 |
||||
pop r12 |
||||
pop rbx |
||||
pop rbp |
||||
ret |
||||
|
||||
|
||||
;; PROC ExSetSquare |
||||
;; Register Layout: |
||||
;; INPUT: rdi = a.n |
||||
;; rsi = this.a |
||||
;; INTERNAL: rdx:rax = multiplication accumulator |
||||
;; r9:r8 = c |
||||
;; r10-r13 = t0-t3 |
||||
;; r14 = a.n[0] / t4 |
||||
;; r15 = a.n[1] / t5 |
||||
;; rbx = a.n[2] / t6 |
||||
;; rcx = a.n[3] / t7 |
||||
;; rbp = 0FFFFFFFFFFFFFh / t8 |
||||
;; rsi = a.n[4] / t9 |
||||
GLOBAL SYM(secp256k1_fe_sqr_inner) |
||||
ALIGN 32 |
||||
SYM(secp256k1_fe_sqr_inner): |
||||
push rbp |
||||
push rbx |
||||
push r12 |
||||
push r13 |
||||
push r14 |
||||
push r15 |
||||
push rsi |
||||
mov rbp,0FFFFFFFFFFFFFh |
||||
|
||||
;; c=a.n[0] * a.n[0] |
||||
mov r14,[rdi+0*8] ; r14=a.n[0] |
||||
mov r10,rbp ; modulus |
||||
mov rax,r14 |
||||
mul rax |
||||
mov r15,[rdi+1*8] ; a.n[1] |
||||
add r14,r14 ; r14=2*a.n[0] |
||||
mov r8,rax |
||||
and r10,rax ; only need lower qword |
||||
shrd r8,rdx,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=2*a.n[0] * a.n[1] |
||||
mov rax,r14 ; r14=2*a.n[0] |
||||
mul r15 |
||||
mov rbx,[rdi+2*8] ; rbx=a.n[2] |
||||
mov r11,rbp ; modulus |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and r11,r8 |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1] |
||||
mov rax,r14 |
||||
mul rbx |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,r15 |
||||
mov r12,rbp ; modulus |
||||
mul rax |
||||
mov rcx,[rdi+3*8] ; rcx=a.n[3] |
||||
add r15,r15 ; r15=a.n[1]*2 |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and r12,r8 ; only need lower dword |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2] |
||||
mov rax,r14 |
||||
mul rcx |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,r15 ; rax=2*a.n[1] |
||||
mov r13,rbp ; modulus |
||||
mul rbx |
||||
mov rsi,[rdi+4*8] ; rsi=a.n[4] |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and r13,r8 |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2] |
||||
mov rax,r14 ; last time we need 2*a.n[0] |
||||
mul rsi |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,r15 |
||||
mul rcx |
||||
mov r14,rbp ; modulus |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,rbx |
||||
mul rax |
||||
add rbx,rbx ; rcx=2*a.n[2] |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and r14,r8 |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3] |
||||
mov rax,r15 ; last time we need 2*a.n[1] |
||||
mul rsi |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,rbx |
||||
mul rcx |
||||
mov r15,rbp ; modulus |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and r15,r8 |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3] |
||||
mov rax,rbx ; last time we need 2*a.n[2] |
||||
mul rsi |
||||
add r8,rax |
||||
adc r9,rdx |
||||
|
||||
mov rax,rcx ; a.n[3] |
||||
mul rax |
||||
mov rbx,rbp ; modulus |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and rbx,r8 ; only need lower dword |
||||
lea rax,[2*rcx] |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=2*a.n[3]*a.n[4] |
||||
mul rsi |
||||
mov rcx,rbp ; modulus |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and rcx,r8 ; only need lower dword |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
;; c+=a.n[4]*a.n[4] |
||||
mov rax,rsi |
||||
mul rax |
||||
;; mov rbp,rbp ; modulus is already there! |
||||
add r8,rax |
||||
adc r9,rdx |
||||
and rbp,r8 |
||||
shrd r8,r9,52 |
||||
xor r9,r9 |
||||
|
||||
mov rsi,r8 |
||||
|
||||
;; ******************************************************* |
||||
jmp common_exit_norm |
||||
end |
||||
|
||||
|
@ -1,13 +1,502 @@
@@ -1,13 +1,502 @@
|
||||
/**********************************************************************
|
||||
* Copyright (c) 2013 Pieter Wuille * |
||||
* Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille * |
||||
* Distributed under the MIT software license, see the accompanying * |
||||
* file COPYING or http://www.opensource.org/licenses/mit-license.php.*
|
||||
**********************************************************************/ |
||||
|
||||
/**
|
||||
* Changelog: |
||||
* - March 2013, Diederik Huys: original version |
||||
* - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm |
||||
* - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly |
||||
*/ |
||||
|
||||
#ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_ |
||||
#define _SECP256K1_FIELD_INNER5X52_IMPL_H_ |
||||
|
||||
void __attribute__ ((sysv_abi)) secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r); |
||||
void __attribute__ ((sysv_abi)) secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r); |
||||
SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) { |
||||
/**
|
||||
* Registers: rdx:rax = multiplication accumulator |
||||
* r9:r8 = c |
||||
* r15:rcx = d |
||||
* r10-r14 = a0-a4 |
||||
* rbx = b |
||||
* rdi = r |
||||
* rsi = a / t? |
||||
*/ |
||||
uint64_t tmp1, tmp2, tmp3; |
||||
__asm__ __volatile__( |
||||
"movq 0(%%rsi),%%r10\n" |
||||
"movq 8(%%rsi),%%r11\n" |
||||
"movq 16(%%rsi),%%r12\n" |
||||
"movq 24(%%rsi),%%r13\n" |
||||
"movq 32(%%rsi),%%r14\n" |
||||
|
||||
/* d += a3 * b0 */ |
||||
"movq 0(%%rbx),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"movq %%rax,%%rcx\n" |
||||
"movq %%rdx,%%r15\n" |
||||
/* d += a2 * b1 */ |
||||
"movq 8(%%rbx),%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a1 * b2 */ |
||||
"movq 16(%%rbx),%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d = a0 * b3 */ |
||||
"movq 24(%%rbx),%%rax\n" |
||||
"mulq %%r10\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* c = a4 * b4 */ |
||||
"movq 32(%%rbx),%%rax\n" |
||||
"mulq %%r14\n" |
||||
"movq %%rax,%%r8\n" |
||||
"movq %%rdx,%%r9\n" |
||||
/* d += (c & M) * R */ |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"movq $0x1000003d10,%%rdx\n" |
||||
"mulq %%rdx\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* c >>= 52 (%%r8 only) */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
/* t3 (tmp1) = d & M */ |
||||
"movq %%rcx,%%rsi\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rsi\n" |
||||
"movq %%rsi,%q1\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%r15,%%rcx\n" |
||||
"xorq %%r15,%%r15\n" |
||||
/* d += a4 * b0 */ |
||||
"movq 0(%%rbx),%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a3 * b1 */ |
||||
"movq 8(%%rbx),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a2 * b2 */ |
||||
"movq 16(%%rbx),%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a1 * b3 */ |
||||
"movq 24(%%rbx),%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a0 * b4 */ |
||||
"movq 32(%%rbx),%%rax\n" |
||||
"mulq %%r10\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += c * R */ |
||||
"movq %%r8,%%rax\n" |
||||
"movq $0x1000003d10,%%rdx\n" |
||||
"mulq %%rdx\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* t4 = d & M (%%rsi) */ |
||||
"movq %%rcx,%%rsi\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rsi\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%r15,%%rcx\n" |
||||
"xorq %%r15,%%r15\n" |
||||
/* tx = t4 >> 48 (tmp3) */ |
||||
"movq %%rsi,%%rax\n" |
||||
"shrq $48,%%rax\n" |
||||
"movq %%rax,%q3\n" |
||||
/* t4 &= (M >> 4) (tmp2) */ |
||||
"movq $0xffffffffffff,%%rax\n" |
||||
"andq %%rax,%%rsi\n" |
||||
"movq %%rsi,%q2\n" |
||||
/* c = a0 * b0 */ |
||||
"movq 0(%%rbx),%%rax\n" |
||||
"mulq %%r10\n" |
||||
"movq %%rax,%%r8\n" |
||||
"movq %%rdx,%%r9\n" |
||||
/* d += a4 * b1 */ |
||||
"movq 8(%%rbx),%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a3 * b2 */ |
||||
"movq 16(%%rbx),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a2 * b3 */ |
||||
"movq 24(%%rbx),%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a1 * b4 */ |
||||
"movq 32(%%rbx),%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* u0 = d & M (%%rsi) */ |
||||
"movq %%rcx,%%rsi\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rsi\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%r15,%%rcx\n" |
||||
"xorq %%r15,%%r15\n" |
||||
/* u0 = (u0 << 4) | tx (%%rsi) */ |
||||
"shlq $4,%%rsi\n" |
||||
"movq %q3,%%rax\n" |
||||
"orq %%rax,%%rsi\n" |
||||
/* c += u0 * (R >> 4) */ |
||||
"movq $0x1000003d1,%%rax\n" |
||||
"mulq %%rsi\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* r[0] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"movq %%rax,0(%%rdi)\n" |
||||
/* c >>= 52 */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
"xorq %%r9,%%r9\n" |
||||
/* c += a1 * b0 */ |
||||
"movq 0(%%rbx),%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* c += a0 * b1 */ |
||||
"movq 8(%%rbx),%%rax\n" |
||||
"mulq %%r10\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d += a4 * b2 */ |
||||
"movq 16(%%rbx),%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a3 * b3 */ |
||||
"movq 24(%%rbx),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a2 * b4 */ |
||||
"movq 32(%%rbx),%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* c += (d & M) * R */ |
||||
"movq %%rcx,%%rax\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"movq $0x1000003d10,%%rdx\n" |
||||
"mulq %%rdx\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%r15,%%rcx\n" |
||||
"xorq %%r15,%%r15\n" |
||||
/* r[1] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"movq %%rax,8(%%rdi)\n" |
||||
/* c >>= 52 */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
"xorq %%r9,%%r9\n" |
||||
/* c += a2 * b0 */ |
||||
"movq 0(%%rbx),%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* c += a1 * b1 */ |
||||
"movq 8(%%rbx),%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* c += a0 * b2 (last use of %%r10 = a0) */ |
||||
"movq 16(%%rbx),%%rax\n" |
||||
"mulq %%r10\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */ |
||||
"movq %q2,%%rsi\n" |
||||
"movq %q1,%%r10\n" |
||||
/* d += a4 * b3 */ |
||||
"movq 24(%%rbx),%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a3 * b4 */ |
||||
"movq 32(%%rbx),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* c += (d & M) * R */ |
||||
"movq %%rcx,%%rax\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"movq $0x1000003d10,%%rdx\n" |
||||
"mulq %%rdx\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d >>= 52 (%%rcx only) */ |
||||
"shrdq $52,%%r15,%%rcx\n" |
||||
/* r[2] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"movq %%rax,16(%%rdi)\n" |
||||
/* c >>= 52 */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
"xorq %%r9,%%r9\n" |
||||
/* c += t3 */ |
||||
"addq %%r10,%%r8\n" |
||||
/* c += d * R */ |
||||
"movq %%rcx,%%rax\n" |
||||
"movq $0x1000003d10,%%rdx\n" |
||||
"mulq %%rdx\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* r[3] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"movq %%rax,24(%%rdi)\n" |
||||
/* c >>= 52 (%%r8 only) */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
/* c += t4 (%%r8 only) */ |
||||
"addq %%rsi,%%r8\n" |
||||
/* r[4] = c */ |
||||
"movq %%r8,32(%%rdi)\n" |
||||
: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3) |
||||
: "b"(b), "D"(r) |
||||
: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" |
||||
); |
||||
} |
||||
|
||||
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) { |
||||
/**
|
||||
* Registers: rdx:rax = multiplication accumulator |
||||
* r9:r8 = c |
||||
* rcx:rbx = d |
||||
* r10-r14 = a0-a4 |
||||
* r15 = M (0xfffffffffffff) |
||||
* rdi = r |
||||
* rsi = a / t? |
||||
*/ |
||||
uint64_t tmp1, tmp2, tmp3; |
||||
__asm__ __volatile__( |
||||
"movq 0(%%rsi),%%r10\n" |
||||
"movq 8(%%rsi),%%r11\n" |
||||
"movq 16(%%rsi),%%r12\n" |
||||
"movq 24(%%rsi),%%r13\n" |
||||
"movq 32(%%rsi),%%r14\n" |
||||
"movq $0xfffffffffffff,%%r15\n" |
||||
|
||||
/* d = (a0*2) * a3 */ |
||||
"leaq (%%r10,%%r10,1),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"movq %%rax,%%rbx\n" |
||||
"movq %%rdx,%%rcx\n" |
||||
/* d += (a1*2) * a2 */ |
||||
"leaq (%%r11,%%r11,1),%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* c = a4 * a4 */ |
||||
"movq %%r14,%%rax\n" |
||||
"mulq %%r14\n" |
||||
"movq %%rax,%%r8\n" |
||||
"movq %%rdx,%%r9\n" |
||||
/* d += (c & M) * R */ |
||||
"andq %%r15,%%rax\n" |
||||
"movq $0x1000003d10,%%rdx\n" |
||||
"mulq %%rdx\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* c >>= 52 (%%r8 only) */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
/* t3 (tmp1) = d & M */ |
||||
"movq %%rbx,%%rsi\n" |
||||
"andq %%r15,%%rsi\n" |
||||
"movq %%rsi,%q1\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%rcx,%%rbx\n" |
||||
"xorq %%rcx,%%rcx\n" |
||||
/* a4 *= 2 */ |
||||
"addq %%r14,%%r14\n" |
||||
/* d += a0 * a4 */ |
||||
"movq %%r10,%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* d+= (a1*2) * a3 */ |
||||
"leaq (%%r11,%%r11,1),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* d += a2 * a2 */ |
||||
"movq %%r12,%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* d += c * R */ |
||||
"movq %%r8,%%rax\n" |
||||
"movq $0x1000003d10,%%rdx\n" |
||||
"mulq %%rdx\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* t4 = d & M (%%rsi) */ |
||||
"movq %%rbx,%%rsi\n" |
||||
"andq %%r15,%%rsi\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%rcx,%%rbx\n" |
||||
"xorq %%rcx,%%rcx\n" |
||||
/* tx = t4 >> 48 (tmp3) */ |
||||
"movq %%rsi,%%rax\n" |
||||
"shrq $48,%%rax\n" |
||||
"movq %%rax,%q3\n" |
||||
/* t4 &= (M >> 4) (tmp2) */ |
||||
"movq $0xffffffffffff,%%rax\n" |
||||
"andq %%rax,%%rsi\n" |
||||
"movq %%rsi,%q2\n" |
||||
/* c = a0 * a0 */ |
||||
"movq %%r10,%%rax\n" |
||||
"mulq %%r10\n" |
||||
"movq %%rax,%%r8\n" |
||||
"movq %%rdx,%%r9\n" |
||||
/* d += a1 * a4 */ |
||||
"movq %%r11,%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* d += (a2*2) * a3 */ |
||||
"leaq (%%r12,%%r12,1),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* u0 = d & M (%%rsi) */ |
||||
"movq %%rbx,%%rsi\n" |
||||
"andq %%r15,%%rsi\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%rcx,%%rbx\n" |
||||
"xorq %%rcx,%%rcx\n" |
||||
/* u0 = (u0 << 4) | tx (%%rsi) */ |
||||
"shlq $4,%%rsi\n" |
||||
"movq %q3,%%rax\n" |
||||
"orq %%rax,%%rsi\n" |
||||
/* c += u0 * (R >> 4) */ |
||||
"movq $0x1000003d1,%%rax\n" |
||||
"mulq %%rsi\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* r[0] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"andq %%r15,%%rax\n" |
||||
"movq %%rax,0(%%rdi)\n" |
||||
/* c >>= 52 */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
"xorq %%r9,%%r9\n" |
||||
/* a0 *= 2 */ |
||||
"addq %%r10,%%r10\n" |
||||
/* c += a0 * a1 */ |
||||
"movq %%r10,%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d += a2 * a4 */ |
||||
"movq %%r12,%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* d += a3 * a3 */ |
||||
"movq %%r13,%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* c += (d & M) * R */ |
||||
"movq %%rbx,%%rax\n" |
||||
"andq %%r15,%%rax\n" |
||||
"movq $0x1000003d10,%%rdx\n" |
||||
"mulq %%rdx\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%rcx,%%rbx\n" |
||||
"xorq %%rcx,%%rcx\n" |
||||
/* r[1] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"andq %%r15,%%rax\n" |
||||
"movq %%rax,8(%%rdi)\n" |
||||
/* c >>= 52 */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
"xorq %%r9,%%r9\n" |
||||
/* c += a0 * a2 (last use of %%r10) */ |
||||
"movq %%r10,%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */ |
||||
"movq %q2,%%rsi\n" |
||||
"movq %q1,%%r10\n" |
||||
/* c += a1 * a1 */ |
||||
"movq %%r11,%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d += a3 * a4 */ |
||||
"movq %%r13,%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* c += (d & M) * R */ |
||||
"movq %%rbx,%%rax\n" |
||||
"andq %%r15,%%rax\n" |
||||
"movq $0x1000003d10,%%rdx\n" |
||||
"mulq %%rdx\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d >>= 52 (%%rbx only) */ |
||||
"shrdq $52,%%rcx,%%rbx\n" |
||||
/* r[2] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"andq %%r15,%%rax\n" |
||||
"movq %%rax,16(%%rdi)\n" |
||||
/* c >>= 52 */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
"xorq %%r9,%%r9\n" |
||||
/* c += t3 */ |
||||
"addq %%r10,%%r8\n" |
||||
/* c += d * R */ |
||||
"movq %%rbx,%%rax\n" |
||||
"movq $0x1000003d10,%%rdx\n" |
||||
"mulq %%rdx\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* r[3] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"andq %%r15,%%rax\n" |
||||
"movq %%rax,24(%%rdi)\n" |
||||
/* c >>= 52 (%%r8 only) */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
/* c += t4 (%%r8 only) */ |
||||
"addq %%rsi,%%r8\n" |
||||
/* r[4] = c */ |
||||
"movq %%r8,32(%%rdi)\n" |
||||
: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3) |
||||
: "D"(r) |
||||
: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" |
||||
); |
||||
} |
||||
|
||||
#endif |
||||
|
Loading…
Reference in new issue