Browse Source

Micro-optimisation in sha256_sse2 code courtesy of Guido Ascioti guido.ascioti@gmail.com

nfactor-troky
ckolivas 13 years ago
parent
commit
d356f44d53
  1. 22
      sha256_sse2_i386.c
  2. 122
      x86_32/sha256_xmm.asm

22
sha256_sse2_i386.c

@ -67,12 +67,6 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate,
work_restart[thr_id].restart = 0; work_restart[thr_id].restart = 0;
/* For debugging */
union {
__m128i m;
uint32_t i[4];
} mi;
/* Message expansion */ /* Message expansion */
memcpy(m_midstate, pmidstate, sizeof(m_midstate)); memcpy(m_midstate, pmidstate, sizeof(m_midstate));
memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */ memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */
@ -102,17 +96,12 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate,
CalcSha256_x86 (m_4hash, m_4hash1, sha256_32init); CalcSha256_x86 (m_4hash, m_4hash1, sha256_32init);
for (j = 0; j < 4; j++) { for (j = 0; j < 4; j++) {
mi.m = m_4hash[7]; if (unlikely(((uint32_t *)&(m_4hash[7]))[j] == 0)) {
if (unlikely(mi.i[j] == 0)) /* We found a hit...so check it */
break; /* Use the C version for a check... */
}
/* If j = true, we found a hit...so check it */
/* Use the C version for a check... */
if (unlikely(j != 4)) {
for (i = 0; i < 8; i++) { for (i = 0; i < 8; i++) {
mi.m = m_4hash[i]; *(uint32_t *)&(phash)[i<<2] = ((uint32_t *)&(m_4hash[i]))[j];
*(uint32_t *)&(phash)[i*4] = mi.i[j];
} }
if (fulltest(phash, ptarget)) { if (fulltest(phash, ptarget)) {
@ -120,6 +109,7 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate,
*nNonce_p = nonce + j; *nNonce_p = nonce + j;
return nonce + j; return nonce + j;
} }
}
} }
nonce += 4; nonce += 4;

122
x86_32/sha256_xmm.asm

@ -1,4 +1,4 @@
;; SHA-256 for X86 for Linux, based off of: ;; SHA-256 for X86 for Linux, based off of:A
; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com ; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011 ; Version 2011
@ -15,30 +15,21 @@ BITS 32
; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16)) ; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define LAB_CALC_PARA 2 %define LAB_CALC_PARA 2
%define LAB_CALC_UNROLL 8 %define LAB_CALC_UNROLL 24
%define LAB_LOOP_UNROLL 8 %define LAB_LOOP_UNROLL 64
extern sha256_consts_m128i extern sha256_consts_m128i
global CalcSha256_x86 global CalcSha256_x86
; CalcSha256 hash(ecx), data(edx), init([esp+4]) ; CalcSha256 hash(ecx), data(edx), init([esp+4])
CalcSha256_x86: CalcSha256_x86:
push esi push esi
push edi push edi
mov init, [esp+12] mov init, [esp+12]
push ebx
LAB_NEXT_NONCE:
mov eax, 64*4 ; 256 - rcx is # of SHA-2 rounds
mov ebx, 16*4 ; 64 - rax is where we expand to
LAB_SHA: LAB_SHA:
push eax lea edi, qword [data+256] ; + 256
lea eax, qword [data+eax*4] ; + 1024
lea edi, qword [data+ebx*4] ; + 256
LAB_CALC: LAB_CALC:
%macro lab_calc_blk 1 %macro lab_calc_blk 1
@ -116,13 +107,6 @@ LAB_CALC:
%assign i i+LAB_CALC_PARA %assign i i+LAB_CALC_PARA
%endrep %endrep
add edi, LAB_CALC_UNROLL*LAB_CALC_PARA*16
cmp edi, eax
jb LAB_CALC
pop eax
mov ebx, 0
; Load the init values of the message into the hash. ; Load the init values of the message into the hash.
movdqa xmm7, [init] movdqa xmm7, [init]
@ -143,14 +127,14 @@ LAB_CALC:
pshufd xmm0, xmm0, 0 ; xmm0 == e pshufd xmm0, xmm0, 0 ; xmm0 == e
LAB_LOOP: LAB_LOOP:
;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j] ;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]
%macro lab_loop_blk 0 %macro lab_loop_blk 1
movdqa xmm6, [data+ebx*4] movdqa xmm6, [data+%1]
paddd xmm6, sha256_consts_m128i[ebx*4] paddd xmm6, sha256_consts_m128i[%1]
add ebx, 4
paddd xmm6, [hash+2*16] ; +h paddd xmm6, [hash+2*16] ; +h
@ -217,68 +201,52 @@ LAB_LOOP:
%assign i 0 %assign i 0
%rep LAB_LOOP_UNROLL %rep LAB_LOOP_UNROLL
lab_loop_blk lab_loop_blk i
%assign i i+1 %assign i i+16
%endrep %endrep
cmp ebx, eax
jb LAB_LOOP
; Finished the 64 rounds, calculate hash and save ; Finished the 64 rounds, calculate hash and save
movdqa xmm1, [init] movdqa xmm1, [init+16]
pshufd xmm2, xmm1, 0x55
pshufd xmm6, xmm1, 0xAA
movdqa [hash+3*16], xmm6
pshufd xmm6, xmm1, 0xFF
movdqa [hash+4*16], xmm6
pshufd xmm1, xmm1, 0
paddd xmm5, xmm2 pshufd xmm2, xmm1, 0xFF
paddd xmm4, [hash+3*16] movdqa xmm6, [hash+2*16]
paddd xmm3, [hash+4*16] paddd xmm2, xmm6
paddd xmm7, xmm1 movdqa [hash+7*16], xmm2
movdqa xmm1, [init+4*4]
pshufd xmm2, xmm1, 0x55
pshufd xmm6, xmm1, 0xAA
movdqa [hash+3*16], xmm6
pshufd xmm6, xmm1, 0xFF
movdqa [hash+4*16], xmm6
pshufd xmm1, xmm1, 0
movdqa xmm6, [hash+0*16] pshufd xmm2, xmm1, 0xAA
paddd xmm2, xmm6 movdqa xmm6, [hash+1*16]
movdqa [hash+0*16], xmm2 paddd xmm2, xmm6
movdqa [hash+6*16], xmm2
pshufd xmm2, xmm1, 0x55
movdqa xmm6, [hash+0*16]
paddd xmm2, xmm6
movdqa [hash+5*16], xmm2
movdqa xmm2, [hash+3*16] pshufd xmm1, xmm1, 0
movdqa xmm6, [hash+1*16] paddd xmm0, xmm1
paddd xmm2, xmm6 movdqa [hash+4*16], xmm0
movdqa [hash+1*16], xmm2
movdqa xmm2, [hash+4*16] movdqa xmm1, [init]
movdqa xmm6, [hash+2*16]
paddd xmm2, xmm6
movdqa [hash+2*16], xmm2
paddd xmm0, xmm1 pshufd xmm2, xmm1, 0xFF
paddd xmm3, xmm2
movdqa [hash+3*16], xmm3
movdqa xmm1, [hash+0*16] pshufd xmm2, xmm1, 0xAA
movdqa xmm2, [hash+1*16] paddd xmm4, xmm2
movdqa xmm6, [hash+2*16] movdqa [hash+2*16], xmm4
pshufd xmm2, xmm1, 0x55
paddd xmm5, xmm2
movdqa [hash+1*16], xmm5
pshufd xmm1, xmm1, 0
paddd xmm7, xmm1
movdqa [hash+0*16], xmm7 movdqa [hash+0*16], xmm7
movdqa [hash+1*16], xmm5
movdqa [hash+2*16], xmm4
movdqa [hash+3*16], xmm3
movdqa [hash+4*16], xmm0
movdqa [hash+5*16], xmm1
movdqa [hash+6*16], xmm2
movdqa [hash+7*16], xmm6
LAB_RET: LAB_RET:
pop ebx pop edi
pop edi pop esi
pop esi retn 4
retn 4

Loading…
Cancel
Save