1
0
mirror of https://github.com/GOSTSec/sgminer synced 2025-01-10 14:58:01 +00:00

Micro-optimisation in sha256_sse2 code courtesy of Guido Ascioti guido.ascioti@gmail.com

This commit is contained in:
ckolivas 2012-01-11 11:12:13 +11:00
parent 68c807d755
commit d356f44d53
2 changed files with 54 additions and 96 deletions

View File

@ -67,12 +67,6 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate,
work_restart[thr_id].restart = 0; work_restart[thr_id].restart = 0;
/* For debugging */
union {
__m128i m;
uint32_t i[4];
} mi;
/* Message expansion */ /* Message expansion */
memcpy(m_midstate, pmidstate, sizeof(m_midstate)); memcpy(m_midstate, pmidstate, sizeof(m_midstate));
memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */ memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */
@ -102,17 +96,12 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate,
CalcSha256_x86 (m_4hash, m_4hash1, sha256_32init); CalcSha256_x86 (m_4hash, m_4hash1, sha256_32init);
for (j = 0; j < 4; j++) { for (j = 0; j < 4; j++) {
mi.m = m_4hash[7]; if (unlikely(((uint32_t *)&(m_4hash[7]))[j] == 0)) {
if (unlikely(mi.i[j] == 0)) /* We found a hit...so check it */
break;
}
/* If j = true, we found a hit...so check it */
/* Use the C version for a check... */ /* Use the C version for a check... */
if (unlikely(j != 4)) {
for (i = 0; i < 8; i++) { for (i = 0; i < 8; i++) {
mi.m = m_4hash[i]; *(uint32_t *)&(phash)[i<<2] = ((uint32_t *)&(m_4hash[i]))[j];
*(uint32_t *)&(phash)[i*4] = mi.i[j];
} }
if (fulltest(phash, ptarget)) { if (fulltest(phash, ptarget)) {
@ -121,6 +110,7 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate,
return nonce + j; return nonce + j;
} }
} }
}
nonce += 4; nonce += 4;

View File

@ -1,4 +1,4 @@
;; SHA-256 for X86 for Linux, based off of: ;; SHA-256 for X86 for Linux, based off of:A
; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com ; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011 ; Version 2011
@ -15,9 +15,9 @@ BITS 32
; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16)) ; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define LAB_CALC_PARA 2 %define LAB_CALC_PARA 2
%define LAB_CALC_UNROLL 8 %define LAB_CALC_UNROLL 24
%define LAB_LOOP_UNROLL 8 %define LAB_LOOP_UNROLL 64
extern sha256_consts_m128i extern sha256_consts_m128i
@ -28,17 +28,8 @@ CalcSha256_x86:
push edi push edi
mov init, [esp+12] mov init, [esp+12]
push ebx
LAB_NEXT_NONCE:
mov eax, 64*4 ; 256 - rcx is # of SHA-2 rounds
mov ebx, 16*4 ; 64 - rax is where we expand to
LAB_SHA: LAB_SHA:
push eax lea edi, qword [data+256] ; + 256
lea eax, qword [data+eax*4] ; + 1024
lea edi, qword [data+ebx*4] ; + 256
LAB_CALC: LAB_CALC:
%macro lab_calc_blk 1 %macro lab_calc_blk 1
@ -116,13 +107,6 @@ LAB_CALC:
%assign i i+LAB_CALC_PARA %assign i i+LAB_CALC_PARA
%endrep %endrep
add edi, LAB_CALC_UNROLL*LAB_CALC_PARA*16
cmp edi, eax
jb LAB_CALC
pop eax
mov ebx, 0
; Load the init values of the message into the hash. ; Load the init values of the message into the hash.
movdqa xmm7, [init] movdqa xmm7, [init]
@ -143,14 +127,14 @@ LAB_CALC:
pshufd xmm0, xmm0, 0 ; xmm0 == e pshufd xmm0, xmm0, 0 ; xmm0 == e
LAB_LOOP: LAB_LOOP:
;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j] ;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]
%macro lab_loop_blk 0 %macro lab_loop_blk 1
movdqa xmm6, [data+ebx*4] movdqa xmm6, [data+%1]
paddd xmm6, sha256_consts_m128i[ebx*4] paddd xmm6, sha256_consts_m128i[%1]
add ebx, 4
paddd xmm6, [hash+2*16] ; +h paddd xmm6, [hash+2*16] ; +h
@ -217,68 +201,52 @@ LAB_LOOP:
%assign i 0 %assign i 0
%rep LAB_LOOP_UNROLL %rep LAB_LOOP_UNROLL
lab_loop_blk lab_loop_blk i
%assign i i+1 %assign i i+16
%endrep %endrep
cmp ebx, eax
jb LAB_LOOP
; Finished the 64 rounds, calculate hash and save ; Finished the 64 rounds, calculate hash and save
movdqa xmm1, [init] movdqa xmm1, [init+16]
pshufd xmm2, xmm1, 0x55
pshufd xmm6, xmm1, 0xAA
movdqa [hash+3*16], xmm6
pshufd xmm6, xmm1, 0xFF
movdqa [hash+4*16], xmm6
pshufd xmm1, xmm1, 0
paddd xmm5, xmm2 pshufd xmm2, xmm1, 0xFF
paddd xmm4, [hash+3*16] movdqa xmm6, [hash+2*16]
paddd xmm3, [hash+4*16]
paddd xmm7, xmm1
movdqa xmm1, [init+4*4]
pshufd xmm2, xmm1, 0x55
pshufd xmm6, xmm1, 0xAA
movdqa [hash+3*16], xmm6
pshufd xmm6, xmm1, 0xFF
movdqa [hash+4*16], xmm6
pshufd xmm1, xmm1, 0
movdqa xmm6, [hash+0*16]
paddd xmm2, xmm6 paddd xmm2, xmm6
movdqa [hash+0*16], xmm2 movdqa [hash+7*16], xmm2
pshufd xmm2, xmm1, 0xAA
movdqa xmm2, [hash+3*16]
movdqa xmm6, [hash+1*16] movdqa xmm6, [hash+1*16]
paddd xmm2, xmm6 paddd xmm2, xmm6
movdqa [hash+1*16], xmm2
movdqa xmm2, [hash+4*16]
movdqa xmm6, [hash+2*16]
paddd xmm2, xmm6
movdqa [hash+2*16], xmm2
paddd xmm0, xmm1
movdqa xmm1, [hash+0*16]
movdqa xmm2, [hash+1*16]
movdqa xmm6, [hash+2*16]
movdqa [hash+0*16], xmm7
movdqa [hash+1*16], xmm5
movdqa [hash+2*16], xmm4
movdqa [hash+3*16], xmm3
movdqa [hash+4*16], xmm0
movdqa [hash+5*16], xmm1
movdqa [hash+6*16], xmm2 movdqa [hash+6*16], xmm2
movdqa [hash+7*16], xmm6
pshufd xmm2, xmm1, 0x55
movdqa xmm6, [hash+0*16]
paddd xmm2, xmm6
movdqa [hash+5*16], xmm2
pshufd xmm1, xmm1, 0
paddd xmm0, xmm1
movdqa [hash+4*16], xmm0
movdqa xmm1, [init]
pshufd xmm2, xmm1, 0xFF
paddd xmm3, xmm2
movdqa [hash+3*16], xmm3
pshufd xmm2, xmm1, 0xAA
paddd xmm4, xmm2
movdqa [hash+2*16], xmm4
pshufd xmm2, xmm1, 0x55
paddd xmm5, xmm2
movdqa [hash+1*16], xmm5
pshufd xmm1, xmm1, 0
paddd xmm7, xmm1
movdqa [hash+0*16], xmm7
LAB_RET: LAB_RET:
pop ebx
pop edi pop edi
pop esi pop esi
retn 4 retn 4