From d356f44d5389afaf2a9740b4b27c7d1fa7b8896d Mon Sep 17 00:00:00 2001 From: ckolivas Date: Wed, 11 Jan 2012 11:12:13 +1100 Subject: [PATCH] Micro-optimisation in sha256_sse2 code courtesy of Guido Ascioti guido.ascioti@gmail.com --- sha256_sse2_i386.c | 22 +++----- x86_32/sha256_xmm.asm | 122 ++++++++++++++++-------------------------- 2 files changed, 51 insertions(+), 93 deletions(-) diff --git a/sha256_sse2_i386.c b/sha256_sse2_i386.c index ef3f0ee5..72a90c99 100644 --- a/sha256_sse2_i386.c +++ b/sha256_sse2_i386.c @@ -67,12 +67,6 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate, work_restart[thr_id].restart = 0; - /* For debugging */ - union { - __m128i m; - uint32_t i[4]; - } mi; - /* Message expansion */ memcpy(m_midstate, pmidstate, sizeof(m_midstate)); memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */ @@ -102,17 +96,12 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate, CalcSha256_x86 (m_4hash, m_4hash1, sha256_32init); for (j = 0; j < 4; j++) { - mi.m = m_4hash[7]; - if (unlikely(mi.i[j] == 0)) - break; - } - - /* If j = true, we found a hit...so check it */ - /* Use the C version for a check... */ - if (unlikely(j != 4)) { + if (unlikely(((uint32_t *)&(m_4hash[7]))[j] == 0)) { + /* We found a hit...so check it */ + /* Use the C version for a check... */ + for (i = 0; i < 8; i++) { - mi.m = m_4hash[i]; - *(uint32_t *)&(phash)[i*4] = mi.i[j]; + *(uint32_t *)&(phash)[i<<2] = ((uint32_t *)&(m_4hash[i]))[j]; } if (fulltest(phash, ptarget)) { @@ -120,6 +109,7 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate, *nNonce_p = nonce + j; return nonce + j; } + } } nonce += 4; diff --git a/x86_32/sha256_xmm.asm b/x86_32/sha256_xmm.asm index b2a8fbb4..601cf2bb 100644 --- a/x86_32/sha256_xmm.asm +++ b/x86_32/sha256_xmm.asm @@ -1,4 +1,4 @@ -;; SHA-256 for X86 for Linux, based off of: +;; SHA-256 for X86 for Linux, based off of:A ; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com ; Version 2011 @@ -15,30 +15,21 @@ BITS 32 ; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16)) %define LAB_CALC_PARA 2 -%define LAB_CALC_UNROLL 8 +%define LAB_CALC_UNROLL 24 -%define LAB_LOOP_UNROLL 8 +%define LAB_LOOP_UNROLL 64 extern sha256_consts_m128i global CalcSha256_x86 ; CalcSha256 hash(ecx), data(edx), init([esp+4]) CalcSha256_x86: - push esi - push edi - mov init, [esp+12] - - push ebx - -LAB_NEXT_NONCE: - - mov eax, 64*4 ; 256 - rcx is # of SHA-2 rounds - mov ebx, 16*4 ; 64 - rax is where we expand to + push esi + push edi + mov init, [esp+12] LAB_SHA: - push eax - lea eax, qword [data+eax*4] ; + 1024 - lea edi, qword [data+ebx*4] ; + 256 + lea edi, qword [data+256] ; + 256 LAB_CALC: %macro lab_calc_blk 1 @@ -116,13 +107,6 @@ LAB_CALC: %assign i i+LAB_CALC_PARA %endrep - add edi, LAB_CALC_UNROLL*LAB_CALC_PARA*16 - cmp edi, eax - jb LAB_CALC - - pop eax - mov ebx, 0 - ; Load the init values of the message into the hash. movdqa xmm7, [init] @@ -143,14 +127,14 @@ LAB_CALC: pshufd xmm0, xmm0, 0 ; xmm0 == e + LAB_LOOP: ;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32(g_sha256_k[j]) + w[j] -%macro lab_loop_blk 0 - movdqa xmm6, [data+ebx*4] - paddd xmm6, sha256_consts_m128i[ebx*4] - add ebx, 4 +%macro lab_loop_blk 1 + movdqa xmm6, [data+%1] + paddd xmm6, sha256_consts_m128i[%1] paddd xmm6, [hash+2*16] ; +h @@ -217,68 +201,52 @@ LAB_LOOP: %assign i 0 %rep LAB_LOOP_UNROLL - lab_loop_blk -%assign i i+1 + lab_loop_blk i +%assign i i+16 %endrep - cmp ebx, eax - jb LAB_LOOP - ; Finished the 64 rounds, calculate hash and save - movdqa xmm1, [init] - pshufd xmm2, xmm1, 0x55 - pshufd xmm6, xmm1, 0xAA - movdqa [hash+3*16], xmm6 - pshufd xmm6, xmm1, 0xFF - movdqa [hash+4*16], xmm6 - pshufd xmm1, xmm1, 0 + movdqa xmm1, [init+16] - paddd xmm5, xmm2 - paddd xmm4, [hash+3*16] - paddd xmm3, [hash+4*16] - paddd xmm7, xmm1 - - movdqa xmm1, [init+4*4] - pshufd xmm2, xmm1, 0x55 - pshufd xmm6, xmm1, 0xAA - movdqa [hash+3*16], xmm6 - pshufd xmm6, xmm1, 0xFF - movdqa [hash+4*16], xmm6 - pshufd xmm1, xmm1, 0 + pshufd xmm2, xmm1, 0xFF + movdqa xmm6, [hash+2*16] + paddd xmm2, xmm6 + movdqa [hash+7*16], xmm2 - movdqa xmm6, [hash+0*16] - paddd xmm2, xmm6 - movdqa [hash+0*16], xmm2 + pshufd xmm2, xmm1, 0xAA + movdqa xmm6, [hash+1*16] + paddd xmm2, xmm6 + movdqa [hash+6*16], xmm2 + pshufd xmm2, xmm1, 0x55 + movdqa xmm6, [hash+0*16] + paddd xmm2, xmm6 + movdqa [hash+5*16], xmm2 - movdqa xmm2, [hash+3*16] - movdqa xmm6, [hash+1*16] - paddd xmm2, xmm6 - movdqa [hash+1*16], xmm2 + pshufd xmm1, xmm1, 0 + paddd xmm0, xmm1 + movdqa [hash+4*16], xmm0 - movdqa xmm2, [hash+4*16] - movdqa xmm6, [hash+2*16] - paddd xmm2, xmm6 - movdqa [hash+2*16], xmm2 + movdqa xmm1, [init] - paddd xmm0, xmm1 + pshufd xmm2, xmm1, 0xFF + paddd xmm3, xmm2 + movdqa [hash+3*16], xmm3 - movdqa xmm1, [hash+0*16] - movdqa xmm2, [hash+1*16] - movdqa xmm6, [hash+2*16] + pshufd xmm2, xmm1, 0xAA + paddd xmm4, xmm2 + movdqa [hash+2*16], xmm4 + + pshufd xmm2, xmm1, 0x55 + paddd xmm5, xmm2 + movdqa [hash+1*16], xmm5 + pshufd xmm1, xmm1, 0 + paddd xmm7, xmm1 movdqa [hash+0*16], xmm7 - movdqa [hash+1*16], xmm5 - movdqa [hash+2*16], xmm4 - movdqa [hash+3*16], xmm3 - movdqa [hash+4*16], xmm0 - movdqa [hash+5*16], xmm1 - movdqa [hash+6*16], xmm2 - movdqa [hash+7*16], xmm6 LAB_RET: - pop ebx - pop edi - pop esi - retn 4 + pop edi + pop esi + retn 4