Browse Source

minor performance improvement

2upstream
orignal 8 years ago
parent
commit
a924af5185
  1. 45
      gost/cuda_gosthash.cu

45
gost/cuda_gosthash.cu

@ -673,32 +673,27 @@ __device__ __forceinline__
void GOST_FS(uint64_t* const state64, uint64_t* return_state) void GOST_FS(uint64_t* const state64, uint64_t* return_state)
{ {
uint32_t * state32 = (uint32_t *)state64; uint32_t * state32 = (uint32_t *)state64;
uint64_t r;
#pragma unroll 4 #pragma unroll 4
for (int b=0; b<4; b++) { for (int b=0; b<4; b++)
r = T0[EXTRACT_BYTE(state32[14], b)]; {
r ^= T1[EXTRACT_BYTE(state32[12], b)]; return_state[b] = T0[EXTRACT_BYTE(state32[14], b)]
r ^= T2[EXTRACT_BYTE(state32[10], b)]; ^ T1[EXTRACT_BYTE(state32[12], b)]
r ^= T3[EXTRACT_BYTE(state32[8], b)]; ^ T2[EXTRACT_BYTE(state32[10], b)]
r ^= T4[EXTRACT_BYTE(state32[6], b)]; ^ T3[EXTRACT_BYTE(state32[8], b)]
r ^= T5[EXTRACT_BYTE(state32[4], b)]; ^ T4[EXTRACT_BYTE(state32[6], b)]
r ^= T6[EXTRACT_BYTE(state32[2], b)]; ^ T5[EXTRACT_BYTE(state32[4], b)]
r ^= T7[EXTRACT_BYTE(state32[0], b)]; ^ T6[EXTRACT_BYTE(state32[2], b)]
return_state[b] = r; ^ T7[EXTRACT_BYTE(state32[0], b)];
}
return_state[b+4] = T0[EXTRACT_BYTE(state32[15], b)]
#pragma unroll 4 ^ T1[EXTRACT_BYTE(state32[13], b)]
for (int b=0; b<4; b++) { ^ T2[EXTRACT_BYTE(state32[11], b)]
r = T0[EXTRACT_BYTE(state32[15], b)]; ^ T3[EXTRACT_BYTE(state32[9], b)]
r ^= T1[EXTRACT_BYTE(state32[13], b)]; ^ T4[EXTRACT_BYTE(state32[7], b)]
r ^= T2[EXTRACT_BYTE(state32[11], b)]; ^ T5[EXTRACT_BYTE(state32[5], b)]
r ^= T3[EXTRACT_BYTE(state32[9], b)]; ^ T6[EXTRACT_BYTE(state32[3], b)]
r ^= T4[EXTRACT_BYTE(state32[7], b)]; ^ T7[EXTRACT_BYTE(state32[1], b)];
r ^= T5[EXTRACT_BYTE(state32[5], b)];
r ^= T6[EXTRACT_BYTE(state32[3], b)];
r ^= T7[EXTRACT_BYTE(state32[1], b)];
return_state[b+4] = r;
} }
} }
@ -853,7 +848,7 @@ void gostd_setBlock_80(uint32_t *pdata, uint32_t *ptarget)
__host__ __host__
void gostd_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces) void gostd_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces)
{ {
const uint32_t threadsperblock = 128; const uint32_t threadsperblock = 256;
dim3 grid(threads/threadsperblock); dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);

Loading…
Cancel
Save