|
|
@ -555,6 +555,16 @@ __device__ static uint64_t T7[256] = { |
|
|
|
0x717E7067AF4F499A, 0x938290A9ECD1DBB3, 0x88E3B293344DD172, 0x2734158C250FA3D6 |
|
|
|
0x717E7067AF4F499A, 0x938290A9ECD1DBB3, 0x88E3B293344DD172, 0x2734158C250FA3D6 |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// local copy of T0..T7 for each block |
|
|
|
|
|
|
|
__shared__ static uint64_t T0S[256]; |
|
|
|
|
|
|
|
__shared__ static uint64_t T1S[256]; |
|
|
|
|
|
|
|
__shared__ static uint64_t T2S[256]; |
|
|
|
|
|
|
|
__shared__ static uint64_t T3S[256]; |
|
|
|
|
|
|
|
__shared__ static uint64_t T4S[256]; |
|
|
|
|
|
|
|
__shared__ static uint64_t T5S[256]; |
|
|
|
|
|
|
|
__shared__ static uint64_t T6S[256]; |
|
|
|
|
|
|
|
__shared__ static uint64_t T7S[256]; |
|
|
|
|
|
|
|
|
|
|
|
// KeySchedule |
|
|
|
// KeySchedule |
|
|
|
__constant__ static uint64_t CC[12][8] = {{ |
|
|
|
__constant__ static uint64_t CC[12][8] = {{ |
|
|
|
0xe9daca1eda5b08b1, 0x1f7c65c0812fcbeb, 0x16d0452e43766a2f, 0xfcc485758db84e71, |
|
|
|
0xe9daca1eda5b08b1, 0x1f7c65c0812fcbeb, 0x16d0452e43766a2f, 0xfcc485758db84e71, |
|
|
@ -677,23 +687,23 @@ void GOST_FS(uint64_t* const state64, uint64_t* return_state) |
|
|
|
#pragma unroll 4 |
|
|
|
#pragma unroll 4 |
|
|
|
for (int b=0; b<4; b++) |
|
|
|
for (int b=0; b<4; b++) |
|
|
|
{ |
|
|
|
{ |
|
|
|
return_state[b] = T0[EXTRACT_BYTE(state32[14], b)] |
|
|
|
return_state[b] = T0S[EXTRACT_BYTE(state32[14], b)] |
|
|
|
^ T1[EXTRACT_BYTE(state32[12], b)] |
|
|
|
^ T1S[EXTRACT_BYTE(state32[12], b)] |
|
|
|
^ T2[EXTRACT_BYTE(state32[10], b)] |
|
|
|
^ T2S[EXTRACT_BYTE(state32[10], b)] |
|
|
|
^ T3[EXTRACT_BYTE(state32[8], b)] |
|
|
|
^ T3S[EXTRACT_BYTE(state32[8], b)] |
|
|
|
^ T4[EXTRACT_BYTE(state32[6], b)] |
|
|
|
^ T4S[EXTRACT_BYTE(state32[6], b)] |
|
|
|
^ T5[EXTRACT_BYTE(state32[4], b)] |
|
|
|
^ T5S[EXTRACT_BYTE(state32[4], b)] |
|
|
|
^ T6[EXTRACT_BYTE(state32[2], b)] |
|
|
|
^ T6S[EXTRACT_BYTE(state32[2], b)] |
|
|
|
^ T7[EXTRACT_BYTE(state32[0], b)]; |
|
|
|
^ T7S[EXTRACT_BYTE(state32[0], b)]; |
|
|
|
|
|
|
|
|
|
|
|
return_state[b+4] = T0[EXTRACT_BYTE(state32[15], b)] |
|
|
|
return_state[b+4] = T0S[EXTRACT_BYTE(state32[15], b)] |
|
|
|
^ T1[EXTRACT_BYTE(state32[13], b)] |
|
|
|
^ T1S[EXTRACT_BYTE(state32[13], b)] |
|
|
|
^ T2[EXTRACT_BYTE(state32[11], b)] |
|
|
|
^ T2S[EXTRACT_BYTE(state32[11], b)] |
|
|
|
^ T3[EXTRACT_BYTE(state32[9], b)] |
|
|
|
^ T3S[EXTRACT_BYTE(state32[9], b)] |
|
|
|
^ T4[EXTRACT_BYTE(state32[7], b)] |
|
|
|
^ T4S[EXTRACT_BYTE(state32[7], b)] |
|
|
|
^ T5[EXTRACT_BYTE(state32[5], b)] |
|
|
|
^ T5S[EXTRACT_BYTE(state32[5], b)] |
|
|
|
^ T6[EXTRACT_BYTE(state32[3], b)] |
|
|
|
^ T6S[EXTRACT_BYTE(state32[3], b)] |
|
|
|
^ T7[EXTRACT_BYTE(state32[1], b)]; |
|
|
|
^ T7S[EXTRACT_BYTE(state32[1], b)]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -782,6 +792,15 @@ __global__ |
|
|
|
void gostd_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces) |
|
|
|
void gostd_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces) |
|
|
|
{ |
|
|
|
{ |
|
|
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
|
|
|
|
// copy table to shared memory, we assume 256 threads per block |
|
|
|
|
|
|
|
T0S[threadIdx.x] = T0[threadIdx.x]; |
|
|
|
|
|
|
|
T1S[threadIdx.x] = T1[threadIdx.x]; |
|
|
|
|
|
|
|
T2S[threadIdx.x] = T2[threadIdx.x]; |
|
|
|
|
|
|
|
T3S[threadIdx.x] = T3[threadIdx.x]; |
|
|
|
|
|
|
|
T4S[threadIdx.x] = T4[threadIdx.x]; |
|
|
|
|
|
|
|
T5S[threadIdx.x] = T5[threadIdx.x]; |
|
|
|
|
|
|
|
T6S[threadIdx.x] = T6[threadIdx.x]; |
|
|
|
|
|
|
|
T7S[threadIdx.x] = T7[threadIdx.x]; |
|
|
|
if (thread < threads) |
|
|
|
if (thread < threads) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const uint32_t nonce = startNonce + thread; |
|
|
|
const uint32_t nonce = startNonce + thread; |
|
|
|