|
|
|
@ -743,140 +743,3 @@ void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounc
@@ -743,140 +743,3 @@ void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounc
|
|
|
|
|
x15_whirlpool_cpu_hash_64(thr_id, threads, d_hash); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#if 0 |
|
|
|
|
|
|
|
|
|
__global__ __launch_bounds__(TPB64,2) |
|
|
|
|
void x15_whirlpool_gpu_hash_64_final(uint32_t threads,const uint64_t* __restrict__ g_hash, uint32_t* resNonce, const uint64_t target) |
|
|
|
|
{ |
|
|
|
|
__shared__ uint2 sharedMemory[7][256]; |
|
|
|
|
|
|
|
|
|
if (threadIdx.x < 256) { |
|
|
|
|
const uint2 tmp = __ldg((uint2*)&b0[threadIdx.x]); |
|
|
|
|
sharedMemory[0][threadIdx.x] = tmp; |
|
|
|
|
sharedMemory[1][threadIdx.x] = ROL8(tmp); |
|
|
|
|
sharedMemory[2][threadIdx.x] = ROL16(tmp); |
|
|
|
|
sharedMemory[3][threadIdx.x] = ROL24(tmp); |
|
|
|
|
sharedMemory[4][threadIdx.x] = SWAPUINT2(tmp); |
|
|
|
|
sharedMemory[5][threadIdx.x] = ROR24(tmp); |
|
|
|
|
sharedMemory[6][threadIdx.x] = ROR16(tmp); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
|
if (thread < threads){ |
|
|
|
|
|
|
|
|
|
uint2 hash[8], n[8], h[ 8], backup; |
|
|
|
|
uint2 tmp[8] = { |
|
|
|
|
{0xC0EE0B30,0x672990AF},{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828}, |
|
|
|
|
{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
*(uint2x4*)&hash[ 0] = __ldg4((uint2x4*)&g_hash[(thread<<3) + 0]); |
|
|
|
|
*(uint2x4*)&hash[ 4] = __ldg4((uint2x4*)&g_hash[(thread<<3) + 4]); |
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(int i=0;i<8;i++) |
|
|
|
|
n[i]=hash[i]; |
|
|
|
|
|
|
|
|
|
// __syncthreads(); |
|
|
|
|
|
|
|
|
|
tmp[ 0]^= d_ROUND_ELT(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1); |
|
|
|
|
tmp[ 1]^= d_ROUND_ELT_LDG(sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2); |
|
|
|
|
tmp[ 2]^= d_ROUND_ELT(sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3); |
|
|
|
|
tmp[ 3]^= d_ROUND_ELT_LDG(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4); |
|
|
|
|
tmp[ 4]^= d_ROUND_ELT(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5); |
|
|
|
|
tmp[ 5]^= d_ROUND_ELT_LDG(sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6); |
|
|
|
|
tmp[ 6]^= d_ROUND_ELT(sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7); |
|
|
|
|
tmp[ 7]^= d_ROUND_ELT_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0); |
|
|
|
|
|
|
|
|
|
for (int i=1; i <10; i++){ |
|
|
|
|
TRANSFER(n, tmp); |
|
|
|
|
tmp[ 0] = d_ROUND_ELT1_LDG(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, precomputed_round_key_64[(i-1)*8+0]); |
|
|
|
|
tmp[ 1] = d_ROUND_ELT1( sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, precomputed_round_key_64[(i-1)*8+1]); |
|
|
|
|
tmp[ 2] = d_ROUND_ELT1( sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, precomputed_round_key_64[(i-1)*8+2]); |
|
|
|
|
tmp[ 3] = d_ROUND_ELT1_LDG(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, precomputed_round_key_64[(i-1)*8+3]); |
|
|
|
|
tmp[ 4] = d_ROUND_ELT1( sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, precomputed_round_key_64[(i-1)*8+4]); |
|
|
|
|
tmp[ 5] = d_ROUND_ELT1( sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, precomputed_round_key_64[(i-1)*8+5]); |
|
|
|
|
tmp[ 6] = d_ROUND_ELT1( sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, precomputed_round_key_64[(i-1)*8+6]); |
|
|
|
|
tmp[ 7] = d_ROUND_ELT1_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, precomputed_round_key_64[(i-1)*8+7]); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
TRANSFER(h, tmp); |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for (int i=0; i<8; i++) |
|
|
|
|
h[i] = h[i] ^ hash[i]; |
|
|
|
|
|
|
|
|
|
#pragma unroll 6 |
|
|
|
|
for (int i=1; i<7; i++) |
|
|
|
|
n[i]=vectorize(0); |
|
|
|
|
|
|
|
|
|
n[0] = vectorize(0x80); |
|
|
|
|
n[7] = vectorize(0x2000000000000); |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for (int i=0; i < 8; i++) { |
|
|
|
|
n[i] = n[i] ^ h[i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
backup = h[ 3]; |
|
|
|
|
|
|
|
|
|
// #pragma unroll 8 |
|
|
|
|
for (int i=0; i < 8; i++) { |
|
|
|
|
tmp[ 0] = d_ROUND_ELT1(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1, InitVector_RC[i]); |
|
|
|
|
tmp[ 1] = d_ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2); |
|
|
|
|
tmp[ 2] = d_ROUND_ELT_LDG(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3); |
|
|
|
|
tmp[ 3] = d_ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4); |
|
|
|
|
tmp[ 4] = d_ROUND_ELT_LDG(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5); |
|
|
|
|
tmp[ 5] = d_ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6); |
|
|
|
|
tmp[ 6] = d_ROUND_ELT_LDG(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7); |
|
|
|
|
tmp[ 7] = d_ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0); |
|
|
|
|
TRANSFER(h, tmp); |
|
|
|
|
tmp[ 0] = d_ROUND_ELT1(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, tmp[0]); |
|
|
|
|
tmp[ 1] = d_ROUND_ELT1(sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, tmp[1]); |
|
|
|
|
tmp[ 2] = d_ROUND_ELT1_LDG(sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, tmp[2]); |
|
|
|
|
tmp[ 3] = d_ROUND_ELT1(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, tmp[3]); |
|
|
|
|
tmp[ 4] = d_ROUND_ELT1(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, tmp[4]); |
|
|
|
|
tmp[ 5] = d_ROUND_ELT1(sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, tmp[5]); |
|
|
|
|
tmp[ 6] = d_ROUND_ELT1(sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, tmp[6]); |
|
|
|
|
tmp[ 7] = d_ROUND_ELT1_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, tmp[7]); |
|
|
|
|
TRANSFER(n, tmp); |
|
|
|
|
} |
|
|
|
|
tmp[ 0] = d_ROUND_ELT1(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1, InitVector_RC[8]); |
|
|
|
|
tmp[ 1] = d_ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2); |
|
|
|
|
tmp[ 2] = d_ROUND_ELT_LDG(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3); |
|
|
|
|
tmp[ 3] = d_ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4); |
|
|
|
|
tmp[ 4] = d_ROUND_ELT_LDG(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5); |
|
|
|
|
tmp[ 5] = d_ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6); |
|
|
|
|
tmp[ 6] = d_ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7); |
|
|
|
|
tmp[ 7] = d_ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0); |
|
|
|
|
TRANSFER(h, tmp); |
|
|
|
|
tmp[ 0] = d_ROUND_ELT1(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, tmp[0]); |
|
|
|
|
tmp[ 1] = d_ROUND_ELT1(sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, tmp[1]); |
|
|
|
|
tmp[ 2] = d_ROUND_ELT1(sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, tmp[2]); |
|
|
|
|
tmp[ 3] = d_ROUND_ELT1(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, tmp[3]); |
|
|
|
|
tmp[ 4] = d_ROUND_ELT1(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, tmp[4]); |
|
|
|
|
tmp[ 5] = d_ROUND_ELT1(sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, tmp[5]); |
|
|
|
|
tmp[ 6] = d_ROUND_ELT1_LDG(sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, tmp[6]); |
|
|
|
|
tmp[ 7] = d_ROUND_ELT1(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, tmp[7]); |
|
|
|
|
|
|
|
|
|
n[ 3] = backup ^ d_ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4) |
|
|
|
|
^ d_ROUND_ELT(sharedMemory,tmp, 3, 2, 1, 0, 7, 6, 5, 4); |
|
|
|
|
|
|
|
|
|
if(devectorize(n[3]) <= target) { |
|
|
|
|
uint32_t tmp = atomicExch(&resNonce[0], thread); |
|
|
|
|
if (tmp != UINT32_MAX) |
|
|
|
|
resNonce[1] = tmp; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
extern void x15_whirlpool_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target) |
|
|
|
|
{ |
|
|
|
|
dim3 grid((threads + TPB64-1) / TPB64); |
|
|
|
|
dim3 block(TPB64); |
|
|
|
|
|
|
|
|
|
x15_whirlpool_gpu_hash_64_final <<<grid, block>>> (threads, (uint64_t*)d_hash,d_resNonce,target); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
|