whirlpool: restore old source code for SM 3.0
SM 3.0 implementation need a manual define in whirlpool.cu... alexis variant is 2x slower on SM3.0 (GT 740)
This commit is contained in:
parent
e388c11c02
commit
73f6720121
@ -67,7 +67,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
|
||||
x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu \
|
||||
x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
|
||||
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
|
||||
x15/whirlpool.cu \
|
||||
x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
|
||||
x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
|
||||
x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* SKEIN 64 and 80 based on Alexis Provost version */
|
||||
/* SKEIN 64 and 80 based on Alexis Provos version */
|
||||
|
||||
#define TPB52 512
|
||||
#define TPB50 256
|
||||
|
@ -743,140 +743,3 @@ void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounc
|
||||
x15_whirlpool_cpu_hash_64(thr_id, threads, d_hash);
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
__global__ __launch_bounds__(TPB64,2)
|
||||
void x15_whirlpool_gpu_hash_64_final(uint32_t threads,const uint64_t* __restrict__ g_hash, uint32_t* resNonce, const uint64_t target)
|
||||
{
|
||||
__shared__ uint2 sharedMemory[7][256];
|
||||
|
||||
if (threadIdx.x < 256) {
|
||||
const uint2 tmp = __ldg((uint2*)&b0[threadIdx.x]);
|
||||
sharedMemory[0][threadIdx.x] = tmp;
|
||||
sharedMemory[1][threadIdx.x] = ROL8(tmp);
|
||||
sharedMemory[2][threadIdx.x] = ROL16(tmp);
|
||||
sharedMemory[3][threadIdx.x] = ROL24(tmp);
|
||||
sharedMemory[4][threadIdx.x] = SWAPUINT2(tmp);
|
||||
sharedMemory[5][threadIdx.x] = ROR24(tmp);
|
||||
sharedMemory[6][threadIdx.x] = ROR16(tmp);
|
||||
}
|
||||
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads){
|
||||
|
||||
uint2 hash[8], n[8], h[ 8], backup;
|
||||
uint2 tmp[8] = {
|
||||
{0xC0EE0B30,0x672990AF},{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828},
|
||||
{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828}
|
||||
};
|
||||
|
||||
*(uint2x4*)&hash[ 0] = __ldg4((uint2x4*)&g_hash[(thread<<3) + 0]);
|
||||
*(uint2x4*)&hash[ 4] = __ldg4((uint2x4*)&g_hash[(thread<<3) + 4]);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll 8
|
||||
for(int i=0;i<8;i++)
|
||||
n[i]=hash[i];
|
||||
|
||||
// __syncthreads();
|
||||
|
||||
tmp[ 0]^= d_ROUND_ELT(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1);
|
||||
tmp[ 1]^= d_ROUND_ELT_LDG(sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2);
|
||||
tmp[ 2]^= d_ROUND_ELT(sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3);
|
||||
tmp[ 3]^= d_ROUND_ELT_LDG(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4);
|
||||
tmp[ 4]^= d_ROUND_ELT(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5);
|
||||
tmp[ 5]^= d_ROUND_ELT_LDG(sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6);
|
||||
tmp[ 6]^= d_ROUND_ELT(sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7);
|
||||
tmp[ 7]^= d_ROUND_ELT_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
|
||||
for (int i=1; i <10; i++){
|
||||
TRANSFER(n, tmp);
|
||||
tmp[ 0] = d_ROUND_ELT1_LDG(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, precomputed_round_key_64[(i-1)*8+0]);
|
||||
tmp[ 1] = d_ROUND_ELT1( sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, precomputed_round_key_64[(i-1)*8+1]);
|
||||
tmp[ 2] = d_ROUND_ELT1( sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, precomputed_round_key_64[(i-1)*8+2]);
|
||||
tmp[ 3] = d_ROUND_ELT1_LDG(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, precomputed_round_key_64[(i-1)*8+3]);
|
||||
tmp[ 4] = d_ROUND_ELT1( sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, precomputed_round_key_64[(i-1)*8+4]);
|
||||
tmp[ 5] = d_ROUND_ELT1( sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, precomputed_round_key_64[(i-1)*8+5]);
|
||||
tmp[ 6] = d_ROUND_ELT1( sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, precomputed_round_key_64[(i-1)*8+6]);
|
||||
tmp[ 7] = d_ROUND_ELT1_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, precomputed_round_key_64[(i-1)*8+7]);
|
||||
}
|
||||
|
||||
TRANSFER(h, tmp);
|
||||
#pragma unroll 8
|
||||
for (int i=0; i<8; i++)
|
||||
h[i] = h[i] ^ hash[i];
|
||||
|
||||
#pragma unroll 6
|
||||
for (int i=1; i<7; i++)
|
||||
n[i]=vectorize(0);
|
||||
|
||||
n[0] = vectorize(0x80);
|
||||
n[7] = vectorize(0x2000000000000);
|
||||
|
||||
#pragma unroll 8
|
||||
for (int i=0; i < 8; i++) {
|
||||
n[i] = n[i] ^ h[i];
|
||||
}
|
||||
|
||||
backup = h[ 3];
|
||||
|
||||
// #pragma unroll 8
|
||||
for (int i=0; i < 8; i++) {
|
||||
tmp[ 0] = d_ROUND_ELT1(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1, InitVector_RC[i]);
|
||||
tmp[ 1] = d_ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2);
|
||||
tmp[ 2] = d_ROUND_ELT_LDG(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3);
|
||||
tmp[ 3] = d_ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
|
||||
tmp[ 4] = d_ROUND_ELT_LDG(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5);
|
||||
tmp[ 5] = d_ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6);
|
||||
tmp[ 6] = d_ROUND_ELT_LDG(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7);
|
||||
tmp[ 7] = d_ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
TRANSFER(h, tmp);
|
||||
tmp[ 0] = d_ROUND_ELT1(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, tmp[0]);
|
||||
tmp[ 1] = d_ROUND_ELT1(sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, tmp[1]);
|
||||
tmp[ 2] = d_ROUND_ELT1_LDG(sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, tmp[2]);
|
||||
tmp[ 3] = d_ROUND_ELT1(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, tmp[3]);
|
||||
tmp[ 4] = d_ROUND_ELT1(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, tmp[4]);
|
||||
tmp[ 5] = d_ROUND_ELT1(sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, tmp[5]);
|
||||
tmp[ 6] = d_ROUND_ELT1(sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, tmp[6]);
|
||||
tmp[ 7] = d_ROUND_ELT1_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, tmp[7]);
|
||||
TRANSFER(n, tmp);
|
||||
}
|
||||
tmp[ 0] = d_ROUND_ELT1(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1, InitVector_RC[8]);
|
||||
tmp[ 1] = d_ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2);
|
||||
tmp[ 2] = d_ROUND_ELT_LDG(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3);
|
||||
tmp[ 3] = d_ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
|
||||
tmp[ 4] = d_ROUND_ELT_LDG(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5);
|
||||
tmp[ 5] = d_ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6);
|
||||
tmp[ 6] = d_ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7);
|
||||
tmp[ 7] = d_ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
TRANSFER(h, tmp);
|
||||
tmp[ 0] = d_ROUND_ELT1(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, tmp[0]);
|
||||
tmp[ 1] = d_ROUND_ELT1(sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, tmp[1]);
|
||||
tmp[ 2] = d_ROUND_ELT1(sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, tmp[2]);
|
||||
tmp[ 3] = d_ROUND_ELT1(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, tmp[3]);
|
||||
tmp[ 4] = d_ROUND_ELT1(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, tmp[4]);
|
||||
tmp[ 5] = d_ROUND_ELT1(sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, tmp[5]);
|
||||
tmp[ 6] = d_ROUND_ELT1_LDG(sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, tmp[6]);
|
||||
tmp[ 7] = d_ROUND_ELT1(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, tmp[7]);
|
||||
|
||||
n[ 3] = backup ^ d_ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4)
|
||||
^ d_ROUND_ELT(sharedMemory,tmp, 3, 2, 1, 0, 7, 6, 5, 4);
|
||||
|
||||
if(devectorize(n[3]) <= target) {
|
||||
uint32_t tmp = atomicExch(&resNonce[0], thread);
|
||||
if (tmp != UINT32_MAX)
|
||||
resNonce[1] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern void x15_whirlpool_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target)
|
||||
{
|
||||
dim3 grid((threads + TPB64-1) / TPB64);
|
||||
dim3 block(TPB64);
|
||||
|
||||
x15_whirlpool_gpu_hash_64_final <<<grid, block>>> (threads, (uint64_t*)d_hash,d_resNonce,target);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
2365
x15/cuda_x15_whirlpool_sm3.cu
Normal file
2365
x15/cuda_x15_whirlpool_sm3.cu
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,30 +1,33 @@
|
||||
/*
|
||||
* whirlpool routine (djm)
|
||||
* whirlpool routine
|
||||
*/
|
||||
extern "C"
|
||||
{
|
||||
#include "sph/sph_whirlpool.h"
|
||||
#include "miner.h"
|
||||
extern "C" {
|
||||
#include <sph/sph_whirlpool.h>
|
||||
#include <miner.h>
|
||||
}
|
||||
|
||||
#include "cuda_helper.h"
|
||||
#include <cuda_helper.h>
|
||||
|
||||
//static uint32_t *d_hash[MAX_GPUS];
|
||||
|
||||
extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode);
|
||||
extern void x15_whirlpool_cpu_free(int thr_id);
|
||||
|
||||
extern void whirlpool512_setBlock_80(void *pdata, const void *ptarget);
|
||||
|
||||
//extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
//extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
|
||||
//extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, const uint64_t target);
|
||||
//#define SM3_VARIANT
|
||||
|
||||
#ifdef SM3_VARIANT
|
||||
static uint32_t *d_hash[MAX_GPUS];
|
||||
extern void whirlpool512_init_sm3(int thr_id, uint32_t threads, int mode);
|
||||
extern void whirlpool512_free_sm3(int thr_id);
|
||||
extern void whirlpool512_setBlock_80_sm3(void *pdata, const void *ptarget);
|
||||
extern void whirlpool512_hash_64_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
extern void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
|
||||
extern uint32_t whirlpool512_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
//#define _DEBUG
|
||||
#define _DEBUG_PREFIX "whirl"
|
||||
#include "cuda_debug.cuh"
|
||||
#include <cuda_debug.cuh>
|
||||
#else
|
||||
extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode);
|
||||
extern void x15_whirlpool_cpu_free(int thr_id);
|
||||
extern void whirlpool512_setBlock_80(void *pdata, const void *ptarget);
|
||||
extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, const uint64_t target);
|
||||
#endif
|
||||
|
||||
|
||||
// CPU Hash function
|
||||
extern "C" void wcoinhash(void *state, const void *input)
|
||||
@ -90,10 +93,12 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
|
||||
CUDA_LOG_ERROR();
|
||||
}
|
||||
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
|
||||
|
||||
//CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
|
||||
#ifdef SM3_VARIANT
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
|
||||
whirlpool512_init_sm3(thr_id, throughput, 1 /* old whirlpool */);
|
||||
#else
|
||||
x15_whirlpool_cpu_init(thr_id, throughput, 1 /* old whirlpool */);
|
||||
|
||||
#endif
|
||||
init[thr_id] = true;
|
||||
}
|
||||
|
||||
@ -101,22 +106,26 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
}
|
||||
|
||||
#ifdef SM3_VARIANT
|
||||
whirlpool512_setBlock_80_sm3((void*)endiandata, ptarget);
|
||||
#else
|
||||
whirlpool512_setBlock_80((void*)endiandata, ptarget);
|
||||
#endif
|
||||
|
||||
do {
|
||||
/*
|
||||
int order = 0;
|
||||
whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
#ifdef SM3_VARIANT
|
||||
int order = 1;
|
||||
whirlpool512_hash_80_sm3(thr_id, throughput, pdata[19], d_hash[thr_id]);
|
||||
TRACE64(" 80 :", d_hash);
|
||||
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
whirlpool512_hash_64_sm3(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
TRACE64(" 64 :", d_hash);
|
||||
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
whirlpool512_hash_64_sm3(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
TRACE64(" 64 :", d_hash);
|
||||
work->nonces[0] = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
*/
|
||||
*hashes_done = pdata[19] - first_nonce + throughput;
|
||||
|
||||
work->nonces[0] = whirlpool512_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
#else
|
||||
whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], work->nonces, *(uint64_t*)&ptarget[6]);
|
||||
#endif
|
||||
*hashes_done = pdata[19] - first_nonce + throughput;
|
||||
|
||||
if (work->nonces[0] != UINT32_MAX && bench_algo < 0)
|
||||
{
|
||||
@ -159,9 +168,12 @@ extern "C" void free_whirl(int thr_id)
|
||||
|
||||
cudaThreadSynchronize();
|
||||
|
||||
//cudaFree(d_hash[thr_id]);
|
||||
|
||||
#ifdef SM3_VARIANT
|
||||
cudaFree(d_hash[thr_id]);
|
||||
whirlpool512_free_sm3(thr_id);
|
||||
#else
|
||||
x15_whirlpool_cpu_free(thr_id);
|
||||
#endif
|
||||
init[thr_id] = false;
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
Loading…
x
Reference in New Issue
Block a user