simd512: restore SM3/3.5 perfs
Simple change which affect all algos based on SIMD512 fresh, qubit, s3, x11 to x17...
This commit is contained in:
parent
f42bb85785
commit
c88750332c
@ -3,9 +3,6 @@
|
|||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
// aus heavy.cu
|
|
||||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
|
||||||
|
|
||||||
#include "cuda_x11_aes.cu"
|
#include "cuda_x11_aes.cu"
|
||||||
|
|
||||||
__device__ __forceinline__ void AES_2ROUND(
|
__device__ __forceinline__ void AES_2ROUND(
|
||||||
|
@ -576,7 +576,7 @@ x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, u
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
__global__ void __launch_bounds__(TPB, 4)
|
__global__ void __launch_bounds__(TPB, 4)
|
||||||
x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
|
x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
|
||||||
{
|
{
|
||||||
@ -604,7 +604,7 @@ x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash
|
|||||||
Compression2(hashPosition, g_fft4, g_state);
|
Compression2(hashPosition, g_fft4, g_state);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
__global__ void __launch_bounds__(TPB, 4)
|
__global__ void __launch_bounds__(TPB, 4)
|
||||||
x11_simd512_gpu_compress_64_maxwell(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
|
x11_simd512_gpu_compress_64_maxwell(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
|
||||||
@ -670,13 +670,18 @@ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint
|
|||||||
const int threadsperblock = TPB;
|
const int threadsperblock = TPB;
|
||||||
|
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
dim3 grid8(((threads + threadsperblock-1)/threadsperblock)*8);
|
dim3 grid((threads + threadsperblock-1) / threadsperblock);
|
||||||
|
|
||||||
|
dim3 grid8(((threads + threadsperblock - 1) / threadsperblock) * 8);
|
||||||
|
|
||||||
x11_simd512_gpu_expand_64 <<<grid8, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]);
|
x11_simd512_gpu_expand_64 <<<grid8, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]);
|
||||||
|
if (device_sm[device_map[thr_id]] >= 500) {
|
||||||
|
x11_simd512_gpu_compress_64_maxwell <<< grid8, block >>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
||||||
|
} else {
|
||||||
|
x11_simd512_gpu_compress1_64 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
||||||
|
x11_simd512_gpu_compress2_64 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
||||||
|
}
|
||||||
|
|
||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
|
||||||
|
|
||||||
x11_simd512_gpu_compress_64_maxwell << <grid, block >> > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
|
||||||
x11_simd512_gpu_final_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
x11_simd512_gpu_final_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
||||||
|
|
||||||
MyStreamSynchronize(NULL, order, thr_id);
|
MyStreamSynchronize(NULL, order, thr_id);
|
||||||
|
14
x11/x11.cu
14
x11/x11.cu
@ -58,7 +58,11 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
|
|||||||
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
|
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
|
||||||
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
|
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
|
||||||
|
|
||||||
// X11 Hashfunktion
|
// to check... new sp method
|
||||||
|
//extern void x11_echo512_cpu_setTarget(const void *ptarget);
|
||||||
|
//extern uint32_t x11_echo512_cpu_hash_64_final(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
// X11 CPU Hash
|
||||||
extern "C" void x11hash(void *output, const void *input)
|
extern "C" void x11hash(void *output, const void *input)
|
||||||
{
|
{
|
||||||
// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11
|
// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11
|
||||||
@ -146,7 +150,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
|
|||||||
throughput = min(throughput, (int)(max_nonce - first_nonce));
|
throughput = min(throughput, (int)(max_nonce - first_nonce));
|
||||||
|
|
||||||
if (opt_benchmark)
|
if (opt_benchmark)
|
||||||
((uint32_t*)ptarget)[7] = 0x000f;
|
((uint32_t*)ptarget)[7] = 0x5;
|
||||||
|
|
||||||
if (!init[thr_id])
|
if (!init[thr_id])
|
||||||
{
|
{
|
||||||
@ -176,6 +180,8 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
|
|||||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
|
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
|
||||||
|
|
||||||
quark_blake512_cpu_setBlock_80((void*)endiandata);
|
quark_blake512_cpu_setBlock_80((void*)endiandata);
|
||||||
|
|
||||||
|
//x11_echo512_cpu_setTarget(ptarget);
|
||||||
cuda_check_cpu_setTarget(ptarget);
|
cuda_check_cpu_setTarget(ptarget);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
@ -196,8 +202,10 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
|
|||||||
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
|
||||||
// Scan nach Gewinner Hashes auf der GPU
|
// todo...
|
||||||
|
//foundNonce = x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
//foundNonce = cuda_check_hash_fast(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
//foundNonce = cuda_check_hash_fast(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||||
|
|
||||||
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
if (foundNonce != 0xffffffff)
|
if (foundNonce != 0xffffffff)
|
||||||
{
|
{
|
||||||
|
@ -67,7 +67,7 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
|
|||||||
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
|
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
|
||||||
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
|
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
|
||||||
|
|
||||||
// X13 Hashfunktion
|
// X13 CPU Hash
|
||||||
extern "C" void x13hash(void *output, const void *input)
|
extern "C" void x13hash(void *output, const void *input)
|
||||||
{
|
{
|
||||||
// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13
|
// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13
|
||||||
|
Loading…
x
Reference in New Issue
Block a user