1
0
mirror of https://github.com/GOSTSec/ccminer synced 2025-01-31 08:54:38 +00:00

m7: fine tune threads and max registers

TODO:

on 750 Ti, this version of ripemd seems very slow...
It could be made in 3ms and take actually 25ms

whirlpool can also be optimized, with last djm code...
This commit is contained in:
Tanguy Pruvot 2014-08-25 06:39:06 +02:00
parent fcd381cda2
commit c9ce05a82b
2 changed files with 15 additions and 4 deletions

View File

@ -82,6 +82,18 @@ x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
m7/cuda_tiger192.o: m7/cuda_tiger192.cu
$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=64 -o $@ -c $<
m7/cuda_m7_sha256.o: m7/cuda_m7_sha256.cu
$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=80 -o $@ -c $<
m7/m7_keccak512.o: m7/m7_keccak512.cu
$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=80 -o $@ -c $<
m7/cuda_m7_whirlpool.o: m7/cuda_m7_whirlpool.cu
$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=64 -o $@ -c $<
m7/cuda_mul.o: m7/cuda_mul.cu
$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=32 -o $@ -c $<
# ABI requiring code modules
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $<

View File

@ -282,7 +282,7 @@ static const uint32_t IV[5] = {
(h)[0] = tmp; \
}
__global__
__global__ __launch_bounds__(256, 4)
void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -307,10 +307,9 @@ void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outp
#define F4(x, y, z) xandx(z,x,y)
#define F5(x, y, z) xornt64(x,y,z)
uint32_t in2[16],in3[16];
uint32_t in[16],buf[5];
uint32_t buf[5], in2[16], in3[16];
#pragma unroll 16
for (int i=0;i<16;i++) {
for (int i=0; i<16; i++) {
if ((i+16) < 29)
in2[i] = c_PaddedMessage80[i+16];
else if ((i+16)==29)