Browse Source

m7: fine tune threads and max registers

TODO:

on 750 Ti, this version of ripemd seems very slow...
It could be made in 3ms and take actually 25ms

whirlpool can also be optimized, with last djm code...
Tanguy Pruvot 10 years ago
parent
commit
c9ce05a82b
  1. 12
      Makefile.am
  2. 7
      m7/cuda_ripemd160.cu

12
Makefile.am

@ -82,6 +82,18 @@ x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu @@ -82,6 +82,18 @@ x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
m7/cuda_tiger192.o: m7/cuda_tiger192.cu
$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=64 -o $@ -c $<
m7/cuda_m7_sha256.o: m7/cuda_m7_sha256.cu
$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=80 -o $@ -c $<
m7/m7_keccak512.o: m7/m7_keccak512.cu
$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=80 -o $@ -c $<
m7/cuda_m7_whirlpool.o: m7/cuda_m7_whirlpool.cu
$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=64 -o $@ -c $<
m7/cuda_mul.o: m7/cuda_mul.cu
$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=32 -o $@ -c $<
# ABI requiring code modules
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $<

7
m7/cuda_ripemd160.cu

@ -282,7 +282,7 @@ static const uint32_t IV[5] = { @@ -282,7 +282,7 @@ static const uint32_t IV[5] = {
(h)[0] = tmp; \
}
__global__
__global__ __launch_bounds__(256, 4)
void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -307,10 +307,9 @@ void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outp @@ -307,10 +307,9 @@ void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outp
#define F4(x, y, z) xandx(z,x,y)
#define F5(x, y, z) xornt64(x,y,z)
uint32_t in2[16],in3[16];
uint32_t in[16],buf[5];
uint32_t buf[5], in2[16], in3[16];
#pragma unroll 16
for (int i=0;i<16;i++) {
for (int i=0; i<16; i++) {
if ((i+16) < 29)
in2[i] = c_PaddedMessage80[i+16];
else if ((i+16)==29)

Loading…
Cancel
Save