m7: fine tune threads and max registers

TODO: on 750 Ti, this version of ripemd seems very slow... It could be made in 3ms and take actually 25ms whirlpool can also be optimized, with last djm code...
2025-01-31 08:54:38 +00:00 · 2014-08-25 06:39:06 +02:00 · 2014-08-25 06:39:06 +02:00 · c9ce05a82b
commit c9ce05a82b
parent fcd381cda2
2 changed files with 15 additions and 4 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -82,6 +82,18 @@ x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
 m7/cuda_tiger192.o: m7/cuda_tiger192.cu
 	$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=64 -o $@ -c $<

+m7/cuda_m7_sha256.o: m7/cuda_m7_sha256.cu
+	$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=80 -o $@ -c $<
+
+m7/m7_keccak512.o: m7/m7_keccak512.cu
+	$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=80 -o $@ -c $<
+
+m7/cuda_m7_whirlpool.o: m7/cuda_m7_whirlpool.cu
+	$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=64 -o $@ -c $<
+
+m7/cuda_mul.o: m7/cuda_mul.cu
+	$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=32 -o $@ -c $<
+
 # ABI requiring code modules
 quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
 	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $<
--- a/m7/cuda_ripemd160.cu
+++ b/m7/cuda_ripemd160.cu
@ -282,7 +282,7 @@ static const uint32_t IV[5] = {
 		(h)[0] = tmp; \
 	}

-__global__
+__global__ __launch_bounds__(256, 4)
 void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -307,10 +307,9 @@ void m7_ripemd160_gpu_hash_120(int threads, uint32_t startNounce, uint64_t *outp
 		#define F4(x, y, z)   xandx(z,x,y)
 		#define F5(x, y, z)   xornt64(x,y,z)

-		uint32_t in2[16],in3[16];
-		uint32_t in[16],buf[5];
+		uint32_t buf[5], in2[16], in3[16];
 		#pragma unroll 16
-		for (int i=0;i<16;i++) {
+		for (int i=0; i<16; i++) {
 			if ((i+16) < 29)
 				in2[i] = c_PaddedMessage80[i+16];
 			else if ((i+16)==29)