Tune reg. count for qubit (luffa) algos

10 years ago · 1aec4555cc
1 changed files with 4 additions and 1 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -57,7 +57,7 @@ nvcc_ARCH  = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
				@@ -57,7 +57,7 @@ nvcc_ARCH  = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"

 nvcc_FLAGS = $(nvcc_ARCH) -I . @CUDA_CFLAGS@
-nvcc_FLAGS += $(JANSSON_INCLUDES)
+nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"

 # we're now targeting all major compute architectures within one binary.
 .cu.o:
@ -66,6 +66,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES)
				@@ -66,6 +66,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES)
 blake32.o: blake32.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<

+qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
+
 # Luffa and Echo are faster with 80 registers than 128
 x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<