diff --git a/Makefile.am b/Makefile.am
index 520dff0..875f8b1 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -53,33 +53,33 @@ nvcc_ARCH  = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
 #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
 
-nvcc_FLAGS = $(nvcc_ARCH) -I . --ptxas-options=-v --use_fast_math
+nvcc_FLAGS = $(nvcc_ARCH) -I . @CUDA_CFLAGS@
 nvcc_FLAGS += $(JANSSON_INCLUDES)
 
 # we're now targeting all major compute architectures within one binary.
 .cu.o:
-	$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=128 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=128 -o $@ -c $<
 
 blake32.o: blake32.cu
-	$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=64 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
 
 # Luffa and Echo are faster with 80 registers than 128
 x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
-	$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
 x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu
-	$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
 # Shavite compiles faster with 128 regs
 x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
-	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ --maxrregcount=128 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=128 -o $@ -c $<
 
 x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
-	$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=80 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
 # ABI requiring code modules
 quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
-	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
 
 JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
-	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
diff --git a/build.sh b/build.sh
index 2905734..17935f3 100755
--- a/build.sh
+++ b/build.sh
@@ -4,7 +4,7 @@
 
 # export PATH="$PATH:/usr/local/cuda/bin/"
 
-#make distclean || echo clean
+make distclean || echo clean
 
 rm -f Makefile.in
 rm -f config.status
diff --git a/configure.ac b/configure.ac
index 2f52cdf..f7924d4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -144,12 +144,12 @@ AC_ARG_WITH([cuda],
 
 if test -n "$with_cuda"
 then
-   CUDA_CFLAGS="-I$with_cuda/include"
+   CUDA_CFLAGS="-I$with_cuda/include $CUDA_CFLAGS"
    CUDA_LIBS="-lcudart"
    CUDA_LDFLAGS="-L$with_cuda/lib$SUFFIX"
    NVCC="$with_cuda/bin/nvcc"
 else
-   CUDA_CFLAGS="-I/usr/local/cuda/include"
+   CUDA_CFLAGS="-I/usr/local/cuda/include $CUDA_CFLAGS"
    CUDA_LIBS="-lcudart -static-libstdc++"
    CUDA_LDFLAGS="-L/usr/local/cuda/lib$SUFFIX"
    NVCC="nvcc"
diff --git a/configure.sh b/configure.sh
index c0cdd0d..142b59e 100755
--- a/configure.sh
+++ b/configure.sh
@@ -1 +1,9 @@
-./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda
+# possible additional CUDA_CFLAGS
+#-gencode=arch=compute_50,code=\"sm_50,compute_50\"
+#-gencode=arch=compute_35,code=\"sm_35,compute_35\"
+#-gencode=arch=compute_30,code=\"sm_30,compute_30\"
+
+#--ptxas-options=\"-v -dlcm=cg\""
+
+CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda
+