From 4f379803d3506cdffe83ee3703982e056ab345d1 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 21 Apr 2015 09:20:18 +0200
Subject: [PATCH] scrypt: remove some unused functions/algo checks

there was remains of blake algo
---
 Makefile.am            |  2 +-
 scrypt/salsa_kernel.cu | 72 ++++++++++++++----------------------------
 scrypt/salsa_kernel.h  |  3 --
 3 files changed, 24 insertions(+), 53 deletions(-)
diff --git a/Makefile.am b/Makefile.am
index d6edb34..6b4fd01 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -127,7 +127,7 @@ JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
 
 # This kernel need also an older SM to be able to autotune kernels
 scrypt/salsa_kernel.o: scrypt/salsa_kernel.cu
-	$(NVCC) $(nvcc_FLAGS) -gencode=arch=compute_20,code=\"sm_21,compute_20\" --maxrregcount=80 -o $@ -c $<
+	$(NVCC) -I . -gencode=arch=compute_20,code=\"sm_21,compute_20\" -o $@ -c $<
 
 skein.o: skein.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
diff --git a/scrypt/salsa_kernel.cu b/scrypt/salsa_kernel.cu
index 2eabb4b..273d2bf 100644
--- a/scrypt/salsa_kernel.cu
+++ b/scrypt/salsa_kernel.cu
@@ -1,10 +1,7 @@
-
 //
 // Contains the autotuning logic and some utility functions.
 // Note that all CUDA kernels have been moved to other .cu files
 //
-// NOTE: compile this .cu module for compute_20,sm_21 with --maxrregcount=64
-//
 
 #include <stdio.h>
 #include <map>
@@ -152,13 +149,6 @@ std::map<int, uint32_t *> context_hash[2];
 
 int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurrent, int &wpb);
 
-void cuda_shutdown(int thr_id)
-{
-	cudaDeviceSynchronize();
-	cudaDeviceReset();
-	cudaThreadExit();
-}
-
 int cuda_throughput(int thr_id)
 {
 	int GRID_BLOCKS, WARPS_PER_BLOCK;
@@ -215,7 +205,7 @@ int cuda_throughput(int thr_id)
 				checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[1][thr_id] = tmp;
 			}
 		}
-		else if (IS_SCRYPT_JANE())
+		else /* if (IS_SCRYPT_JANE()) */
 		{
 			// allocate pinned host memory for scrypt_core input/output
 			checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[0][thr_id] = tmp;
@@ -502,9 +492,7 @@ int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurre
 		}
 		MAXWARPS[thr_id] = warp;
 	}
-	if (IS_SCRYPT() || IS_SCRYPT_JANE()) {
-		kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]);
-	}
+	kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]);
 
 	if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK))
 	{
@@ -531,28 +519,16 @@ int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurre
 
 			// allocate device memory
 			uint32_t *d_idata = NULL, *d_odata = NULL;
-			if (IS_SCRYPT() || IS_SCRYPT_JANE()) {
-				unsigned int mem_size = MAXWARPS[thr_id] * WU_PER_WARP * sizeof(uint32_t) * 32;
-				checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size));
-				checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size));
-
-				// pre-initialize some device memory
-				uint32_t *h_idata = (uint32_t*)malloc(mem_size);
-				for (unsigned int i=0; i < mem_size/sizeof(uint32_t); ++i) h_idata[i] = i*2654435761UL; // knuth's method
-				checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
-				free(h_idata);
-			}
-#if 0
-			else if (opt_algo == ALGO_KECCAK) {
-				uint32_t pdata[20] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
-				uint32_t ptarget[8] = {0,0,0,0,0,0,0,0};
-				kernel->prepare_keccak256(thr_id, pdata, ptarget);
-			} else if (opt_algo == ALGO_BLAKE) {
-				uint32_t pdata[20] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
-				uint32_t ptarget[8] = {0,0,0,0,0,0,0,0};
-				kernel->prepare_blake256(thr_id, pdata, ptarget);
-			}
-#endif
+			unsigned int mem_size = MAXWARPS[thr_id] * WU_PER_WARP * sizeof(uint32_t) * 32;
+			checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size));
+			checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size));
+
+			// pre-initialize some device memory
+			uint32_t *h_idata = (uint32_t*)malloc(mem_size);
+			for (unsigned int i=0; i < mem_size/sizeof(uint32_t); ++i) h_idata[i] = i*2654435761UL; // knuth's method
+			checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
+			free(h_idata);
+
 			double best_hash_sec = 0.0;
 			int best_wpb = 0;
 
@@ -592,11 +568,10 @@ int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurre
 							int repeat = 0;
 							do  // average several measurements for better exactness
 							{
-								if (IS_SCRYPT() || IS_SCRYPT_JANE())
-									kernel->run_kernel(
-										grid, threads, WARPS_PER_BLOCK, thr_id, NULL,
-										d_idata, d_odata, N, LOOKUP_GAP, device_interactive[thr_id], true, device_texturecache[thr_id]
-									);
+								kernel->run_kernel(
+									grid, threads, WARPS_PER_BLOCK, thr_id, NULL, d_idata, d_odata, N,
+									LOOKUP_GAP, device_interactive[thr_id], true, device_texturecache[thr_id]
+								);
 								if(cudaDeviceSynchronize() != cudaSuccess)
 									break;
 								++repeat;
@@ -609,8 +584,11 @@ int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurre
 
 							// for scrypt: in interactive mode only find launch configs where kernel launch times are short enough
 							// TODO: instead we could reduce the batchsize parameter to meet the launch time requirement.
-							if (IS_SCRYPT() && device_interactive[thr_id] && GRID_BLOCKS > 2*props.multiProcessorCount && tdelta > 1.0/30)
+							if (IS_SCRYPT() && device_interactive[thr_id]
+								&& GRID_BLOCKS > 2*props.multiProcessorCount && tdelta > 1.0/30)
+							{
 								if (WARPS_PER_BLOCK == 1) goto skip; else goto skip2;
+							}
 
 							hash_sec = (double)WU_PER_LAUNCH / tdelta;
 							Hash[WARPS_PER_BLOCK] = hash_sec;
@@ -668,10 +646,8 @@ skip2:              ;
 skip:           ;
 			}
 
-			if (IS_SCRYPT() || IS_SCRYPT_JANE()) {
-				checkCudaErrors(cudaFree(d_odata));
-				checkCudaErrors(cudaFree(d_idata));
-			}
+			checkCudaErrors(cudaFree(d_odata));
+			checkCudaErrors(cudaFree(d_idata));
 
 			WARPS_PER_BLOCK = best_wpb;
 			applog(LOG_INFO, "GPU #%d: %7.2f hash/s with configuration %c%dx%d", device_map[thr_id], best_hash_sec, kernel->get_identifier(), optimal_blocks, WARPS_PER_BLOCK);
@@ -775,9 +751,7 @@ skip:           ;
 				}
 
 				// update pointers to scratch buffer in constant memory after reallocation
-				if (IS_SCRYPT() || IS_SCRYPT_JANE()) {
-					kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]);
-				}
+				kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]);
 			}
 			else
 			{
diff --git a/scrypt/salsa_kernel.h b/scrypt/salsa_kernel.h
index 5e2d7f7..405207b 100644
--- a/scrypt/salsa_kernel.h
+++ b/scrypt/salsa_kernel.h
@@ -48,10 +48,7 @@ static __inline bool IS_SCRYPT() { get_scrypt_type(); return (scrypt_algo == A_S
 static __inline bool IS_SCRYPT_JANE() { get_scrypt_type(); return (scrypt_algo == A_SCRYPT_JANE); }
 
 // CUDA externals
-extern int cuda_num_devices();
-extern void cuda_shutdown(int thr_id);
 extern int cuda_throughput(int thr_id);
-
 extern uint32_t *cuda_transferbuffer(int thr_id, int stream);
 extern uint32_t *cuda_hashbuffer(int thr_id, int stream);