From 03b2bddc16a18950b4710b831ee175222d5baca4 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 29 Oct 2015 13:09:27 +0000
Subject: [PATCH] lyra2v2: fix SM 3.5 support

May work also on SM 3.0 (to check)
---
 lyra2/lyra2REv2.cu | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
index 4553b5a..67e2dc9 100644
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@@ -90,10 +90,13 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 
 	if (!init[thr_id])
 	{
+		size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3;
 		cudaSetDevice(dev_id);
-		//cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
-		//if (gpu_threads == 1)
-		//	cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
 		CUDA_LOG_ERROR();
 
 		blake256_cpu_init(thr_id, throughput);
@@ -102,8 +105,11 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 		bmw256_cpu_init(thr_id, throughput);
 		CUDA_LOG_ERROR();
 
-		// DMatrix (780Ti may prefer 16 instead of 12, cf djm34)
-		CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], (size_t)12 * sizeof(uint64_t) * 4 * 4 * throughput));
+		// SM 3 implentation requires a bit more memory
+		if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
+			matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
+			
+		CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
 		lyra2v2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));