From a23760174770acb36edfac805670aa46773a6c3a Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 26 Jan 2016 20:38:17 +0100
Subject: [PATCH] 1.7.1 release

set schedule flags to reduce linux cpu usage without MyStreamSynchronize()
---
 Algo256/blake256.cu | 11 ++++++++---
 JHA/jackpotcoin.cu  |  6 ++++++
 groestlcoin.cpp     |  6 ++++++
 heavy/heavy.cu      |  6 ++++++
 myriadgroestl.cpp   |  6 ++++++
 pentablake.cu       |  7 ++++++-
 quark/quarkcoin.cu  |  6 ++++++
 qubit/deep.cu       |  5 +++++
 qubit/qubit.cu      |  6 ++++++
 res/ccminer.rc      |  8 ++++----
 skein.cu            |  8 +++++++-
 skein2.cpp          |  6 ++++++
 x11/c11.cu          |  6 ++++++
 x13/x13.cu          |  6 ++++++
 x15/whirlpoolx.cu   |  6 ++++++
 x15/x14.cu          |  6 ++++++
 x17/x17.cu          |  5 +++++
 zr5.cu              |  6 ++++++
 18 files changed, 107 insertions(+), 9 deletions(-)

diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu
index 5fc48f2..1fb5538 100644
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@@ -256,7 +256,7 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui
 		return result;
 
 	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds);
-	MyStreamSynchronize(NULL, 0, thr_id);
+	//MyStreamSynchronize(NULL, 0, thr_id);
 	if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		result = h_resNonce[thr_id][0];
 		for (int n=0; n < (NBN-1); n++)
@@ -343,7 +343,7 @@ static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, c
 		return result;
 
 	blake256_gpu_hash_16 <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], highTarget, (int) rounds, opt_tracegpu);
-	MyStreamSynchronize(NULL, 0, thr_id);
+	//MyStreamSynchronize(NULL, 0, thr_id);
 	if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		result = h_resNonce[thr_id][0];
 		for (int n=0; n < (NBN-1); n++)
@@ -413,7 +413,12 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
 
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
-		CUDA_LOG_ERROR();
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t));
 		cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t));
diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index 495057f..d2872c5 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -94,6 +94,12 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 	if (!init[thr_id])
 	{
 		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 		cuda_get_arch(thr_id);
 		if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
 			gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
diff --git a/groestlcoin.cpp b/groestlcoin.cpp
index 89f178a..1b0ec4e 100644
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@@ -44,6 +44,12 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 		CUDA_LOG_ERROR();
 		groestlcoin_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
diff --git a/heavy/heavy.cu b/heavy/heavy.cu
index 40fb0c9..8782616 100644
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@@ -172,6 +172,12 @@ int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned l
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		hefty_cpu_init(thr_id, throughput);
 		sha256_cpu_init(thr_id, throughput);
diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp
index 4311120..b4ce83e 100644
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@@ -49,6 +49,12 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 	if(!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 		myriadgroestl_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
diff --git a/pentablake.cu b/pentablake.cu
index 2abdeb7..97dde16 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -63,7 +63,12 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
 
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
-		CUDA_LOG_ERROR();
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index 4241d0b..98e4599 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -139,6 +139,12 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		cudaGetLastError();
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
diff --git a/qubit/deep.cu b/qubit/deep.cu
index a5d4c63..f884602 100644
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@@ -62,6 +62,11 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
 		CUDA_LOG_ERROR();
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
diff --git a/qubit/qubit.cu b/qubit/qubit.cu
index de72f51..e29f8c5 100644
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@@ -73,6 +73,12 @@ extern "C" int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		qubit_luffa512_cpu_init(thr_id, throughput);
 		x11_cubehash512_cpu_init(thr_id, throughput);
diff --git a/res/ccminer.rc b/res/ccminer.rc
index 79501e6..ad65b6f 100644
--- a/res/ccminer.rc
+++ b/res/ccminer.rc
@@ -60,8 +60,8 @@ IDI_ICON1               ICON                    "ccminer.ico"
 //
 
 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 1,7,0,0
- PRODUCTVERSION 1,7,0,0
+ FILEVERSION 1,7,1,0
+ PRODUCTVERSION 1,7,1,0
  FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
  FILEFLAGS 0x21L
@@ -76,10 +76,10 @@ BEGIN
     BEGIN
         BLOCK "040904e4"
         BEGIN
-            VALUE "FileVersion", "1.7"
+            VALUE "FileVersion", "1.7.1"
             VALUE "LegalCopyright", "Copyright (C) 2015"
             VALUE "ProductName", "ccminer"
-            VALUE "ProductVersion", "1.7"
+            VALUE "ProductVersion", "1.7.1"
         END
     END
     BLOCK "VarFileInfo"
diff --git a/skein.cu b/skein.cu
index 97a7c0b..f6b1619 100644
--- a/skein.cu
+++ b/skein.cu
@@ -22,7 +22,7 @@ extern void skeincoin_free(int thr_id);
 extern void skeincoin_setBlock_80(int thr_id, void *pdata);
 extern uint32_t skeincoin_hash_sm5(int thr_id, uint32_t threads, uint32_t startNounce, int swap, uint64_t target64, uint32_t *secNonce);
 
-static __device__ __constant__ uint32_t sha256_hashTable[] = {
+static __device__ uint32_t sha256_hashTable[] = {
 	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
 };
 
@@ -372,6 +372,12 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		if (sm5) {
 			skeincoin_init(thr_id);
diff --git a/skein2.cpp b/skein2.cpp
index 38c0a5b..6e3e5a7 100644
--- a/skein2.cpp
+++ b/skein2.cpp
@@ -53,6 +53,12 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 	if (!init[thr_id])
 	{
 		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);
 
diff --git a/x11/c11.cu b/x11/c11.cu
index b48e85e..ae5068a 100644
--- a/x11/c11.cu
+++ b/x11/c11.cu
@@ -119,6 +119,12 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
diff --git a/x13/x13.cu b/x13/x13.cu
index 39e0fe2..1093042 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -127,6 +127,12 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
diff --git a/x15/whirlpoolx.cu b/x15/whirlpoolx.cu
index 517d5ca..8622643 100644
--- a/x15/whirlpoolx.cu
+++ b/x15/whirlpoolx.cu
@@ -53,6 +53,12 @@ extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonc
 
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
 
diff --git a/x15/x14.cu b/x15/x14.cu
index d020f5c..06f40e2 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -141,6 +141,12 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
diff --git a/x17/x17.cu b/x17/x17.cu
index 3be51d1..fcfdf42 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -169,6 +169,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
diff --git a/zr5.cu b/zr5.cu
index 3d32e21..cf86819 100644
--- a/zr5.cu
+++ b/zr5.cu
@@ -351,6 +351,12 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		// constants
 		cudaMemcpyToSymbol(c_permut, permut, 24*4, 0, cudaMemcpyHostToDevice);