1.7.1 release

set schedule flags to reduce linux cpu usage without MyStreamSynchronize()
9 years ago · a237601747
18 changed files with 107 additions and 9 deletions
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@ -256,7 +256,7 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui
				@@ -256,7 +256,7 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui
 		return result;

 	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds);
-	MyStreamSynchronize(NULL, 0, thr_id);
+	//MyStreamSynchronize(NULL, 0, thr_id);
 	if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		result = h_resNonce[thr_id][0];
 		for (int n=0; n < (NBN-1); n++)
@ -343,7 +343,7 @@ static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, c
				@@ -343,7 +343,7 @@ static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, c
 		return result;

 	blake256_gpu_hash_16 <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], highTarget, (int) rounds, opt_tracegpu);
-	MyStreamSynchronize(NULL, 0, thr_id);
+	//MyStreamSynchronize(NULL, 0, thr_id);
 	if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		result = h_resNonce[thr_id][0];
 		for (int n=0; n < (NBN-1); n++)
@ -413,7 +413,12 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
				@@ -413,7 +413,12 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non

 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
-		CUDA_LOG_ERROR();
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}

 		cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t));
 		cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t));
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -94,6 +94,12 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
				@@ -94,6 +94,12 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 	if (!init[thr_id])
 	{
 		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 		cuda_get_arch(thr_id);
 		if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
 			gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@ -44,6 +44,12 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
				@@ -44,6 +44,12 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 		CUDA_LOG_ERROR();
 		groestlcoin_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@ -172,6 +172,12 @@ int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned l
				@@ -172,6 +172,12 @@ int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned l
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}

 		hefty_cpu_init(thr_id, throughput);
 		sha256_cpu_init(thr_id, throughput);
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@ -49,6 +49,12 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
				@@ -49,6 +49,12 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 	if(!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 		myriadgroestl_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
--- a/pentablake.cu
+++ b/pentablake.cu
@ -63,7 +63,12 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
				@@ -63,7 +63,12 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n

 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
-		CUDA_LOG_ERROR();
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}

 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));

--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -139,6 +139,12 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
				@@ -139,6 +139,12 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}

 		cudaGetLastError();
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@ -62,6 +62,11 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce,
				@@ -62,6 +62,11 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
 		CUDA_LOG_ERROR();

 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@ -73,6 +73,12 @@ extern "C" int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce,
				@@ -73,6 +73,12 @@ extern "C" int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}

 		qubit_luffa512_cpu_init(thr_id, throughput);
 		x11_cubehash512_cpu_init(thr_id, throughput);
--- a/res/ccminer.rc
+++ b/res/ccminer.rc
@ -60,8 +60,8 @@ IDI_ICON1               ICON                    "ccminer.ico"
				@@ -60,8 +60,8 @@ IDI_ICON1               ICON                    "ccminer.ico"
 //

 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 1,7,0,0
- PRODUCTVERSION 1,7,0,0
+ FILEVERSION 1,7,1,0
+ PRODUCTVERSION 1,7,1,0
 FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
 FILEFLAGS 0x21L
@ -76,10 +76,10 @@ BEGIN
				@@ -76,10 +76,10 @@ BEGIN
    BEGIN
        BLOCK "040904e4"
        BEGIN
-            VALUE "FileVersion", "1.7"
+            VALUE "FileVersion", "1.7.1"
            VALUE "LegalCopyright", "Copyright (C) 2015"
            VALUE "ProductName", "ccminer"
-            VALUE "ProductVersion", "1.7"
+            VALUE "ProductVersion", "1.7.1"
        END
    END
    BLOCK "VarFileInfo"
--- a/skein.cu
+++ b/skein.cu
@ -22,7 +22,7 @@ extern void skeincoin_free(int thr_id);
				@@ -22,7 +22,7 @@ extern void skeincoin_free(int thr_id);
 extern void skeincoin_setBlock_80(int thr_id, void *pdata);
 extern uint32_t skeincoin_hash_sm5(int thr_id, uint32_t threads, uint32_t startNounce, int swap, uint64_t target64, uint32_t *secNonce);

-static __device__ __constant__ uint32_t sha256_hashTable[] = {
+static __device__ uint32_t sha256_hashTable[] = {
 	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
 };

@ -372,6 +372,12 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
				@@ -372,6 +372,12 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}

 		if (sm5) {
 			skeincoin_init(thr_id);
--- a/skein2.cpp
+++ b/skein2.cpp
@ -53,6 +53,12 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
				@@ -53,6 +53,12 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 	if (!init[thr_id])
 	{
 		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}

 		cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);

--- a/x11/c11.cu
+++ b/x11/c11.cu
@ -119,6 +119,12 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
				@@ -119,6 +119,12 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}

 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
--- a/x13/x13.cu
+++ b/x13/x13.cu
@ -127,6 +127,12 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u
				@@ -127,6 +127,12 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}

 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
--- a/x15/whirlpoolx.cu
+++ b/x15/whirlpoolx.cu
@ -53,6 +53,12 @@ extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonc
				@@ -53,6 +53,12 @@ extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonc

 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}

 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);

--- a/x15/x14.cu
+++ b/x15/x14.cu
@ -141,6 +141,12 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,
				@@ -141,6 +141,12 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}

 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
--- a/x17/x17.cu
+++ b/x17/x17.cu
@ -169,6 +169,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
				@@ -169,6 +169,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}

 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
--- a/zr5.cu
+++ b/zr5.cu
@ -351,6 +351,12 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work,
				@@ -351,6 +351,12 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}

 		// constants
 		cudaMemcpyToSymbol(c_permut, permut, 24*4, 0, cudaMemcpyHostToDevice);