From 4868c412b0d4605b116efde08ec7538af579a360 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 15 Oct 2015 23:00:56 +0200 Subject: [PATCH] windows: add support for SM 2.1, drop SM 3.5 (x86) Mostly to do compatibilty tests, SM 2.1 support is very limited SM 3.0 code should run on SM 3.5 (only a few cards use this arch) As i can't test SM 3.5, its best to let users do their own tests... --- Algo256/blake256.cu | 4 ++-- README.txt | 3 ++- bench.cpp | 1 + ccminer.vcxproj | 2 +- lyra2/cuda_lyra2v2_sm3.cuh | 1 - lyra2/lyra2REv2.cu | 5 +++-- 6 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu index 191db70..c3a3ea2 100644 --- a/Algo256/blake256.cu +++ b/Algo256/blake256.cu @@ -409,9 +409,9 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non } if (!init[thr_id]) { - if (active_gpus > 1) - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(device_map[thr_id]); CUDA_LOG_ERROR(); + cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t)); cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)); CUDA_LOG_ERROR(); diff --git a/README.txt b/README.txt index 5c30709..4c1c0b7 100644 --- a/README.txt +++ b/README.txt @@ -229,7 +229,8 @@ features. >>> RELEASE HISTORY <<< Under Dev... v1.7 - Improve lyra2 (v1) cuda implementation + Add windows support for SM 2.1 and drop SM 3.5 (x86) + Improve lyra2 (v1/v2) cuda implementation Restore whirlpool algo (and whirlcoin variant) Prepare algo switch ability Add --benchmark alone to run a benchmark for all algos diff --git a/bench.cpp b/bench.cpp index 0a45a41..31944ae 100644 --- a/bench.cpp +++ b/bench.cpp @@ -96,6 +96,7 @@ bool bench_algo_switch_next(int thr_id) // skip some duplicated algos if (algo == ALGO_C11) algo++; // same as x11 if (algo == ALGO_DMD_GR) algo++; // same as groestl + if (algo == ALGO_MJOLLNIR) algo++; // same as heavy if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool if (device_sm[dev_id] && device_sm[dev_id] < 300) { diff --git a/ccminer.vcxproj b/ccminer.vcxproj index f97035c..f6465b3 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -158,7 +158,7 @@ 80 true true - compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_35,sm_35 + compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_20,sm_21 --ptxas-options="-O2" %(AdditionalOptions) O2 diff --git a/lyra2/cuda_lyra2v2_sm3.cuh b/lyra2/cuda_lyra2v2_sm3.cuh index e928e75..c85b862 100644 --- a/lyra2/cuda_lyra2v2_sm3.cuh +++ b/lyra2/cuda_lyra2v2_sm3.cuh @@ -17,7 +17,6 @@ #define Ncol 4 #define vectype ulonglong4 -#define u64type uint64_t #define memshift 4 __device__ vectype *DMatrix; diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu index 54b8303..4553b5a 100644 --- a/lyra2/lyra2REv2.cu +++ b/lyra2/lyra2REv2.cu @@ -82,7 +82,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc const uint32_t first_nonce = pdata[19]; int dev_id = device_map[thr_id]; int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18; - uint32_t throughput = cuda_default_throughput(dev_id, 1U << intensity); + uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) @@ -100,6 +100,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc keccak256_cpu_init(thr_id,throughput); skein256_cpu_init(thr_id, throughput); bmw256_cpu_init(thr_id, throughput); + CUDA_LOG_ERROR(); // DMatrix (780Ti may prefer 16 instead of 12, cf djm34) CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], (size_t)12 * sizeof(uint64_t) * 4 * 4 * throughput)); @@ -118,7 +119,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc uint32_t endiandata[20]; for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); blake256_cpu_setBlock_80(pdata); bmw256_setTarget(ptarget);