From 5bc969fa573d07f262280d835464ca122bf0be30 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 3 Nov 2014 16:06:49 +0100 Subject: [PATCH] Some work on data alignment linux: add -march=native (we build it ourself) and some other flags + remove unused vars (seen with -Wall) --- Makefile.am | 5 ++++- blake32.cu | 6 +++--- configure.sh | 4 +++- cpu-miner.c | 20 +++++++++----------- cuda_helper.h | 2 +- groestlcoin.cpp | 9 ++------- hashlog.cpp | 5 ++--- miner.h | 12 ++++++++++++ sph/haval_helper.c | 2 +- util.c | 29 ++++++++++++++++++++++++++++- 10 files changed, 65 insertions(+), 29 deletions(-) diff --git a/Makefile.am b/Makefile.am index 85d2439..f854915 100644 --- a/Makefile.am +++ b/Makefile.am @@ -49,10 +49,13 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \ x11/s3.cu +if HAVE_WINDOWS +ccminer_SOURCES += compat/winansi.c +endif ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@ ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@ -ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME +ccminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(CPPFLAGS) $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME nvcc_ARCH = -gencode=arch=compute_50,code=\"sm_50,compute_50\" #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\" diff --git a/blake32.cu b/blake32.cu index be43604..e7f944d 100644 --- a/blake32.cu +++ b/blake32.cu @@ -386,7 +386,7 @@ static void blake256mid(uint32_t *output, const uint32_t *input, int8_t rounds = __host__ void blake256_cpu_setBlock_16(uint32_t *penddata, const uint32_t *midstate, const uint32_t *ptarget) { - uint32_t data[11]; + uint32_t _ALIGN(64) data[11]; memcpy(data, midstate, 32); data[8] = penddata[0]; data[9] = penddata[1]; @@ -402,9 +402,9 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt const uint32_t first_nonce = pdata[19]; static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; uint64_t targetHigh = ((uint64_t*)ptarget)[3]; // 0x00000000.0fffffff - uint32_t endiandata[20]; + uint32_t _ALIGN(64) endiandata[20]; #if PRECALC64 - uint32_t midstate[8]; + uint32_t _ALIGN(64) midstate[8]; #else uint32_t crcsum; #endif diff --git a/configure.sh b/configure.sh index 142b59e..7e277f7 100755 --- a/configure.sh +++ b/configure.sh @@ -5,5 +5,7 @@ #--ptxas-options=\"-v -dlcm=cg\"" -CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda +extracflags="-march=native -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16" + +CUDA_CFLAGS="-O3 -Xcompiler -Wall" ./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda diff --git a/cpu-miner.c b/cpu-miner.c index e352b98..ed03e6a 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -383,7 +383,7 @@ struct work { uint32_t scanned_to; }; -static struct work g_work; +static struct work _ALIGN(64) g_work; static time_t g_work_time; static pthread_mutex_t g_work_lock; @@ -484,11 +484,10 @@ static int share_result(int result, const char *reason) { char s[345]; double hashrate; - int i, ret = 0; hashrate = 0.; pthread_mutex_lock(&stats_lock); - for (i = 0; i < opt_n_threads; i++) + for (int i = 0; i < opt_n_threads; i++) hashrate += thr_hashrates[i]; result ? accepted_count++ : rejected_count++; pthread_mutex_unlock(&stats_lock); @@ -651,8 +650,8 @@ static bool get_upstream_work(CURL *curl, struct work *work) if (opt_protocol && rc) { timeval_subtract(&diff, &tv_end, &tv_start); /* show time because curl can be slower against versions/config */ - applog(LOG_DEBUG, "got new work in %u µs", - diff.tv_sec * 1000000 + diff.tv_usec); + applog(LOG_DEBUG, "got new work in %.2f ms", + (1000.0 * diff.tv_sec) + (0.001 * diff.tv_usec)); } json_decref(val); @@ -667,7 +666,7 @@ static void workio_cmd_free(struct workio_cmd *wc) switch (wc->cmd) { case WC_SUBMIT_WORK: - free(wc->u.work); + aligned_free(wc->u.work); break; default: /* do nothing */ break; @@ -682,7 +681,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl) struct work *ret_work; int failures = 0; - ret_work = (struct work*)calloc(1, sizeof(*ret_work)); + ret_work = (struct work*)aligned_calloc(sizeof(*ret_work)); if (!ret_work) return false; @@ -690,7 +689,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl) while (!get_upstream_work(curl, ret_work)) { if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { applog(LOG_ERR, "json_rpc_call failed, terminating workio thread"); - free(ret_work); + aligned_free(ret_work); return false; } @@ -702,7 +701,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl) /* send work to requesting thread */ if (!tq_push(wc->thr->q, ret_work)) - free(ret_work); + aligned_free(ret_work); return true; } @@ -822,7 +821,7 @@ static bool submit_work(struct thr_info *thr, const struct work *work_in) if (!wc) return false; - wc->u.work = (struct work *)malloc(sizeof(*work_in)); + wc->u.work = (struct work *)aligned_calloc(sizeof(*work_in)); if (!wc->u.work) goto err_out; @@ -946,7 +945,6 @@ static void *miner_thread(void *userdata) struct work work; uint32_t max_nonce; uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1); - unsigned char *scratchbuf = NULL; bool work_done = false; bool extrajob = false; char s[16]; diff --git a/cuda_helper.h b/cuda_helper.h index 920a12d..03e38cb 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -4,7 +4,7 @@ #include #include -#if defined(_MSC_VER) +#if defined(__INTELLISENSE__) /* reduce warnings */ #include #include diff --git a/groestlcoin.cpp b/groestlcoin.cpp index 63b6401..d38f4c7 100644 --- a/groestlcoin.cpp +++ b/groestlcoin.cpp @@ -42,13 +42,8 @@ void sha256func(unsigned char *hash, const unsigned char *data, int len) extern "C" void groestlhash(void *state, const void *input) { - // Tryout GPU-groestl - - sph_groestl512_context ctx_groestl[2]; - static unsigned char pblank[1]; - uint32_t mask = 8; - uint32_t zero = 0; - + // CPU-groestl + sph_groestl512_context ctx_groestl[2]; //these uint512 in the c++ source of the client are backed by an array of uint32 uint32_t hashA[16], hashB[16]; diff --git a/hashlog.cpp b/hashlog.cpp index 9bcd04b..025ba14 100644 --- a/hashlog.cpp +++ b/hashlog.cpp @@ -219,10 +219,9 @@ extern "C" void hashlog_purge_all(void) extern "C" void hashlog_dump_job(char* jobid) { if (opt_debug) { - int deleted = 0; uint64_t njobid = hextouint(jobid); uint64_t keypfx = (njobid << 32); - uint32_t sz = tlastshares.size(); + // uint32_t sz = tlastshares.size(); std::map::iterator i = tlastshares.begin(); while (i != tlastshares.end()) { if ((keypfx & i->first) == keypfx) { @@ -235,4 +234,4 @@ extern "C" void hashlog_dump_job(char* jobid) i++; } } -} \ No newline at end of file +} diff --git a/miner.h b/miner.h index 12a4b24..60a6a39 100644 --- a/miner.h +++ b/miner.h @@ -51,6 +51,14 @@ void *alloca (size_t); # endif #endif +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0 +# define _ALIGN(x) __align__(x) +#elif _MSC_VER +# define _ALIGN(x) __declspec(align(x)) +#else +# define _ALIGN(x) __attribute__ ((aligned(x))) +#endif + #ifdef HAVE_SYSLOG_H #include #define LOG_BLUE 0x10 /* unique value */ @@ -200,6 +208,10 @@ static inline void le16enc(void *pp, uint16_t x) } #endif +/* used for struct work */ +void *aligned_calloc(int size); +void aligned_free(void *ptr); + #if JANSSON_MAJOR_VERSION >= 2 #define JSON_LOADS(str, err_ptr) json_loads((str), 0, (err_ptr)) #else diff --git a/sph/haval_helper.c b/sph/haval_helper.c index cf078e0..ce0a9fd 100644 --- a/sph/haval_helper.c +++ b/sph/haval_helper.c @@ -149,7 +149,7 @@ static void SPH_XCAT(SPH_XCAT(haval, PASSES), _close)(sph_haval_context *sc, unsigned ub, unsigned n, void *dst) { - unsigned current,j; + unsigned current; DSTATE; #if SPH_64 diff --git a/util.c b/util.c index 41cbb10..350665f 100644 --- a/util.c +++ b/util.c @@ -382,7 +382,7 @@ json_t *json_rpc_call(CURL *curl, const char *url, curl_easy_setopt(curl, CURLOPT_POST, 1); if (opt_protocol) - applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req); + applog(LOG_DEBUG, "JSON protocol request:\n%s", rpc_req); upload_data.buf = rpc_req; upload_data.len = strlen(rpc_req); @@ -481,6 +481,33 @@ err_out: return NULL; } +/** + * Unlike malloc, calloc set the memory to zero + */ +void *aligned_calloc(int size) +{ + const int ALIGN = 64; // cache line +#ifdef _MSC_VER + void* res = _aligned_malloc(size, ALIGN); + memset(res, 0, size); + return res; +#else + void *mem = calloc(1, size+ALIGN+sizeof(void*)); + void **ptr = (void**)((size_t)(mem+ALIGN+sizeof(void*)) & ~(ALIGN-1)); + ptr[-1] = mem; + return ptr; +#endif +} + +void aligned_free(void *ptr) +{ +#ifdef _MSC_VER + return _aligned_free(ptr); +#else + free(((void**)ptr)[-1]); +#endif +} + void cbin2hex(char *out, const char *in, size_t len) { if (out) {