Some work on data alignment

linux: add -march=native (we build it ourself) and some other flags + remove unused vars (seen with -Wall)
10 years ago · 5bc969fa57
10 changed files with 65 additions and 29 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -49,10 +49,13 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
				@@ -49,10 +49,13 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \
 			  x11/s3.cu

+if HAVE_WINDOWS
+ccminer_SOURCES += compat/winansi.c
+endif

 ccminer_LDFLAGS		= $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
 ccminer_LDADD		= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
-ccminer_CPPFLAGS	= -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
+ccminer_CPPFLAGS	= @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(CPPFLAGS) $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME

 nvcc_ARCH  = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
--- a/blake32.cu
+++ b/blake32.cu
@ -386,7 +386,7 @@ static void blake256mid(uint32_t *output, const uint32_t *input, int8_t rounds =
				@@ -386,7 +386,7 @@ static void blake256mid(uint32_t *output, const uint32_t *input, int8_t rounds =
 __host__
 void blake256_cpu_setBlock_16(uint32_t *penddata, const uint32_t *midstate, const uint32_t *ptarget)
 {
-	uint32_t data[11];
+	uint32_t _ALIGN(64) data[11];
 	memcpy(data, midstate, 32);
 	data[8] = penddata[0];
 	data[9] = penddata[1];
@ -402,9 +402,9 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
				@@ -402,9 +402,9 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 	const uint32_t first_nonce = pdata[19];
 	static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 	uint64_t targetHigh = ((uint64_t*)ptarget)[3]; // 0x00000000.0fffffff
-	uint32_t endiandata[20];
+	uint32_t _ALIGN(64) endiandata[20];
 #if PRECALC64
-	uint32_t midstate[8];
+	uint32_t _ALIGN(64) midstate[8];
 #else
 	uint32_t crcsum;
 #endif
--- a/configure.sh
+++ b/configure.sh
@ -5,5 +5,7 @@
				@@ -5,5 +5,7 @@

 #--ptxas-options=\"-v -dlcm=cg\""

-CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda
+extracflags="-march=native -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16"
+
+CUDA_CFLAGS="-O3 -Xcompiler -Wall" ./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda

--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -383,7 +383,7 @@ struct work {
				@@ -383,7 +383,7 @@ struct work {
 	uint32_t scanned_to;
 };

-static struct work g_work;
+static struct work _ALIGN(64) g_work;
 static time_t g_work_time;
 static pthread_mutex_t g_work_lock;

@ -484,11 +484,10 @@ static int share_result(int result, const char *reason)
				@@ -484,11 +484,10 @@ static int share_result(int result, const char *reason)
 {
 	char s[345];
 	double hashrate;
-	int i, ret = 0;

 	hashrate = 0.;
 	pthread_mutex_lock(&stats_lock);
-	for (i = 0; i < opt_n_threads; i++)
+	for (int i = 0; i < opt_n_threads; i++)
 		hashrate += thr_hashrates[i];
 	result ? accepted_count++ : rejected_count++;
 	pthread_mutex_unlock(&stats_lock);
@ -651,8 +650,8 @@ static bool get_upstream_work(CURL *curl, struct work *work)
				@@ -651,8 +650,8 @@ static bool get_upstream_work(CURL *curl, struct work *work)
 	if (opt_protocol && rc) {
 		timeval_subtract(&diff, &tv_end, &tv_start);
 		/* show time because curl can be slower against versions/config */
-		applog(LOG_DEBUG, "got new work in %u µs",
-			diff.tv_sec * 1000000 + diff.tv_usec);
+		applog(LOG_DEBUG, "got new work in %.2f ms",
+		       (1000.0 * diff.tv_sec) + (0.001 * diff.tv_usec));
 	}

 	json_decref(val);
@ -667,7 +666,7 @@ static void workio_cmd_free(struct workio_cmd *wc)
				@@ -667,7 +666,7 @@ static void workio_cmd_free(struct workio_cmd *wc)

 	switch (wc->cmd) {
 	case WC_SUBMIT_WORK:
-		free(wc->u.work);
+		aligned_free(wc->u.work);
 		break;
 	default: /* do nothing */
 		break;
@ -682,7 +681,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
				@@ -682,7 +681,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
 	struct work *ret_work;
 	int failures = 0;

-	ret_work = (struct work*)calloc(1, sizeof(*ret_work));
+	ret_work = (struct work*)aligned_calloc(sizeof(*ret_work));
 	if (!ret_work)
 		return false;

@ -690,7 +689,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
				@@ -690,7 +689,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
 	while (!get_upstream_work(curl, ret_work)) {
 		if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
 			applog(LOG_ERR, "json_rpc_call failed, terminating workio thread");
-			free(ret_work);
+			aligned_free(ret_work);
 			return false;
 		}

@ -702,7 +701,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
				@@ -702,7 +701,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)

 	/* send work to requesting thread */
 	if (!tq_push(wc->thr->q, ret_work))
-		free(ret_work);
+		aligned_free(ret_work);

 	return true;
 }
@ -822,7 +821,7 @@ static bool submit_work(struct thr_info *thr, const struct work *work_in)
				@@ -822,7 +821,7 @@ static bool submit_work(struct thr_info *thr, const struct work *work_in)
 	if (!wc)
 		return false;

-	wc->u.work = (struct work *)malloc(sizeof(*work_in));
+	wc->u.work = (struct work *)aligned_calloc(sizeof(*work_in));
 	if (!wc->u.work)
 		goto err_out;

@ -946,7 +945,6 @@ static void *miner_thread(void *userdata)
				@@ -946,7 +945,6 @@ static void *miner_thread(void *userdata)
 	struct work work;
 	uint32_t max_nonce;
 	uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1);
-	unsigned char *scratchbuf = NULL;
 	bool work_done = false;
 	bool extrajob = false;
 	char s[16];
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -4,7 +4,7 @@
				@@ -4,7 +4,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>

-#if defined(_MSC_VER)
+#if defined(__INTELLISENSE__)
 /* reduce warnings */
 #include <device_functions.h>
 #include <device_launch_parameters.h>
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@ -42,13 +42,8 @@ void sha256func(unsigned char *hash, const unsigned char *data, int len)
				@@ -42,13 +42,8 @@ void sha256func(unsigned char *hash, const unsigned char *data, int len)

 extern "C" void groestlhash(void *state, const void *input)
 {
-    // Tryout GPU-groestl
-
-    sph_groestl512_context     ctx_groestl[2];
-    static unsigned char pblank[1];
-    uint32_t mask = 8;
-    uint32_t zero = 0;
-
+    // CPU-groestl
+    sph_groestl512_context ctx_groestl[2];

    //these uint512 in the c++ source of the client are backed by an array of uint32
    uint32_t hashA[16], hashB[16];    
--- a/hashlog.cpp
+++ b/hashlog.cpp
@ -219,10 +219,9 @@ extern "C" void hashlog_purge_all(void)
				@@ -219,10 +219,9 @@ extern "C" void hashlog_purge_all(void)
 extern "C" void hashlog_dump_job(char* jobid)
 {
 	if (opt_debug) {
-		int deleted = 0;
 		uint64_t njobid = hextouint(jobid);
 		uint64_t keypfx = (njobid << 32);
-		uint32_t sz = tlastshares.size();
+		// uint32_t sz = tlastshares.size();
 		std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
 		while (i != tlastshares.end()) {
 			if ((keypfx & i->first) == keypfx) {
--- a/miner.h
+++ b/miner.h
@ -51,6 +51,14 @@ void *alloca (size_t);
				@@ -51,6 +51,14 @@ void *alloca (size_t);
 # endif
 #endif

+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0
+# define _ALIGN(x) __align__(x)
+#elif _MSC_VER
+# define _ALIGN(x) __declspec(align(x))
+#else
+# define _ALIGN(x) __attribute__ ((aligned(x)))
+#endif
+
 #ifdef HAVE_SYSLOG_H
 #include <syslog.h>
 #define LOG_BLUE 0x10 /* unique value */
@ -200,6 +208,10 @@ static inline void le16enc(void *pp, uint16_t x)
				@@ -200,6 +208,10 @@ static inline void le16enc(void *pp, uint16_t x)
 }
 #endif

+/* used for struct work */
+void *aligned_calloc(int size);
+void aligned_free(void *ptr);
+
 #if JANSSON_MAJOR_VERSION >= 2
 #define JSON_LOADS(str, err_ptr) json_loads((str), 0, (err_ptr))
 #else
--- a/sph/haval_helper.c
+++ b/sph/haval_helper.c
@ -149,7 +149,7 @@ static void
				@@ -149,7 +149,7 @@ static void
 SPH_XCAT(SPH_XCAT(haval, PASSES), _close)(sph_haval_context *sc,
 	unsigned ub, unsigned n, void *dst)
 {
-	unsigned current,j;
+	unsigned current;
 	DSTATE;

 #if SPH_64
--- a/util.c
+++ b/util.c
@ -382,7 +382,7 @@ json_t *json_rpc_call(CURL *curl, const char *url,
				@@ -382,7 +382,7 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	curl_easy_setopt(curl, CURLOPT_POST, 1);

 	if (opt_protocol)
-		applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req);
+		applog(LOG_DEBUG, "JSON protocol request:\n%s", rpc_req);

 	upload_data.buf = rpc_req;
 	upload_data.len = strlen(rpc_req);
@ -481,6 +481,33 @@ err_out:
				@@ -481,6 +481,33 @@ err_out:
 	return NULL;
 }

+/**
+ * Unlike malloc, calloc set the memory to zero
+ */
+void *aligned_calloc(int size)
+{
+	const int ALIGN = 64; // cache line
+#ifdef _MSC_VER
+	void* res = _aligned_malloc(size, ALIGN);
+	memset(res, 0, size);
+	return res;
+#else
+	void *mem = calloc(1, size+ALIGN+sizeof(void*));
+	void **ptr = (void**)((size_t)(mem+ALIGN+sizeof(void*)) & ~(ALIGN-1));
+	ptr[-1] = mem;
+	return ptr;
+#endif
+}
+
+void aligned_free(void *ptr)
+{
+#ifdef _MSC_VER
+	return _aligned_free(ptr);
+#else
+	free(((void**)ptr)[-1]);
+#endif
+}
+
 void cbin2hex(char *out, const char *in, size_t len)
 {
 	if (out) {