Browse Source

Some work on data alignment

linux: add -march=native (we build it ourself) and some other flags

+ remove unused vars (seen with -Wall)
master
Tanguy Pruvot 10 years ago
parent
commit
5bc969fa57
  1. 5
      Makefile.am
  2. 6
      blake32.cu
  3. 4
      configure.sh
  4. 20
      cpu-miner.c
  5. 2
      cuda_helper.h
  6. 9
      groestlcoin.cpp
  7. 3
      hashlog.cpp
  8. 12
      miner.h
  9. 2
      sph/haval_helper.c
  10. 29
      util.c

5
Makefile.am

@ -49,10 +49,13 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -49,10 +49,13 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \
x11/s3.cu
if HAVE_WINDOWS
ccminer_SOURCES += compat/winansi.c
endif
ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
ccminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(CPPFLAGS) $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
nvcc_ARCH = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
#nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"

6
blake32.cu

@ -386,7 +386,7 @@ static void blake256mid(uint32_t *output, const uint32_t *input, int8_t rounds = @@ -386,7 +386,7 @@ static void blake256mid(uint32_t *output, const uint32_t *input, int8_t rounds =
__host__
void blake256_cpu_setBlock_16(uint32_t *penddata, const uint32_t *midstate, const uint32_t *ptarget)
{
uint32_t data[11];
uint32_t _ALIGN(64) data[11];
memcpy(data, midstate, 32);
data[8] = penddata[0];
data[9] = penddata[1];
@ -402,9 +402,9 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt @@ -402,9 +402,9 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
const uint32_t first_nonce = pdata[19];
static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
uint64_t targetHigh = ((uint64_t*)ptarget)[3]; // 0x00000000.0fffffff
uint32_t endiandata[20];
uint32_t _ALIGN(64) endiandata[20];
#if PRECALC64
uint32_t midstate[8];
uint32_t _ALIGN(64) midstate[8];
#else
uint32_t crcsum;
#endif

4
configure.sh

@ -5,5 +5,7 @@ @@ -5,5 +5,7 @@
#--ptxas-options=\"-v -dlcm=cg\""
CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda
extracflags="-march=native -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16"
CUDA_CFLAGS="-O3 -Xcompiler -Wall" ./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda

20
cpu-miner.c

@ -383,7 +383,7 @@ struct work { @@ -383,7 +383,7 @@ struct work {
uint32_t scanned_to;
};
static struct work g_work;
static struct work _ALIGN(64) g_work;
static time_t g_work_time;
static pthread_mutex_t g_work_lock;
@ -484,11 +484,10 @@ static int share_result(int result, const char *reason) @@ -484,11 +484,10 @@ static int share_result(int result, const char *reason)
{
char s[345];
double hashrate;
int i, ret = 0;
hashrate = 0.;
pthread_mutex_lock(&stats_lock);
for (i = 0; i < opt_n_threads; i++)
for (int i = 0; i < opt_n_threads; i++)
hashrate += thr_hashrates[i];
result ? accepted_count++ : rejected_count++;
pthread_mutex_unlock(&stats_lock);
@ -651,8 +650,8 @@ static bool get_upstream_work(CURL *curl, struct work *work) @@ -651,8 +650,8 @@ static bool get_upstream_work(CURL *curl, struct work *work)
if (opt_protocol && rc) {
timeval_subtract(&diff, &tv_end, &tv_start);
/* show time because curl can be slower against versions/config */
applog(LOG_DEBUG, "got new work in %u µs",
diff.tv_sec * 1000000 + diff.tv_usec);
applog(LOG_DEBUG, "got new work in %.2f ms",
(1000.0 * diff.tv_sec) + (0.001 * diff.tv_usec));
}
json_decref(val);
@ -667,7 +666,7 @@ static void workio_cmd_free(struct workio_cmd *wc) @@ -667,7 +666,7 @@ static void workio_cmd_free(struct workio_cmd *wc)
switch (wc->cmd) {
case WC_SUBMIT_WORK:
free(wc->u.work);
aligned_free(wc->u.work);
break;
default: /* do nothing */
break;
@ -682,7 +681,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl) @@ -682,7 +681,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
struct work *ret_work;
int failures = 0;
ret_work = (struct work*)calloc(1, sizeof(*ret_work));
ret_work = (struct work*)aligned_calloc(sizeof(*ret_work));
if (!ret_work)
return false;
@ -690,7 +689,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl) @@ -690,7 +689,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
while (!get_upstream_work(curl, ret_work)) {
if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
applog(LOG_ERR, "json_rpc_call failed, terminating workio thread");
free(ret_work);
aligned_free(ret_work);
return false;
}
@ -702,7 +701,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl) @@ -702,7 +701,7 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
/* send work to requesting thread */
if (!tq_push(wc->thr->q, ret_work))
free(ret_work);
aligned_free(ret_work);
return true;
}
@ -822,7 +821,7 @@ static bool submit_work(struct thr_info *thr, const struct work *work_in) @@ -822,7 +821,7 @@ static bool submit_work(struct thr_info *thr, const struct work *work_in)
if (!wc)
return false;
wc->u.work = (struct work *)malloc(sizeof(*work_in));
wc->u.work = (struct work *)aligned_calloc(sizeof(*work_in));
if (!wc->u.work)
goto err_out;
@ -946,7 +945,6 @@ static void *miner_thread(void *userdata) @@ -946,7 +945,6 @@ static void *miner_thread(void *userdata)
struct work work;
uint32_t max_nonce;
uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1);
unsigned char *scratchbuf = NULL;
bool work_done = false;
bool extrajob = false;
char s[16];

2
cuda_helper.h

@ -4,7 +4,7 @@ @@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#if defined(_MSC_VER)
#if defined(__INTELLISENSE__)
/* reduce warnings */
#include <device_functions.h>
#include <device_launch_parameters.h>

9
groestlcoin.cpp

@ -42,13 +42,8 @@ void sha256func(unsigned char *hash, const unsigned char *data, int len) @@ -42,13 +42,8 @@ void sha256func(unsigned char *hash, const unsigned char *data, int len)
extern "C" void groestlhash(void *state, const void *input)
{
// Tryout GPU-groestl
sph_groestl512_context ctx_groestl[2];
static unsigned char pblank[1];
uint32_t mask = 8;
uint32_t zero = 0;
// CPU-groestl
sph_groestl512_context ctx_groestl[2];
//these uint512 in the c++ source of the client are backed by an array of uint32
uint32_t hashA[16], hashB[16];

3
hashlog.cpp

@ -219,10 +219,9 @@ extern "C" void hashlog_purge_all(void) @@ -219,10 +219,9 @@ extern "C" void hashlog_purge_all(void)
extern "C" void hashlog_dump_job(char* jobid)
{
if (opt_debug) {
int deleted = 0;
uint64_t njobid = hextouint(jobid);
uint64_t keypfx = (njobid << 32);
uint32_t sz = tlastshares.size();
// uint32_t sz = tlastshares.size();
std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
while (i != tlastshares.end()) {
if ((keypfx & i->first) == keypfx) {

12
miner.h

@ -51,6 +51,14 @@ void *alloca (size_t); @@ -51,6 +51,14 @@ void *alloca (size_t);
# endif
#endif
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0
# define _ALIGN(x) __align__(x)
#elif _MSC_VER
# define _ALIGN(x) __declspec(align(x))
#else
# define _ALIGN(x) __attribute__ ((aligned(x)))
#endif
#ifdef HAVE_SYSLOG_H
#include <syslog.h>
#define LOG_BLUE 0x10 /* unique value */
@ -200,6 +208,10 @@ static inline void le16enc(void *pp, uint16_t x) @@ -200,6 +208,10 @@ static inline void le16enc(void *pp, uint16_t x)
}
#endif
/* used for struct work */
void *aligned_calloc(int size);
void aligned_free(void *ptr);
#if JANSSON_MAJOR_VERSION >= 2
#define JSON_LOADS(str, err_ptr) json_loads((str), 0, (err_ptr))
#else

2
sph/haval_helper.c

@ -149,7 +149,7 @@ static void @@ -149,7 +149,7 @@ static void
SPH_XCAT(SPH_XCAT(haval, PASSES), _close)(sph_haval_context *sc,
unsigned ub, unsigned n, void *dst)
{
unsigned current,j;
unsigned current;
DSTATE;
#if SPH_64

29
util.c

@ -382,7 +382,7 @@ json_t *json_rpc_call(CURL *curl, const char *url, @@ -382,7 +382,7 @@ json_t *json_rpc_call(CURL *curl, const char *url,
curl_easy_setopt(curl, CURLOPT_POST, 1);
if (opt_protocol)
applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req);
applog(LOG_DEBUG, "JSON protocol request:\n%s", rpc_req);
upload_data.buf = rpc_req;
upload_data.len = strlen(rpc_req);
@ -481,6 +481,33 @@ err_out: @@ -481,6 +481,33 @@ err_out:
return NULL;
}
/**
* Unlike malloc, calloc set the memory to zero
*/
void *aligned_calloc(int size)
{
const int ALIGN = 64; // cache line
#ifdef _MSC_VER
void* res = _aligned_malloc(size, ALIGN);
memset(res, 0, size);
return res;
#else
void *mem = calloc(1, size+ALIGN+sizeof(void*));
void **ptr = (void**)((size_t)(mem+ALIGN+sizeof(void*)) & ~(ALIGN-1));
ptr[-1] = mem;
return ptr;
#endif
}
void aligned_free(void *ptr)
{
#ifdef _MSC_VER
return _aligned_free(ptr);
#else
free(((void**)ptr)[-1]);
#endif
}
void cbin2hex(char *out, const char *in, size_t len)
{
if (out) {

Loading…
Cancel
Save