diff --git a/Makefile.am b/Makefile.am
index 635a530..cb4ba62 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -46,15 +46,16 @@ ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
-nvcc_FLAGS = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -I . -Xptxas "-v" --ptxas-options=-v
+nvcc_FLAGS = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -I . --ptxas-options=-v --use_fast_math
nvcc_FLAGS += $(JANSSON_INCLUDES)
# we're now targeting all major compute architectures within one binary.
.cu.o:
$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=128 -o $@ -c $<
-# Luffa is faster with 80 registers than 128
+# Luffa and Echo are faster with 80 registers than 128
x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
+x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu
$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $<
# Shavite compiles faster with 128 regs
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 689b017..a0edd97 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -479,6 +479,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"
%(AdditionalOptions)
+ 80
--ptxas-options=-O3 %(AdditionalOptions)
%(AdditionalOptions)
--ptxas-options=-O3 %(AdditionalOptions)
@@ -577,4 +578,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"
-
\ No newline at end of file
+
diff --git a/config.sh b/config.sh
index 04ef6f9..9955155 100755
--- a/config.sh
+++ b/config.sh
@@ -3,7 +3,7 @@
# Simple script to create the Makefile
# then type 'make'
-# export PATH="$PATH:/usr/local/cuda-6.5/bin/"
+# export PATH="$PATH:/usr/local/cuda/bin/"
make clean || echo clean
@@ -11,4 +11,4 @@ rm -f Makefile.in
rm -f config.status
./autogen.sh || echo done
-CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2 -D_REENTRANT" ./configure
+CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2" ./configure
diff --git a/cpu-miner.c b/cpu-miner.c
index 4dcb71b..8295d1a 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -153,7 +153,7 @@ static const char *algo_names[] = {
"anime",
"fresh",
"nist5",
- "whirlcoin",
+ "whirl",
"x11",
"x13",
"x14",
@@ -231,7 +231,7 @@ Options:\n\
anime Animecoin hash\n\
fresh Freshcoin hash (shavite 80)\n\
nist5 NIST5 (TalkCoin) hash\n\
- whirlcoin Whirlcoin hash\n\
+ whirl Whirlcoin (old whirlpool)\n\
x11 X11 (DarkCoin) hash\n\
x13 X13 (MaruCoin) hash\n\
x14 X14 hash\n\
@@ -1526,17 +1526,17 @@ int main(int argc, char *argv[])
int i;
printf("*** ccMiner for nVidia GPUs by Christian Buchner and Christian H. ***\n");
- printf("\t This is version "PROGRAM_VERSION" (tpruvot@github)\n");
+ printf("\t This is the forked version "PROGRAM_VERSION" (tpruvot@github)\n");
#ifdef WIN32
- printf("\t Built with VC++ 2013 and nVidia CUDA SDK 6.5 RC (DC 5.0)\n\n");
+ printf("\t Built with VC++ 2013 and nVidia CUDA SDK 6.5\n\n");
#else
- printf("\t Built with the nVidia CUDA SDK 6.5 RC\n\n");
+ printf("\t Built with the nVidia CUDA SDK 6.5\n\n");
#endif
printf("\t based on pooler-cpuminer 2.3.2 (c) 2010 Jeff Garzik, 2012 pooler\n");
- printf("\t based on pooler-cpuminer extension for HVC from http://hvc.1gh.com/" "\n\n");
+ printf("\t and HVC extension from http://hvc.1gh.com/" "\n\n");
printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n");
printf("\t BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM\n");
- printf("\tCuda X14 and X15 added by Tanguy Pruvot (also in cpuminer-multi)\n");
+ printf("\tCleaned and optimized by Tanguy Pruvot\n");
printf("\t BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo\n\n");
rpc_user = strdup("");
diff --git a/cuda_helper.h b/cuda_helper.h
index c2c9f3e..8563737 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -38,12 +38,12 @@ extern const uint3 threadIdx;
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
-__device__ __forceinline__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
+__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
{
#if __CUDA_ARCH__ >= 130
return __double_as_longlong(__hiloint2double(HI, LO));
#else
- return (unsigned long long)LO | (((unsigned long long)HI) << 32);
+ return (uint64_t)LO | (((uint64_t)HI) << 32);
#endif
}
@@ -94,11 +94,8 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
{
// Input: 77665544 33221100
// Output: 00112233 44556677
- uint64_t temp[2];
- temp[0] = __byte_perm(_HIWORD(x), 0, 0x0123);
- temp[1] = __byte_perm(_LOWORD(x), 0, 0x0123);
-
- return temp[0] | (temp[1]<<32);
+ uint64_t result = __byte_perm((uint32_t) x, 0, 0x0123);
+ return (result << 32) | __byte_perm(_HIWORD(x), 0, 0x0123);
}
#else
/* host */
@@ -132,7 +129,7 @@ __device__ __forceinline__
uint64_t xor1(uint64_t a, uint64_t b)
{
uint64_t result;
- asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a) ,"l"(b));
+ asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a),"l"(b));
return result;
}
@@ -141,10 +138,10 @@ __device__ __forceinline__
uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
{
uint64_t result;
- asm("{\n\t"
- " .reg .u64 t1;\n\t"
- "xor.b64 t1, %2, %3;\n\t"
- "xor.b64 %0, %1, t1;\n\t"
+ asm("{"
+ ".reg .u64 lt;\n\t"
+ "xor.b64 lt, %2, %3;\n\t"
+ "xor.b64 %0, %1, lt;\n\t"
"}"
: "=l"(result) : "l"(a) ,"l"(b),"l"(c));
return result;
diff --git a/util.c b/util.c
index 81cc85b..9ccbbdf 100644
--- a/util.c
+++ b/util.c
@@ -1366,7 +1366,7 @@ void print_hash_tests(void)
memset(hash, 0, sizeof hash);
wcoinhash(&hash[0], &buf[0]);
- printf("\nwhirlc: "); print_hash(hash);
+ printf("\nwhirl: "); print_hash(hash);
memset(hash, 0, sizeof hash);
x11hash(&hash[0], &buf[0]);
diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu
index 97cd1dd..fca1b05 100644
--- a/x11/cuda_x11_aes.cu
+++ b/x11/cuda_x11_aes.cu
@@ -298,10 +298,11 @@ static void aes_cpu_init()
0, cudaMemcpyHostToDevice);
}
-static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
+__device__ __forceinline__
+void aes_gpu_init(uint32_t *sharedMemory)
{
- if(threadIdx.x < 256)
- {
+ /* each thread startup will fill a uint32 */
+ if (threadIdx.x < 256) {
sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x];
sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x];
@@ -309,10 +310,13 @@ static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
}
}
-static __device__ __forceinline__ void aes_round(
+/* tried with 3 xor.b32 asm, not faster */
+#define xor4_32(a,b,c,d) (a ^ b ^ c ^ d);
+
+__device__
+static void aes_round(
const uint32_t *sharedMemory,
- uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
- uint32_t k0,
+ uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
{
uint32_t idx0, idx1, idx2, idx3;
@@ -321,42 +325,47 @@ static __device__ __forceinline__ void aes_round(
idx1 = __byte_perm(x1, 0, 0x4441) + 256;
idx2 = __byte_perm(x2, 0, 0x4442) + 512;
idx3 = __byte_perm(x3, 0, 0x4443) + 768;
- y0 =sharedMemory[idx0] ^
- sharedMemory[idx1] ^
- sharedMemory[idx2] ^
- sharedMemory[idx3] ^
- k0;
+ y0 = xor4_32(
+ sharedMemory[idx0],
+ sharedMemory[idx1],
+ sharedMemory[idx2],
+ sharedMemory[idx3]);
+ y0 ^= k0;
idx0 = __byte_perm(x1, 0, 0x4440);
idx1 = __byte_perm(x2, 0, 0x4441) + 256;
idx2 = __byte_perm(x3, 0, 0x4442) + 512;
idx3 = __byte_perm(x0, 0, 0x4443) + 768;
- y1 =sharedMemory[idx0] ^
- sharedMemory[idx1] ^
- sharedMemory[idx2] ^
- sharedMemory[idx3]; // ^k3
+ y1 = xor4_32(
+ sharedMemory[idx0],
+ sharedMemory[idx1],
+ sharedMemory[idx2],
+ sharedMemory[idx3]);
idx0 = __byte_perm(x2, 0, 0x4440);
idx1 = __byte_perm(x3, 0, 0x4441) + 256;
idx2 = __byte_perm(x0, 0, 0x4442) + 512;
idx3 = __byte_perm(x1, 0, 0x4443) + 768;
- y2 =sharedMemory[idx0] ^
- sharedMemory[idx1] ^
- sharedMemory[idx2] ^
- sharedMemory[idx3]; // ^k2
+ y2 = xor4_32(
+ sharedMemory[idx0],
+ sharedMemory[idx1],
+ sharedMemory[idx2],
+ sharedMemory[idx3]); // ^k2
idx0 = __byte_perm(x3, 0, 0x4440);
idx1 = __byte_perm(x0, 0, 0x4441) + 256;
idx2 = __byte_perm(x1, 0, 0x4442) + 512;
idx3 = __byte_perm(x2, 0, 0x4443) + 768;
- y3 =sharedMemory[idx0] ^
- sharedMemory[idx1] ^
- sharedMemory[idx2] ^
- sharedMemory[idx3]; // ^k3
+ y3 = xor4_32(
+ sharedMemory[idx0],
+ sharedMemory[idx1],
+ sharedMemory[idx2],
+ sharedMemory[idx3]); // ^k3
}
-static __device__ __forceinline__ void aes_round(
+__device__
+static void aes_round(
const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
@@ -367,36 +376,40 @@ static __device__ __forceinline__ void aes_round(
idx1 = __byte_perm(x1, 0, 0x4441) + 256;
idx2 = __byte_perm(x2, 0, 0x4442) + 512;
idx3 = __byte_perm(x3, 0, 0x4443) + 768;
- y0 =sharedMemory[idx0] ^
- sharedMemory[idx1] ^
- sharedMemory[idx2] ^
- sharedMemory[idx3];
+ y0 = xor4_32(
+ sharedMemory[idx0],
+ sharedMemory[idx1],
+ sharedMemory[idx2],
+ sharedMemory[idx3]);
idx0 = __byte_perm(x1, 0, 0x4440);
idx1 = __byte_perm(x2, 0, 0x4441) + 256;
idx2 = __byte_perm(x3, 0, 0x4442) + 512;
idx3 = __byte_perm(x0, 0, 0x4443) + 768;
- y1 =sharedMemory[idx0] ^
- sharedMemory[idx1] ^
- sharedMemory[idx2] ^
- sharedMemory[idx3]; // ^k3
+ y1 = xor4_32(
+ sharedMemory[idx0],
+ sharedMemory[idx1],
+ sharedMemory[idx2],
+ sharedMemory[idx3]);
idx0 = __byte_perm(x2, 0, 0x4440);
idx1 = __byte_perm(x3, 0, 0x4441) + 256;
idx2 = __byte_perm(x0, 0, 0x4442) + 512;
idx3 = __byte_perm(x1, 0, 0x4443) + 768;
- y2 =sharedMemory[idx0] ^
- sharedMemory[idx1] ^
- sharedMemory[idx2] ^
- sharedMemory[idx3]; // ^k2
+ y2 = xor4_32(
+ sharedMemory[idx0],
+ sharedMemory[idx1],
+ sharedMemory[idx2],
+ sharedMemory[idx3]); // ^k2
idx0 = __byte_perm(x3, 0, 0x4440);
idx1 = __byte_perm(x0, 0, 0x4441) + 256;
idx2 = __byte_perm(x1, 0, 0x4442) + 512;
idx3 = __byte_perm(x2, 0, 0x4443) + 768;
- y3 =sharedMemory[idx0] ^
- sharedMemory[idx1] ^
- sharedMemory[idx2] ^
- sharedMemory[idx3]; // ^k3
+ y3 = xor4_32(
+ sharedMemory[idx0],
+ sharedMemory[idx1],
+ sharedMemory[idx2],
+ sharedMemory[idx3]); // ^k3
}