Browse Source

small reg tunes, rename whirlcoin to whirl

2upstream
Tanguy Pruvot 10 years ago
parent
commit
912ef1215d
  1. 5
      Makefile.am
  2. 3
      ccminer.vcxproj
  3. 4
      config.sh
  4. 14
      cpu-miner.c
  5. 21
      cuda_helper.h
  6. 2
      util.c
  7. 93
      x11/cuda_x11_aes.cu

5
Makefile.am

@ -46,15 +46,16 @@ ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@ ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
nvcc_FLAGS = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -I . -Xptxas "-v" --ptxas-options=-v nvcc_FLAGS = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -I . --ptxas-options=-v --use_fast_math
nvcc_FLAGS += $(JANSSON_INCLUDES) nvcc_FLAGS += $(JANSSON_INCLUDES)
# we're now targeting all major compute architectures within one binary. # we're now targeting all major compute architectures within one binary.
.cu.o: .cu.o:
$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=128 -o $@ -c $< $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=128 -o $@ -c $<
# Luffa is faster with 80 registers than 128 # Luffa and Echo are faster with 80 registers than 128
x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu
$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $< $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $<
# Shavite compiles faster with 128 regs # Shavite compiles faster with 128 regs

3
ccminer.vcxproj

@ -479,6 +479,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
</CudaCompile> </CudaCompile>
<CudaCompile Include="x11\cuda_x11_echo.cu"> <CudaCompile Include="x11\cuda_x11_echo.cu">
<MaxRegCount>80</MaxRegCount>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
@ -577,4 +578,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" /> <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

4
config.sh

@ -3,7 +3,7 @@
# Simple script to create the Makefile # Simple script to create the Makefile
# then type 'make' # then type 'make'
# export PATH="$PATH:/usr/local/cuda-6.5/bin/" # export PATH="$PATH:/usr/local/cuda/bin/"
make clean || echo clean make clean || echo clean
@ -11,4 +11,4 @@ rm -f Makefile.in
rm -f config.status rm -f config.status
./autogen.sh || echo done ./autogen.sh || echo done
CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2 -D_REENTRANT" ./configure CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2" ./configure

14
cpu-miner.c

@ -153,7 +153,7 @@ static const char *algo_names[] = {
"anime", "anime",
"fresh", "fresh",
"nist5", "nist5",
"whirlcoin", "whirl",
"x11", "x11",
"x13", "x13",
"x14", "x14",
@ -231,7 +231,7 @@ Options:\n\
anime Animecoin hash\n\ anime Animecoin hash\n\
fresh Freshcoin hash (shavite 80)\n\ fresh Freshcoin hash (shavite 80)\n\
nist5 NIST5 (TalkCoin) hash\n\ nist5 NIST5 (TalkCoin) hash\n\
whirlcoin Whirlcoin hash\n\ whirl Whirlcoin (old whirlpool)\n\
x11 X11 (DarkCoin) hash\n\ x11 X11 (DarkCoin) hash\n\
x13 X13 (MaruCoin) hash\n\ x13 X13 (MaruCoin) hash\n\
x14 X14 hash\n\ x14 X14 hash\n\
@ -1526,17 +1526,17 @@ int main(int argc, char *argv[])
int i; int i;
printf("*** ccMiner for nVidia GPUs by Christian Buchner and Christian H. ***\n"); printf("*** ccMiner for nVidia GPUs by Christian Buchner and Christian H. ***\n");
printf("\t This is version "PROGRAM_VERSION" (tpruvot@github)\n"); printf("\t This is the forked version "PROGRAM_VERSION" (tpruvot@github)\n");
#ifdef WIN32 #ifdef WIN32
printf("\t Built with VC++ 2013 and nVidia CUDA SDK 6.5 RC (DC 5.0)\n\n"); printf("\t Built with VC++ 2013 and nVidia CUDA SDK 6.5\n\n");
#else #else
printf("\t Built with the nVidia CUDA SDK 6.5 RC\n\n"); printf("\t Built with the nVidia CUDA SDK 6.5\n\n");
#endif #endif
printf("\t based on pooler-cpuminer 2.3.2 (c) 2010 Jeff Garzik, 2012 pooler\n"); printf("\t based on pooler-cpuminer 2.3.2 (c) 2010 Jeff Garzik, 2012 pooler\n");
printf("\t based on pooler-cpuminer extension for HVC from http://hvc.1gh.com/" "\n\n"); printf("\t and HVC extension from http://hvc.1gh.com/" "\n\n");
printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n"); printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n");
printf("\t BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM\n"); printf("\t BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM\n");
printf("\tCuda X14 and X15 added by Tanguy Pruvot (also in cpuminer-multi)\n"); printf("\tCleaned and optimized by Tanguy Pruvot\n");
printf("\t BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo\n\n"); printf("\t BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo\n\n");
rpc_user = strdup(""); rpc_user = strdup("");

21
cuda_helper.h

@ -38,12 +38,12 @@ extern const uint3 threadIdx;
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif #endif
__device__ __forceinline__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI) __device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
{ {
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
return __double_as_longlong(__hiloint2double(HI, LO)); return __double_as_longlong(__hiloint2double(HI, LO));
#else #else
return (unsigned long long)LO | (((unsigned long long)HI) << 32); return (uint64_t)LO | (((uint64_t)HI) << 32);
#endif #endif
} }
@ -94,11 +94,8 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
{ {
// Input: 77665544 33221100 // Input: 77665544 33221100
// Output: 00112233 44556677 // Output: 00112233 44556677
uint64_t temp[2]; uint64_t result = __byte_perm((uint32_t) x, 0, 0x0123);
temp[0] = __byte_perm(_HIWORD(x), 0, 0x0123); return (result << 32) | __byte_perm(_HIWORD(x), 0, 0x0123);
temp[1] = __byte_perm(_LOWORD(x), 0, 0x0123);
return temp[0] | (temp[1]<<32);
} }
#else #else
/* host */ /* host */
@ -132,7 +129,7 @@ __device__ __forceinline__
uint64_t xor1(uint64_t a, uint64_t b) uint64_t xor1(uint64_t a, uint64_t b)
{ {
uint64_t result; uint64_t result;
asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a) ,"l"(b)); asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a),"l"(b));
return result; return result;
} }
@ -141,10 +138,10 @@ __device__ __forceinline__
uint64_t xor3(uint64_t a, uint64_t b, uint64_t c) uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
{ {
uint64_t result; uint64_t result;
asm("{\n\t" asm("{"
" .reg .u64 t1;\n\t" ".reg .u64 lt;\n\t"
"xor.b64 t1, %2, %3;\n\t" "xor.b64 lt, %2, %3;\n\t"
"xor.b64 %0, %1, t1;\n\t" "xor.b64 %0, %1, lt;\n\t"
"}" "}"
: "=l"(result) : "l"(a) ,"l"(b),"l"(c)); : "=l"(result) : "l"(a) ,"l"(b),"l"(c));
return result; return result;

2
util.c

@ -1366,7 +1366,7 @@ void print_hash_tests(void)
memset(hash, 0, sizeof hash); memset(hash, 0, sizeof hash);
wcoinhash(&hash[0], &buf[0]); wcoinhash(&hash[0], &buf[0]);
printf("\nwhirlc: "); print_hash(hash); printf("\nwhirl: "); print_hash(hash);
memset(hash, 0, sizeof hash); memset(hash, 0, sizeof hash);
x11hash(&hash[0], &buf[0]); x11hash(&hash[0], &buf[0]);

93
x11/cuda_x11_aes.cu

@ -298,10 +298,11 @@ static void aes_cpu_init()
0, cudaMemcpyHostToDevice); 0, cudaMemcpyHostToDevice);
} }
static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory) __device__ __forceinline__
void aes_gpu_init(uint32_t *sharedMemory)
{ {
if(threadIdx.x < 256) /* each thread startup will fill a uint32 */
{ if (threadIdx.x < 256) {
sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x]; sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x];
sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x]; sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x];
@ -309,10 +310,13 @@ static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
} }
} }
static __device__ __forceinline__ void aes_round( /* tried with 3 xor.b32 asm, not faster */
#define xor4_32(a,b,c,d) (a ^ b ^ c ^ d);
__device__
static void aes_round(
const uint32_t *sharedMemory, const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0,
uint32_t k0,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
{ {
uint32_t idx0, idx1, idx2, idx3; uint32_t idx0, idx1, idx2, idx3;
@ -321,42 +325,47 @@ static __device__ __forceinline__ void aes_round(
idx1 = __byte_perm(x1, 0, 0x4441) + 256; idx1 = __byte_perm(x1, 0, 0x4441) + 256;
idx2 = __byte_perm(x2, 0, 0x4442) + 512; idx2 = __byte_perm(x2, 0, 0x4442) + 512;
idx3 = __byte_perm(x3, 0, 0x4443) + 768; idx3 = __byte_perm(x3, 0, 0x4443) + 768;
y0 =sharedMemory[idx0] ^ y0 = xor4_32(
sharedMemory[idx1] ^ sharedMemory[idx0],
sharedMemory[idx2] ^ sharedMemory[idx1],
sharedMemory[idx3] ^ sharedMemory[idx2],
k0; sharedMemory[idx3]);
y0 ^= k0;
idx0 = __byte_perm(x1, 0, 0x4440); idx0 = __byte_perm(x1, 0, 0x4440);
idx1 = __byte_perm(x2, 0, 0x4441) + 256; idx1 = __byte_perm(x2, 0, 0x4441) + 256;
idx2 = __byte_perm(x3, 0, 0x4442) + 512; idx2 = __byte_perm(x3, 0, 0x4442) + 512;
idx3 = __byte_perm(x0, 0, 0x4443) + 768; idx3 = __byte_perm(x0, 0, 0x4443) + 768;
y1 =sharedMemory[idx0] ^ y1 = xor4_32(
sharedMemory[idx1] ^ sharedMemory[idx0],
sharedMemory[idx2] ^ sharedMemory[idx1],
sharedMemory[idx3]; // ^k3 sharedMemory[idx2],
sharedMemory[idx3]);
idx0 = __byte_perm(x2, 0, 0x4440); idx0 = __byte_perm(x2, 0, 0x4440);
idx1 = __byte_perm(x3, 0, 0x4441) + 256; idx1 = __byte_perm(x3, 0, 0x4441) + 256;
idx2 = __byte_perm(x0, 0, 0x4442) + 512; idx2 = __byte_perm(x0, 0, 0x4442) + 512;
idx3 = __byte_perm(x1, 0, 0x4443) + 768; idx3 = __byte_perm(x1, 0, 0x4443) + 768;
y2 =sharedMemory[idx0] ^ y2 = xor4_32(
sharedMemory[idx1] ^ sharedMemory[idx0],
sharedMemory[idx2] ^ sharedMemory[idx1],
sharedMemory[idx3]; // ^k2 sharedMemory[idx2],
sharedMemory[idx3]); // ^k2
idx0 = __byte_perm(x3, 0, 0x4440); idx0 = __byte_perm(x3, 0, 0x4440);
idx1 = __byte_perm(x0, 0, 0x4441) + 256; idx1 = __byte_perm(x0, 0, 0x4441) + 256;
idx2 = __byte_perm(x1, 0, 0x4442) + 512; idx2 = __byte_perm(x1, 0, 0x4442) + 512;
idx3 = __byte_perm(x2, 0, 0x4443) + 768; idx3 = __byte_perm(x2, 0, 0x4443) + 768;
y3 =sharedMemory[idx0] ^ y3 = xor4_32(
sharedMemory[idx1] ^ sharedMemory[idx0],
sharedMemory[idx2] ^ sharedMemory[idx1],
sharedMemory[idx3]; // ^k3 sharedMemory[idx2],
sharedMemory[idx3]); // ^k3
} }
static __device__ __forceinline__ void aes_round( __device__
static void aes_round(
const uint32_t *sharedMemory, const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
@ -367,36 +376,40 @@ static __device__ __forceinline__ void aes_round(
idx1 = __byte_perm(x1, 0, 0x4441) + 256; idx1 = __byte_perm(x1, 0, 0x4441) + 256;
idx2 = __byte_perm(x2, 0, 0x4442) + 512; idx2 = __byte_perm(x2, 0, 0x4442) + 512;
idx3 = __byte_perm(x3, 0, 0x4443) + 768; idx3 = __byte_perm(x3, 0, 0x4443) + 768;
y0 =sharedMemory[idx0] ^ y0 = xor4_32(
sharedMemory[idx1] ^ sharedMemory[idx0],
sharedMemory[idx2] ^ sharedMemory[idx1],
sharedMemory[idx3]; sharedMemory[idx2],
sharedMemory[idx3]);
idx0 = __byte_perm(x1, 0, 0x4440); idx0 = __byte_perm(x1, 0, 0x4440);
idx1 = __byte_perm(x2, 0, 0x4441) + 256; idx1 = __byte_perm(x2, 0, 0x4441) + 256;
idx2 = __byte_perm(x3, 0, 0x4442) + 512; idx2 = __byte_perm(x3, 0, 0x4442) + 512;
idx3 = __byte_perm(x0, 0, 0x4443) + 768; idx3 = __byte_perm(x0, 0, 0x4443) + 768;
y1 =sharedMemory[idx0] ^ y1 = xor4_32(
sharedMemory[idx1] ^ sharedMemory[idx0],
sharedMemory[idx2] ^ sharedMemory[idx1],
sharedMemory[idx3]; // ^k3 sharedMemory[idx2],
sharedMemory[idx3]);
idx0 = __byte_perm(x2, 0, 0x4440); idx0 = __byte_perm(x2, 0, 0x4440);
idx1 = __byte_perm(x3, 0, 0x4441) + 256; idx1 = __byte_perm(x3, 0, 0x4441) + 256;
idx2 = __byte_perm(x0, 0, 0x4442) + 512; idx2 = __byte_perm(x0, 0, 0x4442) + 512;
idx3 = __byte_perm(x1, 0, 0x4443) + 768; idx3 = __byte_perm(x1, 0, 0x4443) + 768;
y2 =sharedMemory[idx0] ^ y2 = xor4_32(
sharedMemory[idx1] ^ sharedMemory[idx0],
sharedMemory[idx2] ^ sharedMemory[idx1],
sharedMemory[idx3]; // ^k2 sharedMemory[idx2],
sharedMemory[idx3]); // ^k2
idx0 = __byte_perm(x3, 0, 0x4440); idx0 = __byte_perm(x3, 0, 0x4440);
idx1 = __byte_perm(x0, 0, 0x4441) + 256; idx1 = __byte_perm(x0, 0, 0x4441) + 256;
idx2 = __byte_perm(x1, 0, 0x4442) + 512; idx2 = __byte_perm(x1, 0, 0x4442) + 512;
idx3 = __byte_perm(x2, 0, 0x4443) + 768; idx3 = __byte_perm(x2, 0, 0x4443) + 768;
y3 =sharedMemory[idx0] ^ y3 = xor4_32(
sharedMemory[idx1] ^ sharedMemory[idx0],
sharedMemory[idx2] ^ sharedMemory[idx1],
sharedMemory[idx3]; // ^k3 sharedMemory[idx2],
sharedMemory[idx3]); // ^k3
} }

Loading…
Cancel
Save