small reg tunes, rename whirlcoin to whirl

This commit is contained in:
Tanguy Pruvot 2014-08-21 02:21:39 +02:00
parent 7d430edc25
commit 912ef1215d
7 changed files with 77 additions and 65 deletions

View File

@ -46,15 +46,16 @@ ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
nvcc_FLAGS = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -I . -Xptxas "-v" --ptxas-options=-v
nvcc_FLAGS = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -I . --ptxas-options=-v --use_fast_math
nvcc_FLAGS += $(JANSSON_INCLUDES)
# we're now targeting all major compute architectures within one binary.
.cu.o:
$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=128 -o $@ -c $<
# Luffa is faster with 80 registers than 128
# Luffa and Echo are faster with 80 registers than 128
x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu
$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $<
# Shavite compiles faster with 128 regs

View File

@ -479,6 +479,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_echo.cu">
<MaxRegCount>80</MaxRegCount>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
@ -577,4 +578,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<ImportGroup Label="ExtensionTargets">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
</ImportGroup>
</Project>
</Project>

View File

@ -3,7 +3,7 @@
# Simple script to create the Makefile
# then type 'make'
# export PATH="$PATH:/usr/local/cuda-6.5/bin/"
# export PATH="$PATH:/usr/local/cuda/bin/"
make clean || echo clean
@ -11,4 +11,4 @@ rm -f Makefile.in
rm -f config.status
./autogen.sh || echo done
CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2 -D_REENTRANT" ./configure
CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2" ./configure

View File

@ -153,7 +153,7 @@ static const char *algo_names[] = {
"anime",
"fresh",
"nist5",
"whirlcoin",
"whirl",
"x11",
"x13",
"x14",
@ -231,7 +231,7 @@ Options:\n\
anime Animecoin hash\n\
fresh Freshcoin hash (shavite 80)\n\
nist5 NIST5 (TalkCoin) hash\n\
whirlcoin Whirlcoin hash\n\
whirl Whirlcoin (old whirlpool)\n\
x11 X11 (DarkCoin) hash\n\
x13 X13 (MaruCoin) hash\n\
x14 X14 hash\n\
@ -1526,17 +1526,17 @@ int main(int argc, char *argv[])
int i;
printf("*** ccMiner for nVidia GPUs by Christian Buchner and Christian H. ***\n");
printf("\t This is version "PROGRAM_VERSION" (tpruvot@github)\n");
printf("\t This is the forked version "PROGRAM_VERSION" (tpruvot@github)\n");
#ifdef WIN32
printf("\t Built with VC++ 2013 and nVidia CUDA SDK 6.5 RC (DC 5.0)\n\n");
printf("\t Built with VC++ 2013 and nVidia CUDA SDK 6.5\n\n");
#else
printf("\t Built with the nVidia CUDA SDK 6.5 RC\n\n");
printf("\t Built with the nVidia CUDA SDK 6.5\n\n");
#endif
printf("\t based on pooler-cpuminer 2.3.2 (c) 2010 Jeff Garzik, 2012 pooler\n");
printf("\t based on pooler-cpuminer extension for HVC from http://hvc.1gh.com/" "\n\n");
printf("\t and HVC extension from http://hvc.1gh.com/" "\n\n");
printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n");
printf("\t BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM\n");
printf("\tCuda X14 and X15 added by Tanguy Pruvot (also in cpuminer-multi)\n");
printf("\tCleaned and optimized by Tanguy Pruvot\n");
printf("\t BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo\n\n");
rpc_user = strdup("");

View File

@ -38,12 +38,12 @@ extern const uint3 threadIdx;
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
__device__ __forceinline__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
{
#if __CUDA_ARCH__ >= 130
return __double_as_longlong(__hiloint2double(HI, LO));
#else
return (unsigned long long)LO | (((unsigned long long)HI) << 32);
return (uint64_t)LO | (((uint64_t)HI) << 32);
#endif
}
@ -94,11 +94,8 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
{
// Input: 77665544 33221100
// Output: 00112233 44556677
uint64_t temp[2];
temp[0] = __byte_perm(_HIWORD(x), 0, 0x0123);
temp[1] = __byte_perm(_LOWORD(x), 0, 0x0123);
return temp[0] | (temp[1]<<32);
uint64_t result = __byte_perm((uint32_t) x, 0, 0x0123);
return (result << 32) | __byte_perm(_HIWORD(x), 0, 0x0123);
}
#else
/* host */
@ -132,7 +129,7 @@ __device__ __forceinline__
uint64_t xor1(uint64_t a, uint64_t b)
{
uint64_t result;
asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a) ,"l"(b));
asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a),"l"(b));
return result;
}
@ -141,10 +138,10 @@ __device__ __forceinline__
uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
{
uint64_t result;
asm("{\n\t"
" .reg .u64 t1;\n\t"
"xor.b64 t1, %2, %3;\n\t"
"xor.b64 %0, %1, t1;\n\t"
asm("{"
".reg .u64 lt;\n\t"
"xor.b64 lt, %2, %3;\n\t"
"xor.b64 %0, %1, lt;\n\t"
"}"
: "=l"(result) : "l"(a) ,"l"(b),"l"(c));
return result;

2
util.c
View File

@ -1366,7 +1366,7 @@ void print_hash_tests(void)
memset(hash, 0, sizeof hash);
wcoinhash(&hash[0], &buf[0]);
printf("\nwhirlc: "); print_hash(hash);
printf("\nwhirl: "); print_hash(hash);
memset(hash, 0, sizeof hash);
x11hash(&hash[0], &buf[0]);

View File

@ -298,10 +298,11 @@ static void aes_cpu_init()
0, cudaMemcpyHostToDevice);
}
static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
__device__ __forceinline__
void aes_gpu_init(uint32_t *sharedMemory)
{
if(threadIdx.x < 256)
{
/* each thread startup will fill a uint32 */
if (threadIdx.x < 256) {
sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x];
sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x];
@ -309,10 +310,13 @@ static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
}
}
static __device__ __forceinline__ void aes_round(
/* tried with 3 xor.b32 asm, not faster */
#define xor4_32(a,b,c,d) (a ^ b ^ c ^ d);
__device__
static void aes_round(
const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t k0,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
{
uint32_t idx0, idx1, idx2, idx3;
@ -321,42 +325,47 @@ static __device__ __forceinline__ void aes_round(
idx1 = __byte_perm(x1, 0, 0x4441) + 256;
idx2 = __byte_perm(x2, 0, 0x4442) + 512;
idx3 = __byte_perm(x3, 0, 0x4443) + 768;
y0 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3] ^
k0;
y0 = xor4_32(
sharedMemory[idx0],
sharedMemory[idx1],
sharedMemory[idx2],
sharedMemory[idx3]);
y0 ^= k0;
idx0 = __byte_perm(x1, 0, 0x4440);
idx1 = __byte_perm(x2, 0, 0x4441) + 256;
idx2 = __byte_perm(x3, 0, 0x4442) + 512;
idx3 = __byte_perm(x0, 0, 0x4443) + 768;
y1 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3]; // ^k3
y1 = xor4_32(
sharedMemory[idx0],
sharedMemory[idx1],
sharedMemory[idx2],
sharedMemory[idx3]);
idx0 = __byte_perm(x2, 0, 0x4440);
idx1 = __byte_perm(x3, 0, 0x4441) + 256;
idx2 = __byte_perm(x0, 0, 0x4442) + 512;
idx3 = __byte_perm(x1, 0, 0x4443) + 768;
y2 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3]; // ^k2
y2 = xor4_32(
sharedMemory[idx0],
sharedMemory[idx1],
sharedMemory[idx2],
sharedMemory[idx3]); // ^k2
idx0 = __byte_perm(x3, 0, 0x4440);
idx1 = __byte_perm(x0, 0, 0x4441) + 256;
idx2 = __byte_perm(x1, 0, 0x4442) + 512;
idx3 = __byte_perm(x2, 0, 0x4443) + 768;
y3 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3]; // ^k3
y3 = xor4_32(
sharedMemory[idx0],
sharedMemory[idx1],
sharedMemory[idx2],
sharedMemory[idx3]); // ^k3
}
static __device__ __forceinline__ void aes_round(
__device__
static void aes_round(
const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
@ -367,36 +376,40 @@ static __device__ __forceinline__ void aes_round(
idx1 = __byte_perm(x1, 0, 0x4441) + 256;
idx2 = __byte_perm(x2, 0, 0x4442) + 512;
idx3 = __byte_perm(x3, 0, 0x4443) + 768;
y0 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3];
y0 = xor4_32(
sharedMemory[idx0],
sharedMemory[idx1],
sharedMemory[idx2],
sharedMemory[idx3]);
idx0 = __byte_perm(x1, 0, 0x4440);
idx1 = __byte_perm(x2, 0, 0x4441) + 256;
idx2 = __byte_perm(x3, 0, 0x4442) + 512;
idx3 = __byte_perm(x0, 0, 0x4443) + 768;
y1 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3]; // ^k3
y1 = xor4_32(
sharedMemory[idx0],
sharedMemory[idx1],
sharedMemory[idx2],
sharedMemory[idx3]);
idx0 = __byte_perm(x2, 0, 0x4440);
idx1 = __byte_perm(x3, 0, 0x4441) + 256;
idx2 = __byte_perm(x0, 0, 0x4442) + 512;
idx3 = __byte_perm(x1, 0, 0x4443) + 768;
y2 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3]; // ^k2
y2 = xor4_32(
sharedMemory[idx0],
sharedMemory[idx1],
sharedMemory[idx2],
sharedMemory[idx3]); // ^k2
idx0 = __byte_perm(x3, 0, 0x4440);
idx1 = __byte_perm(x0, 0, 0x4441) + 256;
idx2 = __byte_perm(x1, 0, 0x4442) + 512;
idx3 = __byte_perm(x2, 0, 0x4443) + 768;
y3 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3]; // ^k3
y3 = xor4_32(
sharedMemory[idx0],
sharedMemory[idx1],
sharedMemory[idx2],
sharedMemory[idx3]); // ^k3
}