Tune quark part of Xn funcs

based on klaus commits, will increase a bit speed of most algos

PS: main increase is due to the register count tuning in Makefile

and for skein512 on linux, its the ROTL64

but almost no changes on X11 : 2648MH/s vs 2630 before
This commit is contained in:
Tanguy Pruvot 2014-10-20 02:18:21 +02:00
parent 0720797f1b
commit d8a23fa970
9 changed files with 96 additions and 65 deletions

View File

@ -87,7 +87,15 @@ x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
# ABI requiring code modules
quark/cuda_quark_blake512.o: quark/cuda_quark_blake512.cu
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
quark/cuda_jh512.o: quark/cuda_jh512.cu
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
quark/cuda_quark_keccak512.o: quark/cuda_quark_keccak512.cu
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=88 -o $@ -c $<
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<

View File

@ -405,11 +405,13 @@
<TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
</CudaCompile>
<CudaCompile Include="quark\cuda_jh512.cu">
<MaxRegCount>80</MaxRegCount>
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
<TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
</CudaCompile>
<CudaCompile Include="quark\cuda_quark_blake512.cu">
<MaxRegCount>80</MaxRegCount>
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
<TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
@ -431,6 +433,7 @@
<TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
</CudaCompile>
<CudaCompile Include="quark\cuda_quark_keccak512.cu">
<MaxRegCount>88</MaxRegCount>
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
<TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
@ -579,4 +582,4 @@
<Target Name="AfterClean">
<Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
</Target>
</Project>
</Project>

View File

@ -1,4 +1,4 @@
AC_INIT([ccminer], [2014.09.28])
AC_INIT([ccminer], [1.4.5])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@ -5,5 +5,5 @@
#--ptxas-options=\"-v -dlcm=cg\""
CUDA_CFLAGS="-O2" ./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda
CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda

View File

@ -156,7 +156,7 @@
#define PACKAGE_NAME "ccminer"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "ccminer 2014.09.28"
#define PACKAGE_STRING "ccminer 1.4.5"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "ccminer"
@ -165,7 +165,7 @@
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "2014.09.28"
#define PACKAGE_VERSION "1.4.5"
/* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be
@ -188,7 +188,7 @@
#define USE_XOP 1
/* Version number of package */
#define VERSION "2014.09.28"
#define VERSION "1.4.5"
/* Define curl_free() as free() if our version of curl lacks curl_free. */
/* #undef curl_free */

View File

@ -318,6 +318,25 @@ uint64_t ROTL64(const uint64_t x, const int offset)
: "=l"(result) : "l"(x), "r"(offset));
return result;
}
#elif __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 3
__device__
uint64_t ROTL64(const uint64_t x, const int offset)
{
uint64_t res;
asm("{\n\t"
".reg .u32 tl,th,vl,vh;\n\t"
".reg .pred p;\n\t"
"mov.b64 {tl,th}, %1;\n\t"
"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
"setp.lt.u32 p, %2, 32;\n\t"
"@!p mov.b64 %0, {vl,vh};\n\t"
"@p mov.b64 %0, {vh,vl};\n\t"
"}"
: "=l"(res) : "l"(x) , "r"(offset)
);
return res;
}
#else
/* host */
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))

View File

@ -7,9 +7,6 @@
#define USE_SHUFFLE 0
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// die Message it Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
@ -106,18 +103,6 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t
h[i % 8] ^= v[i];
}
__device__ __constant__
static const uint64_t d_constMem[8] = {
0x6a09e667f3bcc908ULL,
0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL,
0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL,
0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL,
0x5be0cd19137e2179ULL
};
// Hash-Padding
__device__ __constant__
static const uint64_t d_constHashPadding[8] = {
@ -157,10 +142,16 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n
uint64_t buf[16];
// State
uint64_t h[8];
#pragma unroll 8
for (int i=0;i<8;i++)
h[i] = d_constMem[i];
uint64_t h[8] = {
0x6a09e667f3bcc908ULL,
0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL,
0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL,
0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL,
0x5be0cd19137e2179ULL
};
// Message for first round
#pragma unroll 8
@ -195,13 +186,19 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint64_t h[8];
uint64_t buf[16];
uint32_t nounce = startNounce + thread;
#pragma unroll 8
for(int i=0; i<8; i++)
h[i] = d_constMem[i];
uint64_t h[8] = {
0x6a09e667f3bcc908ULL,
0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL,
0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL,
0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL,
0x5be0cd19137e2179ULL
};
// Message für die erste Runde in Register holen
#pragma unroll 16

View File

@ -5,19 +5,36 @@
#include "cuda_helper.h"
// aus cpu-miner.c
extern "C" extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
extern int device_map[8];
// Take a look at: https://www.schneier.com/skein1.3.pdf
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
// Zum testen Hostcode...
/* Hier erstmal die Tabelle mit den Konstanten für die Mix-Funktion. Kann später vll.
mal direkt in den Code eingesetzt werden
*/
__device__
uint64_t skein_rotl64(const uint64_t x, const int offset)
{
uint64_t res;
asm("{\n\t"
".reg .u32 tl,th,vl,vh;\n\t"
".reg .pred p;\n\t"
"mov.b64 {tl,th}, %1;\n\t"
"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
"setp.lt.u32 p, %2, 32;\n\t"
"@!p mov.b64 %0, {vl,vh};\n\t"
"@p mov.b64 %0, {vh,vl};\n\t"
"}"
: "=l"(res) : "l"(x) , "r"(offset)
);
return res;
}
#if __CUDA_ARCH__ >= 350
#undef ROTL64
#define ROTL64 skein_rotl64
#endif
/*
* M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
@ -288,18 +305,8 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \
}
static __constant__ uint64_t d_constMem[8];
static const uint64_t h_constMem[8] = {
SPH_C64(0x4903ADFF749C51CE),
SPH_C64(0x0D95DE399746DF03),
SPH_C64(0x8FD1934127C79BCE),
SPH_C64(0x9A255629FF352CB1),
SPH_C64(0x5DB62599DF6CA7B0),
SPH_C64(0xEABE394CA9D5C3F4),
SPH_C64(0x991112C71A75B523),
SPH_C64(0xAE18A40B660FCC33) };
__global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
__global__
void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@ -315,14 +322,14 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
uint64_t *inpHash = &g_hash[8 * hashPosition];
// Initialisierung
h0 = d_constMem[0];
h1 = d_constMem[1];
h2 = d_constMem[2];
h3 = d_constMem[3];
h4 = d_constMem[4];
h5 = d_constMem[5];
h6 = d_constMem[6];
h7 = d_constMem[7];
h0 = 0x4903ADFF749C51CEull;
h1 = 0x0D95DE399746DF03ull;
h2 = 0x8FD1934127C79BCEull;
h3 = 0x9A255629FF352CB1ull;
h4 = 0x5DB62599DF6CA7B0ull;
h5 = 0xEABE394CA9D5C3F4ull;
h6 = 0x991112C71A75B523ull;
h7 = 0xAE18A40B660FCC33ull;
// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg
#pragma unroll 8
@ -399,16 +406,13 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
}
// Setup-Funktionen
__host__ void quark_skein512_cpu_init(int thr_id, int threads)
__host__
void quark_skein512_cpu_init(int thr_id, int threads)
{
// nix zu tun ;-)
cudaMemcpyToSymbol( d_constMem,
h_constMem,
sizeof(h_constMem),
0, cudaMemcpyHostToDevice);
}
__host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
__host__
void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
const int threadsperblock = 256;

View File

@ -142,7 +142,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
static bool init[8] = {0,0,0,0,0,0,0,0};
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
((uint32_t*)ptarget)[7] = 0x0000f;
if (!init[thr_id])
{