Tune quark part of Xn funcs
based on klaus commits, will increase a bit speed of most algos PS: main increase is due to the register count tuning in Makefile and for skein512 on linux, its the ROTL64 but almost no changes on X11 : 2648MH/s vs 2630 before
This commit is contained in:
parent
0720797f1b
commit
d8a23fa970
10
Makefile.am
10
Makefile.am
@ -87,7 +87,15 @@ x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
|
||||
x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
|
||||
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
|
||||
|
||||
# ABI requiring code modules
|
||||
quark/cuda_quark_blake512.o: quark/cuda_quark_blake512.cu
|
||||
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
|
||||
|
||||
quark/cuda_jh512.o: quark/cuda_jh512.cu
|
||||
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
|
||||
|
||||
quark/cuda_quark_keccak512.o: quark/cuda_quark_keccak512.cu
|
||||
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=88 -o $@ -c $<
|
||||
|
||||
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
|
||||
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
|
||||
|
||||
|
@ -405,11 +405,13 @@
|
||||
<TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="quark\cuda_jh512.cu">
|
||||
<MaxRegCount>80</MaxRegCount>
|
||||
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
|
||||
<AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
|
||||
<TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="quark\cuda_quark_blake512.cu">
|
||||
<MaxRegCount>80</MaxRegCount>
|
||||
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
|
||||
<AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
|
||||
<TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
|
||||
@ -431,6 +433,7 @@
|
||||
<TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="quark\cuda_quark_keccak512.cu">
|
||||
<MaxRegCount>88</MaxRegCount>
|
||||
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
|
||||
<AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
|
||||
<TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
|
||||
@ -579,4 +582,4 @@
|
||||
<Target Name="AfterClean">
|
||||
<Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
|
||||
</Target>
|
||||
</Project>
|
||||
</Project>
|
||||
|
@ -1,4 +1,4 @@
|
||||
AC_INIT([ccminer], [2014.09.28])
|
||||
AC_INIT([ccminer], [1.4.5])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
@ -5,5 +5,5 @@
|
||||
|
||||
#--ptxas-options=\"-v -dlcm=cg\""
|
||||
|
||||
CUDA_CFLAGS="-O2" ./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda
|
||||
CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda
|
||||
|
||||
|
@ -156,7 +156,7 @@
|
||||
#define PACKAGE_NAME "ccminer"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "ccminer 2014.09.28"
|
||||
#define PACKAGE_STRING "ccminer 1.4.5"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "ccminer"
|
||||
@ -165,7 +165,7 @@
|
||||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "2014.09.28"
|
||||
#define PACKAGE_VERSION "1.4.5"
|
||||
|
||||
/* If using the C implementation of alloca, define if you know the
|
||||
direction of stack growth for your system; otherwise it will be
|
||||
@ -188,7 +188,7 @@
|
||||
#define USE_XOP 1
|
||||
|
||||
/* Version number of package */
|
||||
#define VERSION "2014.09.28"
|
||||
#define VERSION "1.4.5"
|
||||
|
||||
/* Define curl_free() as free() if our version of curl lacks curl_free. */
|
||||
/* #undef curl_free */
|
||||
|
@ -318,6 +318,25 @@ uint64_t ROTL64(const uint64_t x, const int offset)
|
||||
: "=l"(result) : "l"(x), "r"(offset));
|
||||
return result;
|
||||
}
|
||||
#elif __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 3
|
||||
__device__
|
||||
uint64_t ROTL64(const uint64_t x, const int offset)
|
||||
{
|
||||
uint64_t res;
|
||||
asm("{\n\t"
|
||||
".reg .u32 tl,th,vl,vh;\n\t"
|
||||
".reg .pred p;\n\t"
|
||||
"mov.b64 {tl,th}, %1;\n\t"
|
||||
"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
|
||||
"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
|
||||
"setp.lt.u32 p, %2, 32;\n\t"
|
||||
"@!p mov.b64 %0, {vl,vh};\n\t"
|
||||
"@p mov.b64 %0, {vh,vl};\n\t"
|
||||
"}"
|
||||
: "=l"(res) : "l"(x) , "r"(offset)
|
||||
);
|
||||
return res;
|
||||
}
|
||||
#else
|
||||
/* host */
|
||||
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
||||
|
@ -7,9 +7,6 @@
|
||||
|
||||
#define USE_SHUFFLE 0
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// die Message it Padding zur Berechnung auf der GPU
|
||||
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
|
||||
|
||||
@ -106,18 +103,6 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t
|
||||
h[i % 8] ^= v[i];
|
||||
}
|
||||
|
||||
__device__ __constant__
|
||||
static const uint64_t d_constMem[8] = {
|
||||
0x6a09e667f3bcc908ULL,
|
||||
0xbb67ae8584caa73bULL,
|
||||
0x3c6ef372fe94f82bULL,
|
||||
0xa54ff53a5f1d36f1ULL,
|
||||
0x510e527fade682d1ULL,
|
||||
0x9b05688c2b3e6c1fULL,
|
||||
0x1f83d9abfb41bd6bULL,
|
||||
0x5be0cd19137e2179ULL
|
||||
};
|
||||
|
||||
// Hash-Padding
|
||||
__device__ __constant__
|
||||
static const uint64_t d_constHashPadding[8] = {
|
||||
@ -157,10 +142,16 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n
|
||||
uint64_t buf[16];
|
||||
|
||||
// State
|
||||
uint64_t h[8];
|
||||
#pragma unroll 8
|
||||
for (int i=0;i<8;i++)
|
||||
h[i] = d_constMem[i];
|
||||
uint64_t h[8] = {
|
||||
0x6a09e667f3bcc908ULL,
|
||||
0xbb67ae8584caa73bULL,
|
||||
0x3c6ef372fe94f82bULL,
|
||||
0xa54ff53a5f1d36f1ULL,
|
||||
0x510e527fade682d1ULL,
|
||||
0x9b05688c2b3e6c1fULL,
|
||||
0x1f83d9abfb41bd6bULL,
|
||||
0x5be0cd19137e2179ULL
|
||||
};
|
||||
|
||||
// Message for first round
|
||||
#pragma unroll 8
|
||||
@ -195,13 +186,19 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
uint64_t h[8];
|
||||
uint64_t buf[16];
|
||||
uint32_t nounce = startNounce + thread;
|
||||
|
||||
#pragma unroll 8
|
||||
for(int i=0; i<8; i++)
|
||||
h[i] = d_constMem[i];
|
||||
uint64_t h[8] = {
|
||||
0x6a09e667f3bcc908ULL,
|
||||
0xbb67ae8584caa73bULL,
|
||||
0x3c6ef372fe94f82bULL,
|
||||
0xa54ff53a5f1d36f1ULL,
|
||||
0x510e527fade682d1ULL,
|
||||
0x9b05688c2b3e6c1fULL,
|
||||
0x1f83d9abfb41bd6bULL,
|
||||
0x5be0cd19137e2179ULL
|
||||
};
|
||||
|
||||
// Message für die erste Runde in Register holen
|
||||
#pragma unroll 16
|
||||
|
@ -5,19 +5,36 @@
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern "C" extern int device_map[8];
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
extern int device_map[8];
|
||||
|
||||
// Take a look at: https://www.schneier.com/skein1.3.pdf
|
||||
|
||||
#define SHL(x, n) ((x) << (n))
|
||||
#define SHR(x, n) ((x) >> (n))
|
||||
|
||||
// Zum testen Hostcode...
|
||||
/* Hier erstmal die Tabelle mit den Konstanten für die Mix-Funktion. Kann später vll.
|
||||
mal direkt in den Code eingesetzt werden
|
||||
*/
|
||||
__device__
|
||||
uint64_t skein_rotl64(const uint64_t x, const int offset)
|
||||
{
|
||||
uint64_t res;
|
||||
asm("{\n\t"
|
||||
".reg .u32 tl,th,vl,vh;\n\t"
|
||||
".reg .pred p;\n\t"
|
||||
"mov.b64 {tl,th}, %1;\n\t"
|
||||
"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
|
||||
"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
|
||||
"setp.lt.u32 p, %2, 32;\n\t"
|
||||
"@!p mov.b64 %0, {vl,vh};\n\t"
|
||||
"@p mov.b64 %0, {vh,vl};\n\t"
|
||||
"}"
|
||||
: "=l"(res) : "l"(x) , "r"(offset)
|
||||
);
|
||||
return res;
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
#undef ROTL64
|
||||
#define ROTL64 skein_rotl64
|
||||
#endif
|
||||
|
||||
/*
|
||||
* M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
|
||||
@ -288,18 +305,8 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
|
||||
TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \
|
||||
}
|
||||
|
||||
static __constant__ uint64_t d_constMem[8];
|
||||
static const uint64_t h_constMem[8] = {
|
||||
SPH_C64(0x4903ADFF749C51CE),
|
||||
SPH_C64(0x0D95DE399746DF03),
|
||||
SPH_C64(0x8FD1934127C79BCE),
|
||||
SPH_C64(0x9A255629FF352CB1),
|
||||
SPH_C64(0x5DB62599DF6CA7B0),
|
||||
SPH_C64(0xEABE394CA9D5C3F4),
|
||||
SPH_C64(0x991112C71A75B523),
|
||||
SPH_C64(0xAE18A40B660FCC33) };
|
||||
|
||||
__global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
|
||||
__global__
|
||||
void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector)
|
||||
{
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
@ -315,14 +322,14 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
|
||||
uint64_t *inpHash = &g_hash[8 * hashPosition];
|
||||
|
||||
// Initialisierung
|
||||
h0 = d_constMem[0];
|
||||
h1 = d_constMem[1];
|
||||
h2 = d_constMem[2];
|
||||
h3 = d_constMem[3];
|
||||
h4 = d_constMem[4];
|
||||
h5 = d_constMem[5];
|
||||
h6 = d_constMem[6];
|
||||
h7 = d_constMem[7];
|
||||
h0 = 0x4903ADFF749C51CEull;
|
||||
h1 = 0x0D95DE399746DF03ull;
|
||||
h2 = 0x8FD1934127C79BCEull;
|
||||
h3 = 0x9A255629FF352CB1ull;
|
||||
h4 = 0x5DB62599DF6CA7B0ull;
|
||||
h5 = 0xEABE394CA9D5C3F4ull;
|
||||
h6 = 0x991112C71A75B523ull;
|
||||
h7 = 0xAE18A40B660FCC33ull;
|
||||
|
||||
// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg
|
||||
#pragma unroll 8
|
||||
@ -399,16 +406,13 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
|
||||
}
|
||||
|
||||
// Setup-Funktionen
|
||||
__host__ void quark_skein512_cpu_init(int thr_id, int threads)
|
||||
__host__
|
||||
void quark_skein512_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
// nix zu tun ;-)
|
||||
cudaMemcpyToSymbol( d_constMem,
|
||||
h_constMem,
|
||||
sizeof(h_constMem),
|
||||
0, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
__host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||
__host__
|
||||
void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||
{
|
||||
const int threadsperblock = 256;
|
||||
|
||||
|
@ -142,7 +142,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
|
||||
static bool init[8] = {0,0,0,0,0,0,0,0};
|
||||
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
((uint32_t*)ptarget)[7] = 0x0000f;
|
||||
|
||||
if (!init[thr_id])
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user