Browse Source

phi: maxwell opt (aes final + streebog)

+ fix the fugue leak..

Also update sib algo with this improvement
pull/2/head
Tanguy Pruvot 7 years ago
parent
commit
5a90db192a
  1. 3
      Makefile.am
  2. 3
      ccminer.vcxproj
  3. 3
      ccminer.vcxproj.filters
  4. 8
      res/ccminer.rc
  5. 2
      skunk/cuda_skunk_streebog.cu
  6. 309
      x11/cuda_streebog_maxwell.cu
  7. 48
      x11/phi.cu
  8. 10
      x11/sib.cu
  9. 0
      x11/streebog_arrays.cuh

3
Makefile.am

@ -74,7 +74,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \ x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \ x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
x11/c11.cu x11/phi.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu x11/phi.cu x11/cuda_streebog_maxwell.cu \
x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
# scrypt # scrypt
ccminer_SOURCES += scrypt.cpp scrypt-jane.cpp \ ccminer_SOURCES += scrypt.cpp scrypt-jane.cpp \

3
ccminer.vcxproj

@ -487,7 +487,6 @@
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
<FastMath>true</FastMath> <FastMath>true</FastMath>
</CudaCompile> </CudaCompile>
<CudaCompile Include="x11\phi.cu" />
<CudaCompile Include="quark\cuda_bmw512.cu"> <CudaCompile Include="quark\cuda_bmw512.cu">
<MaxRegCount>128</MaxRegCount> <MaxRegCount>128</MaxRegCount>
</CudaCompile> </CudaCompile>
@ -560,8 +559,10 @@
<MaxRegCount>64</MaxRegCount> <MaxRegCount>64</MaxRegCount>
</CudaCompile> </CudaCompile>
<CudaCompile Include="x11\cuda_streebog.cu" /> <CudaCompile Include="x11\cuda_streebog.cu" />
<CudaCompile Include="x11\cuda_streebog_maxwell.cu" />
<CudaCompile Include="x11\c11.cu" /> <CudaCompile Include="x11\c11.cu" />
<CudaCompile Include="x11\fresh.cu" /> <CudaCompile Include="x11\fresh.cu" />
<CudaCompile Include="x11\phi.cu" />
<CudaCompile Include="x11\sib.cu" /> <CudaCompile Include="x11\sib.cu" />
<CudaCompile Include="x11\s3.cu" /> <CudaCompile Include="x11\s3.cu" />
<CudaCompile Include="x11\timetravel.cu" /> <CudaCompile Include="x11\timetravel.cu" />

3
ccminer.vcxproj.filters

@ -787,6 +787,9 @@
<CudaCompile Include="x11\cuda_streebog.cu"> <CudaCompile Include="x11\cuda_streebog.cu">
<Filter>Source Files\CUDA\x11</Filter> <Filter>Source Files\CUDA\x11</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="x11\cuda_streebog_maxwell.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\s3.cu"> <CudaCompile Include="x11\s3.cu">
<Filter>Source Files\CUDA\x11</Filter> <Filter>Source Files\CUDA\x11</Filter>
</CudaCompile> </CudaCompile>

8
res/ccminer.rc

@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico"
// //
VS_VERSION_INFO VERSIONINFO VS_VERSION_INFO VERSIONINFO
FILEVERSION 2,2,1,0 FILEVERSION 2,2,2,0
PRODUCTVERSION 2,2,1,0 PRODUCTVERSION 2,2,2,0
FILEFLAGSMASK 0x3fL FILEFLAGSMASK 0x3fL
#ifdef _DEBUG #ifdef _DEBUG
FILEFLAGS 0x21L FILEFLAGS 0x21L
@ -76,10 +76,10 @@ BEGIN
BEGIN BEGIN
BLOCK "040904e4" BLOCK "040904e4"
BEGIN BEGIN
VALUE "FileVersion", "2.2.1" VALUE "FileVersion", "2.2.2"
VALUE "LegalCopyright", "Copyright (C) 2017" VALUE "LegalCopyright", "Copyright (C) 2017"
VALUE "ProductName", "ccminer" VALUE "ProductName", "ccminer"
VALUE "ProductVersion", "2.2.1" VALUE "ProductVersion", "2.2.2"
END END
END END
BLOCK "VarFileInfo" BLOCK "VarFileInfo"

2
skunk/cuda_skunk_streebog.cu

@ -18,7 +18,7 @@
#include <cuda_vectors.h> #include <cuda_vectors.h>
#include <cuda_vector_uint2x4.h> #include <cuda_vector_uint2x4.h>
#include "skunk/streebog_arrays.cuh" #include "x11/streebog_arrays.cuh"
//#define FULL_UNROLL //#define FULL_UNROLL
__device__ __forceinline__ __device__ __forceinline__

309
x11/cuda_streebog_maxwell.cu

@ -0,0 +1,309 @@
/*
* Streebog GOST R 34.10-2012 CUDA implementation.
*
* https://tools.ietf.org/html/rfc6986
* https://en.wikipedia.org/wiki/Streebog
*
* ==========================(LICENSE BEGIN)============================
*
* @author Tanguy Pruvot - 2015
* @author Alexis Provos - 2016
*/
// Further improved with shared memory partial utilization
// Tested under CUDA7.5 toolkit for cp 5.0/5.2
//#include <miner.h>
#include <cuda_helper.h>
#include <cuda_vectors.h>
#include <cuda_vector_uint2x4.h>
#include "streebog_arrays.cuh"
//#define FULL_UNROLL
__device__ __forceinline__
static void GOST_FS(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state)
{
return_state[0] = __ldg(&T02[__byte_perm(state[7].x,0,0x44440)])
^ shared[1][__byte_perm(state[6].x,0,0x44440)]
^ shared[2][__byte_perm(state[5].x,0,0x44440)]
^ shared[3][__byte_perm(state[4].x,0,0x44440)]
^ shared[4][__byte_perm(state[3].x,0,0x44440)]
^ shared[5][__byte_perm(state[2].x,0,0x44440)]
^ shared[6][__byte_perm(state[1].x,0,0x44440)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]);
return_state[1] = __ldg(&T02[__byte_perm(state[7].x,0,0x44441)])
^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)])
^ shared[2][__byte_perm(state[5].x,0,0x44441)]
^ shared[3][__byte_perm(state[4].x,0,0x44441)]
^ shared[4][__byte_perm(state[3].x,0,0x44441)]
^ shared[5][__byte_perm(state[2].x,0,0x44441)]
^ shared[6][__byte_perm(state[1].x,0,0x44441)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)]);
return_state[2] = __ldg(&T02[__byte_perm(state[7].x,0,0x44442)])
^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)])
^ shared[2][__byte_perm(state[5].x,0,0x44442)]
^ shared[3][__byte_perm(state[4].x,0,0x44442)]
^ shared[4][__byte_perm(state[3].x,0,0x44442)]
^ shared[5][__byte_perm(state[2].x,0,0x44442)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)])
^ shared[6][__byte_perm(state[1].x,0,0x44442)];
return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)])
^ shared[1][__byte_perm(state[6].x,0,0x44443)]
^ shared[2][__byte_perm(state[5].x,0,0x44443)]
^ shared[3][__byte_perm(state[4].x,0,0x44443)]
^ __ldg(&T42[__byte_perm(state[3].x,0,0x44443)])
^ shared[5][__byte_perm(state[2].x,0,0x44443)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)])
^ shared[6][__byte_perm(state[1].x,0,0x44443)];
return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)])
^ shared[1][__byte_perm(state[6].y,0,0x44440)]
^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)])
^ shared[3][__byte_perm(state[4].y,0,0x44440)]
^ shared[4][__byte_perm(state[3].y,0,0x44440)]
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)])
^ shared[5][__byte_perm(state[2].y,0,0x44440)]
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)]);
return_state[5] = __ldg(&T02[__byte_perm(state[7].y,0,0x44441)])
^ shared[2][__byte_perm(state[5].y,0,0x44441)]
^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)])
^ shared[3][__byte_perm(state[4].y,0,0x44441)]
^ shared[4][__byte_perm(state[3].y,0,0x44441)]
^ shared[5][__byte_perm(state[2].y,0,0x44441)]
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)])
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)]);
return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)])
^ shared[1][__byte_perm(state[6].y,0,0x44442)]
^ shared[2][__byte_perm(state[5].y,0,0x44442)]
^ shared[3][__byte_perm(state[4].y,0,0x44442)]
^ shared[4][__byte_perm(state[3].y,0,0x44442)]
^ shared[5][__byte_perm(state[2].y,0,0x44442)]
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)])
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)]);
return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)])
^ __ldg(&T12[__byte_perm(state[6].y,0,0x44443)])
^ shared[2][__byte_perm(state[5].y,0,0x44443)]
^ shared[3][__byte_perm(state[4].y,0,0x44443)]
^ shared[4][__byte_perm(state[3].y,0,0x44443)]
^ shared[5][__byte_perm(state[2].y,0,0x44443)]
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)])
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)]);
}
__device__ __forceinline__
static void GOST_FS_LDG(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state)
{
return_state[0] = __ldg(&T02[__byte_perm(state[7].x,0,0x44440)])
^ __ldg(&T12[__byte_perm(state[6].x,0,0x44440)])
^ shared[2][__byte_perm(state[5].x,0,0x44440)]
^ shared[3][__byte_perm(state[4].x,0,0x44440)]
^ shared[4][__byte_perm(state[3].x,0,0x44440)]
^ shared[5][__byte_perm(state[2].x,0,0x44440)]
^ shared[6][__byte_perm(state[1].x,0,0x44440)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]);
return_state[1] = __ldg(&T02[__byte_perm(state[7].x,0,0x44441)])
^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)])
^ shared[2][__byte_perm(state[5].x,0,0x44441)]
^ shared[3][__byte_perm(state[4].x,0,0x44441)]
^ shared[4][__byte_perm(state[3].x,0,0x44441)]
^ shared[5][__byte_perm(state[2].x,0,0x44441)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)])
^ shared[6][__byte_perm(state[1].x,0,0x44441)];
return_state[2] = __ldg(&T02[__byte_perm(state[7].x,0,0x44442)])
^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)])
^ shared[2][__byte_perm(state[5].x,0,0x44442)]
^ shared[3][__byte_perm(state[4].x,0,0x44442)]
^ shared[4][__byte_perm(state[3].x,0,0x44442)]
^ shared[5][__byte_perm(state[2].x,0,0x44442)]
^ shared[6][__byte_perm(state[1].x,0,0x44442)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)]);
return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)])
^ __ldg(&T12[__byte_perm(state[6].x,0,0x44443)])
^ shared[2][__byte_perm(state[5].x,0,0x44443)]
^ shared[3][__byte_perm(state[4].x,0,0x44443)]
^ shared[4][__byte_perm(state[3].x,0,0x44443)]
^ shared[5][__byte_perm(state[2].x,0,0x44443)]
^ shared[6][__byte_perm(state[1].x,0,0x44443)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)]);
return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)])
^ shared[1][__byte_perm(state[6].y,0,0x44440)]
^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)])
^ shared[3][__byte_perm(state[4].y,0,0x44440)]
^ shared[4][__byte_perm(state[3].y,0,0x44440)]
^ shared[5][__byte_perm(state[2].y,0,0x44440)]
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)])
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)]);
return_state[5] = __ldg(&T02[__byte_perm(state[7].y,0,0x44441)])
^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)])
^ shared[2][__byte_perm(state[5].y,0,0x44441)]
^ shared[3][__byte_perm(state[4].y,0,0x44441)]
^ shared[4][__byte_perm(state[3].y,0,0x44441)]
^ shared[5][__byte_perm(state[2].y,0,0x44441)]
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)])
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)]);
return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)])
^ __ldg(&T12[__byte_perm(state[6].y,0,0x44442)])
^ __ldg(&T22[__byte_perm(state[5].y,0,0x44442)])
^ shared[3][__byte_perm(state[4].y,0,0x44442)]
^ shared[4][__byte_perm(state[3].y,0,0x44442)]
^ shared[5][__byte_perm(state[2].y,0,0x44442)]
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)])
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)]);
return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)])
^ shared[1][__byte_perm(state[6].y,0,0x44443)]
^ __ldg(&T22[__byte_perm(state[5].y,0,0x44443)])
^ shared[3][__byte_perm(state[4].y,0,0x44443)]
^ shared[4][__byte_perm(state[3].y,0,0x44443)]
^ shared[5][__byte_perm(state[2].y,0,0x44443)]
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)])
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)]);
}
__device__ __forceinline__
static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uint2 *const __restrict__ state)
{
uint2 t[8];
for(int i=0; i<12; i++){
GOST_FS(shared,state, t);
#pragma unroll 8
for(int j=0;j<8;j++)
K[ j] ^= *(uint2*)&CC[i][j];
#pragma unroll 8
for(int j=0;j<8;j++)
state[ j] = t[ j];
GOST_FS_LDG(shared,K, t);
#pragma unroll 8
for(int j=0;j<8;j++)
state[ j]^= t[ j];
#pragma unroll 8
for(int j=0;j<8;j++)
K[ j] = t[ j];
}
}
#define TPB 256
__global__
#if __CUDA_ARCH__ > 500
__launch_bounds__(TPB, 3)
#else
__launch_bounds__(TPB, 3)
#endif
void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint2 buf[8], t[8], temp[8], K0[8], hash[8];
__shared__ uint2 shared[8][256];
shared[0][threadIdx.x] = __ldg(&T02[threadIdx.x]);
shared[1][threadIdx.x] = __ldg(&T12[threadIdx.x]);
shared[2][threadIdx.x] = __ldg(&T22[threadIdx.x]);
shared[3][threadIdx.x] = __ldg(&T32[threadIdx.x]);
shared[4][threadIdx.x] = __ldg(&T42[threadIdx.x]);
shared[5][threadIdx.x] = __ldg(&T52[threadIdx.x]);
shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]);
shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]);
uint64_t* inout = &g_hash[thread<<3];
*(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
*(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);
__threadfence_block();
K0[0] = vectorize(0x74a5d4ce2efc83b3);
#pragma unroll 8
for(int i=0;i<8;i++){
buf[ i] = K0[ 0] ^ hash[ i];
}
for(int i=0; i<12; i++){
GOST_FS(shared, buf, temp);
#pragma unroll 8
for(uint32_t j=0;j<8;j++){
buf[ j] = temp[ j] ^ *(uint2*)&precomputed_values[i][j];
}
}
#pragma unroll 8
for(int j=0;j<8;j++){
buf[ j]^= hash[ j];
}
#pragma unroll 8
for(int j=0;j<8;j++){
K0[ j] = buf[ j];
}
K0[7].y ^= 0x00020000;
GOST_FS(shared, K0, t);
#pragma unroll 8
for(int i=0;i<8;i++)
K0[ i] = t[ i];
t[7].y ^= 0x01000000;
GOST_E12(shared, K0, t);
#pragma unroll 8
for(int j=0;j<8;j++)
buf[ j] ^= t[ j];
buf[7].y ^= 0x01000000;
GOST_FS(shared, buf,K0);
buf[7].y ^= 0x00020000;
#pragma unroll 8
for(int j=0;j<8;j++)
t[ j] = K0[ j];
t[7].y ^= 0x00020000;
GOST_E12(shared, K0, t);
#pragma unroll 8
for(int j=0;j<8;j++)
buf[ j] ^= t[ j];
GOST_FS(shared, buf,K0); // K = F(h)
hash[7]+= vectorize(0x0100000000000000);
#pragma unroll 8
for(int j=0;j<8;j++)
t[ j] = K0[ j] ^ hash[ j];
GOST_E12(shared, K0, t);
*(uint2x4*)&inout[0] = *(uint2x4*)&t[0] ^ *(uint2x4*)&hash[0] ^ *(uint2x4*)&buf[0];
*(uint2x4*)&inout[4] = *(uint2x4*)&t[4] ^ *(uint2x4*)&hash[4] ^ *(uint2x4*)&buf[4];
}
__host__
void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash)
{
dim3 grid((threads + TPB-1) / TPB);
dim3 block(TPB);
streebog_gpu_hash_64_maxwell <<<grid, block>>> ((uint64_t*)d_hash);
}

48
x11/phi.cu

@ -22,17 +22,21 @@ extern "C" {
#include "cuda_x11.h" #include "cuda_x11.h"
extern void skein512_cpu_setBlock_80(void *pdata); extern void skein512_cpu_setBlock_80(void *pdata);
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
extern void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash);
extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_fugue512_cpu_free(int thr_id); extern void x13_fugue512_cpu_free(int thr_id);
extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
static uint32_t *d_resNonce[MAX_GPUS];
extern "C" void phihash(void *output, const void *input) extern "C" void phihash(void *output, const void *input)
{ {
@ -76,6 +80,7 @@ extern "C" void phihash(void *output, const void *input)
#include "cuda_debug.cuh" #include "cuda_debug.cuh"
static bool init[MAX_GPUS] = { 0 }; static bool init[MAX_GPUS] = { 0 };
static bool use_compat_kernels[MAX_GPUS] = { 0 };
extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{ {
@ -96,7 +101,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
if (!init[thr_id]) if (!init[thr_id])
{ {
cudaSetDevice(device_map[thr_id]); cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) { if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset(); cudaDeviceReset();
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
@ -105,13 +110,19 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
} }
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
cuda_get_arch(thr_id);
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
quark_skein512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput);
x11_cubehash512_cpu_init(thr_id, throughput); x11_cubehash512_cpu_init(thr_id, throughput);
x13_fugue512_cpu_init(thr_id, throughput); x13_fugue512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput); if (use_compat_kernels[thr_id])
x11_echo512_cpu_init(thr_id, throughput);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput), -1); CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput), -1);
CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
cuda_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -122,7 +133,10 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
be32enc(&endiandata[k], pdata[k]); be32enc(&endiandata[k], pdata[k]);
skein512_cpu_setBlock_80((void*)endiandata); skein512_cpu_setBlock_80((void*)endiandata);
cuda_check_cpu_setTarget(ptarget); if (use_compat_kernels[thr_id])
cuda_check_cpu_setTarget(ptarget);
else
cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
do { do {
int order = 0; int order = 0;
@ -131,24 +145,33 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); if (use_compat_kernels[thr_id]) {
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
} else {
streebog_hash_64_maxwell(thr_id, throughput, d_hash[thr_id]);
tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
}
if (work->nonces[0] != UINT32_MAX) if (work->nonces[0] != UINT32_MAX)
{ {
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
const uint32_t startNonce = pdata[19];
uint32_t _ALIGN(64) vhash[8]; uint32_t _ALIGN(64) vhash[8];
if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce;
be32enc(&endiandata[19], work->nonces[0]); be32enc(&endiandata[19], work->nonces[0]);
phihash(vhash, endiandata); phihash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1; work->valid_nonces = 1;
work_set_target_ratio(work, vhash); work_set_target_ratio(work, vhash);
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
*hashes_done = pdata[19] - first_nonce + throughput; *hashes_done = pdata[19] - first_nonce + throughput;
if (work->nonces[1] != 0) { //work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
//if (work->nonces[1] != 0) {
if (work->nonces[1] != UINT32_MAX) {
work->nonces[1] += startNonce;
be32enc(&endiandata[19], work->nonces[1]); be32enc(&endiandata[19], work->nonces[1]);
phihash(vhash, endiandata); phihash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1); bn_set_target_ratio(work, vhash, 1);
@ -164,6 +187,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
gpu_increment_reject(thr_id); gpu_increment_reject(thr_id);
if (!opt_quiet) if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
pdata[19] = work->nonces[0] + 1; pdata[19] = work->nonces[0] + 1;
continue; continue;
} }
@ -189,6 +213,8 @@ extern "C" void free_phi(int thr_id)
cudaThreadSynchronize(); cudaThreadSynchronize();
cudaFree(d_hash[thr_id]); cudaFree(d_hash[thr_id]);
cudaFree(d_resNonce[thr_id]);
x13_fugue512_cpu_free(thr_id);
cuda_check_cpu_free(thr_id); cuda_check_cpu_free(thr_id);
init[thr_id] = false; init[thr_id] = false;

10
x11/sib.cu

@ -18,6 +18,7 @@ extern "C" {
#include "cuda_x11.h" #include "cuda_x11.h"
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
extern void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash);
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
@ -98,6 +99,7 @@ extern "C" void sibhash(void *output, const void *input)
#include "cuda_debug.cuh" #include "cuda_debug.cuh"
static bool init[MAX_GPUS] = { 0 }; static bool init[MAX_GPUS] = { 0 };
static bool use_compat_kernels[MAX_GPUS] = { 0 };
extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{ {
@ -124,6 +126,9 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
} }
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
cuda_get_arch(thr_id);
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
quark_blake512_cpu_init(thr_id, throughput); quark_blake512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput);
@ -166,7 +171,10 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
TRACE("jh512 :"); TRACE("jh512 :");
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("keccak :"); TRACE("keccak :");
streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); if (use_compat_kernels[thr_id])
streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
else
streebog_hash_64_maxwell(thr_id, throughput, d_hash[thr_id]);
TRACE("gost :"); TRACE("gost :");
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
TRACE("luffa+c:"); TRACE("luffa+c:");

0
skunk/streebog_arrays.cuh → x11/streebog_arrays.cuh

Loading…
Cancel
Save