Browse Source

blake2s cleanup

2upstream 1.7.5-tpruvot
Tanguy Pruvot 8 years ago
parent
commit
5a69056ee5
  1. 80
      Algo256/blake2s.cu
  2. 5
      README.txt
  3. 5
      ccminer.cpp
  4. 8
      res/ccminer.rc

80
Algo256/blake2s.cu

@ -1,3 +1,7 @@ @@ -1,3 +1,7 @@
/**
* Blake2-S 256 CUDA implementation
* @author tpruvot@github March 2016
*/
#include <stdio.h>
#include <string.h>
#include <stdint.h>
@ -5,19 +9,18 @@ @@ -5,19 +9,18 @@
#include "miner.h"
#define NATIVE_LITTLE_ENDIAN
extern "C" {
#define NATIVE_LITTLE_ENDIAN
#include <sph/blake2s.h>
}
static __thread blake2s_state ALIGN(64) s_midstate;
static __thread blake2s_state ALIGN(64) s_ctx;
//#define GPU_MIDSTATE
#define MIDLEN 76
#define A 64
static __thread blake2s_state ALIGN(A) s_midstate;
static __thread blake2s_state ALIGN(A) s_ctx;
#include "cuda_helper.h"
#ifdef __INTELLISENSE__
@ -63,7 +66,7 @@ inline void blake2s_hash_end(uint32_t *output, const uint32_t *input) @@ -63,7 +66,7 @@ inline void blake2s_hash_end(uint32_t *output, const uint32_t *input)
}
__host__
void blake2s_cpu_setBlock(uint32_t *penddata, blake2s_state *pstate)
void blake2s_setBlock(uint32_t *penddata, blake2s_state *pstate)
{
#ifndef GPU_MIDSTATE
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, penddata, 80, 0, cudaMemcpyHostToDevice));
@ -73,31 +76,18 @@ void blake2s_cpu_setBlock(uint32_t *penddata, blake2s_state *pstate) @@ -73,31 +76,18 @@ void blake2s_cpu_setBlock(uint32_t *penddata, blake2s_state *pstate)
}
__device__ __forceinline__
uint32_t gpu_load32(const void *src) {
return *(uint32_t *)(src);
uint64_t gpu_load64(void *src) {
return *(uint64_t*)(src);
}
__device__ __forceinline__
void gpu_store32(void *dst, uint32_t dw) {
*(uint32_t *)(dst) = dw;
*(uint32_t*)(dst) = dw;
}
__device__ __forceinline__
void gpu_store64(void *dst, uint64_t lw) {
*(uint64_t *)(dst) = lw;
}
__device__ __forceinline__
uint64_t gpu_load48(const void *src)
{
const uint8_t *p = (const uint8_t *)src;
uint64_t w = *p++;
w |= (uint64_t)(*p++) << 8;
w |= (uint64_t)(*p++) << 16;
w |= (uint64_t)(*p++) << 24;
w |= (uint64_t)(*p++) << 32;
w |= (uint64_t)(*p++) << 40;
return w;
*(uint64_t*)(dst) = lw;
}
__device__ __forceinline__
@ -202,10 +192,11 @@ void gpu_blake2s_compress(blake2s_state *S, const uint32_t *block) @@ -202,10 +192,11 @@ void gpu_blake2s_compress(blake2s_state *S, const uint32_t *block)
for(int i = 0; i < 8; i++)
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
#undef G
#undef ROUND
#undef G
#undef ROUND
}
/* unused but kept as reference */
__device__ __forceinline__
void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen)
{
@ -236,10 +227,10 @@ void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen) @@ -236,10 +227,10 @@ void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen)
}
__device__ __forceinline__
void gpu_blake2s_update76(blake2s_state *S, const void *in)
void gpu_blake2s_update76(blake2s_state *S, const void *input)
{
uint64_t *b64 = (uint64_t*) S->buf;
uint64_t *i64 = (uint64_t*) in;
uint64_t *i64 = (uint64_t*) input;
#pragma unroll
for (int i=0; i < 80/8; i++)
b64[i] = i64[i];
@ -295,14 +286,14 @@ void gpu_blake2s_init_param(blake2s_state *S, const blake2s_param *P) @@ -295,14 +286,14 @@ void gpu_blake2s_init_param(blake2s_state *S, const blake2s_param *P)
S->buflen = 0;
#pragma unroll
for (int i = 0; i < sizeof(S->buf)/4; i++)
gpu_store32(S->buf + (4*i), 0);
for (int i = 0; i < sizeof(S->buf)/8; i++)
gpu_store64(S->buf + (8*i), 0);
uint32_t *p = (uint32_t*) P;
uint64_t *p = (uint64_t*) P;
/* IV XOR ParamBlock */
for (int i = 0; i < 8; i++)
S->h[i] ^= gpu_load32(&p[i]);
for (int i = 0; i < 4; i++)
S->h[i] ^= gpu_load64(&p[i]);
}
// Sequential blake2s initialization
@ -373,7 +364,7 @@ void blake2s_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_ @@ -373,7 +364,7 @@ void blake2s_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_
}
__host__
uint32_t blake2s_host_hash(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2)
uint32_t blake2s_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2)
{
uint32_t result = UINT32_MAX;
@ -406,14 +397,15 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc @@ -406,14 +397,15 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
const uint32_t first_nonce = pdata[19];
int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 26 : 22;
int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 28 : 25;
if (device_sm[dev_id] < 350) intensity = 22;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) {
ptarget[7] = swab32(0xff);
ptarget[6] = swab32(0xFFFF0);
ptarget[7] = 0;
}
if (!init[thr_id])
@ -442,13 +434,14 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc @@ -442,13 +434,14 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
blake2s_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
memcpy(&s_ctx, &s_midstate, sizeof(blake2s_state));
blake2s_cpu_setBlock(endiandata, &s_midstate);
blake2s_setBlock(endiandata, &s_midstate);
uint2 gpu_target = make_uint2(ptarget[7], ptarget[6]);
const uint32_t Htarg = ptarget[7];
const uint2 target = make_uint2(ptarget[7], ptarget[6]);
do {
uint32_t foundNonce = blake2s_host_hash(thr_id, throughput, pdata[19], gpu_target);
uint32_t foundNonce = blake2s_hash_cuda(thr_id, throughput, pdata[19], target);
*hashes_done = pdata[19] - first_nonce + throughput;
if (foundNonce != UINT32_MAX)
{
@ -458,15 +451,14 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc @@ -458,15 +451,14 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
le32enc(&endiandata[19], foundNonce);
blake2s_hash_end(vhashcpu, endiandata);
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
if (vhashcpu[7] <= target.x && fulltest(vhashcpu, ptarget)) {
work_set_target_ratio(work, vhashcpu);
*hashes_done = pdata[19] + throughput - first_nonce + 1;
pdata[19] = work->nonces[0] = swab32(foundNonce);
#if NBN > 1
if (extra_results[0] != UINT32_MAX) {
le32enc(&endiandata[19], extra_results[0]);
blake2s_hash_end(vhashcpu, endiandata);
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
if (vhashcpu[7] <= target.x && fulltest(vhashcpu, ptarget)) {
work->nonces[1] = swab32(extra_results[0]);
if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio) {
work_set_target_ratio(work, vhashcpu);
@ -490,9 +482,7 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc @@ -490,9 +482,7 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
} while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + pdata[19]);
*hashes_done = pdata[19] - first_nonce + 1;
MyStreamSynchronize(NULL, 0, device_map[thr_id]);
*hashes_done = pdata[19] - first_nonce;
return 0;
}

5
README.txt

@ -238,9 +238,8 @@ features. @@ -238,9 +238,8 @@ features.
>>> RELEASE HISTORY <<<
Mar. 12th 2015 v1.7.5
Blake2S Algo
...
Mar. 13th 2015 v1.7.5
Blake2S Algo (NEVA/OXEN)
Feb. 28th 2015 v1.7.4 (1.7.3 was a preview, not official)
Decred simplified stratum (getwork over stratum)

5
ccminer.cpp

@ -1713,6 +1713,9 @@ static void *miner_thread(void *userdata) @@ -1713,6 +1713,9 @@ static void *miner_thread(void *userdata)
// and make an unique work (extradata)
nonceptr[1] += 1;
nonceptr[2] |= thr_id;
} else if (opt_benchmark) {
// randomize work
nonceptr[-1] += 1;
}
pthread_mutex_unlock(&g_work_lock);
@ -1826,11 +1829,11 @@ static void *miner_thread(void *userdata) @@ -1826,11 +1829,11 @@ static void *miner_thread(void *userdata)
if (max64 < minmax) {
switch (opt_algo) {
case ALGO_BLAKECOIN:
case ALGO_BLAKE2S:
case ALGO_VANILLA:
minmax = 0x80000000U;
break;
case ALGO_BLAKE:
case ALGO_BLAKE2S:
case ALGO_BMW:
case ALGO_DECRED:
//case ALGO_WHIRLPOOLX:

8
res/ccminer.rc

@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico" @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico"
//
VS_VERSION_INFO VERSIONINFO
FILEVERSION 1,7,4,0
PRODUCTVERSION 1,7,4,0
FILEVERSION 1,7,5,0
PRODUCTVERSION 1,7,5,0
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x21L
@ -76,10 +76,10 @@ BEGIN @@ -76,10 +76,10 @@ BEGIN
BEGIN
BLOCK "040904e4"
BEGIN
VALUE "FileVersion", "1.7.4"
VALUE "FileVersion", "1.7.5"
VALUE "LegalCopyright", "Copyright (C) 2016"
VALUE "ProductName", "ccminer"
VALUE "ProductVersion", "1.7.4"
VALUE "ProductVersion", "1.7.5"
END
END
BLOCK "VarFileInfo"

Loading…
Cancel
Save