mirror of https://github.com/GOSTSec/sgminer
djm34
10 years ago
8 changed files with 1092 additions and 6 deletions
@ -0,0 +1,482 @@
@@ -0,0 +1,482 @@
|
||||
/*-
|
||||
* Copyright 2014 James Lovejoy |
||||
* Copyright 2014 phm |
||||
* All rights reserved. |
||||
* |
||||
* Redistribution and use in source and binary forms, with or without |
||||
* modification, are permitted provided that the following conditions |
||||
* are met: |
||||
* 1. Redistributions of source code must retain the above copyright |
||||
* notice, this list of conditions and the following disclaimer. |
||||
* 2. Redistributions in binary form must reproduce the above copyright |
||||
* notice, this list of conditions and the following disclaimer in the |
||||
* documentation and/or other materials provided with the distribution. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
||||
* SUCH DAMAGE. |
||||
*/ |
||||
|
||||
#include "config.h" |
||||
#include "miner.h" |
||||
|
||||
#include <stdlib.h> |
||||
#include <stdint.h> |
||||
#include <string.h> |
||||
|
||||
|
||||
|
||||
static const uint32_t sha256_h[8] = { |
||||
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, |
||||
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 |
||||
}; |
||||
|
||||
static const uint32_t sha256_k[64] = { |
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
||||
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
||||
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, |
||||
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, |
||||
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, |
||||
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, |
||||
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, |
||||
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, |
||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
||||
}; |
||||
|
||||
void sha256_init(uint32_t *state) |
||||
{ |
||||
memcpy(state, sha256_h, 32); |
||||
} |
||||
|
||||
/* Elementary functions used by SHA256 */ |
||||
#define Ch(x, y, z) ((x & (y ^ z)) ^ z) |
||||
#define Maj(x, y, z) ((x & (y | z)) | (y & z)) |
||||
#define ROTR(x, n) ((x >> n) | (x << (32 - n))) |
||||
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) |
||||
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) |
||||
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) |
||||
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) |
||||
|
||||
/* SHA256 round function */ |
||||
#define RND(a, b, c, d, e, f, g, h, k) \ |
||||
do { \ |
||||
t0 = h + S1(e) + Ch(e, f, g) + k; \ |
||||
t1 = S0(a) + Maj(a, b, c); \ |
||||
d += t0; \ |
||||
h = t0 + t1; \ |
||||
} while (0) |
||||
|
||||
/* Adjusted round function for rotating state */ |
||||
#define RNDr(S, W, i) \ |
||||
RND(S[(64 - i) % 8], S[(65 - i) % 8], \ |
||||
S[(66 - i) % 8], S[(67 - i) % 8], \ |
||||
S[(68 - i) % 8], S[(69 - i) % 8], \ |
||||
S[(70 - i) % 8], S[(71 - i) % 8], \ |
||||
W[i] + sha256_k[i]) |
||||
|
||||
|
||||
/*
|
||||
* SHA256 block compression function. The 256-bit state is transformed via |
||||
* the 512-bit input block to produce a new state. |
||||
*/ |
||||
void sha256_transform(uint32_t *state, const uint32_t *block, int swap) |
||||
{ |
||||
uint32_t W[64]; |
||||
uint32_t S[8]; |
||||
uint32_t t0, t1; |
||||
int i; |
||||
|
||||
/* 1. Prepare message schedule W. */ |
||||
if (swap) { |
||||
for (i = 0; i < 16; i++) |
||||
W[i] = swab32(block[i]); |
||||
} |
||||
else |
||||
memcpy(W, block, 64); |
||||
for (i = 16; i < 64; i += 2) { |
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; |
||||
W[i + 1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; |
||||
} |
||||
|
||||
/* 2. Initialize working variables. */ |
||||
memcpy(S, state, 32); |
||||
|
||||
/* 3. Mix. */ |
||||
RNDr(S, W, 0); |
||||
RNDr(S, W, 1); |
||||
RNDr(S, W, 2); |
||||
RNDr(S, W, 3); |
||||
RNDr(S, W, 4); |
||||
RNDr(S, W, 5); |
||||
RNDr(S, W, 6); |
||||
RNDr(S, W, 7); |
||||
RNDr(S, W, 8); |
||||
RNDr(S, W, 9); |
||||
RNDr(S, W, 10); |
||||
RNDr(S, W, 11); |
||||
RNDr(S, W, 12); |
||||
RNDr(S, W, 13); |
||||
RNDr(S, W, 14); |
||||
RNDr(S, W, 15); |
||||
RNDr(S, W, 16); |
||||
RNDr(S, W, 17); |
||||
RNDr(S, W, 18); |
||||
RNDr(S, W, 19); |
||||
RNDr(S, W, 20); |
||||
RNDr(S, W, 21); |
||||
RNDr(S, W, 22); |
||||
RNDr(S, W, 23); |
||||
RNDr(S, W, 24); |
||||
RNDr(S, W, 25); |
||||
RNDr(S, W, 26); |
||||
RNDr(S, W, 27); |
||||
RNDr(S, W, 28); |
||||
RNDr(S, W, 29); |
||||
RNDr(S, W, 30); |
||||
RNDr(S, W, 31); |
||||
RNDr(S, W, 32); |
||||
RNDr(S, W, 33); |
||||
RNDr(S, W, 34); |
||||
RNDr(S, W, 35); |
||||
RNDr(S, W, 36); |
||||
RNDr(S, W, 37); |
||||
RNDr(S, W, 38); |
||||
RNDr(S, W, 39); |
||||
RNDr(S, W, 40); |
||||
RNDr(S, W, 41); |
||||
RNDr(S, W, 42); |
||||
RNDr(S, W, 43); |
||||
RNDr(S, W, 44); |
||||
RNDr(S, W, 45); |
||||
RNDr(S, W, 46); |
||||
RNDr(S, W, 47); |
||||
RNDr(S, W, 48); |
||||
RNDr(S, W, 49); |
||||
RNDr(S, W, 50); |
||||
RNDr(S, W, 51); |
||||
RNDr(S, W, 52); |
||||
RNDr(S, W, 53); |
||||
RNDr(S, W, 54); |
||||
RNDr(S, W, 55); |
||||
RNDr(S, W, 56); |
||||
RNDr(S, W, 57); |
||||
RNDr(S, W, 58); |
||||
RNDr(S, W, 59); |
||||
RNDr(S, W, 60); |
||||
RNDr(S, W, 61); |
||||
RNDr(S, W, 62); |
||||
RNDr(S, W, 63); |
||||
|
||||
/* 4. Mix local working variables into global state */ |
||||
for (i = 0; i < 8; i++) |
||||
state[i] += S[i]; |
||||
} |
||||
|
||||
/*
|
||||
* Encode a length len/4 vector of (uint32_t) into a length len vector of |
||||
* (unsigned char) in big-endian form. Assumes len is a multiple of 4. |
||||
*/ |
||||
static inline void |
||||
be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) |
||||
{ |
||||
uint32_t i; |
||||
|
||||
for (i = 0; i < len; i++) |
||||
dst[i] = htobe32(src[i]); |
||||
} |
||||
static inline void be32enc(void *pp, uint32_t x) |
||||
{ |
||||
uint8_t *p = (uint8_t *)pp; |
||||
p[3] = x & 0xff; |
||||
p[2] = (x >> 8) & 0xff; |
||||
p[1] = (x >> 16) & 0xff; |
||||
p[0] = (x >> 24) & 0xff; |
||||
} |
||||
static inline uint32_t be32dec(const void *pp) |
||||
{ |
||||
const uint8_t *p = (uint8_t const *)pp; |
||||
return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + |
||||
((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); |
||||
} |
||||
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) |
||||
//note, this is 64 bytes
|
||||
static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) |
||||
{ |
||||
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) |
||||
uint32_t x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11, x12, x13, x14, x15; |
||||
int i; |
||||
|
||||
x00 = (B[0] ^= Bx[0]); |
||||
x01 = (B[1] ^= Bx[1]); |
||||
x02 = (B[2] ^= Bx[2]); |
||||
x03 = (B[3] ^= Bx[3]); |
||||
x04 = (B[4] ^= Bx[4]); |
||||
x05 = (B[5] ^= Bx[5]); |
||||
x06 = (B[6] ^= Bx[6]); |
||||
x07 = (B[7] ^= Bx[7]); |
||||
x08 = (B[8] ^= Bx[8]); |
||||
x09 = (B[9] ^= Bx[9]); |
||||
x10 = (B[10] ^= Bx[10]); |
||||
x11 = (B[11] ^= Bx[11]); |
||||
x12 = (B[12] ^= Bx[12]); |
||||
x13 = (B[13] ^= Bx[13]); |
||||
x14 = (B[14] ^= Bx[14]); |
||||
x15 = (B[15] ^= Bx[15]); |
||||
for (i = 0; i < 8; i += 2) { |
||||
/* Operate on columns. */ |
||||
x04 ^= ROTL(x00 + x12, 7); x09 ^= ROTL(x05 + x01, 7); |
||||
x14 ^= ROTL(x10 + x06, 7); x03 ^= ROTL(x15 + x11, 7); |
||||
|
||||
x08 ^= ROTL(x04 + x00, 9); x13 ^= ROTL(x09 + x05, 9); |
||||
x02 ^= ROTL(x14 + x10, 9); x07 ^= ROTL(x03 + x15, 9); |
||||
|
||||
x12 ^= ROTL(x08 + x04, 13); x01 ^= ROTL(x13 + x09, 13); |
||||
x06 ^= ROTL(x02 + x14, 13); x11 ^= ROTL(x07 + x03, 13); |
||||
|
||||
x00 ^= ROTL(x12 + x08, 18); x05 ^= ROTL(x01 + x13, 18); |
||||
x10 ^= ROTL(x06 + x02, 18); x15 ^= ROTL(x11 + x07, 18); |
||||
|
||||
/* Operate on rows. */ |
||||
x01 ^= ROTL(x00 + x03, 7); x06 ^= ROTL(x05 + x04, 7); |
||||
x11 ^= ROTL(x10 + x09, 7); x12 ^= ROTL(x15 + x14, 7); |
||||
|
||||
x02 ^= ROTL(x01 + x00, 9); x07 ^= ROTL(x06 + x05, 9); |
||||
x08 ^= ROTL(x11 + x10, 9); x13 ^= ROTL(x12 + x15, 9); |
||||
|
||||
x03 ^= ROTL(x02 + x01, 13); x04 ^= ROTL(x07 + x06, 13); |
||||
x09 ^= ROTL(x08 + x11, 13); x14 ^= ROTL(x13 + x12, 13); |
||||
|
||||
x00 ^= ROTL(x03 + x02, 18); x05 ^= ROTL(x04 + x07, 18); |
||||
x10 ^= ROTL(x09 + x08, 18); x15 ^= ROTL(x14 + x13, 18); |
||||
} |
||||
B[0] += x00; |
||||
B[1] += x01; |
||||
B[2] += x02; |
||||
B[3] += x03; |
||||
B[4] += x04; |
||||
B[5] += x05; |
||||
B[6] += x06; |
||||
B[7] += x07; |
||||
B[8] += x08; |
||||
B[9] += x09; |
||||
B[10] += x10; |
||||
B[11] += x11; |
||||
B[12] += x12; |
||||
B[13] += x13; |
||||
B[14] += x14; |
||||
B[15] += x15; |
||||
#undef ROTL |
||||
} |
||||
|
||||
void sha256_hash(unsigned char *hash, const unsigned char *data, int len) |
||||
{ |
||||
uint32_t S[16], T[16]; |
||||
int i, r; |
||||
|
||||
sha256_init(S); |
||||
for (r = len; r > -9; r -= 64) { |
||||
if (r < 64) |
||||
memset(T, 0, 64); |
||||
memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); |
||||
if (r >= 0 && r < 64) |
||||
((unsigned char *)T)[r] = 0x80; |
||||
for (i = 0; i < 16; i++) |
||||
T[i] = be32dec(T + i); |
||||
|
||||
if (r < 56) |
||||
T[15] = 8 * len; |
||||
sha256_transform(S, T, 0); |
||||
} |
||||
for (i = 0; i < 8; i++) |
||||
be32enc((uint32_t *)hash + i, S[i]); |
||||
} |
||||
|
||||
void sha256_hash512(unsigned char *hash, const unsigned char *data) |
||||
{ |
||||
uint32_t S[16], T[16]; |
||||
int i; |
||||
|
||||
sha256_init(S); |
||||
|
||||
memcpy(T, data, 64); |
||||
|
||||
for (i = 0; i < 16; i++) |
||||
T[i] = be32dec(T + i); |
||||
sha256_transform(S, T, 0); |
||||
|
||||
memset(T, 0, 64); |
||||
//memcpy(T, data + 64, 0);
|
||||
((unsigned char *)T)[0] = 0x80; |
||||
for (i = 0; i < 16; i++) |
||||
T[i] = be32dec(T + i); |
||||
T[15] = 8 * 64; |
||||
sha256_transform(S, T, 0); |
||||
|
||||
for (i = 0; i < 8; i++) |
||||
be32enc((uint32_t *)hash + i, S[i]); |
||||
} |
||||
|
||||
inline void pluckrehash(void *state, const void *input) |
||||
{ |
||||
|
||||
int i,j; |
||||
uint32_t data[20]; |
||||
|
||||
const int HASH_MEMORY = 128 * 1024; |
||||
uint8_t * scratchbuf = (uint8_t*)malloc(HASH_MEMORY); |
||||
memcpy(data,input,80); |
||||
|
||||
uint8_t hashbuffer[128*1024]; //don't allocate this on stack, since it's huge..
|
||||
int size = HASH_MEMORY; |
||||
memset(hashbuffer, 0, 64); |
||||
sha256_hash(&hashbuffer[0], (uint8_t*)data, 80); |
||||
for (i = 64; i < size - 32; i += 32) |
||||
{ |
||||
int randmax = i - 4; //we could use size here, but then it's probable to use 0 as the value in most cases
|
||||
uint32_t joint[16]; |
||||
uint32_t randbuffer[16]; |
||||
|
||||
uint32_t randseed[16]; |
||||
memcpy(randseed, &hashbuffer[i - 64], 64); |
||||
if (i>128) |
||||
{ |
||||
memcpy(randbuffer, &hashbuffer[i - 128], 64); |
||||
} |
||||
else |
||||
{ |
||||
memset(&randbuffer, 0, 64); |
||||
} |
||||
|
||||
xor_salsa8(randbuffer, randseed); |
||||
|
||||
memcpy(joint, &hashbuffer[i - 32], 32); |
||||
//use the last hash value as the seed
|
||||
for (j = 32; j < 64; j += 4) |
||||
{ |
||||
uint32_t rand = randbuffer[(j - 32) / 4] % (randmax - 32); |
||||
joint[j / 4] = *((uint32_t*)&hashbuffer[rand]); |
||||
|
||||
} |
||||
sha256_hash512(&hashbuffer[i], (uint8_t*)joint); |
||||
|
||||
memcpy(randseed, &hashbuffer[i - 32], 64); |
||||
if (i>128) |
||||
{ |
||||
memcpy(randbuffer, &hashbuffer[i - 128], 64); |
||||
} |
||||
else |
||||
{ |
||||
memset(randbuffer, 0, 64); |
||||
} |
||||
xor_salsa8(randbuffer, randseed); |
||||
for (j = 0; j < 32; j += 2) |
||||
{ |
||||
uint32_t rand = randbuffer[j / 2] % randmax; |
||||
*((uint32_t*)&hashbuffer[rand]) = *((uint32_t*)&hashbuffer[j + i - 4]); |
||||
} |
||||
} |
||||
|
||||
|
||||
//printf("cpu hashbuffer %08x nonce %08x\n", ((uint32_t*)hashbuffer)[7],data[19]);
|
||||
|
||||
memcpy(state, hashbuffer, 32); |
||||
} |
||||
|
||||
static const uint32_t diff1targ = 0x0000ffff; |
||||
|
||||
|
||||
/* Used externally as confirmation of correct OCL code */ |
||||
int pluck_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) |
||||
{ |
||||
uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); |
||||
uint32_t data[20], ohash[8]; |
||||
|
||||
be32enc_vect(data, (const uint32_t *)pdata, 19); |
||||
data[19] = htobe32(nonce); |
||||
pluckrehash(ohash, data); |
||||
|
||||
tmp_hash7 = be32toh(ohash[7]); |
||||
|
||||
applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", |
||||
(long unsigned int)Htarg, |
||||
(long unsigned int)diff1targ, |
||||
(long unsigned int)tmp_hash7); |
||||
|
||||
if (tmp_hash7 > diff1targ) |
||||
return -1; |
||||
|
||||
if (tmp_hash7 > Htarg) |
||||
return 0; |
||||
|
||||
return 1; |
||||
} |
||||
|
||||
void pluck_regenhash(struct work *work) |
||||
{ |
||||
uint32_t data[20]; |
||||
uint32_t *nonce = (uint32_t *)(work->data + 76); |
||||
uint32_t *ohash = (uint32_t *)(work->hash); |
||||
|
||||
be32enc_vect(data, (const uint32_t *)work->data, 19); |
||||
data[19] = htobe32(*nonce); |
||||
|
||||
pluckrehash(ohash, data); |
||||
} |
||||
|
||||
|
||||
bool scanhash_pluck(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, |
||||
unsigned char *pdata, unsigned char __maybe_unused *phash1, |
||||
unsigned char __maybe_unused *phash, const unsigned char *ptarget, |
||||
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) |
||||
{ |
||||
uint32_t *nonce = (uint32_t *)(pdata + 76); |
||||
uint32_t data[20]; |
||||
uint32_t tmp_hash7; |
||||
uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); |
||||
bool ret = false; |
||||
|
||||
be32enc_vect(data, (const uint32_t *)pdata, 19); |
||||
|
||||
while (1) |
||||
{ |
||||
uint32_t ostate[8]; |
||||
|
||||
*nonce = ++n; |
||||
data[19] = (n); |
||||
pluckrehash(ostate, data); |
||||
tmp_hash7 = (ostate[7]); |
||||
|
||||
applog(LOG_INFO, "data7 %08lx", (long unsigned int)data[7]); |
||||
|
||||
if (unlikely(tmp_hash7 <= Htarg)) |
||||
{ |
||||
((uint32_t *)pdata)[19] = htobe32(n); |
||||
*last_nonce = n; |
||||
ret = true; |
||||
break; |
||||
} |
||||
|
||||
if (unlikely((n >= max_nonce) || thr->work_restart)) |
||||
{ |
||||
*last_nonce = n; |
||||
break; |
||||
} |
||||
} |
||||
|
||||
return ret; |
||||
} |
@ -0,0 +1,10 @@
@@ -0,0 +1,10 @@
|
||||
#ifndef PLUCK_H |
||||
#define PLUCK_H |
||||
|
||||
#include "miner.h" |
||||
#define PLUCK_SCRATCHBUF_SIZE (128 * 1024) |
||||
extern int pluck_test(unsigned char *pdata, const unsigned char *ptarget, |
||||
uint32_t nonce); |
||||
extern void pluck_regenhash(struct work *work); |
||||
|
||||
#endif /* PLUCK_H */ |
@ -0,0 +1,463 @@
@@ -0,0 +1,463 @@
|
||||
/* |
||||
* "pluck" kernel implementation. |
||||
* |
||||
* ==========================(LICENSE BEGIN)============================ |
||||
* |
||||
* Copyright (c) 2015 djm34 |
||||
* |
||||
* Permission is hereby granted, free of charge, to any person obtaining |
||||
* a copy of this software and associated documentation files (the |
||||
* "Software"), to deal in the Software without restriction, including |
||||
* without limitation the rights to use, copy, modify, merge, publish, |
||||
* distribute, sublicense, and/or sell copies of the Software, and to |
||||
* permit persons to whom the Software is furnished to do so, subject to |
||||
* the following conditions: |
||||
* |
||||
* The above copyright notice and this permission notice shall be |
||||
* included in all copies or substantial portions of the Software. |
||||
* |
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
||||
* |
||||
* ===========================(LICENSE END)============================= |
||||
* |
||||
* @author djm34 |
||||
*/ |
||||
#if !defined(cl_khr_byte_addressable_store) |
||||
#error "Device does not support unaligned stores" |
||||
#endif |
||||
#define ROL32(x, n) rotate(x, (uint) n) |
||||
//#define ROL32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) |
||||
#define HASH_MEMORY 4096 |
||||
|
||||
|
||||
#define SALSA(a,b,c,d) do { \ |
||||
t =a+d; b^=rotate(t, 7U); \ |
||||
t =b+a; c^=rotate(t, 9U); \ |
||||
t =c+b; d^=rotate(t, 13U); \ |
||||
t =d+c; a^=rotate(t, 18U); \ |
||||
} while(0) |
||||
|
||||
|
||||
#define SALSA_CORE(state) do { \ |
||||
\ |
||||
SALSA(state.s0,state.s4,state.s8,state.sc); \ |
||||
SALSA(state.s5,state.s9,state.sd,state.s1); \ |
||||
SALSA(state.sa,state.se,state.s2,state.s6); \ |
||||
SALSA(state.sf,state.s3,state.s7,state.sb); \ |
||||
SALSA(state.s0,state.s1,state.s2,state.s3); \ |
||||
SALSA(state.s5,state.s6,state.s7,state.s4); \ |
||||
SALSA(state.sa,state.sb,state.s8,state.s9); \ |
||||
SALSA(state.sf,state.sc,state.sd,state.se); \ |
||||
} while(0) |
||||
|
||||
/* |
||||
#define SALSA_CORE(state) do { \ |
||||
state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \ |
||||
state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \ |
||||
state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \ |
||||
state.s3 ^= rotate(state.sf + state.sb, 7U); state.s7 ^= rotate(state.s3 + state.sf, 9U); state.sb ^= rotate(state.s7 + state.s3, 13U); state.sf ^= rotate(state.sb + state.s7, 18U); \ |
||||
state.s1 ^= rotate(state.s0 + state.s3, 7U); state.s2 ^= rotate(state.s1 + state.s0, 9U); state.s3 ^= rotate(state.s2 + state.s1, 13U); state.s0 ^= rotate(state.s3 + state.s2, 18U); \ |
||||
state.s6 ^= rotate(state.s5 + state.s4, 7U); state.s7 ^= rotate(state.s6 + state.s5, 9U); state.s4 ^= rotate(state.s7 + state.s6, 13U); state.s5 ^= rotate(state.s4 + state.s7, 18U); \ |
||||
state.sb ^= rotate(state.sa + state.s9, 7U); state.s8 ^= rotate(state.sb + state.sa, 9U); state.s9 ^= rotate(state.s8 + state.sb, 13U); state.sa ^= rotate(state.s9 + state.s8, 18U); \ |
||||
state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \ |
||||
} while(0) |
||||
*/ |
||||
uint16 xor_salsa8(uint16 Bx) |
||||
{ |
||||
uint t; |
||||
uint16 st = Bx; |
||||
SALSA_CORE(st); |
||||
SALSA_CORE(st); |
||||
SALSA_CORE(st); |
||||
SALSA_CORE(st); |
||||
return(st + Bx); |
||||
} |
||||
|
||||
|
||||
|
||||
#define SHR(x, n) ((x) >> n) |
||||
#define SWAP32(a) (as_uint(as_uchar4(a).wzyx)) |
||||
|
||||
#define S0(x) (ROL32(x, 25) ^ ROL32(x, 14) ^ SHR(x, 3)) |
||||
#define S1(x) (ROL32(x, 15) ^ ROL32(x, 13) ^ SHR(x, 10)) |
||||
|
||||
#define S2(x) (ROL32(x, 30) ^ ROL32(x, 19) ^ ROL32(x, 10)) |
||||
#define S3(x) (ROL32(x, 26) ^ ROL32(x, 21) ^ ROL32(x, 7)) |
||||
|
||||
#define P(a,b,c,d,e,f,g,h,x,K) \ |
||||
{ \ |
||||
temp1 = h + S3(e) + F1(e,f,g) + (K + x); \ |
||||
d += temp1; h = temp1 + S2(a) + F0(a,b,c); \ |
||||
} |
||||
|
||||
#define PLAST(a,b,c,d,e,f,g,h,x,K) \ |
||||
{ \ |
||||
d += h + S3(e) + F1(e,f,g) + (x + K); \ |
||||
} |
||||
|
||||
#define F0(y, x, z) bitselect(z, y, z ^ x) |
||||
#define F1(x, y, z) bitselect(z, y, x) |
||||
|
||||
#define R0 (W0 = S1(W14) + W9 + S0(W1) + W0) |
||||
#define R1 (W1 = S1(W15) + W10 + S0(W2) + W1) |
||||
#define R2 (W2 = S1(W0) + W11 + S0(W3) + W2) |
||||
#define R3 (W3 = S1(W1) + W12 + S0(W4) + W3) |
||||
#define R4 (W4 = S1(W2) + W13 + S0(W5) + W4) |
||||
#define R5 (W5 = S1(W3) + W14 + S0(W6) + W5) |
||||
#define R6 (W6 = S1(W4) + W15 + S0(W7) + W6) |
||||
#define R7 (W7 = S1(W5) + W0 + S0(W8) + W7) |
||||
#define R8 (W8 = S1(W6) + W1 + S0(W9) + W8) |
||||
#define R9 (W9 = S1(W7) + W2 + S0(W10) + W9) |
||||
#define R10 (W10 = S1(W8) + W3 + S0(W11) + W10) |
||||
#define R11 (W11 = S1(W9) + W4 + S0(W12) + W11) |
||||
#define R12 (W12 = S1(W10) + W5 + S0(W13) + W12) |
||||
#define R13 (W13 = S1(W11) + W6 + S0(W14) + W13) |
||||
#define R14 (W14 = S1(W12) + W7 + S0(W15) + W14) |
||||
#define R15 (W15 = S1(W13) + W8 + S0(W0) + W15) |
||||
|
||||
#define RD14 (S1(W12) + W7 + S0(W15) + W14) |
||||
#define RD15 (S1(W13) + W8 + S0(W0) + W15) |
||||
|
||||
inline uint8 sha256_round1(uint16 data) |
||||
{ |
||||
uint temp1; |
||||
uint8 res; |
||||
uint W0 = SWAP32(data.s0); |
||||
uint W1 = SWAP32(data.s1); |
||||
uint W2 = SWAP32(data.s2); |
||||
uint W3 = SWAP32(data.s3); |
||||
uint W4 = SWAP32(data.s4); |
||||
uint W5 = SWAP32(data.s5); |
||||
uint W6 = SWAP32(data.s6); |
||||
uint W7 = SWAP32(data.s7); |
||||
uint W8 = SWAP32(data.s8); |
||||
uint W9 = SWAP32(data.s9); |
||||
uint W10 = SWAP32(data.sA); |
||||
uint W11 = SWAP32(data.sB); |
||||
uint W12 = SWAP32(data.sC); |
||||
uint W13 = SWAP32(data.sD); |
||||
uint W14 = SWAP32(data.sE); |
||||
uint W15 = SWAP32(data.sF); |
||||
|
||||
uint v0 = 0x6A09E667; |
||||
uint v1 = 0xBB67AE85; |
||||
uint v2 = 0x3C6EF372; |
||||
uint v3 = 0xA54FF53A; |
||||
uint v4 = 0x510E527F; |
||||
uint v5 = 0x9B05688C; |
||||
uint v6 = 0x1F83D9AB; |
||||
uint v7 = 0x5BE0CD19; |
||||
|
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5); |
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174); |
||||
|
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA); |
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967); |
||||
|
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85); |
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070); |
||||
|
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3); |
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2); |
||||
|
||||
res.s0 = v0 + 0x6A09E667; |
||||
res.s1 = v1 + 0xBB67AE85; |
||||
res.s2 = v2 + 0x3C6EF372; |
||||
res.s3 = v3 + 0xA54FF53A; |
||||
res.s4 = v4 + 0x510E527F; |
||||
res.s5 = v5 + 0x9B05688C; |
||||
res.s6 = v6 + 0x1F83D9AB; |
||||
res.s7 = v7 + 0x5BE0CD19; |
||||
return (res); |
||||
} |
||||
|
||||
|
||||
inline uint8 sha256_round2(uint16 data,uint8 buf) |
||||
{ |
||||
uint temp1; |
||||
uint8 res; |
||||
uint W0 = data.s0; |
||||
uint W1 = data.s1; |
||||
uint W2 = data.s2; |
||||
uint W3 = data.s3; |
||||
uint W4 = data.s4; |
||||
uint W5 = data.s5; |
||||
uint W6 = data.s6; |
||||
uint W7 = data.s7; |
||||
uint W8 = data.s8; |
||||
uint W9 = data.s9; |
||||
uint W10 = data.sA; |
||||
uint W11 = data.sB; |
||||
uint W12 = data.sC; |
||||
uint W13 = data.sD; |
||||
uint W14 = data.sE; |
||||
uint W15 = data.sF; |
||||
|
||||
uint v0 = buf.s0; |
||||
uint v1 = buf.s1; |
||||
uint v2 = buf.s2; |
||||
uint v3 = buf.s3; |
||||
uint v4 = buf.s4; |
||||
uint v5 = buf.s5; |
||||
uint v6 = buf.s6; |
||||
uint v7 = buf.s7; |
||||
|
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5); |
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174); |
||||
|
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA); |
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967); |
||||
|
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85); |
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070); |
||||
|
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3); |
||||
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE); |
||||
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F); |
||||
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814); |
||||
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208); |
||||
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA); |
||||
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB); |
||||
P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7); |
||||
P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2); |
||||
|
||||
res.s0 = SWAP32(v0 + buf.s0); |
||||
res.s1 = SWAP32(v1 + buf.s1); |
||||
res.s2 = SWAP32(v2 + buf.s2); |
||||
res.s3 = SWAP32(v3 + buf.s3); |
||||
res.s4 = SWAP32(v4 + buf.s4); |
||||
res.s5 = SWAP32(v5 + buf.s5); |
||||
res.s6 = SWAP32(v6 + buf.s6); |
||||
res.s7 = SWAP32(v7 + buf.s7); |
||||
return (res); |
||||
} |
||||
|
||||
inline uint8 sha256_80(uint* data,uint nonce) |
||||
{ |
||||
|
||||
uint8 buf = sha256_round1( ((uint16*)data)[0]); |
||||
uint in[16]; |
||||
for (int i = 0; i<3; i++) { in[i] = SWAP32(data[i + 16]); } |
||||
in[3] = SWAP32(nonce); |
||||
in[4] = 0x80000000; |
||||
in[15] = 0x280; |
||||
for (int i = 5; i<15; i++) { in[i] = 0; } |
||||
|
||||
return(sha256_round2(((uint16*)in)[0], buf)); |
||||
} |
||||
|
||||
inline uint8 sha256_64(uint* data) |
||||
{ |
||||
|
||||
uint8 buf=sha256_round1(((uint16*)data)[0]); |
||||
uint in[16]; |
||||
for (int i = 1; i<15; i++) { in[i] = 0; } |
||||
in[0] = 0x80000000; |
||||
in[15] = 0x200; |
||||
|
||||
return(sha256_round2(((uint16*)in)[0],buf)); |
||||
} |
||||
|
||||
|
||||
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) |
||||
__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target) |
||||
{ |
||||
|
||||
__global uchar *hashbuffer = (__global uchar *)(padcache + (1024*128 * (get_global_id(0) % MAX_GLOBAL_THREADS))); |
||||
|
||||
uint data[20]; |
||||
|
||||
((uint16 *)data)[0] = ((__global const uint16 *)input)[0]; |
||||
((uint4 *)data)[4] = ((__global const uint4 *)input)[4]; |
||||
|
||||
((__global uint8*)hashbuffer)[0] = sha256_80(data,get_global_id(0)); |
||||
((__global uint8*)hashbuffer)[1] = 0; |
||||
|
||||
for (int i = 2; i < 4096 - 1; i++) |
||||
{ |
||||
uint randmax = i * 32 - 4; |
||||
uint randseed[16]; |
||||
uint randbuffer[16]; |
||||
uint joint[16]; |
||||
|
||||
((uint8*)randseed)[0] = ((__global uint8*)hashbuffer)[i - 2]; |
||||
((uint8*)randseed)[1] = ((__global uint8*)hashbuffer)[i - 1]; |
||||
|
||||
if (i>4) |
||||
{ |
||||
|
||||
((uint8*)randseed)[0] ^= ((__global uint8*)hashbuffer)[i - 4]; |
||||
((uint8*)randseed)[1] ^= ((__global uint8*)hashbuffer)[i - 3]; |
||||
} |
||||
|
||||
((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); |
||||
|
||||
|
||||
|
||||
((uint8*)joint)[0] = ((__global uint8*)hashbuffer)[i - 1]; |
||||
for (int j = 0; j < 8; j++) |
||||
{ |
||||
uint rand = randbuffer[j] % (randmax - 32); |
||||
|
||||
((uchar4*)joint)[(j + 8)].x =((__global uchar*)(hashbuffer))[0+rand]; |
||||
((uchar4*)joint)[(j + 8)].y =((__global uchar*)(hashbuffer))[1+rand]; |
||||
((uchar4*)joint)[(j + 8)].z =((__global uchar*)(hashbuffer))[2+rand]; |
||||
((uchar4*)joint)[(j + 8)].w =((__global uchar*)(hashbuffer))[3+rand]; |
||||
} |
||||
((__global uint8*)(hashbuffer))[i] = sha256_64(joint); |
||||
|
||||
|
||||
|
||||
(( uint8*)randseed)[0] = ((__global uint8*)(hashbuffer))[i - 1]; |
||||
(( uint8*)randseed)[1] = ((__global uint8*)(hashbuffer))[i]; |
||||
|
||||
|
||||
if (i>4) |
||||
{ |
||||
|
||||
((uint8*)randseed)[0] ^= ((__global uint8*)(hashbuffer))[i - 4]; |
||||
((uint8*)randseed)[1] ^= ((__global uint8*)(hashbuffer))[i - 3]; |
||||
|
||||
} |
||||
|
||||
((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]); |
||||
|
||||
for (int j = 0; j < 32; j += 2) |
||||
{ |
||||
uint rand = randbuffer[j / 2] % randmax; |
||||
uchar4 Tohere; |
||||
|
||||
Tohere.x = ((__global uchar*)(hashbuffer))[randmax + j]; |
||||
Tohere.y = ((__global uchar*)(hashbuffer))[randmax + j + 1]; |
||||
Tohere.z = ((__global uchar*)(hashbuffer))[randmax + j + 2]; |
||||
Tohere.w = ((__global uchar*)(hashbuffer))[randmax + j + 3]; |
||||
((__global uchar*)(hashbuffer))[rand] = Tohere.x; |
||||
((__global uchar*)(hashbuffer))[rand+1] = Tohere.y; |
||||
((__global uchar*)(hashbuffer))[rand+2] = Tohere.z; |
||||
((__global uchar*)(hashbuffer))[rand+3] = Tohere.w; |
||||
|
||||
} |
||||
|
||||
} // main loop |
||||
|
||||
|
||||
if( ((__global uint *)hashbuffer)[7] <= (target)) {output[atomic_inc(output + 0xFF)] = SWAP32(get_global_id(0)); |
||||
//printf("gpu hashbuffer %08x nonce %08x\n",((__global uint *)hashbuffer)[7] ,SWAP32(get_global_id(0))); |
||||
} |
||||
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////// |
||||
|
||||
} |
Loading…
Reference in new issue