mirror of https://github.com/GOSTSec/sgminer
Con Kolivas
12 years ago
22 changed files with 7 additions and 4596 deletions
@ -1,863 +0,0 @@ |
|||||||
/*
|
|
||||||
* Copyright 2011-2012 Con Kolivas |
|
||||||
* Copyright 2011-2012 Luke Dashjr |
|
||||||
* Copyright 2010 Jeff Garzik |
|
||||||
* |
|
||||||
* This program is free software; you can redistribute it and/or modify it |
|
||||||
* under the terms of the GNU General Public License as published by the Free |
|
||||||
* Software Foundation; either version 3 of the License, or (at your option) |
|
||||||
* any later version. See COPYING for more details. |
|
||||||
*/ |
|
||||||
|
|
||||||
#include "config.h" |
|
||||||
|
|
||||||
|
|
||||||
#include <stdio.h> |
|
||||||
#include <stdlib.h> |
|
||||||
#include <string.h> |
|
||||||
#include <stdbool.h> |
|
||||||
#include <stdint.h> |
|
||||||
#include <unistd.h> |
|
||||||
#include <signal.h> |
|
||||||
|
|
||||||
#include <sys/stat.h> |
|
||||||
#include <sys/types.h> |
|
||||||
|
|
||||||
#ifndef WIN32 |
|
||||||
#include <sys/wait.h> |
|
||||||
#include <sys/resource.h> |
|
||||||
#endif |
|
||||||
#include <libgen.h> |
|
||||||
|
|
||||||
#include "compat.h" |
|
||||||
#include "miner.h" |
|
||||||
#include "bench_block.h" |
|
||||||
#include "driver-cpu.h" |
|
||||||
|
|
||||||
#if defined(unix) |
|
||||||
#include <errno.h> |
|
||||||
#include <fcntl.h> |
|
||||||
#endif |
|
||||||
|
|
||||||
#if defined(__linux) && defined(cpu_set_t) /* Linux specific policy and affinity management */ |
|
||||||
#include <sched.h> |
|
||||||
static inline void drop_policy(void) |
|
||||||
{ |
|
||||||
struct sched_param param; |
|
||||||
|
|
||||||
#ifdef SCHED_BATCH |
|
||||||
#ifdef SCHED_IDLE |
|
||||||
if (unlikely(sched_setscheduler(0, SCHED_IDLE, ¶m) == -1)) |
|
||||||
#endif |
|
||||||
sched_setscheduler(0, SCHED_BATCH, ¶m); |
|
||||||
#endif |
|
||||||
} |
|
||||||
|
|
||||||
static inline void affine_to_cpu(int id, int cpu) |
|
||||||
{ |
|
||||||
cpu_set_t set; |
|
||||||
|
|
||||||
CPU_ZERO(&set); |
|
||||||
CPU_SET(cpu, &set); |
|
||||||
sched_setaffinity(0, sizeof(&set), &set); |
|
||||||
applog(LOG_INFO, "Binding cpu mining thread %d to cpu %d", id, cpu); |
|
||||||
} |
|
||||||
#else |
|
||||||
static inline void drop_policy(void) |
|
||||||
{ |
|
||||||
} |
|
||||||
|
|
||||||
static inline void affine_to_cpu(int __maybe_unused id, int __maybe_unused cpu) |
|
||||||
{ |
|
||||||
} |
|
||||||
#endif |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/* TODO: resolve externals */ |
|
||||||
extern char *set_int_range(const char *arg, int *i, int min, int max); |
|
||||||
extern int dev_from_id(int thr_id); |
|
||||||
|
|
||||||
|
|
||||||
/* chipset-optimized hash functions */ |
|
||||||
extern bool ScanHash_4WaySSE2(struct thr_info*, const unsigned char *pmidstate, |
|
||||||
unsigned char *pdata, unsigned char *phash1, unsigned char *phash, |
|
||||||
const unsigned char *ptarget, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce); |
|
||||||
|
|
||||||
extern bool ScanHash_altivec_4way(struct thr_info*, const unsigned char *pmidstate, |
|
||||||
unsigned char *pdata, |
|
||||||
unsigned char *phash1, unsigned char *phash, |
|
||||||
const unsigned char *ptarget, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce); |
|
||||||
|
|
||||||
extern bool scanhash_via(struct thr_info*, const unsigned char *pmidstate, |
|
||||||
unsigned char *pdata, |
|
||||||
unsigned char *phash1, unsigned char *phash, |
|
||||||
const unsigned char *target, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n); |
|
||||||
|
|
||||||
extern bool scanhash_c(struct thr_info*, const unsigned char *midstate, unsigned char *data, |
|
||||||
unsigned char *hash1, unsigned char *hash, |
|
||||||
const unsigned char *target, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n); |
|
||||||
|
|
||||||
extern bool scanhash_cryptopp(struct thr_info*, const unsigned char *midstate,unsigned char *data, |
|
||||||
unsigned char *hash1, unsigned char *hash, |
|
||||||
const unsigned char *target, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n); |
|
||||||
|
|
||||||
extern bool scanhash_asm32(struct thr_info*, const unsigned char *midstate,unsigned char *data, |
|
||||||
unsigned char *hash1, unsigned char *hash, |
|
||||||
const unsigned char *target, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce); |
|
||||||
|
|
||||||
extern bool scanhash_sse2_64(struct thr_info*, const unsigned char *pmidstate, unsigned char *pdata, |
|
||||||
unsigned char *phash1, unsigned char *phash, |
|
||||||
const unsigned char *ptarget, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, |
|
||||||
uint32_t nonce); |
|
||||||
|
|
||||||
extern bool scanhash_sse4_64(struct thr_info*, const unsigned char *pmidstate, unsigned char *pdata, |
|
||||||
unsigned char *phash1, unsigned char *phash, |
|
||||||
const unsigned char *ptarget, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, |
|
||||||
uint32_t nonce); |
|
||||||
|
|
||||||
extern bool scanhash_sse2_32(struct thr_info*, const unsigned char *pmidstate, unsigned char *pdata, |
|
||||||
unsigned char *phash1, unsigned char *phash, |
|
||||||
const unsigned char *ptarget, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, |
|
||||||
uint32_t nonce); |
|
||||||
|
|
||||||
extern bool scanhash_scrypt(struct thr_info *thr, int thr_id, unsigned char *pdata, unsigned char *scratchbuf, |
|
||||||
const unsigned char *ptarget, |
|
||||||
uint32_t max_nonce, unsigned long *hashes_done); |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef WANT_CPUMINE |
|
||||||
static size_t max_name_len = 0; |
|
||||||
static char *name_spaces_pad = NULL; |
|
||||||
const char *algo_names[] = { |
|
||||||
[ALGO_C] = "c", |
|
||||||
#ifdef WANT_SSE2_4WAY |
|
||||||
[ALGO_4WAY] = "4way", |
|
||||||
#endif |
|
||||||
#ifdef WANT_VIA_PADLOCK |
|
||||||
[ALGO_VIA] = "via", |
|
||||||
#endif |
|
||||||
[ALGO_CRYPTOPP] = "cryptopp", |
|
||||||
#ifdef WANT_CRYPTOPP_ASM32 |
|
||||||
[ALGO_CRYPTOPP_ASM32] = "cryptopp_asm32", |
|
||||||
#endif |
|
||||||
#ifdef WANT_X8632_SSE2 |
|
||||||
[ALGO_SSE2_32] = "sse2_32", |
|
||||||
#endif |
|
||||||
#ifdef WANT_X8664_SSE2 |
|
||||||
[ALGO_SSE2_64] = "sse2_64", |
|
||||||
#endif |
|
||||||
#ifdef WANT_X8664_SSE4 |
|
||||||
[ALGO_SSE4_64] = "sse4_64", |
|
||||||
#endif |
|
||||||
#ifdef WANT_ALTIVEC_4WAY |
|
||||||
[ALGO_ALTIVEC_4WAY] = "altivec_4way", |
|
||||||
#endif |
|
||||||
#ifdef WANT_SCRYPT |
|
||||||
[ALGO_SCRYPT] = "scrypt", |
|
||||||
#endif |
|
||||||
}; |
|
||||||
|
|
||||||
static const sha256_func sha256_funcs[] = { |
|
||||||
[ALGO_C] = (sha256_func)scanhash_c, |
|
||||||
#ifdef WANT_SSE2_4WAY |
|
||||||
[ALGO_4WAY] = (sha256_func)ScanHash_4WaySSE2, |
|
||||||
#endif |
|
||||||
#ifdef WANT_ALTIVEC_4WAY |
|
||||||
[ALGO_ALTIVEC_4WAY] = (sha256_func) ScanHash_altivec_4way, |
|
||||||
#endif |
|
||||||
#ifdef WANT_VIA_PADLOCK |
|
||||||
[ALGO_VIA] = (sha256_func)scanhash_via, |
|
||||||
#endif |
|
||||||
[ALGO_CRYPTOPP] = (sha256_func)scanhash_cryptopp, |
|
||||||
#ifdef WANT_CRYPTOPP_ASM32 |
|
||||||
[ALGO_CRYPTOPP_ASM32] = (sha256_func)scanhash_asm32, |
|
||||||
#endif |
|
||||||
#ifdef WANT_X8632_SSE2 |
|
||||||
[ALGO_SSE2_32] = (sha256_func)scanhash_sse2_32, |
|
||||||
#endif |
|
||||||
#ifdef WANT_X8664_SSE2 |
|
||||||
[ALGO_SSE2_64] = (sha256_func)scanhash_sse2_64, |
|
||||||
#endif |
|
||||||
#ifdef WANT_X8664_SSE4 |
|
||||||
[ALGO_SSE4_64] = (sha256_func)scanhash_sse4_64, |
|
||||||
#endif |
|
||||||
#ifdef WANT_SCRYPT |
|
||||||
[ALGO_SCRYPT] = (sha256_func)scanhash_scrypt |
|
||||||
#endif |
|
||||||
}; |
|
||||||
#endif |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef WANT_CPUMINE |
|
||||||
#if defined(WANT_X8664_SSE4) && defined(__SSE4_1__) |
|
||||||
enum sha256_algos opt_algo = ALGO_SSE4_64; |
|
||||||
#elif defined(WANT_X8664_SSE2) && defined(__SSE2__) |
|
||||||
enum sha256_algos opt_algo = ALGO_SSE2_64; |
|
||||||
#elif defined(WANT_X8632_SSE2) && defined(__SSE2__) |
|
||||||
enum sha256_algos opt_algo = ALGO_SSE2_32; |
|
||||||
#else |
|
||||||
enum sha256_algos opt_algo = ALGO_C; |
|
||||||
#endif |
|
||||||
bool opt_usecpu = false; |
|
||||||
static int cpur_thr_id; |
|
||||||
static bool forced_n_threads; |
|
||||||
#endif |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef WANT_CPUMINE |
|
||||||
// Algo benchmark, crash-prone, system independent stage
|
|
||||||
double bench_algo_stage3( |
|
||||||
enum sha256_algos algo |
|
||||||
) |
|
||||||
{ |
|
||||||
// Use a random work block pulled from a pool
|
|
||||||
static uint8_t bench_block[] = { CGMINER_BENCHMARK_BLOCK }; |
|
||||||
struct work work __attribute__((aligned(128))); |
|
||||||
unsigned char hash1[64]; |
|
||||||
|
|
||||||
size_t bench_size = sizeof(work); |
|
||||||
size_t work_size = sizeof(bench_block); |
|
||||||
size_t min_size = (work_size < bench_size ? work_size : bench_size); |
|
||||||
memset(&work, 0, sizeof(work)); |
|
||||||
memcpy(&work, &bench_block, min_size); |
|
||||||
|
|
||||||
struct thr_info dummy = {0}; |
|
||||||
|
|
||||||
struct timeval end; |
|
||||||
struct timeval start; |
|
||||||
uint32_t max_nonce = (1<<22); |
|
||||||
uint32_t last_nonce = 0; |
|
||||||
|
|
||||||
hex2bin(hash1, "00000000000000000000000000000000000000000000000000000000000000000000008000000000000000000000000000000000000000000000000000010000", 64); |
|
||||||
|
|
||||||
gettimeofday(&start, 0); |
|
||||||
{ |
|
||||||
sha256_func func = sha256_funcs[algo]; |
|
||||||
(*func)( |
|
||||||
&dummy, |
|
||||||
work.midstate, |
|
||||||
work.data, |
|
||||||
hash1, |
|
||||||
work.hash, |
|
||||||
work.target, |
|
||||||
max_nonce, |
|
||||||
&last_nonce, |
|
||||||
work.blk.nonce |
|
||||||
); |
|
||||||
} |
|
||||||
gettimeofday(&end, 0); |
|
||||||
|
|
||||||
uint64_t usec_end = ((uint64_t)end.tv_sec)*1000*1000 + end.tv_usec; |
|
||||||
uint64_t usec_start = ((uint64_t)start.tv_sec)*1000*1000 + start.tv_usec; |
|
||||||
uint64_t usec_elapsed = usec_end - usec_start; |
|
||||||
|
|
||||||
double rate = -1.0; |
|
||||||
if (0<usec_elapsed) { |
|
||||||
rate = (1.0*(last_nonce+1))/usec_elapsed; |
|
||||||
} |
|
||||||
return rate; |
|
||||||
} |
|
||||||
|
|
||||||
#if defined(unix) |
|
||||||
|
|
||||||
// Change non-blocking status on a file descriptor
|
|
||||||
static void set_non_blocking( |
|
||||||
int fd, |
|
||||||
int yes |
|
||||||
) |
|
||||||
{ |
|
||||||
int flags = fcntl(fd, F_GETFL, 0); |
|
||||||
if (flags<0) { |
|
||||||
perror("fcntl(GET) failed"); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
flags = yes ? (flags|O_NONBLOCK) : (flags&~O_NONBLOCK); |
|
||||||
|
|
||||||
int r = fcntl(fd, F_SETFL, flags); |
|
||||||
if (r<0) { |
|
||||||
perror("fcntl(SET) failed"); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
#endif // defined(unix)
|
|
||||||
|
|
||||||
// Algo benchmark, crash-safe, system-dependent stage
|
|
||||||
static double bench_algo_stage2( |
|
||||||
enum sha256_algos algo |
|
||||||
) |
|
||||||
{ |
|
||||||
// Here, the gig is to safely run a piece of code that potentially
|
|
||||||
// crashes. Unfortunately, the Right Way (tm) to do this is rather
|
|
||||||
// heavily platform dependent :(
|
|
||||||
|
|
||||||
double rate = -1.23457; |
|
||||||
|
|
||||||
#if defined(unix) |
|
||||||
|
|
||||||
// Make a pipe: [readFD, writeFD]
|
|
||||||
int pfd[2]; |
|
||||||
int r = pipe(pfd); |
|
||||||
if (r<0) { |
|
||||||
perror("pipe - failed to create pipe for --algo auto"); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
|
|
||||||
// Make pipe non blocking
|
|
||||||
set_non_blocking(pfd[0], 1); |
|
||||||
set_non_blocking(pfd[1], 1); |
|
||||||
|
|
||||||
// Don't allow a crashing child to kill the main process
|
|
||||||
sighandler_t sr0 = signal(SIGPIPE, SIG_IGN); |
|
||||||
sighandler_t sr1 = signal(SIGPIPE, SIG_IGN); |
|
||||||
if (SIG_ERR==sr0 || SIG_ERR==sr1) { |
|
||||||
perror("signal - failed to edit signal mask for --algo auto"); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
|
|
||||||
// Fork a child to do the actual benchmarking
|
|
||||||
pid_t child_pid = fork(); |
|
||||||
if (child_pid<0) { |
|
||||||
perror("fork - failed to create a child process for --algo auto"); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
|
|
||||||
// Do the dangerous work in the child, knowing we might crash
|
|
||||||
if (0==child_pid) { |
|
||||||
|
|
||||||
// TODO: some umask trickery to prevent coredumps
|
|
||||||
|
|
||||||
// Benchmark this algorithm
|
|
||||||
double r = bench_algo_stage3(algo); |
|
||||||
|
|
||||||
// We survived, send result to parent and bail
|
|
||||||
int loop_count = 0; |
|
||||||
while (1) { |
|
||||||
ssize_t bytes_written = write(pfd[1], &r, sizeof(r)); |
|
||||||
int try_again = (0==bytes_written || (bytes_written<0 && EAGAIN==errno)); |
|
||||||
int success = (sizeof(r)==(size_t)bytes_written); |
|
||||||
|
|
||||||
if (success) |
|
||||||
break; |
|
||||||
|
|
||||||
if (!try_again) { |
|
||||||
perror("write - child failed to write benchmark result to pipe"); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
|
|
||||||
if (5<loop_count) { |
|
||||||
applog(LOG_ERR, "child tried %d times to communicate with parent, giving up", loop_count); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
++loop_count; |
|
||||||
sleep(1); |
|
||||||
} |
|
||||||
exit(0); |
|
||||||
} |
|
||||||
|
|
||||||
// Parent waits for a result from child
|
|
||||||
int loop_count = 0; |
|
||||||
while (1) { |
|
||||||
|
|
||||||
// Wait for child to die
|
|
||||||
int status; |
|
||||||
int r = waitpid(child_pid, &status, WNOHANG); |
|
||||||
if ((child_pid==r) || (r<0 && ECHILD==errno)) { |
|
||||||
|
|
||||||
// Child died somehow. Grab result and bail
|
|
||||||
double tmp; |
|
||||||
ssize_t bytes_read = read(pfd[0], &tmp, sizeof(tmp)); |
|
||||||
if (sizeof(tmp)==(size_t)bytes_read) |
|
||||||
rate = tmp; |
|
||||||
break; |
|
||||||
|
|
||||||
} else if (r<0) { |
|
||||||
perror("bench_algo: waitpid failed. giving up."); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
|
|
||||||
// Give up on child after a ~60s
|
|
||||||
if (60<loop_count) { |
|
||||||
kill(child_pid, SIGKILL); |
|
||||||
waitpid(child_pid, &status, 0); |
|
||||||
break; |
|
||||||
} |
|
||||||
|
|
||||||
// Wait a bit longer
|
|
||||||
++loop_count; |
|
||||||
sleep(1); |
|
||||||
} |
|
||||||
|
|
||||||
// Close pipe
|
|
||||||
r = close(pfd[0]); |
|
||||||
if (r<0) { |
|
||||||
perror("close - failed to close read end of pipe for --algo auto"); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
r = close(pfd[1]); |
|
||||||
if (r<0) { |
|
||||||
perror("close - failed to close read end of pipe for --algo auto"); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
|
|
||||||
#elif defined(WIN32) |
|
||||||
|
|
||||||
// Get handle to current exe
|
|
||||||
HINSTANCE module = GetModuleHandle(0); |
|
||||||
if (!module) { |
|
||||||
applog(LOG_ERR, "failed to retrieve module handle"); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
|
|
||||||
// Create a unique name
|
|
||||||
char unique_name[32]; |
|
||||||
snprintf( |
|
||||||
unique_name, |
|
||||||
sizeof(unique_name)-1, |
|
||||||
"cgminer-%p", |
|
||||||
(void*)module |
|
||||||
); |
|
||||||
|
|
||||||
// Create and init a chunked of shared memory
|
|
||||||
HANDLE map_handle = CreateFileMapping( |
|
||||||
INVALID_HANDLE_VALUE, // use paging file
|
|
||||||
NULL, // default security attributes
|
|
||||||
PAGE_READWRITE, // read/write access
|
|
||||||
0, // size: high 32-bits
|
|
||||||
4096, // size: low 32-bits
|
|
||||||
unique_name // name of map object
|
|
||||||
); |
|
||||||
if (NULL==map_handle) { |
|
||||||
applog(LOG_ERR, "could not create shared memory"); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
|
|
||||||
void *shared_mem = MapViewOfFile( |
|
||||||
map_handle, // object to map view of
|
|
||||||
FILE_MAP_WRITE, // read/write access
|
|
||||||
0, // high offset: map from
|
|
||||||
0, // low offset: beginning
|
|
||||||
0 // default: map entire file
|
|
||||||
); |
|
||||||
if (NULL==shared_mem) { |
|
||||||
applog(LOG_ERR, "could not map shared memory"); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
SetEnvironmentVariable("CGMINER_SHARED_MEM", unique_name); |
|
||||||
CopyMemory(shared_mem, &rate, sizeof(rate)); |
|
||||||
|
|
||||||
// Get path to current exe
|
|
||||||
char cmd_line[256 + MAX_PATH]; |
|
||||||
const size_t n = sizeof(cmd_line)-200; |
|
||||||
DWORD size = GetModuleFileName(module, cmd_line, n); |
|
||||||
if (0==size) { |
|
||||||
applog(LOG_ERR, "failed to retrieve module path"); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
|
|
||||||
// Construct new command line based on that
|
|
||||||
char *p = strlen(cmd_line) + cmd_line; |
|
||||||
sprintf(p, " --bench-algo %d", algo); |
|
||||||
SetEnvironmentVariable("CGMINER_BENCH_ALGO", "1"); |
|
||||||
|
|
||||||
// Launch a debug copy of cgminer
|
|
||||||
STARTUPINFO startup_info; |
|
||||||
PROCESS_INFORMATION process_info; |
|
||||||
ZeroMemory(&startup_info, sizeof(startup_info)); |
|
||||||
ZeroMemory(&process_info, sizeof(process_info)); |
|
||||||
startup_info.cb = sizeof(startup_info); |
|
||||||
|
|
||||||
BOOL ok = CreateProcess( |
|
||||||
NULL, // No module name (use command line)
|
|
||||||
cmd_line, // Command line
|
|
||||||
NULL, // Process handle not inheritable
|
|
||||||
NULL, // Thread handle not inheritable
|
|
||||||
FALSE, // Set handle inheritance to FALSE
|
|
||||||
DEBUG_ONLY_THIS_PROCESS,// We're going to debug the child
|
|
||||||
NULL, // Use parent's environment block
|
|
||||||
NULL, // Use parent's starting directory
|
|
||||||
&startup_info, // Pointer to STARTUPINFO structure
|
|
||||||
&process_info // Pointer to PROCESS_INFORMATION structure
|
|
||||||
); |
|
||||||
if (!ok) { |
|
||||||
applog(LOG_ERR, "CreateProcess failed with error %d\n", GetLastError() ); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
|
|
||||||
// Debug the child (only clean way to catch exceptions)
|
|
||||||
while (1) { |
|
||||||
|
|
||||||
// Wait for child to do something
|
|
||||||
DEBUG_EVENT debug_event; |
|
||||||
ZeroMemory(&debug_event, sizeof(debug_event)); |
|
||||||
|
|
||||||
BOOL ok = WaitForDebugEvent(&debug_event, 60 * 1000); |
|
||||||
if (!ok) |
|
||||||
break; |
|
||||||
|
|
||||||
// Decide if event is "normal"
|
|
||||||
int go_on = |
|
||||||
CREATE_PROCESS_DEBUG_EVENT== debug_event.dwDebugEventCode || |
|
||||||
CREATE_THREAD_DEBUG_EVENT == debug_event.dwDebugEventCode || |
|
||||||
EXIT_THREAD_DEBUG_EVENT == debug_event.dwDebugEventCode || |
|
||||||
EXCEPTION_DEBUG_EVENT == debug_event.dwDebugEventCode || |
|
||||||
LOAD_DLL_DEBUG_EVENT == debug_event.dwDebugEventCode || |
|
||||||
OUTPUT_DEBUG_STRING_EVENT == debug_event.dwDebugEventCode || |
|
||||||
UNLOAD_DLL_DEBUG_EVENT == debug_event.dwDebugEventCode; |
|
||||||
if (!go_on) |
|
||||||
break; |
|
||||||
|
|
||||||
// Some exceptions are also "normal", apparently.
|
|
||||||
if (EXCEPTION_DEBUG_EVENT== debug_event.dwDebugEventCode) { |
|
||||||
|
|
||||||
int go_on = |
|
||||||
EXCEPTION_BREAKPOINT== debug_event.u.Exception.ExceptionRecord.ExceptionCode; |
|
||||||
if (!go_on) |
|
||||||
break; |
|
||||||
} |
|
||||||
|
|
||||||
// If nothing unexpected happened, let child proceed
|
|
||||||
ContinueDebugEvent( |
|
||||||
debug_event.dwProcessId, |
|
||||||
debug_event.dwThreadId, |
|
||||||
DBG_CONTINUE |
|
||||||
); |
|
||||||
} |
|
||||||
|
|
||||||
// Clean up child process
|
|
||||||
TerminateProcess(process_info.hProcess, 1); |
|
||||||
CloseHandle(process_info.hProcess); |
|
||||||
CloseHandle(process_info.hThread); |
|
||||||
|
|
||||||
// Reap return value and cleanup
|
|
||||||
CopyMemory(&rate, shared_mem, sizeof(rate)); |
|
||||||
(void)UnmapViewOfFile(shared_mem); |
|
||||||
(void)CloseHandle(map_handle); |
|
||||||
|
|
||||||
#else |
|
||||||
|
|
||||||
// Not linux, not unix, not WIN32 ... do our best
|
|
||||||
rate = bench_algo_stage3(algo); |
|
||||||
|
|
||||||
#endif // defined(unix)
|
|
||||||
|
|
||||||
// Done
|
|
||||||
return rate; |
|
||||||
} |
|
||||||
|
|
||||||
static void bench_algo( |
|
||||||
double *best_rate, |
|
||||||
enum sha256_algos *best_algo, |
|
||||||
enum sha256_algos algo |
|
||||||
) |
|
||||||
{ |
|
||||||
size_t n = max_name_len - strlen(algo_names[algo]); |
|
||||||
memset(name_spaces_pad, ' ', n); |
|
||||||
name_spaces_pad[n] = 0; |
|
||||||
|
|
||||||
applog( |
|
||||||
LOG_ERR, |
|
||||||
"\"%s\"%s : benchmarking algorithm ...", |
|
||||||
algo_names[algo], |
|
||||||
name_spaces_pad |
|
||||||
); |
|
||||||
|
|
||||||
double rate = bench_algo_stage2(algo); |
|
||||||
if (rate<0.0) { |
|
||||||
applog( |
|
||||||
LOG_ERR, |
|
||||||
"\"%s\"%s : algorithm fails on this platform", |
|
||||||
algo_names[algo], |
|
||||||
name_spaces_pad |
|
||||||
); |
|
||||||
} else { |
|
||||||
applog( |
|
||||||
LOG_ERR, |
|
||||||
"\"%s\"%s : algorithm runs at %.5f MH/s", |
|
||||||
algo_names[algo], |
|
||||||
name_spaces_pad, |
|
||||||
rate |
|
||||||
); |
|
||||||
if (*best_rate<rate) { |
|
||||||
*best_rate = rate; |
|
||||||
*best_algo = algo; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Figure out the longest algorithm name
|
|
||||||
void init_max_name_len() |
|
||||||
{ |
|
||||||
size_t i; |
|
||||||
size_t nb_names = sizeof(algo_names)/sizeof(algo_names[0]); |
|
||||||
for (i=0; i<nb_names; ++i) { |
|
||||||
const char *p = algo_names[i]; |
|
||||||
size_t name_len = p ? strlen(p) : 0; |
|
||||||
if (max_name_len<name_len) |
|
||||||
max_name_len = name_len; |
|
||||||
} |
|
||||||
|
|
||||||
name_spaces_pad = (char*) malloc(max_name_len+16); |
|
||||||
if (0==name_spaces_pad) { |
|
||||||
perror("malloc failed"); |
|
||||||
exit(1); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// Pick the fastest CPU hasher
|
|
||||||
static enum sha256_algos pick_fastest_algo() |
|
||||||
{ |
|
||||||
double best_rate = -1.0; |
|
||||||
enum sha256_algos best_algo = 0; |
|
||||||
applog(LOG_ERR, "benchmarking all sha256 algorithms ..."); |
|
||||||
|
|
||||||
bench_algo(&best_rate, &best_algo, ALGO_C); |
|
||||||
|
|
||||||
#if defined(WANT_SSE2_4WAY) |
|
||||||
bench_algo(&best_rate, &best_algo, ALGO_4WAY); |
|
||||||
#endif |
|
||||||
|
|
||||||
#if defined(WANT_VIA_PADLOCK) |
|
||||||
bench_algo(&best_rate, &best_algo, ALGO_VIA); |
|
||||||
#endif |
|
||||||
|
|
||||||
bench_algo(&best_rate, &best_algo, ALGO_CRYPTOPP); |
|
||||||
|
|
||||||
#if defined(WANT_CRYPTOPP_ASM32) |
|
||||||
bench_algo(&best_rate, &best_algo, ALGO_CRYPTOPP_ASM32); |
|
||||||
#endif |
|
||||||
|
|
||||||
#if defined(WANT_X8632_SSE2) |
|
||||||
bench_algo(&best_rate, &best_algo, ALGO_SSE2_32); |
|
||||||
#endif |
|
||||||
|
|
||||||
#if defined(WANT_X8664_SSE2) |
|
||||||
bench_algo(&best_rate, &best_algo, ALGO_SSE2_64); |
|
||||||
#endif |
|
||||||
|
|
||||||
#if defined(WANT_X8664_SSE4) |
|
||||||
bench_algo(&best_rate, &best_algo, ALGO_SSE4_64); |
|
||||||
#endif |
|
||||||
|
|
||||||
#if defined(WANT_ALTIVEC_4WAY) |
|
||||||
bench_algo(&best_rate, &best_algo, ALGO_ALTIVEC_4WAY); |
|
||||||
#endif |
|
||||||
|
|
||||||
size_t n = max_name_len - strlen(algo_names[best_algo]); |
|
||||||
memset(name_spaces_pad, ' ', n); |
|
||||||
name_spaces_pad[n] = 0; |
|
||||||
applog( |
|
||||||
LOG_ERR, |
|
||||||
"\"%s\"%s : is fastest algorithm at %.5f MH/s", |
|
||||||
algo_names[best_algo], |
|
||||||
name_spaces_pad, |
|
||||||
best_rate |
|
||||||
); |
|
||||||
return best_algo; |
|
||||||
} |
|
||||||
|
|
||||||
/* FIXME: Use asprintf for better errors. */ |
|
||||||
char *set_algo(const char *arg, enum sha256_algos *algo) |
|
||||||
{ |
|
||||||
enum sha256_algos i; |
|
||||||
|
|
||||||
if (opt_scrypt) |
|
||||||
return "Can only use scrypt algorithm"; |
|
||||||
|
|
||||||
if (!strcmp(arg, "auto")) { |
|
||||||
*algo = pick_fastest_algo(); |
|
||||||
return NULL; |
|
||||||
} |
|
||||||
|
|
||||||
for (i = 0; i < ARRAY_SIZE(algo_names); i++) { |
|
||||||
if (algo_names[i] && !strcmp(arg, algo_names[i])) { |
|
||||||
*algo = i; |
|
||||||
return NULL; |
|
||||||
} |
|
||||||
} |
|
||||||
return "Unknown algorithm"; |
|
||||||
} |
|
||||||
|
|
||||||
#ifdef WANT_SCRYPT |
|
||||||
void set_scrypt_algo(enum sha256_algos *algo) |
|
||||||
{ |
|
||||||
*algo = ALGO_SCRYPT; |
|
||||||
} |
|
||||||
#endif |
|
||||||
|
|
||||||
void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo) |
|
||||||
{ |
|
||||||
strncpy(buf, algo_names[*algo], OPT_SHOW_LEN); |
|
||||||
} |
|
||||||
#endif |
|
||||||
|
|
||||||
#ifdef WANT_CPUMINE |
|
||||||
char *force_nthreads_int(const char *arg, int *i) |
|
||||||
{ |
|
||||||
forced_n_threads = true; |
|
||||||
return set_int_range(arg, i, 0, 9999); |
|
||||||
} |
|
||||||
#endif |
|
||||||
|
|
||||||
#ifdef WANT_CPUMINE |
|
||||||
static void cpu_detect() |
|
||||||
{ |
|
||||||
int i; |
|
||||||
|
|
||||||
// Reckon number of cores in the box
|
|
||||||
#if defined(WIN32) |
|
||||||
{ |
|
||||||
DWORD_PTR system_am; |
|
||||||
DWORD_PTR process_am; |
|
||||||
BOOL ok = GetProcessAffinityMask( |
|
||||||
GetCurrentProcess(), |
|
||||||
&system_am, |
|
||||||
&process_am |
|
||||||
); |
|
||||||
if (!ok) { |
|
||||||
applog(LOG_ERR, "couldn't figure out number of processors :("); |
|
||||||
num_processors = 1; |
|
||||||
} else { |
|
||||||
size_t n = 32; |
|
||||||
num_processors = 0; |
|
||||||
while (n--) |
|
||||||
if (process_am & (1<<n)) |
|
||||||
++num_processors; |
|
||||||
} |
|
||||||
} |
|
||||||
#else |
|
||||||
num_processors = sysconf(_SC_NPROCESSORS_ONLN); |
|
||||||
#endif /* !WIN32 */ |
|
||||||
|
|
||||||
if (opt_n_threads < 0 || !forced_n_threads) { |
|
||||||
if (total_devices && !opt_usecpu) |
|
||||||
opt_n_threads = 0; |
|
||||||
else |
|
||||||
opt_n_threads = num_processors; |
|
||||||
} |
|
||||||
if (num_processors < 1) |
|
||||||
return; |
|
||||||
|
|
||||||
cpus = calloc(opt_n_threads, sizeof(struct cgpu_info)); |
|
||||||
if (unlikely(!cpus)) |
|
||||||
quit(1, "Failed to calloc cpus"); |
|
||||||
for (i = 0; i < opt_n_threads; ++i) { |
|
||||||
struct cgpu_info *cgpu; |
|
||||||
|
|
||||||
cgpu = &cpus[i]; |
|
||||||
cgpu->drv = &cpu_drv; |
|
||||||
cgpu->deven = DEV_ENABLED; |
|
||||||
cgpu->threads = 1; |
|
||||||
cgpu->kname = algo_names[opt_algo]; |
|
||||||
if (opt_scrypt) |
|
||||||
cgpu->drv->max_diff = 0xffffffff; |
|
||||||
add_cgpu(cgpu); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
static void reinit_cpu_device(struct cgpu_info *cpu) |
|
||||||
{ |
|
||||||
tq_push(control_thr[cpur_thr_id].q, cpu); |
|
||||||
} |
|
||||||
|
|
||||||
static bool cpu_thread_prepare(struct thr_info *thr) |
|
||||||
{ |
|
||||||
thread_reportin(thr); |
|
||||||
|
|
||||||
return true; |
|
||||||
} |
|
||||||
|
|
||||||
static uint64_t cpu_can_limit_work(struct thr_info __maybe_unused *thr) |
|
||||||
{ |
|
||||||
return 0xffff; |
|
||||||
} |
|
||||||
|
|
||||||
static bool cpu_thread_init(struct thr_info *thr) |
|
||||||
{ |
|
||||||
const int thr_id = thr->id; |
|
||||||
|
|
||||||
/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
|
|
||||||
* and if that fails, then SCHED_BATCH. No need for this to be an |
|
||||||
* error if it fails */ |
|
||||||
setpriority(PRIO_PROCESS, 0, 19); |
|
||||||
drop_policy(); |
|
||||||
/* Cpu affinity only makes sense if the number of threads is a multiple
|
|
||||||
* of the number of CPUs */ |
|
||||||
if (!(opt_n_threads % num_processors)) |
|
||||||
affine_to_cpu(dev_from_id(thr_id), dev_from_id(thr_id) % num_processors); |
|
||||||
return true; |
|
||||||
} |
|
||||||
|
|
||||||
static int64_t cpu_scanhash(struct thr_info *thr, struct work *work, int64_t max_nonce) |
|
||||||
{ |
|
||||||
const int thr_id = thr->id; |
|
||||||
unsigned char hash1[64]; |
|
||||||
uint32_t first_nonce = work->blk.nonce; |
|
||||||
uint32_t last_nonce; |
|
||||||
bool rc; |
|
||||||
|
|
||||||
hex2bin(hash1, "00000000000000000000000000000000000000000000000000000000000000000000008000000000000000000000000000000000000000000000000000010000", 64); |
|
||||||
CPUSearch: |
|
||||||
last_nonce = first_nonce; |
|
||||||
rc = false; |
|
||||||
|
|
||||||
/* scan nonces for a proof-of-work hash */ |
|
||||||
{ |
|
||||||
sha256_func func = sha256_funcs[opt_algo]; |
|
||||||
rc = (*func)( |
|
||||||
thr, |
|
||||||
work->midstate, |
|
||||||
work->data, |
|
||||||
hash1, |
|
||||||
work->hash, |
|
||||||
work->target, |
|
||||||
max_nonce, |
|
||||||
&last_nonce, |
|
||||||
work->blk.nonce |
|
||||||
); |
|
||||||
} |
|
||||||
|
|
||||||
/* if nonce found, submit work */ |
|
||||||
if (unlikely(rc)) { |
|
||||||
applog(LOG_DEBUG, "CPU %d found something?", dev_from_id(thr_id)); |
|
||||||
submit_nonce(thr, work, last_nonce); |
|
||||||
work->blk.nonce = last_nonce + 1; |
|
||||||
goto CPUSearch; |
|
||||||
} |
|
||||||
else |
|
||||||
if (unlikely(last_nonce == first_nonce)) |
|
||||||
return 0; |
|
||||||
|
|
||||||
work->blk.nonce = last_nonce + 1; |
|
||||||
return last_nonce - first_nonce + 1; |
|
||||||
} |
|
||||||
|
|
||||||
struct device_drv cpu_drv = { |
|
||||||
.drv_id = DRIVER_CPU, |
|
||||||
.dname = "cpu", |
|
||||||
.name = "CPU", |
|
||||||
.drv_detect = cpu_detect, |
|
||||||
.reinit_device = reinit_cpu_device, |
|
||||||
.thread_prepare = cpu_thread_prepare, |
|
||||||
.can_limit_work = cpu_can_limit_work, |
|
||||||
.thread_init = cpu_thread_init, |
|
||||||
.scanhash = cpu_scanhash, |
|
||||||
}; |
|
||||||
#endif |
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,65 +0,0 @@ |
|||||||
#ifndef __DEVICE_CPU_H__ |
|
||||||
#define __DEVICE_CPU_H__ |
|
||||||
|
|
||||||
#include "miner.h" |
|
||||||
|
|
||||||
#include "config.h" |
|
||||||
#include <stdbool.h> |
|
||||||
|
|
||||||
#ifndef OPT_SHOW_LEN |
|
||||||
#define OPT_SHOW_LEN 80 |
|
||||||
#endif |
|
||||||
|
|
||||||
#ifdef __SSE2__ |
|
||||||
#define WANT_SSE2_4WAY 1 |
|
||||||
#endif |
|
||||||
|
|
||||||
#ifdef __ALTIVEC__ |
|
||||||
#define WANT_ALTIVEC_4WAY 1 |
|
||||||
#endif |
|
||||||
|
|
||||||
#if defined(__i386__) && defined(HAS_YASM) && defined(__SSE2__) |
|
||||||
#define WANT_X8632_SSE2 1 |
|
||||||
#endif |
|
||||||
|
|
||||||
#if (defined(__i386__) || defined(__x86_64__)) && !defined(__APPLE__) |
|
||||||
#define WANT_VIA_PADLOCK 1 |
|
||||||
#endif |
|
||||||
|
|
||||||
#if defined(__x86_64__) && defined(HAS_YASM) |
|
||||||
#define WANT_X8664_SSE2 1 |
|
||||||
#endif |
|
||||||
|
|
||||||
#if defined(__x86_64__) && defined(HAS_YASM) && defined(__SSE4_1__) |
|
||||||
#define WANT_X8664_SSE4 1 |
|
||||||
#endif |
|
||||||
|
|
||||||
#ifdef USE_SCRYPT |
|
||||||
#define WANT_SCRYPT |
|
||||||
#endif |
|
||||||
|
|
||||||
enum sha256_algos { |
|
||||||
ALGO_C, /* plain C */ |
|
||||||
ALGO_4WAY, /* parallel SSE2 */ |
|
||||||
ALGO_VIA, /* VIA padlock */ |
|
||||||
ALGO_CRYPTOPP, /* Crypto++ (C) */ |
|
||||||
ALGO_CRYPTOPP_ASM32, /* Crypto++ 32-bit assembly */ |
|
||||||
ALGO_SSE2_32, /* SSE2 for x86_32 */ |
|
||||||
ALGO_SSE2_64, /* SSE2 for x86_64 */ |
|
||||||
ALGO_SSE4_64, /* SSE4 for x86_64 */ |
|
||||||
ALGO_ALTIVEC_4WAY, /* parallel Altivec */ |
|
||||||
ALGO_SCRYPT, /* scrypt */ |
|
||||||
}; |
|
||||||
|
|
||||||
extern const char *algo_names[]; |
|
||||||
extern bool opt_usecpu; |
|
||||||
extern struct device_drv cpu_drv; |
|
||||||
|
|
||||||
extern char *set_algo(const char *arg, enum sha256_algos *algo); |
|
||||||
extern void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo); |
|
||||||
extern char *force_nthreads_int(const char *arg, int *i); |
|
||||||
extern void init_max_name_len(); |
|
||||||
extern double bench_algo_stage3(enum sha256_algos algo); |
|
||||||
extern void set_scrypt_algo(enum sha256_algos *algo); |
|
||||||
|
|
||||||
#endif /* __DEVICE_CPU_H__ */ |
|
@ -1,488 +0,0 @@ |
|||||||
// Copyright (c) 2010 Satoshi Nakamoto
|
|
||||||
// Distributed under the MIT/X11 software license, see the accompanying
|
|
||||||
// file license.txt or http://www.opensource.org/licenses/mit-license.php.
|
|
||||||
|
|
||||||
// tcatm's 4-way 128-bit SSE2 SHA-256
|
|
||||||
|
|
||||||
#include "driver-cpu.h" |
|
||||||
|
|
||||||
#ifdef WANT_SSE2_4WAY |
|
||||||
|
|
||||||
#include <string.h> |
|
||||||
#include <assert.h> |
|
||||||
|
|
||||||
#include <xmmintrin.h> |
|
||||||
#include <stdint.h> |
|
||||||
#include <stdio.h> |
|
||||||
|
|
||||||
#define NPAR 32 |
|
||||||
|
|
||||||
static void DoubleBlockSHA256(const void* pin, void* pout, const void* pinit, unsigned int hash[8][NPAR], const void* init2); |
|
||||||
|
|
||||||
static const unsigned int sha256_consts[] = { |
|
||||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /* 0 */ |
|
||||||
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
|
||||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /* 8 */ |
|
||||||
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
|
||||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, /* 16 */ |
|
||||||
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
|
||||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, /* 24 */ |
|
||||||
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
|
||||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, /* 32 */ |
|
||||||
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
|
||||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, /* 40 */ |
|
||||||
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
|
||||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, /* 48 */ |
|
||||||
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
|
||||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, /* 56 */ |
|
||||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
|
||||||
}; |
|
||||||
|
|
||||||
|
|
||||||
static inline __m128i Ch(const __m128i b, const __m128i c, const __m128i d) { |
|
||||||
return _mm_xor_si128(_mm_and_si128(b,c),_mm_andnot_si128(b,d)); |
|
||||||
} |
|
||||||
|
|
||||||
static inline __m128i Maj(const __m128i b, const __m128i c, const __m128i d) { |
|
||||||
return _mm_xor_si128(_mm_xor_si128(_mm_and_si128(b,c),_mm_and_si128(b,d)),_mm_and_si128(c,d)); |
|
||||||
} |
|
||||||
|
|
||||||
static inline __m128i ROTR(__m128i x, const int n) { |
|
||||||
return _mm_or_si128(_mm_srli_epi32(x, n),_mm_slli_epi32(x, 32 - n)); |
|
||||||
} |
|
||||||
|
|
||||||
static inline __m128i SHR(__m128i x, const int n) { |
|
||||||
return _mm_srli_epi32(x, n); |
|
||||||
} |
|
||||||
|
|
||||||
/* SHA256 Functions */ |
|
||||||
#define BIGSIGMA0_256(x) (_mm_xor_si128(_mm_xor_si128(ROTR((x), 2),ROTR((x), 13)),ROTR((x), 22))) |
|
||||||
#define BIGSIGMA1_256(x) (_mm_xor_si128(_mm_xor_si128(ROTR((x), 6),ROTR((x), 11)),ROTR((x), 25))) |
|
||||||
|
|
||||||
|
|
||||||
#define SIGMA0_256(x) (_mm_xor_si128(_mm_xor_si128(ROTR((x), 7),ROTR((x), 18)), SHR((x), 3 ))) |
|
||||||
#define SIGMA1_256(x) (_mm_xor_si128(_mm_xor_si128(ROTR((x),17),ROTR((x), 19)), SHR((x), 10))) |
|
||||||
|
|
||||||
static inline unsigned int store32(const __m128i x, int i) { |
|
||||||
union { unsigned int ret[4]; __m128i x; } box; |
|
||||||
box.x = x; |
|
||||||
return box.ret[i]; |
|
||||||
} |
|
||||||
|
|
||||||
static inline void store_epi32(const __m128i x, unsigned int *x0, unsigned int *x1, unsigned int *x2, unsigned int *x3) { |
|
||||||
union { unsigned int ret[4]; __m128i x; } box; |
|
||||||
box.x = x; |
|
||||||
*x0 = box.ret[3]; *x1 = box.ret[2]; *x2 = box.ret[1]; *x3 = box.ret[0]; |
|
||||||
} |
|
||||||
|
|
||||||
#define add4(x0, x1, x2, x3) _mm_add_epi32(_mm_add_epi32(x0, x1),_mm_add_epi32( x2,x3)) |
|
||||||
#define add5(x0, x1, x2, x3, x4) _mm_add_epi32(add4(x0, x1, x2, x3), x4) |
|
||||||
|
|
||||||
#define SHA256ROUND(a, b, c, d, e, f, g, h, i, w) \ |
|
||||||
T1 = add5(h, BIGSIGMA1_256(e), Ch(e, f, g), _mm_set1_epi32(sha256_consts[i]), w); \ |
|
||||||
d = _mm_add_epi32(d, T1); \ |
|
||||||
h = _mm_add_epi32(T1, _mm_add_epi32(BIGSIGMA0_256(a), Maj(a, b, c))); |
|
||||||
|
|
||||||
static inline void dumpreg(__m128i x, char *msg) { |
|
||||||
union { unsigned int ret[4]; __m128i x; } box; |
|
||||||
box.x = x ; |
|
||||||
printf("%s %08x %08x %08x %08x\n", msg, box.ret[0], box.ret[1], box.ret[2], box.ret[3]); |
|
||||||
} |
|
||||||
|
|
||||||
#if 1 |
|
||||||
#define dumpstate(i) printf("%s: %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", \ |
|
||||||
__func__, store32(w0, i), store32(a, i), store32(b, i), store32(c, i), store32(d, i), store32(e, i), store32(f, i), store32(g, i), store32(h, i)); |
|
||||||
#else |
|
||||||
#define dumpstate() |
|
||||||
#endif |
|
||||||
|
|
||||||
static const unsigned int pSHA256InitState[8] = |
|
||||||
{0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; |
|
||||||
|
|
||||||
|
|
||||||
bool ScanHash_4WaySSE2(struct thr_info*thr, const unsigned char *pmidstate, |
|
||||||
unsigned char *pdata, |
|
||||||
unsigned char *phash1, unsigned char *phash, |
|
||||||
const unsigned char *ptarget, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, |
|
||||||
uint32_t nonce) |
|
||||||
{ |
|
||||||
unsigned int *nNonce_p = (unsigned int*)(pdata + 76); |
|
||||||
|
|
||||||
pdata += 64; |
|
||||||
|
|
||||||
for (;;) |
|
||||||
{ |
|
||||||
unsigned int thash[9][NPAR] __attribute__((aligned(128))); |
|
||||||
int j; |
|
||||||
|
|
||||||
nonce += NPAR; |
|
||||||
*nNonce_p = nonce; |
|
||||||
|
|
||||||
DoubleBlockSHA256(pdata, phash1, pmidstate, thash, pSHA256InitState); |
|
||||||
|
|
||||||
for (j = 0; j < NPAR; j++) |
|
||||||
{ |
|
||||||
if (unlikely(thash[7][j] == 0)) |
|
||||||
{ |
|
||||||
int i; |
|
||||||
|
|
||||||
for (i = 0; i < 32/4; i++) |
|
||||||
((unsigned int*)phash)[i] = thash[i][j]; |
|
||||||
|
|
||||||
if (fulltest(phash, ptarget)) { |
|
||||||
nonce += j; |
|
||||||
*last_nonce = nonce; |
|
||||||
*nNonce_p = nonce; |
|
||||||
return true; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
if ((nonce >= max_nonce) || thr->work_restart) |
|
||||||
{ |
|
||||||
*last_nonce = nonce; |
|
||||||
return false; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
|
|
||||||
static void DoubleBlockSHA256(const void* pin, void* pad, const void *pre, unsigned int thash[9][NPAR], const void *init) |
|
||||||
{ |
|
||||||
unsigned int* In = (unsigned int*)pin; |
|
||||||
unsigned int* Pad = (unsigned int*)pad; |
|
||||||
unsigned int* hPre = (unsigned int*)pre; |
|
||||||
unsigned int* hInit = (unsigned int*)init; |
|
||||||
unsigned int /* i, j, */ k; |
|
||||||
|
|
||||||
/* vectors used in calculation */ |
|
||||||
__m128i w0, w1, w2, w3, w4, w5, w6, w7; |
|
||||||
__m128i w8, w9, w10, w11, w12, w13, w14, w15; |
|
||||||
__m128i T1; |
|
||||||
__m128i a, b, c, d, e, f, g, h; |
|
||||||
__m128i nonce, preNonce; |
|
||||||
|
|
||||||
/* nonce offset for vector */ |
|
||||||
__m128i offset = _mm_set_epi32(0x00000003, 0x00000002, 0x00000001, 0x00000000); |
|
||||||
|
|
||||||
|
|
||||||
preNonce = _mm_add_epi32(_mm_set1_epi32(In[3]), offset); |
|
||||||
|
|
||||||
for(k = 0; k<NPAR; k+=4) { |
|
||||||
w0 = _mm_set1_epi32(In[0]); |
|
||||||
w1 = _mm_set1_epi32(In[1]); |
|
||||||
w2 = _mm_set1_epi32(In[2]); |
|
||||||
//w3 = _mm_set1_epi32(In[3]); nonce will be later hacked into the hash
|
|
||||||
w4 = _mm_set1_epi32(In[4]); |
|
||||||
w5 = _mm_set1_epi32(In[5]); |
|
||||||
w6 = _mm_set1_epi32(In[6]); |
|
||||||
w7 = _mm_set1_epi32(In[7]); |
|
||||||
w8 = _mm_set1_epi32(In[8]); |
|
||||||
w9 = _mm_set1_epi32(In[9]); |
|
||||||
w10 = _mm_set1_epi32(In[10]); |
|
||||||
w11 = _mm_set1_epi32(In[11]); |
|
||||||
w12 = _mm_set1_epi32(In[12]); |
|
||||||
w13 = _mm_set1_epi32(In[13]); |
|
||||||
w14 = _mm_set1_epi32(In[14]); |
|
||||||
w15 = _mm_set1_epi32(In[15]); |
|
||||||
|
|
||||||
/* hack nonce into lowest byte of w3 */ |
|
||||||
nonce = _mm_add_epi32(preNonce, _mm_set1_epi32(k)); |
|
||||||
w3 = nonce; |
|
||||||
|
|
||||||
a = _mm_set1_epi32(hPre[0]); |
|
||||||
b = _mm_set1_epi32(hPre[1]); |
|
||||||
c = _mm_set1_epi32(hPre[2]); |
|
||||||
d = _mm_set1_epi32(hPre[3]); |
|
||||||
e = _mm_set1_epi32(hPre[4]); |
|
||||||
f = _mm_set1_epi32(hPre[5]); |
|
||||||
g = _mm_set1_epi32(hPre[6]); |
|
||||||
h = _mm_set1_epi32(hPre[7]); |
|
||||||
|
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15); |
|
||||||
|
|
||||||
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0); |
|
||||||
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1); |
|
||||||
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2); |
|
||||||
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3); |
|
||||||
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4); |
|
||||||
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5); |
|
||||||
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6); |
|
||||||
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7); |
|
||||||
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8); |
|
||||||
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9); |
|
||||||
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10); |
|
||||||
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11); |
|
||||||
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12); |
|
||||||
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13); |
|
||||||
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14); |
|
||||||
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15); |
|
||||||
|
|
||||||
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0); |
|
||||||
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1); |
|
||||||
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2); |
|
||||||
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3); |
|
||||||
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4); |
|
||||||
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5); |
|
||||||
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6); |
|
||||||
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7); |
|
||||||
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8); |
|
||||||
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9); |
|
||||||
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10); |
|
||||||
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11); |
|
||||||
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12); |
|
||||||
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13); |
|
||||||
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14); |
|
||||||
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15); |
|
||||||
|
|
||||||
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0); |
|
||||||
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1); |
|
||||||
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2); |
|
||||||
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3); |
|
||||||
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4); |
|
||||||
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5); |
|
||||||
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6); |
|
||||||
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7); |
|
||||||
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8); |
|
||||||
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9); |
|
||||||
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10); |
|
||||||
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11); |
|
||||||
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12); |
|
||||||
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13); |
|
||||||
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14); |
|
||||||
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15); |
|
||||||
|
|
||||||
#define store_load(x, i, dest) \ |
|
||||||
T1 = _mm_set1_epi32((hPre)[i]); \ |
|
||||||
dest = _mm_add_epi32(T1, x); |
|
||||||
|
|
||||||
store_load(a, 0, w0); |
|
||||||
store_load(b, 1, w1); |
|
||||||
store_load(c, 2, w2); |
|
||||||
store_load(d, 3, w3); |
|
||||||
store_load(e, 4, w4); |
|
||||||
store_load(f, 5, w5); |
|
||||||
store_load(g, 6, w6); |
|
||||||
store_load(h, 7, w7); |
|
||||||
|
|
||||||
w8 = _mm_set1_epi32(Pad[8]); |
|
||||||
w9 = _mm_set1_epi32(Pad[9]); |
|
||||||
w10 = _mm_set1_epi32(Pad[10]); |
|
||||||
w11 = _mm_set1_epi32(Pad[11]); |
|
||||||
w12 = _mm_set1_epi32(Pad[12]); |
|
||||||
w13 = _mm_set1_epi32(Pad[13]); |
|
||||||
w14 = _mm_set1_epi32(Pad[14]); |
|
||||||
w15 = _mm_set1_epi32(Pad[15]); |
|
||||||
|
|
||||||
a = _mm_set1_epi32(hInit[0]); |
|
||||||
b = _mm_set1_epi32(hInit[1]); |
|
||||||
c = _mm_set1_epi32(hInit[2]); |
|
||||||
d = _mm_set1_epi32(hInit[3]); |
|
||||||
e = _mm_set1_epi32(hInit[4]); |
|
||||||
f = _mm_set1_epi32(hInit[5]); |
|
||||||
g = _mm_set1_epi32(hInit[6]); |
|
||||||
h = _mm_set1_epi32(hInit[7]); |
|
||||||
|
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15); |
|
||||||
|
|
||||||
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0); |
|
||||||
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1); |
|
||||||
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2); |
|
||||||
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3); |
|
||||||
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4); |
|
||||||
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5); |
|
||||||
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6); |
|
||||||
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7); |
|
||||||
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8); |
|
||||||
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9); |
|
||||||
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10); |
|
||||||
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11); |
|
||||||
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12); |
|
||||||
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13); |
|
||||||
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14); |
|
||||||
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15); |
|
||||||
|
|
||||||
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0); |
|
||||||
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1); |
|
||||||
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2); |
|
||||||
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3); |
|
||||||
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4); |
|
||||||
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5); |
|
||||||
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6); |
|
||||||
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7); |
|
||||||
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8); |
|
||||||
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9); |
|
||||||
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10); |
|
||||||
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11); |
|
||||||
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12); |
|
||||||
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13); |
|
||||||
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14); |
|
||||||
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15); |
|
||||||
|
|
||||||
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0); |
|
||||||
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1); |
|
||||||
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2); |
|
||||||
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3); |
|
||||||
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4); |
|
||||||
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5); |
|
||||||
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6); |
|
||||||
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7); |
|
||||||
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8); |
|
||||||
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9); |
|
||||||
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10); |
|
||||||
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11); |
|
||||||
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12); |
|
||||||
|
|
||||||
/* Skip last 3-rounds; not necessary for H==0 */ |
|
||||||
#if 0 |
|
||||||
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13); |
|
||||||
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14); |
|
||||||
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15); |
|
||||||
#endif |
|
||||||
|
|
||||||
/* store resulsts directly in thash */ |
|
||||||
#define store_2(x,i) \ |
|
||||||
w0 = _mm_set1_epi32(hInit[i]); \ |
|
||||||
*(__m128i *)&(thash)[i][0+k] = _mm_add_epi32(w0, x); |
|
||||||
|
|
||||||
store_2(a, 0); |
|
||||||
store_2(b, 1); |
|
||||||
store_2(c, 2); |
|
||||||
store_2(d, 3); |
|
||||||
store_2(e, 4); |
|
||||||
store_2(f, 5); |
|
||||||
store_2(g, 6); |
|
||||||
store_2(h, 7); |
|
||||||
*(__m128i *)&(thash)[8][0+k] = nonce; |
|
||||||
} |
|
||||||
|
|
||||||
} |
|
||||||
|
|
||||||
#endif /* WANT_SSE2_4WAY */ |
|
@ -1,469 +0,0 @@ |
|||||||
// Copyright (c) 2010 Satoshi Nakamoto
|
|
||||||
// Copyright (c) 2011 Gilles Risch
|
|
||||||
// Distributed under the MIT/X11 software license, see the accompanying
|
|
||||||
// file license.txt or http://www.opensource.org/licenses/mit-license.php.
|
|
||||||
|
|
||||||
|
|
||||||
// 4-way 128-bit Altivec SHA-256,
|
|
||||||
// based on tcatm's 4-way 128-bit SSE2 SHA-256
|
|
||||||
//
|
|
||||||
|
|
||||||
|
|
||||||
#include "driver-cpu.h" |
|
||||||
|
|
||||||
#ifdef WANT_ALTIVEC_4WAY |
|
||||||
|
|
||||||
#include <string.h> |
|
||||||
#include <assert.h> |
|
||||||
|
|
||||||
//#include <altivec.h>
|
|
||||||
#include <stdint.h> |
|
||||||
#include <stdio.h> |
|
||||||
|
|
||||||
#define NPAR 32 |
|
||||||
|
|
||||||
static void DoubleBlockSHA256(const void* pin, void* pout, const void* pinit, unsigned int hash[8][NPAR], const void* init2); |
|
||||||
|
|
||||||
static const unsigned int sha256_consts[] = { |
|
||||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /* 0 */ |
|
||||||
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
|
||||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /* 8 */ |
|
||||||
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
|
||||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, /* 16 */ |
|
||||||
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
|
||||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, /* 24 */ |
|
||||||
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
|
||||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, /* 32 */ |
|
||||||
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
|
||||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, /* 40 */ |
|
||||||
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
|
||||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, /* 48 */ |
|
||||||
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
|
||||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, /* 56 */ |
|
||||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
|
||||||
}; |
|
||||||
|
|
||||||
|
|
||||||
static inline vector unsigned int Ch(const vector unsigned int b, const vector unsigned int c, const vector unsigned int d) { |
|
||||||
return vec_sel(d,c,b); |
|
||||||
} |
|
||||||
|
|
||||||
static inline vector unsigned int Maj(const vector unsigned int b, const vector unsigned int c, const vector unsigned int d) { |
|
||||||
return vec_sel(b,c, vec_xor(b,d)); |
|
||||||
} |
|
||||||
|
|
||||||
/* RotateRight(x, n) := RotateLeft(x, 32-n) */ |
|
||||||
/* SHA256 Functions */ |
|
||||||
#define BIGSIGMA0_256(x) (vec_xor(vec_xor(vec_rl((x), (vector unsigned int)(32-2)),vec_rl((x), (vector unsigned int)(32-13))),vec_rl((x), (vector unsigned int)(32-22)))) |
|
||||||
#define BIGSIGMA1_256(x) (vec_xor(vec_xor(vec_rl((x), (vector unsigned int)(32-6)),vec_rl((x), (vector unsigned int)(32-11))),vec_rl((x), (vector unsigned int)(32-25)))) |
|
||||||
|
|
||||||
#define SIGMA0_256(x) (vec_xor(vec_xor(vec_rl((x), (vector unsigned int)(32- 7)),vec_rl((x), (vector unsigned int)(32-18))), vec_sr((x), (vector unsigned int)(3 )))) |
|
||||||
#define SIGMA1_256(x) (vec_xor(vec_xor(vec_rl((x), (vector unsigned int)(32-17)),vec_rl((x), (vector unsigned int)(32-19))), vec_sr((x), (vector unsigned int)(10)))) |
|
||||||
|
|
||||||
#define add4(x0, x1, x2, x3) vec_add(vec_add(x0, x1),vec_add( x2,x3)) |
|
||||||
#define add5(x0, x1, x2, x3, x4) vec_add(add4(x0, x1, x2, x3), x4) |
|
||||||
|
|
||||||
#define SHA256ROUND(a, b, c, d, e, f, g, h, i, w) \ |
|
||||||
T1 = add5(h, BIGSIGMA1_256(e), Ch(e, f, g), (vector unsigned int)(sha256_consts[i],sha256_consts[i],sha256_consts[i],sha256_consts[i]), w); \ |
|
||||||
d = vec_add(d, T1); \ |
|
||||||
h = vec_add(T1, vec_add(BIGSIGMA0_256(a), Maj(a, b, c))); |
|
||||||
|
|
||||||
|
|
||||||
static const unsigned int pSHA256InitState[8] = |
|
||||||
{0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; |
|
||||||
|
|
||||||
|
|
||||||
bool ScanHash_altivec_4way(struct thr_info*thr, const unsigned char *pmidstate, |
|
||||||
unsigned char *pdata, |
|
||||||
unsigned char *phash1, unsigned char *phash, |
|
||||||
const unsigned char *ptarget, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, |
|
||||||
uint32_t nonce) |
|
||||||
{ |
|
||||||
unsigned int *nNonce_p = (unsigned int*)(pdata + 76); |
|
||||||
|
|
||||||
pdata += 64; |
|
||||||
|
|
||||||
for (;;) |
|
||||||
{ |
|
||||||
unsigned int thash[9][NPAR] __attribute__((aligned(128))); |
|
||||||
int j; |
|
||||||
|
|
||||||
*nNonce_p = nonce; |
|
||||||
|
|
||||||
DoubleBlockSHA256(pdata, phash1, pmidstate, thash, pSHA256InitState); |
|
||||||
|
|
||||||
for (j = 0; j < NPAR; j++) |
|
||||||
{ |
|
||||||
if (unlikely(thash[7][j] == 0)) |
|
||||||
{ |
|
||||||
int i; |
|
||||||
|
|
||||||
for (i = 0; i < 32/4; i++) |
|
||||||
((unsigned int*)phash)[i] = thash[i][j]; |
|
||||||
|
|
||||||
if (fulltest(phash, ptarget)) { |
|
||||||
nonce += j; |
|
||||||
*last_nonce = nonce; |
|
||||||
*nNonce_p = nonce; |
|
||||||
return true; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
if ((nonce >= max_nonce) || thr->work_restart) |
|
||||||
{ |
|
||||||
*last_nonce = nonce; |
|
||||||
return false; |
|
||||||
} |
|
||||||
|
|
||||||
nonce += NPAR; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
|
|
||||||
static void DoubleBlockSHA256(const void* pin, void* pad, const void *pre, unsigned int thash[9][NPAR], const void *init) |
|
||||||
{ |
|
||||||
unsigned int* In = (unsigned int*)pin; |
|
||||||
unsigned int* Pad = (unsigned int*)pad; |
|
||||||
unsigned int* hPre = (unsigned int*)pre; |
|
||||||
unsigned int* hInit = (unsigned int*)init; |
|
||||||
unsigned int /* i, j, */ k; |
|
||||||
|
|
||||||
/* vectors used in calculation */ |
|
||||||
vector unsigned int w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; |
|
||||||
vector unsigned int T1; |
|
||||||
vector unsigned int a, b, c, d, e, f, g, h; |
|
||||||
vector unsigned int nonce, preNonce; |
|
||||||
|
|
||||||
/* nonce offset for vector */ |
|
||||||
vector unsigned int offset = (vector unsigned int)(0, 1, 2, 3); |
|
||||||
|
|
||||||
preNonce = vec_add((vector unsigned int)(In[3],In[3],In[3],In[3]), offset); |
|
||||||
|
|
||||||
for(k = 0; k<NPAR; k+=4) |
|
||||||
{ |
|
||||||
w0 = (vector unsigned int)(In[0],In[0],In[0],In[0]); |
|
||||||
w1 = (vector unsigned int)(In[1],In[1],In[1],In[1]); |
|
||||||
w2 = (vector unsigned int)(In[2],In[2],In[2],In[2]); |
|
||||||
//w3 = (vector unsigned int)(In[3],In[3],In[3],In[3]); nonce will be later hacked into the hash
|
|
||||||
w4 = (vector unsigned int)(In[4],In[4],In[4],In[4]); |
|
||||||
w5 = (vector unsigned int)(In[5],In[5],In[5],In[5]); |
|
||||||
w6 = (vector unsigned int)(In[6],In[6],In[6],In[6]); |
|
||||||
w7 = (vector unsigned int)(In[7],In[7],In[7],In[7]); |
|
||||||
w8 = (vector unsigned int)(In[8],In[8],In[8],In[8]); |
|
||||||
w9 = (vector unsigned int)(In[9],In[9],In[9],In[9]); |
|
||||||
w10 = (vector unsigned int)(In[10],In[10],In[10],In[10]); |
|
||||||
w11 = (vector unsigned int)(In[11],In[11],In[11],In[11]); |
|
||||||
w12 = (vector unsigned int)(In[12],In[12],In[12],In[12]); |
|
||||||
w13 = (vector unsigned int)(In[13],In[13],In[13],In[13]); |
|
||||||
w14 = (vector unsigned int)(In[14],In[14],In[14],In[14]); |
|
||||||
w15 = (vector unsigned int)(In[15],In[15],In[15],In[15]); |
|
||||||
|
|
||||||
/* hack nonce into lowest byte of w3 */ |
|
||||||
nonce = vec_add(preNonce, (vector unsigned int)(k,k,k,k)); |
|
||||||
|
|
||||||
w3 = nonce; |
|
||||||
//printf ("W3: %08vlx\n", w3);
|
|
||||||
|
|
||||||
a = (vector unsigned int)(hPre[0],hPre[0],hPre[0],hPre[0]); |
|
||||||
b = (vector unsigned int)(hPre[1],hPre[1],hPre[1],hPre[1]); |
|
||||||
c = (vector unsigned int)(hPre[2],hPre[2],hPre[2],hPre[2]); |
|
||||||
d = (vector unsigned int)(hPre[3],hPre[3],hPre[3],hPre[3]); |
|
||||||
e = (vector unsigned int)(hPre[4],hPre[4],hPre[4],hPre[4]); |
|
||||||
f = (vector unsigned int)(hPre[5],hPre[5],hPre[5],hPre[5]); |
|
||||||
g = (vector unsigned int)(hPre[6],hPre[6],hPre[6],hPre[6]); |
|
||||||
h = (vector unsigned int)(hPre[7],hPre[7],hPre[7],hPre[7]); |
|
||||||
|
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15); |
|
||||||
|
|
||||||
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0); |
|
||||||
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1); |
|
||||||
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2); |
|
||||||
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3); |
|
||||||
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4); |
|
||||||
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5); |
|
||||||
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6); |
|
||||||
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7); |
|
||||||
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8); |
|
||||||
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9); |
|
||||||
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10); |
|
||||||
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11); |
|
||||||
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12); |
|
||||||
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13); |
|
||||||
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14); |
|
||||||
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15); |
|
||||||
|
|
||||||
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0); |
|
||||||
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1); |
|
||||||
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2); |
|
||||||
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3); |
|
||||||
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4); |
|
||||||
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5); |
|
||||||
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6); |
|
||||||
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7); |
|
||||||
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8); |
|
||||||
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9); |
|
||||||
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10); |
|
||||||
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11); |
|
||||||
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12); |
|
||||||
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13); |
|
||||||
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14); |
|
||||||
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15); |
|
||||||
|
|
||||||
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0); |
|
||||||
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1); |
|
||||||
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2); |
|
||||||
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3); |
|
||||||
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4); |
|
||||||
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5); |
|
||||||
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6); |
|
||||||
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7); |
|
||||||
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8); |
|
||||||
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9); |
|
||||||
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10); |
|
||||||
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11); |
|
||||||
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12); |
|
||||||
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13); |
|
||||||
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14); |
|
||||||
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15); |
|
||||||
|
|
||||||
#define store_load(x, i, dest) \ |
|
||||||
T1 = (vector unsigned int)((hPre)[i],(hPre)[i],(hPre)[i],(hPre)[i]); \ |
|
||||||
dest = vec_add(T1, x); |
|
||||||
|
|
||||||
store_load(a, 0, w0); |
|
||||||
store_load(b, 1, w1); |
|
||||||
store_load(c, 2, w2); |
|
||||||
store_load(d, 3, w3); |
|
||||||
store_load(e, 4, w4); |
|
||||||
store_load(f, 5, w5); |
|
||||||
store_load(g, 6, w6); |
|
||||||
store_load(h, 7, w7); |
|
||||||
|
|
||||||
/* end of first SHA256 round */ |
|
||||||
|
|
||||||
w8 = (vector unsigned int)(Pad[8],Pad[8],Pad[8],Pad[8]); |
|
||||||
w9 = (vector unsigned int)(Pad[9],Pad[9],Pad[9],Pad[9]); |
|
||||||
w10 = (vector unsigned int)(Pad[10],Pad[10],Pad[10],Pad[10]); |
|
||||||
w11 = (vector unsigned int)(Pad[11],Pad[11],Pad[11],Pad[11]); |
|
||||||
w12 = (vector unsigned int)(Pad[12],Pad[12],Pad[12],Pad[12]); |
|
||||||
w13 = (vector unsigned int)(Pad[13],Pad[13],Pad[13],Pad[13]); |
|
||||||
w14 = (vector unsigned int)(Pad[14],Pad[14],Pad[14],Pad[14]); |
|
||||||
w15 = (vector unsigned int)(Pad[15],Pad[15],Pad[15],Pad[15]); |
|
||||||
|
|
||||||
a = (vector unsigned int)(hInit[0],hInit[0],hInit[0],hInit[0]); |
|
||||||
b = (vector unsigned int)(hInit[1],hInit[1],hInit[1],hInit[1]); |
|
||||||
c = (vector unsigned int)(hInit[2],hInit[2],hInit[2],hInit[2]); |
|
||||||
d = (vector unsigned int)(hInit[3],hInit[3],hInit[3],hInit[3]); |
|
||||||
e = (vector unsigned int)(hInit[4],hInit[4],hInit[4],hInit[4]); |
|
||||||
f = (vector unsigned int)(hInit[5],hInit[5],hInit[5],hInit[5]); |
|
||||||
g = (vector unsigned int)(hInit[6],hInit[6],hInit[6],hInit[6]); |
|
||||||
h = (vector unsigned int)(hInit[7],hInit[7],hInit[7],hInit[7]); |
|
||||||
|
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15); |
|
||||||
|
|
||||||
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0); |
|
||||||
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1); |
|
||||||
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2); |
|
||||||
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3); |
|
||||||
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4); |
|
||||||
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5); |
|
||||||
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6); |
|
||||||
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7); |
|
||||||
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8); |
|
||||||
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9); |
|
||||||
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10); |
|
||||||
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11); |
|
||||||
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12); |
|
||||||
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13); |
|
||||||
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14); |
|
||||||
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15); |
|
||||||
|
|
||||||
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0); |
|
||||||
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1); |
|
||||||
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2); |
|
||||||
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3); |
|
||||||
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4); |
|
||||||
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5); |
|
||||||
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6); |
|
||||||
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7); |
|
||||||
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8); |
|
||||||
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9); |
|
||||||
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10); |
|
||||||
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11); |
|
||||||
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12); |
|
||||||
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13); |
|
||||||
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14); |
|
||||||
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15); |
|
||||||
|
|
||||||
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0); |
|
||||||
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1); |
|
||||||
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2); |
|
||||||
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3); |
|
||||||
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4); |
|
||||||
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5); |
|
||||||
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6); |
|
||||||
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7); |
|
||||||
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8); |
|
||||||
SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8); |
|
||||||
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9); |
|
||||||
SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9); |
|
||||||
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10); |
|
||||||
SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10); |
|
||||||
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11); |
|
||||||
SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11); |
|
||||||
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12); |
|
||||||
SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12); |
|
||||||
|
|
||||||
/* Skip last 3-rounds; not necessary for H==0 */ |
|
||||||
/*#if 0
|
|
||||||
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13); |
|
||||||
SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13); |
|
||||||
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14); |
|
||||||
SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14); |
|
||||||
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15); |
|
||||||
SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15); |
|
||||||
#endif*/ |
|
||||||
|
|
||||||
/* store resulsts directly in thash */ |
|
||||||
#define store_2(x,i) \ |
|
||||||
w0 = (vector unsigned int)(hInit[i],hInit[i],hInit[i],hInit[i]); \ |
|
||||||
vec_st(vec_add(w0, x), 0 ,&thash[i][k]); |
|
||||||
|
|
||||||
store_2(a, 0); |
|
||||||
store_2(b, 1); |
|
||||||
store_2(c, 2); |
|
||||||
store_2(d, 3); |
|
||||||
store_2(e, 4); |
|
||||||
store_2(f, 5); |
|
||||||
store_2(g, 6); |
|
||||||
store_2(h, 7); |
|
||||||
|
|
||||||
vec_st(nonce, 0 ,&thash[8][k]); |
|
||||||
/* writing the results into the array is time intensive */ |
|
||||||
/* -> try if it´s faster to compare the results with the target inside this function */ |
|
||||||
} |
|
||||||
|
|
||||||
} |
|
||||||
|
|
||||||
#endif /* WANT_ALTIVEC_4WAY */ |
|
||||||
|
|
@ -1,609 +0,0 @@ |
|||||||
|
|
||||||
#include "config.h" |
|
||||||
|
|
||||||
#include <stdint.h> |
|
||||||
#include <stdbool.h> |
|
||||||
#include <string.h> |
|
||||||
#include <stdlib.h> |
|
||||||
#include <stdio.h> |
|
||||||
#include "miner.h" |
|
||||||
|
|
||||||
typedef uint32_t word32; |
|
||||||
|
|
||||||
static word32 rotrFixed(word32 word, unsigned int shift) |
|
||||||
{ |
|
||||||
return (word >> shift) | (word << (32 - shift)); |
|
||||||
} |
|
||||||
|
|
||||||
#define blk0(i) (W[i] = data[i]) |
|
||||||
|
|
||||||
static const word32 SHA256_K[64] = { |
|
||||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
|
||||||
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
|
||||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
|
||||||
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
|
||||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, |
|
||||||
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
|
||||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, |
|
||||||
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
|
||||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, |
|
||||||
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
|
||||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, |
|
||||||
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
|
||||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, |
|
||||||
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
|
||||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, |
|
||||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
|
||||||
}; |
|
||||||
|
|
||||||
#define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15])) |
|
||||||
|
|
||||||
#define Ch(x,y,z) (z^(x&(y^z))) |
|
||||||
#define Maj(x,y,z) (y^((x^y)&(y^z))) |
|
||||||
|
|
||||||
#define a(i) T[(0-i)&7] |
|
||||||
#define b(i) T[(1-i)&7] |
|
||||||
#define c(i) T[(2-i)&7] |
|
||||||
#define d(i) T[(3-i)&7] |
|
||||||
#define e(i) T[(4-i)&7] |
|
||||||
#define f(i) T[(5-i)&7] |
|
||||||
#define g(i) T[(6-i)&7] |
|
||||||
#define h(i) T[(7-i)&7] |
|
||||||
|
|
||||||
#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\ |
|
||||||
d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) |
|
||||||
|
|
||||||
// for SHA256
|
|
||||||
#define S0(x) (rotrFixed(x,2)^rotrFixed(x,13)^rotrFixed(x,22)) |
|
||||||
#define S1(x) (rotrFixed(x,6)^rotrFixed(x,11)^rotrFixed(x,25)) |
|
||||||
#define s0(x) (rotrFixed(x,7)^rotrFixed(x,18)^(x>>3)) |
|
||||||
#define s1(x) (rotrFixed(x,17)^rotrFixed(x,19)^(x>>10)) |
|
||||||
|
|
||||||
static void SHA256_Transform(word32 *state, const word32 *data) |
|
||||||
{ |
|
||||||
word32 W[16] = { }; |
|
||||||
word32 T[8]; |
|
||||||
unsigned int j; |
|
||||||
|
|
||||||
/* Copy context->state[] to working vars */ |
|
||||||
memcpy(T, state, sizeof(T)); |
|
||||||
/* 64 operations, partially loop unrolled */ |
|
||||||
for (j=0; j<64; j+=16) |
|
||||||
{ |
|
||||||
R( 0); R( 1); R( 2); R( 3); |
|
||||||
R( 4); R( 5); R( 6); R( 7); |
|
||||||
R( 8); R( 9); R(10); R(11); |
|
||||||
R(12); R(13); R(14); R(15); |
|
||||||
} |
|
||||||
/* Add the working vars back into context.state[] */ |
|
||||||
state[0] += a(0); |
|
||||||
state[1] += b(0); |
|
||||||
state[2] += c(0); |
|
||||||
state[3] += d(0); |
|
||||||
state[4] += e(0); |
|
||||||
state[5] += f(0); |
|
||||||
state[6] += g(0); |
|
||||||
state[7] += h(0); |
|
||||||
} |
|
||||||
|
|
||||||
static void runhash(void *state, const void *input, const void *init) |
|
||||||
{ |
|
||||||
memcpy(state, init, 32); |
|
||||||
SHA256_Transform(state, input); |
|
||||||
} |
|
||||||
|
|
||||||
/* suspiciously similar to ScanHash* from bitcoin */ |
|
||||||
bool scanhash_cryptopp(struct thr_info*thr, const unsigned char *midstate, |
|
||||||
unsigned char *data, |
|
||||||
unsigned char *hash1, unsigned char *hash, |
|
||||||
const unsigned char *target, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, |
|
||||||
uint32_t n) |
|
||||||
{ |
|
||||||
uint32_t *hash32 = (uint32_t *) hash; |
|
||||||
uint32_t *nonce = (uint32_t *)(data + 76); |
|
||||||
|
|
||||||
data += 64; |
|
||||||
|
|
||||||
while (1) { |
|
||||||
n++; |
|
||||||
*nonce = n; |
|
||||||
|
|
||||||
runhash(hash1, data, midstate); |
|
||||||
runhash(hash, hash1, sha256_init_state); |
|
||||||
|
|
||||||
if (unlikely((hash32[7] == 0) && fulltest(hash, target))) { |
|
||||||
*last_nonce = n; |
|
||||||
return true; |
|
||||||
} |
|
||||||
|
|
||||||
if ((n >= max_nonce) || thr->work_restart) { |
|
||||||
*last_nonce = n; |
|
||||||
return false; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
#if defined(WANT_CRYPTOPP_ASM32) |
|
||||||
|
|
||||||
#define CRYPTOPP_FASTCALL |
|
||||||
#define CRYPTOPP_BOOL_X86 1 |
|
||||||
#define CRYPTOPP_BOOL_X64 0 |
|
||||||
#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0 |
|
||||||
|
|
||||||
#ifdef CRYPTOPP_GENERATE_X64_MASM |
|
||||||
#define AS1(x) x*newline* |
|
||||||
#define AS2(x, y) x, y*newline* |
|
||||||
#define AS3(x, y, z) x, y, z*newline* |
|
||||||
#define ASS(x, y, a, b, c, d) x, y, a*64+b*16+c*4+d*newline* |
|
||||||
#define ASL(x) label##x:*newline* |
|
||||||
#define ASJ(x, y, z) x label##y*newline* |
|
||||||
#define ASC(x, y) x label##y*newline* |
|
||||||
#define AS_HEX(y) 0##y##h |
|
||||||
#elif defined(_MSC_VER) || defined(__BORLANDC__) |
|
||||||
#define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY |
|
||||||
#define AS1(x) __asm {x} |
|
||||||
#define AS2(x, y) __asm {x, y} |
|
||||||
#define AS3(x, y, z) __asm {x, y, z} |
|
||||||
#define ASS(x, y, a, b, c, d) __asm {x, y, (a)*64+(b)*16+(c)*4+(d)} |
|
||||||
#define ASL(x) __asm {label##x:} |
|
||||||
#define ASJ(x, y, z) __asm {x label##y} |
|
||||||
#define ASC(x, y) __asm {x label##y} |
|
||||||
#define CRYPTOPP_NAKED __declspec(naked) |
|
||||||
#define AS_HEX(y) 0x##y |
|
||||||
#else |
|
||||||
#define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY |
|
||||||
// define these in two steps to allow arguments to be expanded
|
|
||||||
#define GNU_AS1(x) #x ";" |
|
||||||
#define GNU_AS2(x, y) #x ", " #y ";" |
|
||||||
#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";" |
|
||||||
#define GNU_ASL(x) "\n" #x ":" |
|
||||||
#define GNU_ASJ(x, y, z) #x " " #y #z ";" |
|
||||||
#define AS1(x) GNU_AS1(x) |
|
||||||
#define AS2(x, y) GNU_AS2(x, y) |
|
||||||
#define AS3(x, y, z) GNU_AS3(x, y, z) |
|
||||||
#define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";" |
|
||||||
#define ASL(x) GNU_ASL(x) |
|
||||||
#define ASJ(x, y, z) GNU_ASJ(x, y, z) |
|
||||||
#define ASC(x, y) #x " " #y ";" |
|
||||||
#define CRYPTOPP_NAKED |
|
||||||
#define AS_HEX(y) 0x##y |
|
||||||
#endif |
|
||||||
|
|
||||||
#define IF0(y) |
|
||||||
#define IF1(y) y |
|
||||||
|
|
||||||
#ifdef CRYPTOPP_GENERATE_X64_MASM |
|
||||||
#define ASM_MOD(x, y) ((x) MOD (y)) |
|
||||||
#define XMMWORD_PTR XMMWORD PTR |
|
||||||
#else |
|
||||||
// GNU assembler doesn't seem to have mod operator
|
|
||||||
#define ASM_MOD(x, y) ((x)-((x)/(y))*(y)) |
|
||||||
// GAS 2.15 doesn't support XMMWORD PTR. it seems necessary only for MASM
|
|
||||||
#define XMMWORD_PTR |
|
||||||
#endif |
|
||||||
|
|
||||||
#if CRYPTOPP_BOOL_X86 |
|
||||||
#define AS_REG_1 ecx |
|
||||||
#define AS_REG_2 edx |
|
||||||
#define AS_REG_3 esi |
|
||||||
#define AS_REG_4 edi |
|
||||||
#define AS_REG_5 eax |
|
||||||
#define AS_REG_6 ebx |
|
||||||
#define AS_REG_7 ebp |
|
||||||
#define AS_REG_1d ecx |
|
||||||
#define AS_REG_2d edx |
|
||||||
#define AS_REG_3d esi |
|
||||||
#define AS_REG_4d edi |
|
||||||
#define AS_REG_5d eax |
|
||||||
#define AS_REG_6d ebx |
|
||||||
#define AS_REG_7d ebp |
|
||||||
#define WORD_SZ 4 |
|
||||||
#define WORD_REG(x) e##x |
|
||||||
#define WORD_PTR DWORD PTR |
|
||||||
#define AS_PUSH_IF86(x) AS1(push e##x) |
|
||||||
#define AS_POP_IF86(x) AS1(pop e##x) |
|
||||||
#define AS_JCXZ jecxz |
|
||||||
#elif CRYPTOPP_BOOL_X64 |
|
||||||
#ifdef CRYPTOPP_GENERATE_X64_MASM |
|
||||||
#define AS_REG_1 rcx |
|
||||||
#define AS_REG_2 rdx |
|
||||||
#define AS_REG_3 r8 |
|
||||||
#define AS_REG_4 r9 |
|
||||||
#define AS_REG_5 rax |
|
||||||
#define AS_REG_6 r10 |
|
||||||
#define AS_REG_7 r11 |
|
||||||
#define AS_REG_1d ecx |
|
||||||
#define AS_REG_2d edx |
|
||||||
#define AS_REG_3d r8d |
|
||||||
#define AS_REG_4d r9d |
|
||||||
#define AS_REG_5d eax |
|
||||||
#define AS_REG_6d r10d |
|
||||||
#define AS_REG_7d r11d |
|
||||||
#else |
|
||||||
#define AS_REG_1 rdi |
|
||||||
#define AS_REG_2 rsi |
|
||||||
#define AS_REG_3 rdx |
|
||||||
#define AS_REG_4 rcx |
|
||||||
#define AS_REG_5 r8 |
|
||||||
#define AS_REG_6 r9 |
|
||||||
#define AS_REG_7 r10 |
|
||||||
#define AS_REG_1d edi |
|
||||||
#define AS_REG_2d esi |
|
||||||
#define AS_REG_3d edx |
|
||||||
#define AS_REG_4d ecx |
|
||||||
#define AS_REG_5d r8d |
|
||||||
#define AS_REG_6d r9d |
|
||||||
#define AS_REG_7d r10d |
|
||||||
#endif |
|
||||||
#define WORD_SZ 8 |
|
||||||
#define WORD_REG(x) r##x |
|
||||||
#define WORD_PTR QWORD PTR |
|
||||||
#define AS_PUSH_IF86(x) |
|
||||||
#define AS_POP_IF86(x) |
|
||||||
#define AS_JCXZ jrcxz |
|
||||||
#endif |
|
||||||
|
|
||||||
static void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, size_t len |
|
||||||
#if defined(_MSC_VER) && (_MSC_VER == 1200) |
|
||||||
, ... // VC60 workaround: prevent VC 6 from inlining this function
|
|
||||||
#endif |
|
||||||
) |
|
||||||
{ |
|
||||||
#if defined(_MSC_VER) && (_MSC_VER == 1200) |
|
||||||
AS2(mov ecx, [state]) |
|
||||||
AS2(mov edx, [data]) |
|
||||||
#endif |
|
||||||
|
|
||||||
#define LOCALS_SIZE 8*4 + 16*4 + 4*WORD_SZ |
|
||||||
#define H(i) [BASE+ASM_MOD(1024+7-(i),8)*4] |
|
||||||
#define G(i) H(i+1) |
|
||||||
#define F(i) H(i+2) |
|
||||||
#define E(i) H(i+3) |
|
||||||
#define D(i) H(i+4) |
|
||||||
#define C(i) H(i+5) |
|
||||||
#define B(i) H(i+6) |
|
||||||
#define A(i) H(i+7) |
|
||||||
#define Wt(i) BASE+8*4+ASM_MOD(1024+15-(i),16)*4 |
|
||||||
#define Wt_2(i) Wt((i)-2) |
|
||||||
#define Wt_15(i) Wt((i)-15) |
|
||||||
#define Wt_7(i) Wt((i)-7) |
|
||||||
#define K_END [BASE+8*4+16*4+0*WORD_SZ] |
|
||||||
#define STATE_SAVE [BASE+8*4+16*4+1*WORD_SZ] |
|
||||||
#define DATA_SAVE [BASE+8*4+16*4+2*WORD_SZ] |
|
||||||
#define DATA_END [BASE+8*4+16*4+3*WORD_SZ] |
|
||||||
#define Kt(i) WORD_REG(si)+(i)*4 |
|
||||||
#if CRYPTOPP_BOOL_X86 |
|
||||||
#define BASE esp+4 |
|
||||||
#elif defined(__GNUC__) |
|
||||||
#define BASE r8 |
|
||||||
#else |
|
||||||
#define BASE rsp |
|
||||||
#endif |
|
||||||
|
|
||||||
#define RA0(i, edx, edi) \ |
|
||||||
AS2( add edx, [Kt(i)] )\ |
|
||||||
AS2( add edx, [Wt(i)] )\ |
|
||||||
AS2( add edx, H(i) )\ |
|
||||||
|
|
||||||
#define RA1(i, edx, edi) |
|
||||||
|
|
||||||
#define RB0(i, edx, edi) |
|
||||||
|
|
||||||
#define RB1(i, edx, edi) \ |
|
||||||
AS2( mov AS_REG_7d, [Wt_2(i)] )\ |
|
||||||
AS2( mov edi, [Wt_15(i)])\ |
|
||||||
AS2( mov ebx, AS_REG_7d )\ |
|
||||||
AS2( shr AS_REG_7d, 10 )\ |
|
||||||
AS2( ror ebx, 17 )\ |
|
||||||
AS2( xor AS_REG_7d, ebx )\ |
|
||||||
AS2( ror ebx, 2 )\ |
|
||||||
AS2( xor ebx, AS_REG_7d )/* s1(W_t-2) */\ |
|
||||||
AS2( add ebx, [Wt_7(i)])\ |
|
||||||
AS2( mov AS_REG_7d, edi )\ |
|
||||||
AS2( shr AS_REG_7d, 3 )\ |
|
||||||
AS2( ror edi, 7 )\ |
|
||||||
AS2( add ebx, [Wt(i)])/* s1(W_t-2) + W_t-7 + W_t-16 */\ |
|
||||||
AS2( xor AS_REG_7d, edi )\ |
|
||||||
AS2( add edx, [Kt(i)])\ |
|
||||||
AS2( ror edi, 11 )\ |
|
||||||
AS2( add edx, H(i) )\ |
|
||||||
AS2( xor AS_REG_7d, edi )/* s0(W_t-15) */\ |
|
||||||
AS2( add AS_REG_7d, ebx )/* W_t = s1(W_t-2) + W_t-7 + s0(W_t-15) W_t-16*/\ |
|
||||||
AS2( mov [Wt(i)], AS_REG_7d)\ |
|
||||||
AS2( add edx, AS_REG_7d )\ |
|
||||||
|
|
||||||
#define ROUND(i, r, eax, ecx, edi, edx)\ |
|
||||||
/* in: edi = E */\ |
|
||||||
/* unused: eax, ecx, temp: ebx, AS_REG_7d, out: edx = T1 */\ |
|
||||||
AS2( mov edx, F(i) )\ |
|
||||||
AS2( xor edx, G(i) )\ |
|
||||||
AS2( and edx, edi )\ |
|
||||||
AS2( xor edx, G(i) )/* Ch(E,F,G) = (G^(E&(F^G))) */\ |
|
||||||
AS2( mov AS_REG_7d, edi )\ |
|
||||||
AS2( ror edi, 6 )\ |
|
||||||
AS2( ror AS_REG_7d, 25 )\ |
|
||||||
RA##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\ |
|
||||||
AS2( xor AS_REG_7d, edi )\ |
|
||||||
AS2( ror edi, 5 )\ |
|
||||||
AS2( xor AS_REG_7d, edi )/* S1(E) */\ |
|
||||||
AS2( add edx, AS_REG_7d )/* T1 = S1(E) + Ch(E,F,G) + H + Wt + Kt */\ |
|
||||||
RB##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\ |
|
||||||
/* in: ecx = A, eax = B^C, edx = T1 */\ |
|
||||||
/* unused: edx, temp: ebx, AS_REG_7d, out: eax = A, ecx = B^C, edx = E */\ |
|
||||||
AS2( mov ebx, ecx )\ |
|
||||||
AS2( xor ecx, B(i) )/* A^B */\ |
|
||||||
AS2( and eax, ecx )\ |
|
||||||
AS2( xor eax, B(i) )/* Maj(A,B,C) = B^((A^B)&(B^C) */\ |
|
||||||
AS2( mov AS_REG_7d, ebx )\ |
|
||||||
AS2( ror ebx, 2 )\ |
|
||||||
AS2( add eax, edx )/* T1 + Maj(A,B,C) */\ |
|
||||||
AS2( add edx, D(i) )\ |
|
||||||
AS2( mov D(i), edx )\ |
|
||||||
AS2( ror AS_REG_7d, 22 )\ |
|
||||||
AS2( xor AS_REG_7d, ebx )\ |
|
||||||
AS2( ror ebx, 11 )\ |
|
||||||
AS2( xor AS_REG_7d, ebx )\ |
|
||||||
AS2( add eax, AS_REG_7d )/* T1 + S0(A) + Maj(A,B,C) */\ |
|
||||||
AS2( mov H(i), eax )\ |
|
||||||
|
|
||||||
#define SWAP_COPY(i) \ |
|
||||||
AS2( mov WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\ |
|
||||||
AS1( bswap WORD_REG(bx))\ |
|
||||||
AS2( mov [Wt(i*(1+CRYPTOPP_BOOL_X64)+CRYPTOPP_BOOL_X64)], WORD_REG(bx)) |
|
||||||
|
|
||||||
#if defined(__GNUC__) |
|
||||||
#if CRYPTOPP_BOOL_X64 |
|
||||||
FixedSizeAlignedSecBlock<byte, LOCALS_SIZE> workspace; |
|
||||||
#endif |
|
||||||
__asm__ __volatile__ |
|
||||||
( |
|
||||||
#if CRYPTOPP_BOOL_X64 |
|
||||||
"lea %4, %%r8;" |
|
||||||
#endif |
|
||||||
".intel_syntax noprefix;" |
|
||||||
#elif defined(CRYPTOPP_GENERATE_X64_MASM) |
|
||||||
ALIGN 8 |
|
||||||
X86_SHA256_HashBlocks PROC FRAME |
|
||||||
rex_push_reg rsi |
|
||||||
push_reg rdi |
|
||||||
push_reg rbx |
|
||||||
push_reg rbp |
|
||||||
alloc_stack(LOCALS_SIZE+8) |
|
||||||
.endprolog |
|
||||||
mov rdi, r8 |
|
||||||
lea rsi, [?SHA256_K@CryptoPP@@3QBIB + 48*4] |
|
||||||
#endif |
|
||||||
|
|
||||||
#if CRYPTOPP_BOOL_X86 |
|
||||||
#ifndef __GNUC__ |
|
||||||
AS2( mov edi, [len]) |
|
||||||
AS2( lea WORD_REG(si), [SHA256_K+48*4]) |
|
||||||
#endif |
|
||||||
#if !defined(_MSC_VER) || (_MSC_VER < 1400) |
|
||||||
AS_PUSH_IF86(bx) |
|
||||||
#endif |
|
||||||
|
|
||||||
AS_PUSH_IF86(bp) |
|
||||||
AS2( mov ebx, esp) |
|
||||||
AS2( and esp, -16) |
|
||||||
AS2( sub WORD_REG(sp), LOCALS_SIZE) |
|
||||||
AS_PUSH_IF86(bx) |
|
||||||
#endif |
|
||||||
AS2( mov STATE_SAVE, WORD_REG(cx)) |
|
||||||
AS2( mov DATA_SAVE, WORD_REG(dx)) |
|
||||||
AS2( lea WORD_REG(ax), [WORD_REG(di) + WORD_REG(dx)]) |
|
||||||
AS2( mov DATA_END, WORD_REG(ax)) |
|
||||||
AS2( mov K_END, WORD_REG(si)) |
|
||||||
|
|
||||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE |
|
||||||
#if CRYPTOPP_BOOL_X86 |
|
||||||
AS2( test edi, 1) |
|
||||||
ASJ( jnz, 2, f) |
|
||||||
AS1( dec DWORD PTR K_END) |
|
||||||
#endif |
|
||||||
AS2( movdqa xmm0, XMMWORD_PTR [WORD_REG(cx)+0*16]) |
|
||||||
AS2( movdqa xmm1, XMMWORD_PTR [WORD_REG(cx)+1*16]) |
|
||||||
#endif |
|
||||||
|
|
||||||
#if CRYPTOPP_BOOL_X86 |
|
||||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE |
|
||||||
ASJ( jmp, 0, f) |
|
||||||
#endif |
|
||||||
ASL(2) // non-SSE2
|
|
||||||
AS2( mov esi, ecx) |
|
||||||
AS2( lea edi, A(0)) |
|
||||||
AS2( mov ecx, 8) |
|
||||||
AS1( rep movsd) |
|
||||||
AS2( mov esi, K_END) |
|
||||||
ASJ( jmp, 3, f) |
|
||||||
#endif |
|
||||||
|
|
||||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE |
|
||||||
ASL(0) |
|
||||||
AS2( movdqa E(0), xmm1) |
|
||||||
AS2( movdqa A(0), xmm0) |
|
||||||
#endif |
|
||||||
#if CRYPTOPP_BOOL_X86 |
|
||||||
ASL(3) |
|
||||||
#endif |
|
||||||
AS2( sub WORD_REG(si), 48*4) |
|
||||||
SWAP_COPY(0) SWAP_COPY(1) SWAP_COPY(2) SWAP_COPY(3) |
|
||||||
SWAP_COPY(4) SWAP_COPY(5) SWAP_COPY(6) SWAP_COPY(7) |
|
||||||
#if CRYPTOPP_BOOL_X86 |
|
||||||
SWAP_COPY(8) SWAP_COPY(9) SWAP_COPY(10) SWAP_COPY(11) |
|
||||||
SWAP_COPY(12) SWAP_COPY(13) SWAP_COPY(14) SWAP_COPY(15) |
|
||||||
#endif |
|
||||||
AS2( mov edi, E(0)) // E
|
|
||||||
AS2( mov eax, B(0)) // B
|
|
||||||
AS2( xor eax, C(0)) // B^C
|
|
||||||
AS2( mov ecx, A(0)) // A
|
|
||||||
|
|
||||||
ROUND(0, 0, eax, ecx, edi, edx) |
|
||||||
ROUND(1, 0, ecx, eax, edx, edi) |
|
||||||
ROUND(2, 0, eax, ecx, edi, edx) |
|
||||||
ROUND(3, 0, ecx, eax, edx, edi) |
|
||||||
ROUND(4, 0, eax, ecx, edi, edx) |
|
||||||
ROUND(5, 0, ecx, eax, edx, edi) |
|
||||||
ROUND(6, 0, eax, ecx, edi, edx) |
|
||||||
ROUND(7, 0, ecx, eax, edx, edi) |
|
||||||
ROUND(8, 0, eax, ecx, edi, edx) |
|
||||||
ROUND(9, 0, ecx, eax, edx, edi) |
|
||||||
ROUND(10, 0, eax, ecx, edi, edx) |
|
||||||
ROUND(11, 0, ecx, eax, edx, edi) |
|
||||||
ROUND(12, 0, eax, ecx, edi, edx) |
|
||||||
ROUND(13, 0, ecx, eax, edx, edi) |
|
||||||
ROUND(14, 0, eax, ecx, edi, edx) |
|
||||||
ROUND(15, 0, ecx, eax, edx, edi) |
|
||||||
|
|
||||||
ASL(1) |
|
||||||
AS2(add WORD_REG(si), 4*16) |
|
||||||
ROUND(0, 1, eax, ecx, edi, edx) |
|
||||||
ROUND(1, 1, ecx, eax, edx, edi) |
|
||||||
ROUND(2, 1, eax, ecx, edi, edx) |
|
||||||
ROUND(3, 1, ecx, eax, edx, edi) |
|
||||||
ROUND(4, 1, eax, ecx, edi, edx) |
|
||||||
ROUND(5, 1, ecx, eax, edx, edi) |
|
||||||
ROUND(6, 1, eax, ecx, edi, edx) |
|
||||||
ROUND(7, 1, ecx, eax, edx, edi) |
|
||||||
ROUND(8, 1, eax, ecx, edi, edx) |
|
||||||
ROUND(9, 1, ecx, eax, edx, edi) |
|
||||||
ROUND(10, 1, eax, ecx, edi, edx) |
|
||||||
ROUND(11, 1, ecx, eax, edx, edi) |
|
||||||
ROUND(12, 1, eax, ecx, edi, edx) |
|
||||||
ROUND(13, 1, ecx, eax, edx, edi) |
|
||||||
ROUND(14, 1, eax, ecx, edi, edx) |
|
||||||
ROUND(15, 1, ecx, eax, edx, edi) |
|
||||||
AS2( cmp WORD_REG(si), K_END) |
|
||||||
ASJ( jb, 1, b) |
|
||||||
|
|
||||||
AS2( mov WORD_REG(dx), DATA_SAVE) |
|
||||||
AS2( add WORD_REG(dx), 64) |
|
||||||
AS2( mov AS_REG_7, STATE_SAVE) |
|
||||||
AS2( mov DATA_SAVE, WORD_REG(dx)) |
|
||||||
|
|
||||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE |
|
||||||
#if CRYPTOPP_BOOL_X86 |
|
||||||
AS2( test DWORD PTR K_END, 1) |
|
||||||
ASJ( jz, 4, f) |
|
||||||
#endif |
|
||||||
AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_7+1*16]) |
|
||||||
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_7+0*16]) |
|
||||||
AS2( paddd xmm1, E(0)) |
|
||||||
AS2( paddd xmm0, A(0)) |
|
||||||
AS2( movdqa [AS_REG_7+1*16], xmm1) |
|
||||||
AS2( movdqa [AS_REG_7+0*16], xmm0) |
|
||||||
AS2( cmp WORD_REG(dx), DATA_END) |
|
||||||
ASJ( jb, 0, b) |
|
||||||
#endif |
|
||||||
|
|
||||||
#if CRYPTOPP_BOOL_X86 |
|
||||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE |
|
||||||
ASJ( jmp, 5, f) |
|
||||||
ASL(4) // non-SSE2
|
|
||||||
#endif |
|
||||||
AS2( add [AS_REG_7+0*4], ecx) // A
|
|
||||||
AS2( add [AS_REG_7+4*4], edi) // E
|
|
||||||
AS2( mov eax, B(0)) |
|
||||||
AS2( mov ebx, C(0)) |
|
||||||
AS2( mov ecx, D(0)) |
|
||||||
AS2( add [AS_REG_7+1*4], eax) |
|
||||||
AS2( add [AS_REG_7+2*4], ebx) |
|
||||||
AS2( add [AS_REG_7+3*4], ecx) |
|
||||||
AS2( mov eax, F(0)) |
|
||||||
AS2( mov ebx, G(0)) |
|
||||||
AS2( mov ecx, H(0)) |
|
||||||
AS2( add [AS_REG_7+5*4], eax) |
|
||||||
AS2( add [AS_REG_7+6*4], ebx) |
|
||||||
AS2( add [AS_REG_7+7*4], ecx) |
|
||||||
AS2( mov ecx, AS_REG_7d) |
|
||||||
AS2( cmp WORD_REG(dx), DATA_END) |
|
||||||
ASJ( jb, 2, b) |
|
||||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE |
|
||||||
ASL(5) |
|
||||||
#endif |
|
||||||
#endif |
|
||||||
|
|
||||||
AS_POP_IF86(sp) |
|
||||||
AS_POP_IF86(bp) |
|
||||||
#if !defined(_MSC_VER) || (_MSC_VER < 1400) |
|
||||||
AS_POP_IF86(bx) |
|
||||||
#endif |
|
||||||
|
|
||||||
#ifdef CRYPTOPP_GENERATE_X64_MASM |
|
||||||
add rsp, LOCALS_SIZE+8 |
|
||||||
pop rbp |
|
||||||
pop rbx |
|
||||||
pop rdi |
|
||||||
pop rsi |
|
||||||
ret |
|
||||||
X86_SHA256_HashBlocks ENDP |
|
||||||
#endif |
|
||||||
|
|
||||||
#ifdef __GNUC__ |
|
||||||
".att_syntax prefix;" |
|
||||||
: |
|
||||||
: "c" (state), "d" (data), "S" (SHA256_K+48), "D" (len) |
|
||||||
#if CRYPTOPP_BOOL_X64 |
|
||||||
, "m" (workspace[0]) |
|
||||||
#endif |
|
||||||
: "memory", "cc", "%eax" |
|
||||||
#if CRYPTOPP_BOOL_X64 |
|
||||||
, "%rbx", "%r8", "%r10" |
|
||||||
#endif |
|
||||||
); |
|
||||||
#endif |
|
||||||
} |
|
||||||
|
|
||||||
static inline bool HasSSE2(void) { return false; } |
|
||||||
|
|
||||||
static void SHA256_Transform32(word32 *state, const word32 *data) |
|
||||||
{ |
|
||||||
word32 W[16]; |
|
||||||
int i; |
|
||||||
|
|
||||||
for (i = 0; i < 16; i++) |
|
||||||
W[i] = swab32(((word32 *)(data))[i]); |
|
||||||
|
|
||||||
X86_SHA256_HashBlocks(state, W, 16 * 4); |
|
||||||
} |
|
||||||
|
|
||||||
static void runhash32(void *state, const void *input, const void *init) |
|
||||||
{ |
|
||||||
memcpy(state, init, 32); |
|
||||||
SHA256_Transform32(state, input); |
|
||||||
} |
|
||||||
|
|
||||||
/* suspiciously similar to ScanHash* from bitcoin */ |
|
||||||
bool scanhash_asm32(struct thr_info*thr, const unsigned char *midstate, |
|
||||||
unsigned char *data, |
|
||||||
unsigned char *hash1, unsigned char *hash, |
|
||||||
const unsigned char *target, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, |
|
||||||
uint32_t n) |
|
||||||
{ |
|
||||||
uint32_t *hash32 = (uint32_t *) hash; |
|
||||||
uint32_t *nonce = (uint32_t *)(data + 76); |
|
||||||
|
|
||||||
data += 64; |
|
||||||
|
|
||||||
while (1) { |
|
||||||
n++; |
|
||||||
*nonce = n; |
|
||||||
|
|
||||||
runhash32(hash1, data, midstate); |
|
||||||
runhash32(hash, hash1, sha256_init_state); |
|
||||||
|
|
||||||
if (unlikely((hash32[7] == 0) && fulltest(hash, target))) { |
|
||||||
*last_nonce = n; |
|
||||||
return true; |
|
||||||
} |
|
||||||
|
|
||||||
if ((n >= max_nonce) || thr->work_restart) { |
|
||||||
*last_nonce = n; |
|
||||||
return false; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
#endif // #if defined(WANT_CRYPTOPP_ASM32)
|
|
@ -1,274 +0,0 @@ |
|||||||
/*
|
|
||||||
* Cryptographic API. |
|
||||||
* |
|
||||||
* SHA-256, as specified in |
|
||||||
* http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
|
|
||||||
* |
|
||||||
* SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. |
|
||||||
* |
|
||||||
* Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> |
|
||||||
* Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> |
|
||||||
* Copyright (c) 2002 James Morris <jmorris@intercode.com.au> |
|
||||||
* SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com> |
|
||||||
* |
|
||||||
* This program is free software; you can redistribute it and/or modify it |
|
||||||
* under the terms of the GNU General Public License as published by the Free |
|
||||||
* Software Foundation; either version 2 of the License, or (at your option) |
|
||||||
* any later version. |
|
||||||
* |
|
||||||
*/ |
|
||||||
|
|
||||||
#include "config.h" |
|
||||||
|
|
||||||
#include <stdint.h> |
|
||||||
#include <stdbool.h> |
|
||||||
#include <stdlib.h> |
|
||||||
#include <string.h> |
|
||||||
#include "miner.h" |
|
||||||
|
|
||||||
typedef uint32_t u32; |
|
||||||
typedef uint8_t u8; |
|
||||||
|
|
||||||
static inline u32 ror32(u32 word, unsigned int shift) |
|
||||||
{ |
|
||||||
return (word >> shift) | (word << (32 - shift)); |
|
||||||
} |
|
||||||
|
|
||||||
static inline u32 Ch(u32 x, u32 y, u32 z) |
|
||||||
{ |
|
||||||
return z ^ (x & (y ^ z)); |
|
||||||
} |
|
||||||
|
|
||||||
static inline u32 Maj(u32 x, u32 y, u32 z) |
|
||||||
{ |
|
||||||
return (x & y) | (z & (x | y)); |
|
||||||
} |
|
||||||
|
|
||||||
#define e0(x) (ror32(x, 2) ^ ror32(x,13) ^ ror32(x,22)) |
|
||||||
#define e1(x) (ror32(x, 6) ^ ror32(x,11) ^ ror32(x,25)) |
|
||||||
#define s0(x) (ror32(x, 7) ^ ror32(x,18) ^ (x >> 3)) |
|
||||||
#define s1(x) (ror32(x,17) ^ ror32(x,19) ^ (x >> 10)) |
|
||||||
|
|
||||||
static inline void LOAD_OP(int I, u32 *W, const u8 *input) |
|
||||||
{ |
|
||||||
/* byteswap is commented out, because bitcoin input
|
|
||||||
* is already big-endian |
|
||||||
*/ |
|
||||||
W[I] = /* ntohl */ ( ((u32*)(input))[I] ); |
|
||||||
} |
|
||||||
|
|
||||||
static inline void BLEND_OP(int I, u32 *W) |
|
||||||
{ |
|
||||||
W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; |
|
||||||
} |
|
||||||
|
|
||||||
static void sha256_transform(u32 *state, const u8 *input) |
|
||||||
{ |
|
||||||
u32 a, b, c, d, e, f, g, h, t1, t2; |
|
||||||
u32 W[64]; |
|
||||||
int i; |
|
||||||
|
|
||||||
/* load the input */ |
|
||||||
for (i = 0; i < 16; i++) |
|
||||||
LOAD_OP(i, W, input); |
|
||||||
|
|
||||||
/* now blend */ |
|
||||||
for (i = 16; i < 64; i++) |
|
||||||
BLEND_OP(i, W); |
|
||||||
|
|
||||||
/* load the state into our registers */ |
|
||||||
a=state[0]; b=state[1]; c=state[2]; d=state[3]; |
|
||||||
e=state[4]; f=state[5]; g=state[6]; h=state[7]; |
|
||||||
|
|
||||||
/* now iterate */ |
|
||||||
t1 = h + e1(e) + Ch(e,f,g) + 0x428a2f98 + W[ 0]; |
|
||||||
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; |
|
||||||
t1 = g + e1(d) + Ch(d,e,f) + 0x71374491 + W[ 1]; |
|
||||||
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; |
|
||||||
t1 = f + e1(c) + Ch(c,d,e) + 0xb5c0fbcf + W[ 2]; |
|
||||||
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; |
|
||||||
t1 = e + e1(b) + Ch(b,c,d) + 0xe9b5dba5 + W[ 3]; |
|
||||||
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; |
|
||||||
t1 = d + e1(a) + Ch(a,b,c) + 0x3956c25b + W[ 4]; |
|
||||||
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; |
|
||||||
t1 = c + e1(h) + Ch(h,a,b) + 0x59f111f1 + W[ 5]; |
|
||||||
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; |
|
||||||
t1 = b + e1(g) + Ch(g,h,a) + 0x923f82a4 + W[ 6]; |
|
||||||
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; |
|
||||||
t1 = a + e1(f) + Ch(f,g,h) + 0xab1c5ed5 + W[ 7]; |
|
||||||
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; |
|
||||||
|
|
||||||
t1 = h + e1(e) + Ch(e,f,g) + 0xd807aa98 + W[ 8]; |
|
||||||
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; |
|
||||||
t1 = g + e1(d) + Ch(d,e,f) + 0x12835b01 + W[ 9]; |
|
||||||
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; |
|
||||||
t1 = f + e1(c) + Ch(c,d,e) + 0x243185be + W[10]; |
|
||||||
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; |
|
||||||
t1 = e + e1(b) + Ch(b,c,d) + 0x550c7dc3 + W[11]; |
|
||||||
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; |
|
||||||
t1 = d + e1(a) + Ch(a,b,c) + 0x72be5d74 + W[12]; |
|
||||||
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; |
|
||||||
t1 = c + e1(h) + Ch(h,a,b) + 0x80deb1fe + W[13]; |
|
||||||
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; |
|
||||||
t1 = b + e1(g) + Ch(g,h,a) + 0x9bdc06a7 + W[14]; |
|
||||||
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; |
|
||||||
t1 = a + e1(f) + Ch(f,g,h) + 0xc19bf174 + W[15]; |
|
||||||
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; |
|
||||||
|
|
||||||
t1 = h + e1(e) + Ch(e,f,g) + 0xe49b69c1 + W[16]; |
|
||||||
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; |
|
||||||
t1 = g + e1(d) + Ch(d,e,f) + 0xefbe4786 + W[17]; |
|
||||||
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; |
|
||||||
t1 = f + e1(c) + Ch(c,d,e) + 0x0fc19dc6 + W[18]; |
|
||||||
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; |
|
||||||
t1 = e + e1(b) + Ch(b,c,d) + 0x240ca1cc + W[19]; |
|
||||||
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; |
|
||||||
t1 = d + e1(a) + Ch(a,b,c) + 0x2de92c6f + W[20]; |
|
||||||
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; |
|
||||||
t1 = c + e1(h) + Ch(h,a,b) + 0x4a7484aa + W[21]; |
|
||||||
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; |
|
||||||
t1 = b + e1(g) + Ch(g,h,a) + 0x5cb0a9dc + W[22]; |
|
||||||
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; |
|
||||||
t1 = a + e1(f) + Ch(f,g,h) + 0x76f988da + W[23]; |
|
||||||
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; |
|
||||||
|
|
||||||
t1 = h + e1(e) + Ch(e,f,g) + 0x983e5152 + W[24]; |
|
||||||
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; |
|
||||||
t1 = g + e1(d) + Ch(d,e,f) + 0xa831c66d + W[25]; |
|
||||||
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; |
|
||||||
t1 = f + e1(c) + Ch(c,d,e) + 0xb00327c8 + W[26]; |
|
||||||
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; |
|
||||||
t1 = e + e1(b) + Ch(b,c,d) + 0xbf597fc7 + W[27]; |
|
||||||
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; |
|
||||||
t1 = d + e1(a) + Ch(a,b,c) + 0xc6e00bf3 + W[28]; |
|
||||||
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; |
|
||||||
t1 = c + e1(h) + Ch(h,a,b) + 0xd5a79147 + W[29]; |
|
||||||
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; |
|
||||||
t1 = b + e1(g) + Ch(g,h,a) + 0x06ca6351 + W[30]; |
|
||||||
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; |
|
||||||
t1 = a + e1(f) + Ch(f,g,h) + 0x14292967 + W[31]; |
|
||||||
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; |
|
||||||
|
|
||||||
t1 = h + e1(e) + Ch(e,f,g) + 0x27b70a85 + W[32]; |
|
||||||
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; |
|
||||||
t1 = g + e1(d) + Ch(d,e,f) + 0x2e1b2138 + W[33]; |
|
||||||
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; |
|
||||||
t1 = f + e1(c) + Ch(c,d,e) + 0x4d2c6dfc + W[34]; |
|
||||||
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; |
|
||||||
t1 = e + e1(b) + Ch(b,c,d) + 0x53380d13 + W[35]; |
|
||||||
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; |
|
||||||
t1 = d + e1(a) + Ch(a,b,c) + 0x650a7354 + W[36]; |
|
||||||
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; |
|
||||||
t1 = c + e1(h) + Ch(h,a,b) + 0x766a0abb + W[37]; |
|
||||||
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; |
|
||||||
t1 = b + e1(g) + Ch(g,h,a) + 0x81c2c92e + W[38]; |
|
||||||
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; |
|
||||||
t1 = a + e1(f) + Ch(f,g,h) + 0x92722c85 + W[39]; |
|
||||||
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; |
|
||||||
|
|
||||||
t1 = h + e1(e) + Ch(e,f,g) + 0xa2bfe8a1 + W[40]; |
|
||||||
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; |
|
||||||
t1 = g + e1(d) + Ch(d,e,f) + 0xa81a664b + W[41]; |
|
||||||
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; |
|
||||||
t1 = f + e1(c) + Ch(c,d,e) + 0xc24b8b70 + W[42]; |
|
||||||
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; |
|
||||||
t1 = e + e1(b) + Ch(b,c,d) + 0xc76c51a3 + W[43]; |
|
||||||
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; |
|
||||||
t1 = d + e1(a) + Ch(a,b,c) + 0xd192e819 + W[44]; |
|
||||||
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; |
|
||||||
t1 = c + e1(h) + Ch(h,a,b) + 0xd6990624 + W[45]; |
|
||||||
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; |
|
||||||
t1 = b + e1(g) + Ch(g,h,a) + 0xf40e3585 + W[46]; |
|
||||||
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; |
|
||||||
t1 = a + e1(f) + Ch(f,g,h) + 0x106aa070 + W[47]; |
|
||||||
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; |
|
||||||
|
|
||||||
t1 = h + e1(e) + Ch(e,f,g) + 0x19a4c116 + W[48]; |
|
||||||
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; |
|
||||||
t1 = g + e1(d) + Ch(d,e,f) + 0x1e376c08 + W[49]; |
|
||||||
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; |
|
||||||
t1 = f + e1(c) + Ch(c,d,e) + 0x2748774c + W[50]; |
|
||||||
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; |
|
||||||
t1 = e + e1(b) + Ch(b,c,d) + 0x34b0bcb5 + W[51]; |
|
||||||
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; |
|
||||||
t1 = d + e1(a) + Ch(a,b,c) + 0x391c0cb3 + W[52]; |
|
||||||
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; |
|
||||||
t1 = c + e1(h) + Ch(h,a,b) + 0x4ed8aa4a + W[53]; |
|
||||||
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; |
|
||||||
t1 = b + e1(g) + Ch(g,h,a) + 0x5b9cca4f + W[54]; |
|
||||||
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; |
|
||||||
t1 = a + e1(f) + Ch(f,g,h) + 0x682e6ff3 + W[55]; |
|
||||||
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; |
|
||||||
|
|
||||||
t1 = h + e1(e) + Ch(e,f,g) + 0x748f82ee + W[56]; |
|
||||||
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; |
|
||||||
t1 = g + e1(d) + Ch(d,e,f) + 0x78a5636f + W[57]; |
|
||||||
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; |
|
||||||
t1 = f + e1(c) + Ch(c,d,e) + 0x84c87814 + W[58]; |
|
||||||
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; |
|
||||||
t1 = e + e1(b) + Ch(b,c,d) + 0x8cc70208 + W[59]; |
|
||||||
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; |
|
||||||
t1 = d + e1(a) + Ch(a,b,c) + 0x90befffa + W[60]; |
|
||||||
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; |
|
||||||
t1 = c + e1(h) + Ch(h,a,b) + 0xa4506ceb + W[61]; |
|
||||||
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; |
|
||||||
t1 = b + e1(g) + Ch(g,h,a) + 0xbef9a3f7 + W[62]; |
|
||||||
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; |
|
||||||
t1 = a + e1(f) + Ch(f,g,h) + 0xc67178f2 + W[63]; |
|
||||||
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; |
|
||||||
|
|
||||||
state[0] += a; state[1] += b; state[2] += c; state[3] += d; |
|
||||||
state[4] += e; state[5] += f; state[6] += g; state[7] += h; |
|
||||||
|
|
||||||
#if 0 |
|
||||||
/* clear any sensitive info... */ |
|
||||||
a = b = c = d = e = f = g = h = t1 = t2 = 0; |
|
||||||
memset(W, 0, 64 * sizeof(u32)); |
|
||||||
#endif |
|
||||||
} |
|
||||||
|
|
||||||
static void runhash(void *state, const void *input, const void *init) |
|
||||||
{ |
|
||||||
memcpy(state, init, 32); |
|
||||||
sha256_transform(state, input); |
|
||||||
} |
|
||||||
|
|
||||||
const uint32_t sha256_init_state[8] = { |
|
||||||
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, |
|
||||||
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 |
|
||||||
}; |
|
||||||
|
|
||||||
/* suspiciously similar to ScanHash* from bitcoin */ |
|
||||||
bool scanhash_c(struct thr_info*thr, const unsigned char *midstate, unsigned char *data, |
|
||||||
unsigned char *hash1, unsigned char *hash, |
|
||||||
const unsigned char *target, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, |
|
||||||
uint32_t n) |
|
||||||
{ |
|
||||||
uint32_t *hash32 = (uint32_t *) hash; |
|
||||||
uint32_t *nonce = (uint32_t *)(data + 76); |
|
||||||
unsigned long stat_ctr = 0; |
|
||||||
|
|
||||||
data += 64; |
|
||||||
|
|
||||||
while (1) { |
|
||||||
n++; |
|
||||||
*nonce = n; |
|
||||||
|
|
||||||
runhash(hash1, data, midstate); |
|
||||||
runhash(hash, hash1, sha256_init_state); |
|
||||||
|
|
||||||
stat_ctr++; |
|
||||||
|
|
||||||
if (unlikely((hash32[7] == 0) && fulltest(hash, target))) { |
|
||||||
*last_nonce = n; |
|
||||||
return true; |
|
||||||
} |
|
||||||
|
|
||||||
if ((n >= max_nonce) || thr->work_restart) { |
|
||||||
*last_nonce = n; |
|
||||||
return false; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
@ -1,133 +0,0 @@ |
|||||||
/*
|
|
||||||
* SHA-256 driver for ASM routine for x86_64 on Linux |
|
||||||
* Copyright (c) Mark Crichton <crichton@gimp.org> |
|
||||||
* |
|
||||||
* This program is free software; you can redistribute it and/or modify it |
|
||||||
* under the terms of the GNU General Public License as published by the Free |
|
||||||
* Software Foundation; either version 2 of the License, or (at your option) |
|
||||||
* any later version. |
|
||||||
* |
|
||||||
*/ |
|
||||||
|
|
||||||
#include "driver-cpu.h" |
|
||||||
|
|
||||||
#ifdef WANT_X8664_SSE2 |
|
||||||
|
|
||||||
#include <string.h> |
|
||||||
#include <assert.h> |
|
||||||
|
|
||||||
#include <xmmintrin.h> |
|
||||||
#include <stdint.h> |
|
||||||
#include <stdio.h> |
|
||||||
|
|
||||||
extern void sha256_sse2_64_new (__m128i *res, __m128i *res1, __m128i *data, const uint32_t init[8]); |
|
||||||
|
|
||||||
static uint32_t g_sha256_k[]__attribute__((aligned(0x100))) = { |
|
||||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /* 0 */ |
|
||||||
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
|
||||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /* 8 */ |
|
||||||
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
|
||||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, /* 16 */ |
|
||||||
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
|
||||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, /* 24 */ |
|
||||||
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
|
||||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, /* 32 */ |
|
||||||
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
|
||||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, /* 40 */ |
|
||||||
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
|
||||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, /* 48 */ |
|
||||||
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
|
||||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, /* 56 */ |
|
||||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
|
||||||
}; |
|
||||||
|
|
||||||
|
|
||||||
const uint32_t sha256_init[8]__attribute__((aligned(0x100))) = |
|
||||||
{0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; |
|
||||||
|
|
||||||
__m128i g_4sha256_k[64]; |
|
||||||
__m128i sha256_consts_m128i[64]__attribute__((aligned(0x1000))); |
|
||||||
|
|
||||||
bool scanhash_sse2_64(struct thr_info*thr, const unsigned char *pmidstate, |
|
||||||
unsigned char *pdata, |
|
||||||
unsigned char *phash1, unsigned char *phash, |
|
||||||
const unsigned char *ptarget, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, |
|
||||||
uint32_t nonce) |
|
||||||
{ |
|
||||||
uint32_t *nNonce_p = (uint32_t *)(pdata + 76); |
|
||||||
uint32_t m_midstate[8], m_w[16], m_w1[16]; |
|
||||||
__m128i m_4w[64] __attribute__ ((aligned (0x100))); |
|
||||||
__m128i m_4hash[64] __attribute__ ((aligned (0x100))); |
|
||||||
__m128i m_4hash1[64] __attribute__ ((aligned (0x100))); |
|
||||||
__m128i offset; |
|
||||||
int i; |
|
||||||
|
|
||||||
pdata += 64; |
|
||||||
|
|
||||||
/* For debugging */ |
|
||||||
union { |
|
||||||
__m128i m; |
|
||||||
uint32_t i[4]; |
|
||||||
} mi; |
|
||||||
|
|
||||||
/* Message expansion */ |
|
||||||
memcpy(m_midstate, pmidstate, sizeof(m_midstate)); |
|
||||||
memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */ |
|
||||||
memcpy(m_w1, phash1, sizeof(m_w1)); |
|
||||||
memset(m_4hash, 0, sizeof(m_4hash)); |
|
||||||
|
|
||||||
/* Transmongrify */ |
|
||||||
for (i = 0; i < 16; i++) |
|
||||||
m_4w[i] = _mm_set1_epi32(m_w[i]); |
|
||||||
|
|
||||||
for (i = 0; i < 16; i++) |
|
||||||
m_4hash1[i] = _mm_set1_epi32(m_w1[i]); |
|
||||||
|
|
||||||
for (i = 0; i < 64; i++) |
|
||||||
sha256_consts_m128i[i] = _mm_set1_epi32(g_sha256_k[i]); |
|
||||||
|
|
||||||
offset = _mm_set_epi32(0x3, 0x2, 0x1, 0x0); |
|
||||||
|
|
||||||
for (;;) |
|
||||||
{ |
|
||||||
int j; |
|
||||||
|
|
||||||
m_4w[3] = _mm_add_epi32(offset, _mm_set1_epi32(nonce)); |
|
||||||
|
|
||||||
sha256_sse2_64_new (m_4hash, m_4hash1, m_4w, m_midstate); |
|
||||||
|
|
||||||
for (j = 0; j < 4; j++) { |
|
||||||
mi.m = m_4hash[7]; |
|
||||||
if (unlikely(mi.i[j] == 0)) |
|
||||||
break; |
|
||||||
} |
|
||||||
|
|
||||||
/* If j = true, we found a hit...so check it */ |
|
||||||
/* Use the C version for a check... */ |
|
||||||
if (unlikely(j != 4)) { |
|
||||||
for (i = 0; i < 8; i++) { |
|
||||||
mi.m = m_4hash[i]; |
|
||||||
*(uint32_t *)&(phash)[i*4] = mi.i[j]; |
|
||||||
} |
|
||||||
|
|
||||||
if (fulltest(phash, ptarget)) { |
|
||||||
nonce += j; |
|
||||||
*last_nonce = nonce + 1; |
|
||||||
*nNonce_p = nonce; |
|
||||||
return true; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
if (unlikely((nonce >= max_nonce) || thr->work_restart)) |
|
||||||
{ |
|
||||||
*last_nonce = nonce; |
|
||||||
return false; |
|
||||||
} |
|
||||||
|
|
||||||
nonce += 4; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
#endif /* WANT_X8664_SSE2 */ |
|
||||||
|
|
@ -1,125 +0,0 @@ |
|||||||
/*
|
|
||||||
* SHA-256 driver for ASM routine for x86_64 on Linux |
|
||||||
* Copyright (c) Mark Crichton <crichton@gimp.org> |
|
||||||
* |
|
||||||
* This program is free software; you can redistribute it and/or modify it |
|
||||||
* under the terms of the GNU General Public License as published by the Free |
|
||||||
* Software Foundation; either version 2 of the License, or (at your option) |
|
||||||
* any later version. |
|
||||||
* |
|
||||||
*/ |
|
||||||
|
|
||||||
#include "driver-cpu.h" |
|
||||||
|
|
||||||
#ifdef WANT_X8632_SSE2 |
|
||||||
|
|
||||||
#include <string.h> |
|
||||||
#include <assert.h> |
|
||||||
|
|
||||||
#include <xmmintrin.h> |
|
||||||
#include <stdint.h> |
|
||||||
#include <stdio.h> |
|
||||||
|
|
||||||
extern void CalcSha256_x86 (__m128i *res, __m128i *data, const uint32_t init[8])__attribute__((fastcall)); |
|
||||||
|
|
||||||
static uint32_t g_sha256_k[]__attribute__((aligned(0x100))) = { |
|
||||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /* 0 */ |
|
||||||
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
|
||||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /* 8 */ |
|
||||||
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
|
||||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, /* 16 */ |
|
||||||
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
|
||||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, /* 24 */ |
|
||||||
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
|
||||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, /* 32 */ |
|
||||||
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
|
||||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, /* 40 */ |
|
||||||
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
|
||||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, /* 48 */ |
|
||||||
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
|
||||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, /* 56 */ |
|
||||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
|
||||||
}; |
|
||||||
|
|
||||||
|
|
||||||
const uint32_t sha256_32init[8]__attribute__((aligned(0x100))) = |
|
||||||
{0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; |
|
||||||
|
|
||||||
__m128i g_4sha256_k[64]; |
|
||||||
__m128i sha256_consts_m128i[64]__attribute__((aligned(0x1000))); |
|
||||||
|
|
||||||
bool scanhash_sse2_32(struct thr_info*thr, const unsigned char *pmidstate, |
|
||||||
unsigned char *pdata, |
|
||||||
unsigned char *phash1, unsigned char *phash, |
|
||||||
const unsigned char *ptarget, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, |
|
||||||
uint32_t nonce) |
|
||||||
{ |
|
||||||
uint32_t *nNonce_p = (uint32_t *)(pdata + 76); |
|
||||||
uint32_t m_midstate[8], m_w[16], m_w1[16]; |
|
||||||
__m128i m_4w[64] __attribute__ ((aligned (0x100))); |
|
||||||
__m128i m_4hash[64] __attribute__ ((aligned (0x100))); |
|
||||||
__m128i m_4hash1[64] __attribute__ ((aligned (0x100))); |
|
||||||
__m128i offset; |
|
||||||
int i; |
|
||||||
|
|
||||||
pdata += 64; |
|
||||||
|
|
||||||
/* Message expansion */ |
|
||||||
memcpy(m_midstate, pmidstate, sizeof(m_midstate)); |
|
||||||
memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */ |
|
||||||
memcpy(m_w1, phash1, sizeof(m_w1)); |
|
||||||
memset(m_4hash, 0, sizeof(m_4hash)); |
|
||||||
|
|
||||||
/* Transmongrify */ |
|
||||||
for (i = 0; i < 16; i++) |
|
||||||
m_4w[i] = _mm_set1_epi32(m_w[i]); |
|
||||||
|
|
||||||
for (i = 0; i < 16; i++) |
|
||||||
m_4hash1[i] = _mm_set1_epi32(m_w1[i]); |
|
||||||
|
|
||||||
for (i = 0; i < 64; i++) |
|
||||||
sha256_consts_m128i[i] = _mm_set1_epi32(g_sha256_k[i]); |
|
||||||
|
|
||||||
offset = _mm_set_epi32(0x3, 0x2, 0x1, 0x0); |
|
||||||
|
|
||||||
for (;;) |
|
||||||
{ |
|
||||||
int j; |
|
||||||
|
|
||||||
m_4w[3] = _mm_add_epi32(offset, _mm_set1_epi32(nonce)); |
|
||||||
|
|
||||||
/* Some optimization can be done here W.R.T. precalculating some hash */ |
|
||||||
CalcSha256_x86 (m_4hash1, m_4w, m_midstate); |
|
||||||
CalcSha256_x86 (m_4hash, m_4hash1, sha256_32init); |
|
||||||
|
|
||||||
for (j = 0; j < 4; j++) { |
|
||||||
if (unlikely(((uint32_t *)&(m_4hash[7]))[j] == 0)) { |
|
||||||
/* We found a hit...so check it */ |
|
||||||
/* Use the C version for a check... */ |
|
||||||
|
|
||||||
for (i = 0; i < 8; i++) { |
|
||||||
*(uint32_t *)&(phash)[i<<2] = ((uint32_t *)&(m_4hash[i]))[j]; |
|
||||||
} |
|
||||||
|
|
||||||
if (fulltest(phash, ptarget)) { |
|
||||||
nonce += j; |
|
||||||
*last_nonce = nonce; |
|
||||||
*nNonce_p = nonce; |
|
||||||
return true; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
if (unlikely((nonce >= max_nonce) || thr->work_restart)) { |
|
||||||
*last_nonce = nonce; |
|
||||||
return false; |
|
||||||
} |
|
||||||
|
|
||||||
nonce += 4; |
|
||||||
|
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
#endif /* WANT_X8632_SSE2 */ |
|
||||||
|
|
@ -1,132 +0,0 @@ |
|||||||
/*
|
|
||||||
* SHA-256 driver for ASM routine for x86_64 on Linux |
|
||||||
* Copyright (c) Mark Crichton <crichton@gimp.org> |
|
||||||
* |
|
||||||
* This program is free software; you can redistribute it and/or modify it |
|
||||||
* under the terms of the GNU General Public License as published by the Free |
|
||||||
* Software Foundation; either version 2 of the License, or (at your option) |
|
||||||
* any later version. |
|
||||||
* |
|
||||||
*/ |
|
||||||
|
|
||||||
#include "driver-cpu.h" |
|
||||||
|
|
||||||
#ifdef WANT_X8664_SSE4 |
|
||||||
|
|
||||||
#include <string.h> |
|
||||||
#include <assert.h> |
|
||||||
|
|
||||||
#include <xmmintrin.h> |
|
||||||
#include <stdint.h> |
|
||||||
#include <stdio.h> |
|
||||||
|
|
||||||
extern void CalcSha256_x64_sse4(__m128i *res, __m128i *data, uint32_t init[8]); |
|
||||||
|
|
||||||
static uint32_t g_sha256_k[] = { |
|
||||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /* 0 */ |
|
||||||
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
|
||||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /* 8 */ |
|
||||||
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
|
||||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, /* 16 */ |
|
||||||
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
|
||||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, /* 24 */ |
|
||||||
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
|
||||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, /* 32 */ |
|
||||||
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
|
||||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, /* 40 */ |
|
||||||
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
|
||||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, /* 48 */ |
|
||||||
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
|
||||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, /* 56 */ |
|
||||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
|
||||||
}; |
|
||||||
|
|
||||||
|
|
||||||
static uint32_t g_sha256_hinit[8] = |
|
||||||
{0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; |
|
||||||
|
|
||||||
__m128i g_4sha256_k[64]; |
|
||||||
|
|
||||||
bool scanhash_sse4_64(struct thr_info*thr, const unsigned char *pmidstate, |
|
||||||
unsigned char *pdata, |
|
||||||
unsigned char *phash1, unsigned char *phash, |
|
||||||
const unsigned char *ptarget, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, |
|
||||||
uint32_t nonce) |
|
||||||
{ |
|
||||||
uint32_t *nNonce_p = (uint32_t *)(pdata + 76); |
|
||||||
uint32_t m_midstate[8], m_w[16], m_w1[16]; |
|
||||||
__m128i m_4w[64], m_4hash[64], m_4hash1[64]; |
|
||||||
__m128i offset; |
|
||||||
int i; |
|
||||||
|
|
||||||
pdata += 64; |
|
||||||
|
|
||||||
/* For debugging */ |
|
||||||
union { |
|
||||||
__m128i m; |
|
||||||
uint32_t i[4]; |
|
||||||
} mi; |
|
||||||
|
|
||||||
/* Message expansion */ |
|
||||||
memcpy(m_midstate, pmidstate, sizeof(m_midstate)); |
|
||||||
memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */ |
|
||||||
memcpy(m_w1, phash1, sizeof(m_w1)); |
|
||||||
memset(m_4hash, 0, sizeof(m_4hash)); |
|
||||||
|
|
||||||
/* Transmongrify */ |
|
||||||
for (i = 0; i < 16; i++) |
|
||||||
m_4w[i] = _mm_set1_epi32(m_w[i]); |
|
||||||
|
|
||||||
for (i = 0; i < 16; i++) |
|
||||||
m_4hash1[i] = _mm_set1_epi32(m_w1[i]); |
|
||||||
|
|
||||||
for (i = 0; i < 64; i++) |
|
||||||
g_4sha256_k[i] = _mm_set1_epi32(g_sha256_k[i]); |
|
||||||
|
|
||||||
offset = _mm_set_epi32(0x3, 0x2, 0x1, 0x0); |
|
||||||
|
|
||||||
for (;;) |
|
||||||
{ |
|
||||||
int j; |
|
||||||
|
|
||||||
m_4w[3] = _mm_add_epi32(offset, _mm_set1_epi32(nonce)); |
|
||||||
|
|
||||||
/* Some optimization can be done here W.R.T. precalculating some hash */ |
|
||||||
CalcSha256_x64_sse4(m_4hash1, m_4w, m_midstate); |
|
||||||
CalcSha256_x64_sse4(m_4hash, m_4hash1, g_sha256_hinit); |
|
||||||
|
|
||||||
for (j = 0; j < 4; j++) { |
|
||||||
mi.m = m_4hash[7]; |
|
||||||
if (unlikely(mi.i[j] == 0)) |
|
||||||
break; |
|
||||||
} |
|
||||||
|
|
||||||
/* If j = true, we found a hit...so check it */ |
|
||||||
/* Use the C version for a check... */ |
|
||||||
if (unlikely(j != 4)) { |
|
||||||
for (i = 0; i < 8; i++) { |
|
||||||
mi.m = m_4hash[i]; |
|
||||||
*(uint32_t *)&(phash)[i*4] = mi.i[j]; |
|
||||||
} |
|
||||||
|
|
||||||
if (fulltest(phash, ptarget)) { |
|
||||||
nonce += j; |
|
||||||
*last_nonce = nonce; |
|
||||||
*nNonce_p = nonce; |
|
||||||
return true; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
if (unlikely((nonce >= max_nonce) || thr->work_restart)) |
|
||||||
{ |
|
||||||
*last_nonce = nonce; |
|
||||||
return false; |
|
||||||
} |
|
||||||
|
|
||||||
nonce += 4; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
#endif /* WANT_X8664_SSE4 */ |
|
||||||
|
|
@ -1,85 +0,0 @@ |
|||||||
|
|
||||||
#include "driver-cpu.h" |
|
||||||
|
|
||||||
#include <stdint.h> |
|
||||||
#include <stdlib.h> |
|
||||||
#include <string.h> |
|
||||||
#include <stdio.h> |
|
||||||
#include <sys/time.h> |
|
||||||
#include "miner.h" |
|
||||||
|
|
||||||
#ifdef WANT_VIA_PADLOCK |
|
||||||
|
|
||||||
static void via_sha256(void *hash, void *buf, unsigned len) |
|
||||||
{ |
|
||||||
unsigned stat = 0; |
|
||||||
asm volatile(".byte 0xf3, 0x0f, 0xa6, 0xd0" |
|
||||||
:"+S"(buf), "+a"(stat) |
|
||||||
:"c"(len), "D" (hash) |
|
||||||
:"memory"); |
|
||||||
} |
|
||||||
|
|
||||||
bool scanhash_via(struct thr_info*thr, const unsigned char __maybe_unused *pmidstate, |
|
||||||
unsigned char *data_inout, |
|
||||||
unsigned char __maybe_unused *phash1, unsigned char __maybe_unused *phash, |
|
||||||
const unsigned char *target, |
|
||||||
uint32_t max_nonce, uint32_t *last_nonce, |
|
||||||
uint32_t n) |
|
||||||
{ |
|
||||||
unsigned char data[128] __attribute__((aligned(128))); |
|
||||||
unsigned char tmp_hash[32] __attribute__((aligned(128))); |
|
||||||
unsigned char tmp_hash1[32] __attribute__((aligned(128))); |
|
||||||
uint32_t *data32 = (uint32_t *) data; |
|
||||||
uint32_t *hash32 = (uint32_t *) tmp_hash; |
|
||||||
uint32_t *nonce = (uint32_t *)(data + 64 + 12); |
|
||||||
unsigned long stat_ctr = 0; |
|
||||||
int i; |
|
||||||
|
|
||||||
/* bitcoin gives us big endian input, but via wants LE,
|
|
||||||
* so we reverse the swapping bitcoin has already done (extra work) |
|
||||||
* in order to permit the hardware to swap everything |
|
||||||
* back to BE again (extra work). |
|
||||||
*/ |
|
||||||
for (i = 0; i < 128/4; i++) |
|
||||||
data32[i] = swab32(((uint32_t *)data_inout)[i]); |
|
||||||
|
|
||||||
while (1) { |
|
||||||
n++; |
|
||||||
*nonce = n; |
|
||||||
|
|
||||||
/* first SHA256 transform */ |
|
||||||
memcpy(tmp_hash1, sha256_init_state, 32); |
|
||||||
via_sha256(tmp_hash1, data, 80); /* or maybe 128? */ |
|
||||||
|
|
||||||
for (i = 0; i < 32/4; i++) |
|
||||||
((uint32_t *)tmp_hash1)[i] = |
|
||||||
swab32(((uint32_t *)tmp_hash1)[i]); |
|
||||||
|
|
||||||
/* second SHA256 transform */ |
|
||||||
memcpy(tmp_hash, sha256_init_state, 32); |
|
||||||
via_sha256(tmp_hash, tmp_hash1, 32); |
|
||||||
|
|
||||||
stat_ctr++; |
|
||||||
|
|
||||||
if (unlikely((hash32[7] == 0) && fulltest(tmp_hash, target))) { |
|
||||||
/* swap nonce'd data back into original storage area;
|
|
||||||
* TODO: only swap back the nonce, rather than all data |
|
||||||
*/ |
|
||||||
for (i = 0; i < 128/4; i++) { |
|
||||||
uint32_t *dout32 = (uint32_t *) data_inout; |
|
||||||
dout32[i] = swab32(data32[i]); |
|
||||||
} |
|
||||||
|
|
||||||
*last_nonce = n; |
|
||||||
return true; |
|
||||||
} |
|
||||||
|
|
||||||
if ((n >= max_nonce) || thr->work_restart) { |
|
||||||
*last_nonce = n; |
|
||||||
return false; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
#endif /* WANT_VIA_PADLOCK */ |
|
||||||
|
|
@ -1,8 +0,0 @@ |
|||||||
noinst_LIBRARIES = libx8632.a |
|
||||||
|
|
||||||
SUFFIXES = .asm |
|
||||||
|
|
||||||
libx8632_a_SOURCES = sha256_xmm.asm |
|
||||||
|
|
||||||
.asm.o: |
|
||||||
$(YASM) -f $(YASM_FMT) $< |
|
@ -1,259 +0,0 @@ |
|||||||
;; SHA-256 for X86 for Linux, based off of:A |
|
||||||
|
|
||||||
; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com |
|
||||||
; Version 2011 |
|
||||||
; This software is Public Domain |
|
||||||
|
|
||||||
; SHA-256 CPU SSE cruncher for Bitcoin Miner |
|
||||||
|
|
||||||
ALIGN 32 |
|
||||||
BITS 32 |
|
||||||
|
|
||||||
%define hash ecx |
|
||||||
%define data edx |
|
||||||
%define init esi |
|
||||||
|
|
||||||
; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16)) |
|
||||||
%define LAB_CALC_PARA 2 |
|
||||||
%define LAB_CALC_UNROLL 24 |
|
||||||
|
|
||||||
%define LAB_LOOP_UNROLL 64 |
|
||||||
|
|
||||||
extern _sha256_consts_m128i |
|
||||||
|
|
||||||
global $@CalcSha256_x86@12 |
|
||||||
; CalcSha256 hash(ecx), data(edx), init([esp+4]) |
|
||||||
@CalcSha256_x86@12: |
|
||||||
push esi |
|
||||||
push edi |
|
||||||
mov init, [esp+12] |
|
||||||
|
|
||||||
LAB_SHA: |
|
||||||
lea edi, qword [data+256] ; + 256 |
|
||||||
|
|
||||||
LAB_CALC: |
|
||||||
%macro lab_calc_blk 1 |
|
||||||
movdqa xmm0, [edi-(15-%1)*16] ; xmm0 = W[I-15] |
|
||||||
movdqa xmm4, [edi-(15-(%1+1))*16] ; xmm4 = W[I-15+1] |
|
||||||
movdqa xmm2, xmm0 ; xmm2 = W[I-15] |
|
||||||
movdqa xmm6, xmm4 ; xmm6 = W[I-15+1] |
|
||||||
psrld xmm0, 3 ; xmm0 = W[I-15] >> 3 |
|
||||||
psrld xmm4, 3 ; xmm4 = W[I-15+1] >> 3 |
|
||||||
movdqa xmm1, xmm0 ; xmm1 = W[I-15] >> 3 |
|
||||||
movdqa xmm5, xmm4 ; xmm5 = W[I-15+1] >> 3 |
|
||||||
pslld xmm2, 14 ; xmm2 = W[I-15] << 14 |
|
||||||
pslld xmm6, 14 ; xmm6 = W[I-15+1] << 14 |
|
||||||
psrld xmm1, 4 ; xmm1 = W[I-15] >> 7 |
|
||||||
psrld xmm5, 4 ; xmm5 = W[I-15+1] >> 7 |
|
||||||
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) |
|
||||||
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) |
|
||||||
psrld xmm1, 11 ; xmm1 = W[I-15] >> 18 |
|
||||||
psrld xmm5, 11 ; xmm5 = W[I-15+1] >> 18 |
|
||||||
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) |
|
||||||
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) |
|
||||||
pslld xmm2, 11 ; xmm2 = W[I-15] << 25 |
|
||||||
pslld xmm6, 11 ; xmm6 = W[I-15+1] << 25 |
|
||||||
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) |
|
||||||
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) |
|
||||||
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25) |
|
||||||
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25) |
|
||||||
|
|
||||||
movdqa xmm3, [edi-(2-%1)*16] ; xmm3 = W[I-2] |
|
||||||
movdqa xmm7, [edi-(2-(%1+1))*16] ; xmm7 = W[I-2+1] |
|
||||||
|
|
||||||
paddd xmm0, [edi-(16-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] |
|
||||||
paddd xmm4, [edi-(16-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] |
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;; |
|
||||||
|
|
||||||
movdqa xmm2, xmm3 ; xmm2 = W[I-2] |
|
||||||
movdqa xmm6, xmm7 ; xmm6 = W[I-2+1] |
|
||||||
psrld xmm3, 10 ; xmm3 = W[I-2] >> 10 |
|
||||||
psrld xmm7, 10 ; xmm7 = W[I-2+1] >> 10 |
|
||||||
movdqa xmm1, xmm3 ; xmm1 = W[I-2] >> 10 |
|
||||||
movdqa xmm5, xmm7 ; xmm5 = W[I-2+1] >> 10 |
|
||||||
|
|
||||||
paddd xmm0, [edi-(7-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7] |
|
||||||
|
|
||||||
pslld xmm2, 13 ; xmm2 = W[I-2] << 13 |
|
||||||
pslld xmm6, 13 ; xmm6 = W[I-2+1] << 13 |
|
||||||
psrld xmm1, 7 ; xmm1 = W[I-2] >> 17 |
|
||||||
psrld xmm5, 7 ; xmm5 = W[I-2+1] >> 17 |
|
||||||
|
|
||||||
paddd xmm4, [edi-(7-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1] |
|
||||||
|
|
||||||
pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) |
|
||||||
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) |
|
||||||
psrld xmm1, 2 ; xmm1 = W[I-2] >> 19 |
|
||||||
psrld xmm5, 2 ; xmm5 = W[I-2+1] >> 19 |
|
||||||
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) |
|
||||||
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) |
|
||||||
pslld xmm2, 2 ; xmm2 = W[I-2] << 15 |
|
||||||
pslld xmm6, 2 ; xmm6 = W[I-2+1] << 15 |
|
||||||
pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) |
|
||||||
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) |
|
||||||
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15) |
|
||||||
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15) |
|
||||||
|
|
||||||
paddd xmm0, xmm3 ; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7] |
|
||||||
paddd xmm4, xmm7 ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1] |
|
||||||
movdqa [edi+(%1*16)], xmm0 |
|
||||||
movdqa [edi+((%1+1)*16)], xmm4 |
|
||||||
%endmacro |
|
||||||
|
|
||||||
%assign i 0 |
|
||||||
%rep LAB_CALC_UNROLL |
|
||||||
lab_calc_blk i |
|
||||||
%assign i i+LAB_CALC_PARA |
|
||||||
%endrep |
|
||||||
|
|
||||||
; Load the init values of the message into the hash. |
|
||||||
|
|
||||||
movdqa xmm7, [init] |
|
||||||
pshufd xmm5, xmm7, 0x55 ; xmm5 == b |
|
||||||
pshufd xmm4, xmm7, 0xAA ; xmm4 == c |
|
||||||
pshufd xmm3, xmm7, 0xFF ; xmm3 == d |
|
||||||
pshufd xmm7, xmm7, 0 ; xmm7 == a |
|
||||||
|
|
||||||
movdqa xmm0, [init+4*4] |
|
||||||
pshufd xmm1, xmm0, 0x55 ; [hash+0*16] == f |
|
||||||
movdqa [hash+0*16], xmm1 |
|
||||||
|
|
||||||
pshufd xmm1, xmm0, 0xAA ; [hash+1*16] == g |
|
||||||
movdqa [hash+1*16], xmm1 |
|
||||||
|
|
||||||
pshufd xmm1, xmm0, 0xFF ; [hash+2*16] == h |
|
||||||
movdqa [hash+2*16], xmm1 |
|
||||||
|
|
||||||
pshufd xmm0, xmm0, 0 ; xmm0 == e |
|
||||||
|
|
||||||
|
|
||||||
LAB_LOOP: |
|
||||||
|
|
||||||
;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j] |
|
||||||
|
|
||||||
%macro lab_loop_blk 1 |
|
||||||
movdqa xmm6, [data+%1] |
|
||||||
paddd xmm6, _sha256_consts_m128i[%1] |
|
||||||
|
|
||||||
paddd xmm6, [hash+2*16] ; +h |
|
||||||
|
|
||||||
movdqa xmm1, xmm0 |
|
||||||
movdqa xmm2, [hash+1*16] |
|
||||||
pandn xmm1, xmm2 ; ~e & g |
|
||||||
|
|
||||||
movdqa [hash+2*16], xmm2 ; h = g |
|
||||||
movdqa xmm2, [hash+0*16] ; f |
|
||||||
movdqa [hash+1*16], xmm2 ; g = f |
|
||||||
|
|
||||||
|
|
||||||
pand xmm2, xmm0 ; e & f |
|
||||||
pxor xmm1, xmm2 ; (e & f) ^ (~e & g) |
|
||||||
movdqa [hash+0*16], xmm0 ; f = e |
|
||||||
|
|
||||||
paddd xmm6, xmm1 ; Ch + h + w[i] + k[i] |
|
||||||
|
|
||||||
movdqa xmm1, xmm0 |
|
||||||
psrld xmm0, 6 |
|
||||||
movdqa xmm2, xmm0 |
|
||||||
pslld xmm1, 7 |
|
||||||
psrld xmm2, 5 |
|
||||||
pxor xmm0, xmm1 |
|
||||||
pxor xmm0, xmm2 |
|
||||||
pslld xmm1, 14 |
|
||||||
psrld xmm2, 14 |
|
||||||
pxor xmm0, xmm1 |
|
||||||
pxor xmm0, xmm2 |
|
||||||
pslld xmm1, 5 |
|
||||||
pxor xmm0, xmm1 ; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25) |
|
||||||
paddd xmm6, xmm0 ; xmm6 = t1 |
|
||||||
|
|
||||||
movdqa xmm0, xmm3 ; d |
|
||||||
paddd xmm0, xmm6 ; e = d+t1 |
|
||||||
|
|
||||||
movdqa xmm1, xmm5 ; =b |
|
||||||
movdqa xmm3, xmm4 ; d = c |
|
||||||
movdqa xmm2, xmm4 ; c |
|
||||||
pand xmm2, xmm5 ; b & c |
|
||||||
pand xmm4, xmm7 ; a & c |
|
||||||
pand xmm1, xmm7 ; a & b |
|
||||||
pxor xmm1, xmm4 |
|
||||||
movdqa xmm4, xmm5 ; c = b |
|
||||||
movdqa xmm5, xmm7 ; b = a |
|
||||||
pxor xmm1, xmm2 ; (a & c) ^ (a & d) ^ (c & d) |
|
||||||
paddd xmm6, xmm1 ; t1 + ((a & c) ^ (a & d) ^ (c & d)) |
|
||||||
|
|
||||||
movdqa xmm2, xmm7 |
|
||||||
psrld xmm7, 2 |
|
||||||
movdqa xmm1, xmm7 |
|
||||||
pslld xmm2, 10 |
|
||||||
psrld xmm1, 11 |
|
||||||
pxor xmm7, xmm2 |
|
||||||
pxor xmm7, xmm1 |
|
||||||
pslld xmm2, 9 |
|
||||||
psrld xmm1, 9 |
|
||||||
pxor xmm7, xmm2 |
|
||||||
pxor xmm7, xmm1 |
|
||||||
pslld xmm2, 11 |
|
||||||
pxor xmm7, xmm2 |
|
||||||
paddd xmm7, xmm6 ; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d)); |
|
||||||
%endmacro |
|
||||||
|
|
||||||
%assign i 0 |
|
||||||
%rep LAB_LOOP_UNROLL |
|
||||||
lab_loop_blk i |
|
||||||
%assign i i+16 |
|
||||||
%endrep |
|
||||||
|
|
||||||
; Finished the 64 rounds, calculate hash and save |
|
||||||
|
|
||||||
movdqa xmm1, [init+16] |
|
||||||
|
|
||||||
pshufd xmm2, xmm1, 0xFF |
|
||||||
movdqa xmm6, [hash+2*16] |
|
||||||
paddd xmm2, xmm6 |
|
||||||
movdqa [hash+7*16], xmm2 |
|
||||||
|
|
||||||
pshufd xmm2, xmm1, 0xAA |
|
||||||
movdqa xmm6, [hash+1*16] |
|
||||||
paddd xmm2, xmm6 |
|
||||||
movdqa [hash+6*16], xmm2 |
|
||||||
|
|
||||||
pshufd xmm2, xmm1, 0x55 |
|
||||||
movdqa xmm6, [hash+0*16] |
|
||||||
paddd xmm2, xmm6 |
|
||||||
movdqa [hash+5*16], xmm2 |
|
||||||
|
|
||||||
pshufd xmm1, xmm1, 0 |
|
||||||
paddd xmm0, xmm1 |
|
||||||
movdqa [hash+4*16], xmm0 |
|
||||||
|
|
||||||
movdqa xmm1, [init] |
|
||||||
|
|
||||||
pshufd xmm2, xmm1, 0xFF |
|
||||||
paddd xmm3, xmm2 |
|
||||||
movdqa [hash+3*16], xmm3 |
|
||||||
|
|
||||||
pshufd xmm2, xmm1, 0xAA |
|
||||||
paddd xmm4, xmm2 |
|
||||||
movdqa [hash+2*16], xmm4 |
|
||||||
|
|
||||||
pshufd xmm2, xmm1, 0x55 |
|
||||||
paddd xmm5, xmm2 |
|
||||||
movdqa [hash+1*16], xmm5 |
|
||||||
|
|
||||||
pshufd xmm1, xmm1, 0 |
|
||||||
paddd xmm7, xmm1 |
|
||||||
movdqa [hash+0*16], xmm7 |
|
||||||
|
|
||||||
LAB_RET: |
|
||||||
pop edi |
|
||||||
pop esi |
|
||||||
retn 4 |
|
||||||
|
|
||||||
%ifidn __OUTPUT_FORMAT__,elf |
|
||||||
section .note.GNU-stack noalloc noexec nowrite progbits |
|
||||||
%endif |
|
||||||
%ifidn __OUTPUT_FORMAT__,elf32 |
|
||||||
section .note.GNU-stack noalloc noexec nowrite progbits |
|
||||||
%endif |
|
@ -1,8 +0,0 @@ |
|||||||
noinst_LIBRARIES = libx8664.a |
|
||||||
|
|
||||||
SUFFIXES = .asm |
|
||||||
|
|
||||||
libx8664_a_SOURCES = sha256_xmm_amd64.asm sha256_sse4_amd64.asm |
|
||||||
|
|
||||||
.asm.o: |
|
||||||
$(YASM) -f $(YASM_FMT) -o $@ $< |
|
@ -1,292 +0,0 @@ |
|||||||
;; SHA-256 for X86-64 for Linux, based off of: |
|
||||||
|
|
||||||
; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com |
|
||||||
; Version 2011 |
|
||||||
; This software is Public Domain |
|
||||||
|
|
||||||
; Significant re-write/optimisation and reordering by, |
|
||||||
; Neil Kettle <mu-b@digit-labs.org> |
|
||||||
; ~18% performance improvement |
|
||||||
|
|
||||||
; SHA-256 CPU SSE cruncher for Bitcoin Miner |
|
||||||
|
|
||||||
ALIGN 32 |
|
||||||
BITS 64 |
|
||||||
|
|
||||||
%ifidn __OUTPUT_FORMAT__,win64 |
|
||||||
%define hash rcx |
|
||||||
%define data rdx |
|
||||||
%define init r8 |
|
||||||
%define temp r9 |
|
||||||
%else |
|
||||||
%define hash rdi |
|
||||||
%define data rsi |
|
||||||
%define init rdx |
|
||||||
%define temp rcx |
|
||||||
%endif |
|
||||||
|
|
||||||
; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16)) |
|
||||||
%define LAB_CALC_PARA 2 |
|
||||||
%define LAB_CALC_UNROLL 8 |
|
||||||
|
|
||||||
%define LAB_LOOP_UNROLL 8 |
|
||||||
|
|
||||||
extern g_4sha256_k |
|
||||||
|
|
||||||
global CalcSha256_x64_sse4 |
|
||||||
; CalcSha256 hash(rdi), data(rsi), init(rdx) |
|
||||||
; CalcSha256 hash(rcx), data(rdx), init(r8) |
|
||||||
CalcSha256_x64_sse4: |
|
||||||
|
|
||||||
push rbx |
|
||||||
%ifidn __OUTPUT_FORMAT__,win64 |
|
||||||
sub rsp, 16 * 6 |
|
||||||
movdqa [rsp + 16*0], xmm6 |
|
||||||
movdqa [rsp + 16*1], xmm7 |
|
||||||
movdqa [rsp + 16*2], xmm8 |
|
||||||
movdqa [rsp + 16*3], xmm9 |
|
||||||
movdqa [rsp + 16*4], xmm10 |
|
||||||
movdqa [rsp + 16*5], xmm11 |
|
||||||
%endif |
|
||||||
|
|
||||||
LAB_NEXT_NONCE: |
|
||||||
|
|
||||||
mov temp, 64*4 ; 256 - temp is # of SHA-2 rounds |
|
||||||
mov rax, 16*4 ; 64 - rax is where we expand to |
|
||||||
|
|
||||||
LAB_SHA: |
|
||||||
push temp |
|
||||||
lea temp, qword [data+temp*4] ; + 1024 |
|
||||||
lea r11, qword [data+rax*4] ; + 256 |
|
||||||
|
|
||||||
LAB_CALC: |
|
||||||
%macro lab_calc_blk 1 |
|
||||||
|
|
||||||
movntdqa xmm0, [r11-(15-%1)*16] ; xmm0 = W[I-15] |
|
||||||
movdqa xmm2, xmm0 ; xmm2 = W[I-15] |
|
||||||
movntdqa xmm4, [r11-(15-(%1+1))*16] ; xmm4 = W[I-15+1] |
|
||||||
movdqa xmm6, xmm4 ; xmm6 = W[I-15+1] |
|
||||||
|
|
||||||
psrld xmm0, 3 ; xmm0 = W[I-15] >> 3 |
|
||||||
movdqa xmm1, xmm0 ; xmm1 = W[I-15] >> 3 |
|
||||||
pslld xmm2, 14 ; xmm2 = W[I-15] << 14 |
|
||||||
psrld xmm4, 3 ; xmm4 = W[I-15+1] >> 3 |
|
||||||
movdqa xmm5, xmm4 ; xmm5 = W[I-15+1] >> 3 |
|
||||||
psrld xmm5, 4 ; xmm5 = W[I-15+1] >> 7 |
|
||||||
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) |
|
||||||
pslld xmm6, 14 ; xmm6 = W[I-15+1] << 14 |
|
||||||
psrld xmm1, 4 ; xmm1 = W[I-15] >> 7 |
|
||||||
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) |
|
||||||
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) |
|
||||||
psrld xmm1, 11 ; xmm1 = W[I-15] >> 18 |
|
||||||
psrld xmm5, 11 ; xmm5 = W[I-15+1] >> 18 |
|
||||||
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) |
|
||||||
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) |
|
||||||
pslld xmm2, 11 ; xmm2 = W[I-15] << 25 |
|
||||||
pslld xmm6, 11 ; xmm6 = W[I-15+1] << 25 |
|
||||||
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25) |
|
||||||
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) |
|
||||||
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25) |
|
||||||
paddd xmm0, [r11-(16-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] |
|
||||||
paddd xmm4, [r11-(16-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] |
|
||||||
movntdqa xmm3, [r11-(2-%1)*16] ; xmm3 = W[I-2] |
|
||||||
movntdqa xmm7, [r11-(2-(%1+1))*16] ; xmm7 = W[I-2+1] |
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;; |
|
||||||
|
|
||||||
movdqa xmm2, xmm3 ; xmm2 = W[I-2] |
|
||||||
psrld xmm3, 10 ; xmm3 = W[I-2] >> 10 |
|
||||||
movdqa xmm1, xmm3 ; xmm1 = W[I-2] >> 10 |
|
||||||
movdqa xmm6, xmm7 ; xmm6 = W[I-2+1] |
|
||||||
psrld xmm7, 10 ; xmm7 = W[I-2+1] >> 10 |
|
||||||
movdqa xmm5, xmm7 ; xmm5 = W[I-2+1] >> 10 |
|
||||||
|
|
||||||
paddd xmm0, [r11-(7-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7] |
|
||||||
paddd xmm4, [r11-(7-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1] |
|
||||||
|
|
||||||
pslld xmm2, 13 ; xmm2 = W[I-2] << 13 |
|
||||||
pslld xmm6, 13 ; xmm6 = W[I-2+1] << 13 |
|
||||||
psrld xmm1, 7 ; xmm1 = W[I-2] >> 17 |
|
||||||
psrld xmm5, 7 ; xmm5 = W[I-2+1] >> 17 |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) |
|
||||||
psrld xmm1, 2 ; xmm1 = W[I-2] >> 19 |
|
||||||
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) |
|
||||||
pslld xmm2, 2 ; xmm2 = W[I-2] << 15 |
|
||||||
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) |
|
||||||
psrld xmm5, 2 ; xmm5 = W[I-2+1] >> 19 |
|
||||||
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) |
|
||||||
pslld xmm6, 2 ; xmm6 = W[I-2+1] << 15 |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) |
|
||||||
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15) |
|
||||||
paddd xmm0, xmm3 ; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7] |
|
||||||
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) |
|
||||||
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15) |
|
||||||
paddd xmm4, xmm7 ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1] |
|
||||||
|
|
||||||
movdqa [r11+(%1*16)], xmm0 |
|
||||||
movdqa [r11+((%1+1)*16)], xmm4 |
|
||||||
%endmacro |
|
||||||
|
|
||||||
%assign i 0 |
|
||||||
%rep LAB_CALC_UNROLL |
|
||||||
lab_calc_blk i |
|
||||||
%assign i i+LAB_CALC_PARA |
|
||||||
%endrep |
|
||||||
|
|
||||||
add r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16 |
|
||||||
cmp r11, temp |
|
||||||
jb LAB_CALC |
|
||||||
|
|
||||||
pop temp |
|
||||||
mov rax, 0 |
|
||||||
|
|
||||||
; Load the init values of the message into the hash. |
|
||||||
|
|
||||||
movntdqa xmm7, [init] |
|
||||||
pshufd xmm5, xmm7, 0x55 ; xmm5 == b |
|
||||||
pshufd xmm4, xmm7, 0xAA ; xmm4 == c |
|
||||||
pshufd xmm3, xmm7, 0xFF ; xmm3 == d |
|
||||||
pshufd xmm7, xmm7, 0 ; xmm7 == a |
|
||||||
|
|
||||||
movntdqa xmm0, [init+4*4] |
|
||||||
pshufd xmm8, xmm0, 0x55 ; xmm8 == f |
|
||||||
pshufd xmm9, xmm0, 0xAA ; xmm9 == g |
|
||||||
pshufd xmm10, xmm0, 0xFF ; xmm10 == h |
|
||||||
pshufd xmm0, xmm0, 0 ; xmm0 == e |
|
||||||
|
|
||||||
LAB_LOOP: |
|
||||||
|
|
||||||
;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j] |
|
||||||
|
|
||||||
%macro lab_loop_blk 0 |
|
||||||
movntdqa xmm6, [data+rax*4] |
|
||||||
paddd xmm6, g_4sha256_k[rax*4] |
|
||||||
add rax, 4 |
|
||||||
|
|
||||||
paddd xmm6, xmm10 ; +h |
|
||||||
|
|
||||||
movdqa xmm1, xmm0 |
|
||||||
movdqa xmm2, xmm9 |
|
||||||
pandn xmm1, xmm2 ; ~e & g |
|
||||||
|
|
||||||
movdqa xmm10, xmm2 ; h = g |
|
||||||
movdqa xmm2, xmm8 ; f |
|
||||||
movdqa xmm9, xmm2 ; g = f |
|
||||||
|
|
||||||
pand xmm2, xmm0 ; e & f |
|
||||||
pxor xmm1, xmm2 ; (e & f) ^ (~e & g) |
|
||||||
movdqa xmm8, xmm0 ; f = e |
|
||||||
|
|
||||||
paddd xmm6, xmm1 ; Ch + h + w[i] + k[i] |
|
||||||
|
|
||||||
movdqa xmm1, xmm0 |
|
||||||
psrld xmm0, 6 |
|
||||||
movdqa xmm2, xmm0 |
|
||||||
pslld xmm1, 7 |
|
||||||
psrld xmm2, 5 |
|
||||||
pxor xmm0, xmm1 |
|
||||||
pxor xmm0, xmm2 |
|
||||||
pslld xmm1, 14 |
|
||||||
psrld xmm2, 14 |
|
||||||
pxor xmm0, xmm1 |
|
||||||
pxor xmm0, xmm2 |
|
||||||
pslld xmm1, 5 |
|
||||||
pxor xmm0, xmm1 ; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25) |
|
||||||
paddd xmm6, xmm0 ; xmm6 = t1 |
|
||||||
|
|
||||||
movdqa xmm0, xmm3 ; d |
|
||||||
paddd xmm0, xmm6 ; e = d+t1 |
|
||||||
|
|
||||||
movdqa xmm1, xmm5 ; =b |
|
||||||
movdqa xmm3, xmm4 ; d = c |
|
||||||
movdqa xmm2, xmm4 ; c |
|
||||||
pand xmm2, xmm5 ; b & c |
|
||||||
pand xmm4, xmm7 ; a & c |
|
||||||
pand xmm1, xmm7 ; a & b |
|
||||||
pxor xmm1, xmm4 |
|
||||||
movdqa xmm4, xmm5 ; c = b |
|
||||||
movdqa xmm5, xmm7 ; b = a |
|
||||||
pxor xmm1, xmm2 ; (a & c) ^ (a & d) ^ (c & d) |
|
||||||
paddd xmm6, xmm1 ; t1 + ((a & c) ^ (a & d) ^ (c & d)) |
|
||||||
|
|
||||||
movdqa xmm2, xmm7 |
|
||||||
psrld xmm7, 2 |
|
||||||
movdqa xmm1, xmm7 |
|
||||||
pslld xmm2, 10 |
|
||||||
psrld xmm1, 11 |
|
||||||
pxor xmm7, xmm2 |
|
||||||
pxor xmm7, xmm1 |
|
||||||
pslld xmm2, 9 |
|
||||||
psrld xmm1, 9 |
|
||||||
pxor xmm7, xmm2 |
|
||||||
pxor xmm7, xmm1 |
|
||||||
pslld xmm2, 11 |
|
||||||
pxor xmm7, xmm2 |
|
||||||
paddd xmm7, xmm6 ; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d)); |
|
||||||
%endmacro |
|
||||||
|
|
||||||
%assign i 0 |
|
||||||
%rep LAB_LOOP_UNROLL |
|
||||||
lab_loop_blk |
|
||||||
%assign i i+1 |
|
||||||
%endrep |
|
||||||
|
|
||||||
cmp rax, temp |
|
||||||
jb LAB_LOOP |
|
||||||
|
|
||||||
; Finished the 64 rounds, calculate hash and save |
|
||||||
|
|
||||||
movntdqa xmm1, [init] |
|
||||||
pshufd xmm2, xmm1, 0x55 |
|
||||||
paddd xmm5, xmm2 |
|
||||||
pshufd xmm6, xmm1, 0xAA |
|
||||||
paddd xmm4, xmm6 |
|
||||||
pshufd xmm11, xmm1, 0xFF |
|
||||||
paddd xmm3, xmm11 |
|
||||||
pshufd xmm1, xmm1, 0 |
|
||||||
paddd xmm7, xmm1 |
|
||||||
|
|
||||||
movntdqa xmm1, [init+4*4] |
|
||||||
pshufd xmm2, xmm1, 0x55 |
|
||||||
paddd xmm8, xmm2 |
|
||||||
pshufd xmm6, xmm1, 0xAA |
|
||||||
paddd xmm9, xmm6 |
|
||||||
pshufd xmm11, xmm1, 0xFF |
|
||||||
paddd xmm10, xmm11 |
|
||||||
pshufd xmm1, xmm1, 0 |
|
||||||
paddd xmm0, xmm1 |
|
||||||
|
|
||||||
movdqa [hash+0*16], xmm7 |
|
||||||
movdqa [hash+1*16], xmm5 |
|
||||||
movdqa [hash+2*16], xmm4 |
|
||||||
movdqa [hash+3*16], xmm3 |
|
||||||
movdqa [hash+4*16], xmm0 |
|
||||||
movdqa [hash+5*16], xmm8 |
|
||||||
movdqa [hash+6*16], xmm9 |
|
||||||
movdqa [hash+7*16], xmm10 |
|
||||||
|
|
||||||
LAB_RET: |
|
||||||
%ifidn __OUTPUT_FORMAT__,win64 |
|
||||||
movdqa xmm6, [rsp + 16*0] |
|
||||||
movdqa xmm7, [rsp + 16*1] |
|
||||||
movdqa xmm8, [rsp + 16*2] |
|
||||||
movdqa xmm9, [rsp + 16*3] |
|
||||||
movdqa xmm10, [rsp + 16*4] |
|
||||||
movdqa xmm11, [rsp + 16*5] |
|
||||||
add rsp, 16 * 6 |
|
||||||
%endif |
|
||||||
pop rbx |
|
||||||
ret |
|
||||||
|
|
||||||
%ifidn __OUTPUT_FORMAT__,elf |
|
||||||
section .note.GNU-stack noalloc noexec nowrite progbits |
|
||||||
%endif |
|
||||||
%ifidn __OUTPUT_FORMAT__,elf64 |
|
||||||
section .note.GNU-stack noalloc noexec nowrite progbits |
|
||||||
%endif |
|
@ -1,354 +0,0 @@ |
|||||||
;/* |
|
||||||
; * Copyright (C) 2011 - Neil Kettle <neil@digit-labs.org> |
|
||||||
; * |
|
||||||
; * This file is part of cpuminer-ng. |
|
||||||
; * |
|
||||||
; * cpuminer-ng is free software: you can redistribute it and/or modify |
|
||||||
; * it under the terms of the GNU General Public License as published by |
|
||||||
; * the Free Software Foundation, either version 3 of the License, or |
|
||||||
; * (at your option) any later version. |
|
||||||
; * |
|
||||||
; * cpuminer-ng is distributed in the hope that it will be useful, |
|
||||||
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
||||||
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
||||||
; * GNU General Public License for more details. |
|
||||||
; * |
|
||||||
; * You should have received a copy of the GNU General Public License |
|
||||||
; * along with cpuminer-ng. If not, see <http://www.gnu.org/licenses/>. |
|
||||||
; */ |
|
||||||
|
|
||||||
; %rbp, %rbx, and %r12-%r15 - callee save |
|
||||||
|
|
||||||
ALIGN 32 |
|
||||||
BITS 64 |
|
||||||
|
|
||||||
%ifidn __OUTPUT_FORMAT__,win64 |
|
||||||
%define hash rcx |
|
||||||
%define hash1 rdx |
|
||||||
%define data r8 |
|
||||||
%define init r9 |
|
||||||
%else |
|
||||||
%define hash rdi |
|
||||||
%define hash1 rsi |
|
||||||
%define data rdx |
|
||||||
%define init rcx |
|
||||||
%endif |
|
||||||
|
|
||||||
; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16)) |
|
||||||
%define SHA_CALC_W_PARA 2 |
|
||||||
%define SHA_CALC_W_UNROLL 8 |
|
||||||
|
|
||||||
%define SHA_ROUND_LOOP_UNROLL 16 |
|
||||||
|
|
||||||
%ifidn __YASM_OBJFMT__, macho64 |
|
||||||
extern _sha256_consts_m128i |
|
||||||
extern _sha256_init |
|
||||||
%else |
|
||||||
extern sha256_consts_m128i |
|
||||||
extern sha256_init |
|
||||||
%endif |
|
||||||
|
|
||||||
%ifidn __YASM_OBJFMT__, macho64 |
|
||||||
global _sha256_sse2_64_new |
|
||||||
%else |
|
||||||
global sha256_sse2_64_new |
|
||||||
%endif |
|
||||||
|
|
||||||
%define sr1 xmm6 |
|
||||||
%define sr2 xmm1 |
|
||||||
%define sr3 xmm2 |
|
||||||
%define sr4 xmm13 |
|
||||||
|
|
||||||
%define rA xmm7 |
|
||||||
%define rB xmm5 |
|
||||||
%define rC xmm4 |
|
||||||
%define rD xmm3 |
|
||||||
%define rE xmm0 |
|
||||||
%define rF xmm8 |
|
||||||
%define rG xmm9 |
|
||||||
%define rH xmm10 |
|
||||||
|
|
||||||
%macro sha_round_blk 0 |
|
||||||
movdqa sr1, [data+rax] ; T1 = w; |
|
||||||
;movdqa sr1, xmm11 |
|
||||||
movdqa sr2, rE ; sr2 = rE |
|
||||||
|
|
||||||
pandn sr2, rG ; sr2 = ~rE & rG |
|
||||||
movdqa sr3, rF ; sr3 = rF |
|
||||||
|
|
||||||
paddd sr1, rH ; T1 = h + sha256_consts_m128i[i] + w; |
|
||||||
movdqa rH, rG ; rH = rG |
|
||||||
|
|
||||||
pand sr3, rE ; sr3 = rE & rF |
|
||||||
movdqa rG, rF ; rG = rF |
|
||||||
|
|
||||||
%ifidn __YASM_OBJFMT__, macho64 |
|
||||||
paddd sr1, [rcx+rax] |
|
||||||
%else |
|
||||||
paddd sr1, sha256_consts_m128i[rax] ; T1 = sha256_consts_m128i[i] + w; |
|
||||||
%endif |
|
||||||
pxor sr2, sr3 ; sr2 = (rE & rF) ^ (~rE & rG) = Ch (e, f, g) |
|
||||||
|
|
||||||
movdqa rF, rE ; rF = rE |
|
||||||
paddd sr1, sr2 ; T1 = h + Ch (e, f, g) + sha256_consts_m128i[i] + w; |
|
||||||
|
|
||||||
movdqa sr2, rE ; sr2 = rE |
|
||||||
psrld rE, 6 ; e >> 6 |
|
||||||
|
|
||||||
movdqa sr3, rE ; e >> 6 |
|
||||||
pslld sr2, 7 ; e << 7 |
|
||||||
|
|
||||||
psrld sr3, 5 ; e >> 11 |
|
||||||
pxor rE, sr2 ; e >> 6 ^ e << 7 |
|
||||||
|
|
||||||
pslld sr2, 14 ; e << 21 |
|
||||||
pxor rE, sr3 ; e >> 6 ^ e << 7 ^ e >> 11 |
|
||||||
|
|
||||||
psrld sr3, 14 ; e >> 25 |
|
||||||
pxor rE, sr2 ; e >> 6 ^ e << 7 ^ e >> 11 ^ e << 21 |
|
||||||
|
|
||||||
pslld sr2, 5 ; e << 26 |
|
||||||
pxor rE, sr3 ; e >> 6 ^ e << 7 ^ e >> 11 ^ e << 21 ^ e >> 25 |
|
||||||
|
|
||||||
pxor rE, sr2 ; e >> 6 ^ e << 7 ^ e >> 11 ^ e << 21 ^ e >> 25 ^ e << 26 |
|
||||||
movdqa sr2, rB ; sr2 = rB |
|
||||||
|
|
||||||
paddd sr1, rE ; sr1 = h + BIGSIGMA1_256(e) + Ch (e, f, g) + sha256_consts_m128i[i] + w; |
|
||||||
movdqa rE, rD ; rE = rD |
|
||||||
|
|
||||||
movdqa rD, rC ; rD = rC |
|
||||||
paddd rE, sr1 ; rE = rD + T1 |
|
||||||
|
|
||||||
movdqa sr3, rC ; sr3 = rC |
|
||||||
pand rC, rA ; rC = rC & rA |
|
||||||
|
|
||||||
pand sr3, rB ; sr3 = rB & rC |
|
||||||
pand sr2, rA ; sr2 = rB & rA |
|
||||||
|
|
||||||
pxor sr2, rC ; sr2 = (rB & rA) ^ (rC & rA) |
|
||||||
movdqa rC, rB ; rC = rB |
|
||||||
|
|
||||||
pxor sr2, sr3 ; sr2 = (rB & rA) ^ (rC & rA) ^ (rB & rC) |
|
||||||
movdqa rB, rA ; rB = rA |
|
||||||
|
|
||||||
paddd sr1, sr2 ; sr1 = T1 + (rB & rA) ^ (rC & rA) ^ (rB & rC) |
|
||||||
lea rax, [rax+16] |
|
||||||
|
|
||||||
movdqa sr3, rA ; sr3 = rA |
|
||||||
psrld rA, 2 ; a >> 2 |
|
||||||
|
|
||||||
pslld sr3, 10 ; a << 10 |
|
||||||
movdqa sr2, rA ; a >> 2 |
|
||||||
|
|
||||||
pxor rA, sr3 ; a >> 2 ^ a << 10 |
|
||||||
psrld sr2, 11 ; a >> 13 |
|
||||||
|
|
||||||
pxor rA, sr2 ; a >> 2 ^ a << 10 ^ a >> 13 |
|
||||||
pslld sr3, 9 ; a << 19 |
|
||||||
|
|
||||||
pxor rA, sr3 ; a >> 2 ^ a << 10 ^ a >> 13 ^ a << 19 |
|
||||||
psrld sr2, 9 ; a >> 21 |
|
||||||
|
|
||||||
pxor rA, sr2 ; a >> 2 ^ a << 10 ^ a >> 13 ^ a << 19 ^ a >> 21 |
|
||||||
pslld sr3, 11 ; a << 30 |
|
||||||
|
|
||||||
pxor rA, sr3 ; a >> 2 ^ a << 10 ^ a >> 13 ^ a << 19 ^ a >> 21 ^ a << 30 |
|
||||||
paddd rA, sr1 ; T1 + BIGSIGMA0_256(a) + Maj(a, b, c); |
|
||||||
%endmacro |
|
||||||
|
|
||||||
%macro sha_calc_w_blk 1 |
|
||||||
movdqa xmm0, [r11-(15-%1)*16] ; xmm0 = W[I-15] |
|
||||||
movdqa xmm4, [r11-(15-(%1+1))*16] ; xmm4 = W[I-15+1] |
|
||||||
movdqa xmm2, xmm0 ; xmm2 = W[I-15] |
|
||||||
movdqa xmm6, xmm4 ; xmm6 = W[I-15+1] |
|
||||||
psrld xmm0, 3 ; xmm0 = W[I-15] >> 3 |
|
||||||
psrld xmm4, 3 ; xmm4 = W[I-15+1] >> 3 |
|
||||||
movdqa xmm1, xmm0 ; xmm1 = W[I-15] >> 3 |
|
||||||
movdqa xmm5, xmm4 ; xmm5 = W[I-15+1] >> 3 |
|
||||||
pslld xmm2, 14 ; xmm2 = W[I-15] << 14 |
|
||||||
pslld xmm6, 14 ; xmm6 = W[I-15+1] << 14 |
|
||||||
psrld xmm1, 4 ; xmm1 = W[I-15] >> 7 |
|
||||||
psrld xmm5, 4 ; xmm5 = W[I-15+1] >> 7 |
|
||||||
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) |
|
||||||
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) |
|
||||||
psrld xmm1, 11 ; xmm1 = W[I-15] >> 18 |
|
||||||
psrld xmm5, 11 ; xmm5 = W[I-15+1] >> 18 |
|
||||||
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) |
|
||||||
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) |
|
||||||
pslld xmm2, 11 ; xmm2 = W[I-15] << 25 |
|
||||||
pslld xmm6, 11 ; xmm6 = W[I-15+1] << 25 |
|
||||||
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) |
|
||||||
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) |
|
||||||
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25) |
|
||||||
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25) |
|
||||||
|
|
||||||
movdqa xmm3, [r11-(2-%1)*16] ; xmm3 = W[I-2] |
|
||||||
movdqa xmm7, [r11-(2-(%1+1))*16] ; xmm7 = W[I-2+1] |
|
||||||
|
|
||||||
paddd xmm0, [r11-(16-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] |
|
||||||
paddd xmm4, [r11-(16-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] |
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;; |
|
||||||
|
|
||||||
movdqa xmm2, xmm3 ; xmm2 = W[I-2] |
|
||||||
movdqa xmm6, xmm7 ; xmm6 = W[I-2+1] |
|
||||||
psrld xmm3, 10 ; xmm3 = W[I-2] >> 10 |
|
||||||
psrld xmm7, 10 ; xmm7 = W[I-2+1] >> 10 |
|
||||||
movdqa xmm1, xmm3 ; xmm1 = W[I-2] >> 10 |
|
||||||
movdqa xmm5, xmm7 ; xmm5 = W[I-2+1] >> 10 |
|
||||||
|
|
||||||
paddd xmm0, [r11-(7-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7] |
|
||||||
|
|
||||||
pslld xmm2, 13 ; xmm2 = W[I-2] << 13 |
|
||||||
pslld xmm6, 13 ; xmm6 = W[I-2+1] << 13 |
|
||||||
psrld xmm1, 7 ; xmm1 = W[I-2] >> 17 |
|
||||||
psrld xmm5, 7 ; xmm5 = W[I-2+1] >> 17 |
|
||||||
|
|
||||||
paddd xmm4, [r11-(7-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1] |
|
||||||
|
|
||||||
pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) |
|
||||||
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) |
|
||||||
psrld xmm1, 2 ; xmm1 = W[I-2] >> 19 |
|
||||||
psrld xmm5, 2 ; xmm5 = W[I-2+1] >> 19 |
|
||||||
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) |
|
||||||
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) |
|
||||||
pslld xmm2, 2 ; xmm2 = W[I-2] << 15 |
|
||||||
pslld xmm6, 2 ; xmm6 = W[I-2+1] << 15 |
|
||||||
pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) |
|
||||||
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) |
|
||||||
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15) |
|
||||||
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15) |
|
||||||
|
|
||||||
paddd xmm0, xmm3 ; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7] |
|
||||||
paddd xmm4, xmm7 ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1] |
|
||||||
movdqa [r11+(%1*16)], xmm0 |
|
||||||
movdqa [r11+((%1+1)*16)], xmm4 |
|
||||||
%endmacro |
|
||||||
|
|
||||||
; _sha256_sse2_64_new hash(rdi), hash1(rsi), data(rdx), init(rcx), |
|
||||||
|
|
||||||
%ifidn __YASM_OBJFMT__, macho64 |
|
||||||
_sha256_sse2_64_new: |
|
||||||
%else |
|
||||||
sha256_sse2_64_new: |
|
||||||
%endif |
|
||||||
|
|
||||||
push rbx |
|
||||||
%ifidn __OUTPUT_FORMAT__,win64 |
|
||||||
sub rsp, 16 * 6 |
|
||||||
movdqa [rsp + 16*0], xmm6 |
|
||||||
movdqa [rsp + 16*1], xmm7 |
|
||||||
movdqa [rsp + 16*2], xmm8 |
|
||||||
movdqa [rsp + 16*3], xmm9 |
|
||||||
movdqa [rsp + 16*4], xmm10 |
|
||||||
movdqa [rsp + 16*5], xmm13 |
|
||||||
%endif |
|
||||||
|
|
||||||
%macro SHA_256 0 |
|
||||||
mov rbx, 64*4 ; rbx is # of SHA-2 rounds |
|
||||||
mov rax, 16*4 ; rax is where we expand to |
|
||||||
|
|
||||||
push rbx |
|
||||||
lea rbx, qword [data+rbx*4] |
|
||||||
lea r11, qword [data+rax*4] |
|
||||||
|
|
||||||
%%SHA_CALC_W: |
|
||||||
%assign i 0 |
|
||||||
%rep SHA_CALC_W_UNROLL |
|
||||||
sha_calc_w_blk i |
|
||||||
%assign i i+SHA_CALC_W_PARA |
|
||||||
%endrep |
|
||||||
add r11, SHA_CALC_W_UNROLL*SHA_CALC_W_PARA*16 |
|
||||||
cmp r11, rbx |
|
||||||
jb %%SHA_CALC_W |
|
||||||
|
|
||||||
pop rbx |
|
||||||
mov rax, 0 |
|
||||||
lea rbx, [rbx*4] |
|
||||||
|
|
||||||
movdqa rA, [init] |
|
||||||
pshufd rB, rA, 0x55 ; rB == B |
|
||||||
pshufd rC, rA, 0xAA ; rC == C |
|
||||||
pshufd rD, rA, 0xFF ; rD == D |
|
||||||
pshufd rA, rA, 0 ; rA == A |
|
||||||
|
|
||||||
movdqa rE, [init+4*4] |
|
||||||
pshufd rF, rE, 0x55 ; rF == F |
|
||||||
pshufd rG, rE, 0xAA ; rG == G |
|
||||||
pshufd rH, rE, 0xFF ; rH == H |
|
||||||
pshufd rE, rE, 0 ; rE == E |
|
||||||
|
|
||||||
%ifidn __YASM_OBJFMT__, macho64 |
|
||||||
lea rcx, [_sha256_consts_m128i wrt rip] |
|
||||||
%endif |
|
||||||
|
|
||||||
%%SHAROUND_LOOP: |
|
||||||
%assign i 0 |
|
||||||
%rep SHA_ROUND_LOOP_UNROLL |
|
||||||
sha_round_blk |
|
||||||
%assign i i+1 |
|
||||||
%endrep |
|
||||||
cmp rax, rbx |
|
||||||
jb %%SHAROUND_LOOP |
|
||||||
|
|
||||||
; Finished the 64 rounds, calculate hash and save |
|
||||||
|
|
||||||
movdqa sr1, [init] |
|
||||||
pshufd sr2, sr1, 0x55 |
|
||||||
pshufd sr3, sr1, 0xAA |
|
||||||
pshufd sr4, sr1, 0xFF |
|
||||||
pshufd sr1, sr1, 0 |
|
||||||
|
|
||||||
paddd rB, sr2 |
|
||||||
paddd rC, sr3 |
|
||||||
paddd rD, sr4 |
|
||||||
paddd rA, sr1 |
|
||||||
|
|
||||||
movdqa sr1, [init+4*4] |
|
||||||
pshufd sr2, sr1, 0x55 |
|
||||||
pshufd sr3, sr1, 0xAA |
|
||||||
pshufd sr4, sr1, 0xFF |
|
||||||
pshufd sr1, sr1, 0 |
|
||||||
|
|
||||||
paddd rF, sr2 |
|
||||||
paddd rG, sr3 |
|
||||||
paddd rH, sr4 |
|
||||||
paddd rE, sr1 |
|
||||||
%endmacro |
|
||||||
|
|
||||||
SHA_256 |
|
||||||
movdqa [hash1+0*16], rA |
|
||||||
movdqa [hash1+1*16], rB |
|
||||||
movdqa [hash1+2*16], rC |
|
||||||
movdqa [hash1+3*16], rD |
|
||||||
movdqa [hash1+4*16], rE |
|
||||||
movdqa [hash1+5*16], rF |
|
||||||
movdqa [hash1+6*16], rG |
|
||||||
movdqa [hash1+7*16], rH |
|
||||||
|
|
||||||
mov data, hash1 |
|
||||||
mov init, sha256_init |
|
||||||
|
|
||||||
SHA_256 |
|
||||||
|
|
||||||
movdqa [hash+7*16], rH |
|
||||||
|
|
||||||
LAB_RET: |
|
||||||
%ifidn __OUTPUT_FORMAT__,win64 |
|
||||||
movdqa xmm6, [rsp + 16*0] |
|
||||||
movdqa xmm7, [rsp + 16*1] |
|
||||||
movdqa xmm8, [rsp + 16*2] |
|
||||||
movdqa xmm9, [rsp + 16*3] |
|
||||||
movdqa xmm10, [rsp + 16*4] |
|
||||||
movdqa xmm13, [rsp + 16*5] |
|
||||||
add rsp, 16 * 6 |
|
||||||
%endif |
|
||||||
pop rbx |
|
||||||
ret |
|
||||||
|
|
||||||
%ifidn __OUTPUT_FORMAT__,elf |
|
||||||
section .note.GNU-stack noalloc noexec nowrite progbits |
|
||||||
%endif |
|
||||||
%ifidn __OUTPUT_FORMAT__,elf64 |
|
||||||
section .note.GNU-stack noalloc noexec nowrite progbits |
|
||||||
%endif |
|
Loading…
Reference in new issue