Browse Source

Remove all CPU mining code.

nfactor-troky
Con Kolivas 12 years ago
parent
commit
f4b2790761
  1. 23
      Makefile.am
  2. 186
      api.c
  3. 154
      cgminer.c
  4. 69
      configure.ac
  5. 863
      driver-cpu.c
  6. 65
      driver-cpu.h
  7. 5
      miner.h
  8. 488
      sha256_4way.c
  9. 469
      sha256_altivec_4way.c
  10. 609
      sha256_cryptopp.c
  11. 274
      sha256_generic.c
  12. 133
      sha256_sse2_amd64.c
  13. 125
      sha256_sse2_i386.c
  14. 132
      sha256_sse4_amd64.c
  15. 85
      sha256_via.c
  16. 1
      x86_32/.gitignore
  17. 8
      x86_32/Makefile.am
  18. 259
      x86_32/sha256_xmm.asm
  19. 1
      x86_64/.gitignore
  20. 8
      x86_64/Makefile.am
  21. 292
      x86_64/sha256_sse4_amd64.asm
  22. 354
      x86_64/sha256_xmm_amd64.asm

23
Makefile.am

@ -57,29 +57,6 @@ if HAS_SCRYPT @@ -57,29 +57,6 @@ if HAS_SCRYPT
cgminer_SOURCES += scrypt.c scrypt.h
endif
if HAS_CPUMINE
# original CPU related sources, unchanged
cgminer_SOURCES += \
sha256_generic.c sha256_4way.c sha256_via.c \
sha256_cryptopp.c sha256_sse2_amd64.c \
sha256_sse4_amd64.c sha256_sse2_i386.c \
sha256_altivec_4way.c
# the CPU portion extracted from original main.c
cgminer_SOURCES += driver-cpu.h driver-cpu.c
if HAS_YASM
AM_CFLAGS = -DHAS_YASM
if HAVE_x86_64
SUBDIRS += x86_64
cgminer_LDADD += x86_64/libx8664.a
else # HAVE_x86_64
SUBDIRS += x86_32
cgminer_LDADD += x86_32/libx8632.a
endif # HAVE_x86_64
endif # HAS_YASM
endif # HAS_CPUMINE
if NEED_FPGAUTILS
cgminer_SOURCES += fpgautils.c fpgautils.h
endif

186
api.c

@ -27,7 +27,6 @@ @@ -27,7 +27,6 @@
#include "compat.h"
#include "miner.h"
#include "util.h"
#include "driver-cpu.h" /* for algo_names[], TODO: re-factor dependency */
#if defined(USE_BFLSC) || defined(USE_AVALON)
#define HAVE_AN_ASIC 1
@ -38,7 +37,7 @@ @@ -38,7 +37,7 @@
#endif
// Big enough for largest API request
// though a PC with 100s of PGAs/CPUs may exceed the size ...
// though a PC with 100s of PGAs may exceed the size ...
// data is truncated at the end of the last record that fits
// but still closed correctly for JSON
// Current code assumes it can socket send this size + JSON_CLOSE + JSON_END
@ -187,9 +186,6 @@ static const char *DEVICECODE = "" @@ -187,9 +186,6 @@ static const char *DEVICECODE = ""
#endif
#ifdef USE_MODMINER
"MMQ "
#endif
#ifdef WANT_CPUMINE
"CPU "
#endif
"";
@ -224,13 +220,8 @@ static const char *OSINFO = @@ -224,13 +220,8 @@ static const char *OSINFO =
#define _PGA "PGA"
#endif
#ifdef WANT_CPUMINE
#define _CPU "CPU"
#endif
#define _GPUS "GPUS"
#define _PGAS "PGAS"
#define _CPUS "CPUS"
#define _NOTIFY "NOTIFY"
#define _DEVDETAILS "DEVDETAILS"
#define _BYE "BYE"
@ -265,13 +256,8 @@ static const char ISJSON = '{'; @@ -265,13 +256,8 @@ static const char ISJSON = '{';
#define JSON_PGA JSON1 _PGA JSON2
#endif
#ifdef WANT_CPUMINE
#define JSON_CPU JSON1 _CPU JSON2
#endif
#define JSON_GPUS JSON1 _GPUS JSON2
#define JSON_PGAS JSON1 _PGAS JSON2
#define JSON_CPUS JSON1 _CPUS JSON2
#define JSON_NOTIFY JSON1 _NOTIFY JSON2
#define JSON_DEVDETAILS JSON1 _DEVDETAILS JSON2
#define JSON_BYE JSON1 _BYE JSON1
@ -306,14 +292,8 @@ static const char *JSON_PARAMETER = "parameter"; @@ -306,14 +292,8 @@ static const char *JSON_PARAMETER = "parameter";
#define MSG_MISID 15
#define MSG_GPUDEV 17
#ifdef WANT_CPUMINE
#define MSG_CPUNON 16
#define MSG_CPUDEV 18
#define MSG_INVCPU 19
#endif
#define MSG_NUMGPU 20
#define MSG_NUMCPU 21
#define MSG_VERSION 22
#define MSG_INVJSON 23
#define MSG_MISCMD 24
@ -420,11 +400,9 @@ enum code_severity { @@ -420,11 +400,9 @@ enum code_severity {
enum code_parameters {
PARAM_GPU,
PARAM_PGA,
PARAM_CPU,
PARAM_PID,
PARAM_GPUMAX,
PARAM_PGAMAX,
PARAM_CPUMAX,
PARAM_PMAX,
PARAM_POOLMAX,
@ -474,11 +452,8 @@ struct CODES { @@ -474,11 +452,8 @@ struct CODES {
#ifdef HAVE_AN_FPGA
"%d PGA(s)"
#endif
#if defined(WANT_CPUMINE) && (defined(HAVE_OPENCL) || defined(HAVE_AN_ASIC) || defined(HAVE_AN_FPGA))
#if (defined(HAVE_OPENCL) || defined(HAVE_AN_ASIC) || defined(HAVE_AN_FPGA))
" - "
#endif
#ifdef WANT_CPUMINE
"%d CPU(s)"
#endif
},
@ -488,9 +463,6 @@ struct CODES { @@ -488,9 +463,6 @@ struct CODES {
#endif
#ifdef HAVE_AN_FPGA
"/PGAs"
#endif
#ifdef WANT_CPUMINE
"/CPUs"
#endif
},
@ -513,15 +485,9 @@ struct CODES { @@ -513,15 +485,9 @@ struct CODES {
{ SEVERITY_INFO, MSG_PGAENA, PARAM_PGA, "PGA %d sent enable message" },
{ SEVERITY_INFO, MSG_PGADIS, PARAM_PGA, "PGA %d set disable flag" },
{ SEVERITY_ERR, MSG_PGAUNW, PARAM_PGA, "PGA %d is not flagged WELL, cannot enable" },
#endif
#ifdef WANT_CPUMINE
{ SEVERITY_ERR, MSG_CPUNON, PARAM_NONE, "No CPUs" },
{ SEVERITY_SUCC, MSG_CPUDEV, PARAM_CPU, "CPU%d" },
{ SEVERITY_ERR, MSG_INVCPU, PARAM_CPUMAX, "Invalid CPU id %d - range is 0 - %d" },
#endif
{ SEVERITY_SUCC, MSG_NUMGPU, PARAM_NONE, "GPU count" },
{ SEVERITY_SUCC, MSG_NUMPGA, PARAM_NONE, "PGA count" },
{ SEVERITY_SUCC, MSG_NUMCPU, PARAM_NONE, "CPU count" },
{ SEVERITY_SUCC, MSG_VERSION, PARAM_NONE, "CGMiner versions" },
{ SEVERITY_ERR, MSG_INVJSON, PARAM_NONE, "Invalid JSON" },
{ SEVERITY_ERR, MSG_MISCMD, PARAM_CMD, "Missing JSON '%s'" },
@ -1300,9 +1266,6 @@ static void message(struct io_data *io_data, int messageid, int paramid, char *p @@ -1300,9 +1266,6 @@ static void message(struct io_data *io_data, int messageid, int paramid, char *p
#endif
#ifdef HAVE_AN_FPGA
int pga;
#endif
#ifdef WANT_CPUMINE
int cpu;
#endif
int i;
@ -1333,7 +1296,6 @@ static void message(struct io_data *io_data, int messageid, int paramid, char *p @@ -1333,7 +1296,6 @@ static void message(struct io_data *io_data, int messageid, int paramid, char *p
switch(codes[i].params) {
case PARAM_GPU:
case PARAM_PGA:
case PARAM_CPU:
case PARAM_PID:
case PARAM_INT:
sprintf(buf, codes[i].description, paramid);
@ -1351,15 +1313,6 @@ static void message(struct io_data *io_data, int messageid, int paramid, char *p @@ -1351,15 +1313,6 @@ static void message(struct io_data *io_data, int messageid, int paramid, char *p
pga = numpgas();
sprintf(buf, codes[i].description, paramid, pga - 1);
break;
#endif
#ifdef WANT_CPUMINE
case PARAM_CPUMAX:
if (opt_n_threads > 0)
cpu = num_processors;
else
cpu = 0;
sprintf(buf, codes[i].description, paramid, cpu - 1);
break;
#endif
case PARAM_PMAX:
sprintf(buf, codes[i].description, total_pools);
@ -1374,12 +1327,6 @@ static void message(struct io_data *io_data, int messageid, int paramid, char *p @@ -1374,12 +1327,6 @@ static void message(struct io_data *io_data, int messageid, int paramid, char *p
#ifdef HAVE_AN_FPGA
pga = numpgas();
#endif
#ifdef WANT_CPUMINE
if (opt_n_threads > 0)
cpu = num_processors;
else
cpu = 0;
#endif
sprintf(buf, codes[i].description
#ifdef HAVE_OPENCL
@ -1390,9 +1337,6 @@ static void message(struct io_data *io_data, int messageid, int paramid, char *p @@ -1390,9 +1337,6 @@ static void message(struct io_data *io_data, int messageid, int paramid, char *p
#endif
#ifdef HAVE_AN_FPGA
, pga
#endif
#ifdef WANT_CPUMINE
, cpu
#endif
);
break;
@ -1470,7 +1414,6 @@ static void minerconfig(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __ @@ -1470,7 +1414,6 @@ static void minerconfig(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __
int gpucount = 0;
int asccount = 0;
int pgacount = 0;
int cpucount = 0;
char *adlinuse = (char *)NO;
#ifdef HAVE_ADL
const char *adl = YES;
@ -1498,17 +1441,12 @@ static void minerconfig(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __ @@ -1498,17 +1441,12 @@ static void minerconfig(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __
pgacount = numpgas();
#endif
#ifdef WANT_CPUMINE
cpucount = opt_n_threads > 0 ? num_processors : 0;
#endif
message(io_data, MSG_MINECONFIG, 0, NULL, isjson);
io_open = io_add(io_data, isjson ? COMSTR JSON_MINECONFIG : _MINECONFIG COMSTR);
root = api_add_int(root, "GPU Count", &gpucount, false);
root = api_add_int(root, "ASC Count", &asccount, false);
root = api_add_int(root, "PGA Count", &pgacount, false);
root = api_add_int(root, "CPU Count", &cpucount, false);
root = api_add_int(root, "Pool Count", &total_pools, false);
root = api_add_const(root, "ADL", (char *)adl, false);
root = api_add_string(root, "ADL in use", adlinuse, false);
@ -1759,43 +1697,6 @@ static void pgastatus(struct io_data *io_data, int pga, bool isjson, bool precom @@ -1759,43 +1697,6 @@ static void pgastatus(struct io_data *io_data, int pga, bool isjson, bool precom
}
#endif
#ifdef WANT_CPUMINE
static void cpustatus(struct io_data *io_data, int cpu, bool isjson, bool precom)
{
struct api_data *root = NULL;
char buf[TMPBUFSIZ];
if (opt_n_threads > 0 && cpu >= 0 && cpu < num_processors) {
struct cgpu_info *cgpu = &cpus[cpu];
cgpu->utility = cgpu->accepted / ( total_secs ? total_secs : 1 ) * 60;
root = api_add_int(root, "CPU", &cpu, false);
double mhs = cgpu->total_mhashes / total_secs;
root = api_add_mhs(root, "MHS av", &mhs, false);
char mhsname[27];
sprintf(mhsname, "MHS %ds", opt_log_interval);
root = api_add_mhs(root, mhsname, &(cgpu->rolling), false);
root = api_add_int(root, "Accepted", &(cgpu->accepted), false);
root = api_add_int(root, "Rejected", &(cgpu->rejected), false);
root = api_add_utility(root, "Utility", &(cgpu->utility), false);
int last_share_pool = cgpu->last_share_pool_time > 0 ?
cgpu->last_share_pool : -1;
root = api_add_int(root, "Last Share Pool", &last_share_pool, false);
root = api_add_time(root, "Last Share Time", &(cgpu->last_share_pool_time), false);
root = api_add_mhtotal(root, "Total MH", &(cgpu->total_mhashes), false);
root = api_add_int(root, "Diff1 Work", &(cgpu->diff1), false);
root = api_add_diff(root, "Difficulty Accepted", &(cgpu->diff_accepted), false);
root = api_add_diff(root, "Difficulty Rejected", &(cgpu->diff_rejected), false);
root = api_add_diff(root, "Last Share Difficulty", &(cgpu->last_share_diff), false);
root = api_add_time(root, "Last Valid Work", &(cgpu->last_device_valid_work), false);
root = print_data(root, buf, isjson, precom);
io_add(io_data, buf);
}
}
#endif
static void devstatus(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __maybe_unused char *param, bool isjson, __maybe_unused char group)
{
bool io_open = false;
@ -1817,7 +1718,7 @@ static void devstatus(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __ma @@ -1817,7 +1718,7 @@ static void devstatus(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __ma
numpga = numpgas();
#endif
if (numgpu == 0 && opt_n_threads == 0 && numpga == 0 && numasc == 0) {
if (numgpu == 0 && numpga == 0 && numasc == 0) {
message(io_data, MSG_NODEVS, 0, NULL, isjson);
return;
}
@ -1854,16 +1755,6 @@ static void devstatus(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __ma @@ -1854,16 +1755,6 @@ static void devstatus(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __ma
}
#endif
#ifdef WANT_CPUMINE
if (opt_n_threads > 0) {
for (i = 0; i < num_processors; i++) {
cpustatus(io_data, i, isjson, isjson && devcount > 0);
devcount++;
}
}
#endif
if (isjson && io_open)
io_close(io_data);
}
@ -2088,40 +1979,6 @@ static void pgaidentify(struct io_data *io_data, __maybe_unused SOCKETTYPE c, ch @@ -2088,40 +1979,6 @@ static void pgaidentify(struct io_data *io_data, __maybe_unused SOCKETTYPE c, ch
}
#endif
#ifdef WANT_CPUMINE
static void cpudev(struct io_data *io_data, __maybe_unused SOCKETTYPE c, char *param, bool isjson, __maybe_unused char group)
{
bool io_open = false;
int id;
if (opt_n_threads == 0) {
message(io_data, MSG_CPUNON, 0, NULL, isjson);
return;
}
if (param == NULL || *param == '\0') {
message(io_data, MSG_MISID, 0, NULL, isjson);
return;
}
id = atoi(param);
if (id < 0 || id >= num_processors) {
message(io_data, MSG_INVCPU, id, NULL, isjson);
return;
}
message(io_data, MSG_CPUDEV, id, NULL, isjson);
if (isjson)
io_open = io_add(io_data, COMSTR JSON_CPU);
cpustatus(io_data, id, isjson, false);
if (isjson && io_open)
io_close(io_data);
}
#endif
static void poolstatus(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __maybe_unused char *param, bool isjson, __maybe_unused char group)
{
struct api_data *root = NULL;
@ -2219,12 +2076,6 @@ static void summary(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __mayb @@ -2219,12 +2076,6 @@ static void summary(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __mayb
bool io_open;
double utility, mhs, work_utility;
#ifdef WANT_CPUMINE
char *algo = (char *)(algo_names[opt_algo]);
if (algo == NULL)
algo = (char *)NULLSTR;
#endif
message(io_data, MSG_SUMM, 0, NULL, isjson);
io_open = io_add(io_data, isjson ? COMSTR JSON_SUMMARY : _SUMMARY COMSTR);
@ -2236,9 +2087,6 @@ static void summary(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __mayb @@ -2236,9 +2087,6 @@ static void summary(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __mayb
work_utility = total_diff1 / ( total_secs ? total_secs : 1 ) * 60;
root = api_add_elapsed(root, "Elapsed", &(total_secs), true);
#ifdef WANT_CPUMINE
root = api_add_string(root, "Algorithm", algo, false);
#endif
root = api_add_mhs(root, "MHS av", &(mhs), false);
root = api_add_uint(root, "Found Blocks", &(found_blocks), true);
root = api_add_int(root, "Getworks", &(total_getworks), true);
@ -2419,28 +2267,6 @@ static void pgacount(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __may @@ -2419,28 +2267,6 @@ static void pgacount(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __may
io_close(io_data);
}
static void cpucount(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __maybe_unused char *param, bool isjson, __maybe_unused char group)
{
struct api_data *root = NULL;
char buf[TMPBUFSIZ];
bool io_open;
int count = 0;
#ifdef WANT_CPUMINE
count = opt_n_threads > 0 ? num_processors : 0;
#endif
message(io_data, MSG_NUMCPU, 0, NULL, isjson);
io_open = io_add(io_data, isjson ? COMSTR JSON_CPUS : _CPUS COMSTR);
root = api_add_int(root, "Count", &count, false);
root = print_data(root, buf, isjson, false);
io_add(io_data, buf);
if (isjson && io_open)
io_close(io_data);
}
static void switchpool(struct io_data *io_data, __maybe_unused SOCKETTYPE c, char *param, bool isjson, __maybe_unused char group)
{
struct pool *pool;
@ -3540,13 +3366,9 @@ struct CMDS { @@ -3540,13 +3366,9 @@ struct CMDS {
{ "pgaenable", pgaenable, true },
{ "pgadisable", pgadisable, true },
{ "pgaidentify", pgaidentify, true },
#endif
#ifdef WANT_CPUMINE
{ "cpu", cpudev, false },
#endif
{ "gpucount", gpucount, false },
{ "pgacount", pgacount, false },
{ "cpucount", cpucount, false },
{ "switchpool", switchpool, true },
{ "addpool", addpool, true },
{ "poolpriority", poolpriority, true },

154
cgminer.c

@ -44,7 +44,6 @@ @@ -44,7 +44,6 @@
#include "miner.h"
#include "findnonce.h"
#include "adl.h"
#include "driver-cpu.h"
#include "driver-opencl.h"
#include "bench_block.h"
#include "scrypt.h"
@ -89,7 +88,6 @@ int opt_log_interval = 5; @@ -89,7 +88,6 @@ int opt_log_interval = 5;
int opt_queue = 1;
int opt_scantime = 60;
int opt_expiry = 120;
int opt_bench_algo = -1;
static const bool opt_time = true;
unsigned long long global_hashrate;
@ -113,7 +111,6 @@ static bool opt_removedisabled; @@ -113,7 +111,6 @@ static bool opt_removedisabled;
int total_devices;
struct cgpu_info **devices;
bool have_opencl;
int opt_n_threads = -1;
int mining_threads;
int num_processors;
#ifdef HAVE_CURSES
@ -869,36 +866,6 @@ static char *set_null(const char __maybe_unused *arg) @@ -869,36 +866,6 @@ static char *set_null(const char __maybe_unused *arg)
/* These options are available from config file or commandline */
static struct opt_table opt_config_table[] = {
#ifdef WANT_CPUMINE
OPT_WITH_ARG("--algo|-a",
set_algo, show_algo, &opt_algo,
"Specify sha256 implementation for CPU mining:\n"
"\tauto\t\tBenchmark at startup and pick fastest algorithm"
"\n\tc\t\tLinux kernel sha256, implemented in C"
#ifdef WANT_SSE2_4WAY
"\n\t4way\t\ttcatm's 4-way SSE2 implementation"
#endif
#ifdef WANT_VIA_PADLOCK
"\n\tvia\t\tVIA padlock implementation"
#endif
"\n\tcryptopp\tCrypto++ C/C++ implementation"
#ifdef WANT_CRYPTOPP_ASM32
"\n\tcryptopp_asm32\tCrypto++ 32-bit assembler implementation"
#endif
#ifdef WANT_X8632_SSE2
"\n\tsse2_32\t\tSSE2 32 bit implementation for i386 machines"
#endif
#ifdef WANT_X8664_SSE2
"\n\tsse2_64\t\tSSE2 64 bit implementation for x86_64 machines"
#endif
#ifdef WANT_X8664_SSE4
"\n\tsse4_64\t\tSSE4.1 64 bit implementation for x86_64 machines"
#endif
#ifdef WANT_ALTIVEC_4WAY
"\n\taltivec_4way\tAltivec implementation for PowerPC G4 and G5 machines"
#endif
),
#endif
OPT_WITH_ARG("--api-allow",
set_api_allow, NULL, NULL,
"Allow API access only to the given list of [G:]IP[/Prefix] addresses[/subnets]"),
@ -936,20 +903,10 @@ static struct opt_table opt_config_table[] = { @@ -936,20 +903,10 @@ static struct opt_table opt_config_table[] = {
opt_set_bool, &opt_bfl_noncerange,
"Use nonce range on bitforce devices if supported"),
#endif
#ifdef WANT_CPUMINE
OPT_WITH_ARG("--bench-algo|-b",
set_int_0_to_9999, opt_show_intval, &opt_bench_algo,
opt_hidden),
#endif
#ifdef HAVE_CURSES
OPT_WITHOUT_ARG("--compact",
opt_set_bool, &opt_compact,
"Use compact display without per device statistics"),
#endif
#ifdef WANT_CPUMINE
OPT_WITH_ARG("--cpu-threads|-t",
force_nthreads_int, opt_show_intval, &opt_n_threads,
"Number of miner CPU threads"),
#endif
OPT_WITHOUT_ARG("--debug|-D",
enable_debug, &opt_debug,
@ -968,11 +925,6 @@ static struct opt_table opt_config_table[] = { @@ -968,11 +925,6 @@ static struct opt_table opt_config_table[] = {
OPT_WITHOUT_ARG("--disable-rejecting",
opt_set_bool, &opt_disable_pool,
"Automatically disable pools that continually reject shares"),
#if defined(WANT_CPUMINE) && (defined(HAVE_OPENCL) || defined(USE_FPGA))
OPT_WITHOUT_ARG("--enable-cpu|-C",
opt_set_bool, &opt_usecpu,
"Enable CPU mining with other mining (default: no CPU mining if other devices exist)"),
#endif
OPT_WITH_ARG("--expiry|-E",
set_int_0_to_9999, opt_show_intval, &opt_expiry,
"Upper bound on how many seconds after getting work we consider a share from it stale"),
@ -1375,9 +1327,6 @@ static char *opt_verusage_and_exit(const char *extra) @@ -1375,9 +1327,6 @@ static char *opt_verusage_and_exit(const char *extra)
#ifdef HAVE_OPENCL
"GPU "
#endif
#ifdef WANT_CPUMINE
"CPU "
#endif
#ifdef USE_BITFORCE
"bitforce "
#endif
@ -1905,7 +1854,6 @@ static int statusy; @@ -1905,7 +1854,6 @@ static int statusy;
#ifdef HAVE_OPENCL
struct cgpu_info gpus[MAX_GPUDEVICES]; /* Maximum number apparently possible */
#endif
struct cgpu_info *cpus;
#ifdef HAVE_CURSES
static inline void unlock_curses(void)
@ -2039,10 +1987,6 @@ static void curses_print_status(void) @@ -2039,10 +1987,6 @@ static void curses_print_status(void)
wattron(statuswin, A_BOLD);
mvwprintw(statuswin, 0, 0, " " PACKAGE " version " VERSION " - Started: %s", datestamp);
#ifdef WANT_CPUMINE
if (opt_n_threads)
wprintw(statuswin, " CPU Algo: %s", algo_names[opt_algo]);
#endif
wattroff(statuswin, A_BOLD);
mvwhline(statuswin, 1, 0, '-', 80);
mvwprintw(statuswin, 2, 0, " %s", statusline);
@ -4059,9 +4003,6 @@ void write_config(FILE *fcfg) @@ -4059,9 +4003,6 @@ void write_config(FILE *fcfg)
if (opt_reorder)
fprintf(fcfg, ",\n\"gpu-reorder\" : true");
#endif
#ifdef WANT_CPUMINE
fprintf(fcfg, ",\n\"algo\" : \"%s\"", algo_names[opt_algo]);
#endif
/* Simple bool and int options */
struct opt_table *opt;
@ -6382,10 +6323,6 @@ static void *watchdog_thread(void __maybe_unused *userdata) @@ -6382,10 +6323,6 @@ static void *watchdog_thread(void __maybe_unused *userdata)
if (thr->getwork || *denable == DEV_DISABLED)
continue;
#ifdef WANT_CPUMINE
if (cgpu->drv->drv_id == DRIVER_CPU)
continue;
#endif
if (cgpu->status != LIFE_WELL && (now.tv_sec - thr->last.tv_sec < WATCHDOG_SICK_TIME)) {
if (cgpu->status != LIFE_INIT)
applog(LOG_ERR, "%s: Recovered, declaring WELL!", dev_str);
@ -6459,10 +6396,6 @@ void print_summary(void) @@ -6459,10 +6396,6 @@ void print_summary(void)
applog(LOG_WARNING, "Started at %s", datestamp);
if (total_pools == 1)
applog(LOG_WARNING, "Pool: %s", pools[0]->rpc_url);
#ifdef WANT_CPUMINE
if (opt_n_threads)
applog(LOG_WARNING, "CPU hasher algorithm used: %s", algo_names[opt_algo]);
#endif
applog(LOG_WARNING, "Runtime: %d hrs : %d mins : %d secs", hours, mins, secs);
displayed_hashes = total_mhashes_done / total_secs;
if (displayed_hashes < 1) {
@ -6542,9 +6475,6 @@ static void clean_up(void) @@ -6542,9 +6475,6 @@ static void clean_up(void)
if (!opt_realquiet && successful_connect)
print_summary();
if (opt_n_threads)
free(cpus);
curl_global_cleanup();
}
@ -6798,15 +6728,6 @@ void enable_curses(void) { @@ -6798,15 +6728,6 @@ void enable_curses(void) {
}
#endif
/* TODO: fix need a dummy CPU device_drv even if no support for CPU mining */
#ifndef WANT_CPUMINE
struct device_drv cpu_drv;
struct device_drv cpu_drv = {
.drv_id = DRIVER_CPU,
.name = "CPU",
};
#endif
#ifdef USE_BFLSC
extern struct device_drv bflsc_drv;
#endif
@ -7183,10 +7104,6 @@ int main(int argc, char *argv[]) @@ -7183,10 +7104,6 @@ int main(int argc, char *argv[])
sprintf(packagename, "%s %s", PACKAGE, VERSION);
#ifdef WANT_CPUMINE
init_max_name_len();
#endif
handler.sa_handler = &sighandler;
handler.sa_flags = 0;
sigemptyset(&handler.sa_mask);
@ -7202,15 +7119,6 @@ int main(int argc, char *argv[]) @@ -7202,15 +7119,6 @@ int main(int argc, char *argv[])
strcpy(cgminer_path, dirname(s));
free(s);
strcat(cgminer_path, "/");
#ifdef WANT_CPUMINE
// Hack to make cgminer silent when called recursively on WIN32
int skip_to_bench = 0;
#if defined(WIN32)
char buf[32];
if (GetEnvironmentVariable("CGMINER_BENCH_ALGO", buf, 16))
skip_to_bench = 1;
#endif // defined(WIN32)
#endif
devcursor = 8;
logstart = devcursor + 1;
@ -7295,51 +7203,6 @@ int main(int argc, char *argv[]) @@ -7295,51 +7203,6 @@ int main(int argc, char *argv[])
usb_initialise();
#endif
#ifdef WANT_CPUMINE
#ifdef USE_SCRYPT
if (opt_scrypt)
set_scrypt_algo(&opt_algo);
else
#endif
if (0 <= opt_bench_algo) {
double rate = bench_algo_stage3(opt_bench_algo);
if (!skip_to_bench)
printf("%.5f (%s)\n", rate, algo_names[opt_bench_algo]);
else {
// Write result to shared memory for parent
#if defined(WIN32)
char unique_name[64];
if (GetEnvironmentVariable("CGMINER_SHARED_MEM", unique_name, 32)) {
HANDLE map_handle = CreateFileMapping(
INVALID_HANDLE_VALUE, // use paging file
NULL, // default security attributes
PAGE_READWRITE, // read/write access
0, // size: high 32-bits
4096, // size: low 32-bits
unique_name // name of map object
);
if (NULL != map_handle) {
void *shared_mem = MapViewOfFile(
map_handle, // object to map view of
FILE_MAP_WRITE, // read/write access
0, // high offset: map from
0, // low offset: beginning
0 // default: map entire file
);
if (NULL != shared_mem)
CopyMemory(shared_mem, &rate, sizeof(rate));
(void)UnmapViewOfFile(shared_mem);
}
(void)CloseHandle(map_handle);
}
#endif
}
exit(0);
}
#endif
#ifdef HAVE_OPENCL
if (!opt_nogpu)
opencl_drv.drv_detect();
@ -7376,10 +7239,6 @@ int main(int argc, char *argv[]) @@ -7376,10 +7239,6 @@ int main(int argc, char *argv[])
ztex_drv.drv_detect();
#endif
#ifdef WANT_CPUMINE
cpu_drv.drv_detect();
#endif
if (devices_enabled == -1) {
applog(LOG_ERR, "Devices detected:");
for (i = 0; i < total_devices; ++i) {
@ -7400,12 +7259,8 @@ int main(int argc, char *argv[]) @@ -7400,12 +7259,8 @@ int main(int argc, char *argv[])
quit (1, "Command line options set a device that doesn't exist");
enable_device(devices[i]);
} else if (i < total_devices) {
if (opt_removedisabled) {
if (devices[i]->drv->drv_id == DRIVER_CPU)
--opt_n_threads;
} else {
if (!opt_removedisabled)
enable_device(devices[i]);
}
devices[i]->deven = DEV_DISABLED;
}
}
@ -7609,13 +7464,6 @@ begin_bench: @@ -7609,13 +7464,6 @@ begin_bench:
pause_dynamic_threads(i);
#endif
#ifdef WANT_CPUMINE
applog(LOG_INFO, "%d cpu miner threads started, "
"using SHA256 '%s' algorithm.",
opt_n_threads,
algo_names[opt_algo]);
#endif
cgtime(&total_tv_start);
cgtime(&total_tv_end);

69
configure.ac

@ -126,14 +126,6 @@ if test -n "$CGMINER_SDK"; then @@ -126,14 +126,6 @@ if test -n "$CGMINER_SDK"; then
LDFLAGS="-L$CGMINER_SDK/lib/$target $LDFLAGS"
fi
cpumining="no"
AC_ARG_ENABLE([cpumining],,[cpumining=$enableval] )
if test "x$cpumining" = xyes; then
AC_DEFINE_UNQUOTED([WANT_CPUMINE], [1], [Enable CPUMINING])
fi
AM_CONDITIONAL([HAS_CPUMINE], [test x$cpumining = xyes])
opencl="yes"
AC_ARG_ENABLE([opencl],
@ -329,54 +321,6 @@ else @@ -329,54 +321,6 @@ else
JANSSON_LIBS=-ljansson
fi
dnl Find YASM
has_yasm=false
AC_PATH_PROG([YASM],[yasm],[false])
if test "x$YASM" != "xfalse" ; then
AC_MSG_CHECKING([if yasm version is greater than 1.0.1])
yasmver=`"$YASM" --version | head -1 | cut -d\ -f2`
yamajor=`echo $yasmver | cut -d. -f1`
yaminor=`echo $yasmver | cut -d. -f2`
yamini=`echo $yasmver | cut -d. -f3`
if test "$yamajor" -ge "1" ; then
if test "$yamajor" -eq "1" ; then
if test "$yaminor" -ge "0" ; then
if test "$yaminor" -eq "0"; then
if test "$yamini" -ge "1"; then
has_yasm=true
fi
else
has_yasm=true
fi
fi
fi
else
has_yasm=false
fi
if test "x$has_yasm" = "xtrue" ; then
AC_MSG_RESULT([yes])
else
AC_MSG_RESULT([no])
fi
fi
if test "x$has_yasm" = "xfalse" ; then
AC_MSG_NOTICE([yasm is required for the assembly algorithms. They will be skipped.])
else
if test "x$have_x86_64" = xtrue; then
if test "x$have_win32" = xtrue; then
YASM_FMT="win64"
else
YASM_FMT="elf64"
fi
elif test "x$have_win32" = xtrue; then
YASM_FMT="coff"
else
YASM_FMT="elf32"
fi
fi
AM_CONDITIONAL([HAS_YASM], [test x$has_yasm = xtrue])
if test "x$icarus" != xno; then
AC_ARG_WITH([libudev], [AC_HELP_STRING([--without-libudev], [Autodetect FPGAs using libudev (default enabled)])],
[libudev=$withval],
@ -498,15 +442,12 @@ AC_SUBST(WS2_LIBS) @@ -498,15 +442,12 @@ AC_SUBST(WS2_LIBS)
AC_SUBST(MM_LIBS)
AC_SUBST(MATH_LIBS)
AC_SUBST(UDEV_LIBS)
AC_SUBST(YASM_FMT)
AC_SUBST(ADL_CPPFLAGS)
AC_CONFIG_FILES([
Makefile
compat/Makefile
compat/jansson/Makefile
x86_64/Makefile
x86_32/Makefile
ccan/Makefile
lib/Makefile
])
@ -537,14 +478,14 @@ if test "x$opencl" != xno; then @@ -537,14 +478,14 @@ if test "x$opencl" != xno; then
else
echo " OpenCL...............: NOT FOUND. GPU mining support DISABLED"
if test "x$cpumining$bitforce$avalon$icarus$ztex$modminer$bflsc" = xnonononononono; then
if test "x$bitforce$avalon$icarus$ztex$modminer$bflsc" = xnonononononono; then
AC_MSG_ERROR([No mining configured in])
fi
echo " scrypt...............: Disabled (needs OpenCL)"
fi
else
echo " OpenCL...............: Detection overrided. GPU mining support DISABLED"
if test "x$cpumining$bitforce$icarus$avalon$ztex$modminer$bflsc" = xnonononononono; then
if test "x$bitforce$icarus$avalon$ztex$modminer$bflsc" = xnonononononono; then
AC_MSG_ERROR([No mining configured in])
fi
echo " scrypt...............: Disabled (needs OpenCL)"
@ -601,12 +542,6 @@ if test "x$icarus" != xno; then @@ -601,12 +542,6 @@ if test "x$icarus" != xno; then
echo " libudev.detection....: $libudev"
fi
if test "x$cpumining" = xyes; then
echo
echo " CPU Mining...........: Enabled"
echo " ASM.(for CPU mining).: $has_yasm"
fi
echo
echo "Compilation............: make (or gmake)"
echo " CPPFLAGS.............: $CPPFLAGS"

863
driver-cpu.c

@ -1,863 +0,0 @@ @@ -1,863 +0,0 @@
/*
* Copyright 2011-2012 Con Kolivas
* Copyright 2011-2012 Luke Dashjr
* Copyright 2010 Jeff Garzik
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version. See COPYING for more details.
*/
#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <stdint.h>
#include <unistd.h>
#include <signal.h>
#include <sys/stat.h>
#include <sys/types.h>
#ifndef WIN32
#include <sys/wait.h>
#include <sys/resource.h>
#endif
#include <libgen.h>
#include "compat.h"
#include "miner.h"
#include "bench_block.h"
#include "driver-cpu.h"
#if defined(unix)
#include <errno.h>
#include <fcntl.h>
#endif
#if defined(__linux) && defined(cpu_set_t) /* Linux specific policy and affinity management */
#include <sched.h>
static inline void drop_policy(void)
{
struct sched_param param;
#ifdef SCHED_BATCH
#ifdef SCHED_IDLE
if (unlikely(sched_setscheduler(0, SCHED_IDLE, &param) == -1))
#endif
sched_setscheduler(0, SCHED_BATCH, &param);
#endif
}
static inline void affine_to_cpu(int id, int cpu)
{
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(cpu, &set);
sched_setaffinity(0, sizeof(&set), &set);
applog(LOG_INFO, "Binding cpu mining thread %d to cpu %d", id, cpu);
}
#else
static inline void drop_policy(void)
{
}
static inline void affine_to_cpu(int __maybe_unused id, int __maybe_unused cpu)
{
}
#endif
/* TODO: resolve externals */
extern char *set_int_range(const char *arg, int *i, int min, int max);
extern int dev_from_id(int thr_id);
/* chipset-optimized hash functions */
extern bool ScanHash_4WaySSE2(struct thr_info*, const unsigned char *pmidstate,
unsigned char *pdata, unsigned char *phash1, unsigned char *phash,
const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce);
extern bool ScanHash_altivec_4way(struct thr_info*, const unsigned char *pmidstate,
unsigned char *pdata,
unsigned char *phash1, unsigned char *phash,
const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce);
extern bool scanhash_via(struct thr_info*, const unsigned char *pmidstate,
unsigned char *pdata,
unsigned char *phash1, unsigned char *phash,
const unsigned char *target,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n);
extern bool scanhash_c(struct thr_info*, const unsigned char *midstate, unsigned char *data,
unsigned char *hash1, unsigned char *hash,
const unsigned char *target,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n);
extern bool scanhash_cryptopp(struct thr_info*, const unsigned char *midstate,unsigned char *data,
unsigned char *hash1, unsigned char *hash,
const unsigned char *target,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n);
extern bool scanhash_asm32(struct thr_info*, const unsigned char *midstate,unsigned char *data,
unsigned char *hash1, unsigned char *hash,
const unsigned char *target,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce);
extern bool scanhash_sse2_64(struct thr_info*, const unsigned char *pmidstate, unsigned char *pdata,
unsigned char *phash1, unsigned char *phash,
const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce,
uint32_t nonce);
extern bool scanhash_sse4_64(struct thr_info*, const unsigned char *pmidstate, unsigned char *pdata,
unsigned char *phash1, unsigned char *phash,
const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce,
uint32_t nonce);
extern bool scanhash_sse2_32(struct thr_info*, const unsigned char *pmidstate, unsigned char *pdata,
unsigned char *phash1, unsigned char *phash,
const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce,
uint32_t nonce);
extern bool scanhash_scrypt(struct thr_info *thr, int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
const unsigned char *ptarget,
uint32_t max_nonce, unsigned long *hashes_done);
#ifdef WANT_CPUMINE
static size_t max_name_len = 0;
static char *name_spaces_pad = NULL;
const char *algo_names[] = {
[ALGO_C] = "c",
#ifdef WANT_SSE2_4WAY
[ALGO_4WAY] = "4way",
#endif
#ifdef WANT_VIA_PADLOCK
[ALGO_VIA] = "via",
#endif
[ALGO_CRYPTOPP] = "cryptopp",
#ifdef WANT_CRYPTOPP_ASM32
[ALGO_CRYPTOPP_ASM32] = "cryptopp_asm32",
#endif
#ifdef WANT_X8632_SSE2
[ALGO_SSE2_32] = "sse2_32",
#endif
#ifdef WANT_X8664_SSE2
[ALGO_SSE2_64] = "sse2_64",
#endif
#ifdef WANT_X8664_SSE4
[ALGO_SSE4_64] = "sse4_64",
#endif
#ifdef WANT_ALTIVEC_4WAY
[ALGO_ALTIVEC_4WAY] = "altivec_4way",
#endif
#ifdef WANT_SCRYPT
[ALGO_SCRYPT] = "scrypt",
#endif
};
static const sha256_func sha256_funcs[] = {
[ALGO_C] = (sha256_func)scanhash_c,
#ifdef WANT_SSE2_4WAY
[ALGO_4WAY] = (sha256_func)ScanHash_4WaySSE2,
#endif
#ifdef WANT_ALTIVEC_4WAY
[ALGO_ALTIVEC_4WAY] = (sha256_func) ScanHash_altivec_4way,
#endif
#ifdef WANT_VIA_PADLOCK
[ALGO_VIA] = (sha256_func)scanhash_via,
#endif
[ALGO_CRYPTOPP] = (sha256_func)scanhash_cryptopp,
#ifdef WANT_CRYPTOPP_ASM32
[ALGO_CRYPTOPP_ASM32] = (sha256_func)scanhash_asm32,
#endif
#ifdef WANT_X8632_SSE2
[ALGO_SSE2_32] = (sha256_func)scanhash_sse2_32,
#endif
#ifdef WANT_X8664_SSE2
[ALGO_SSE2_64] = (sha256_func)scanhash_sse2_64,
#endif
#ifdef WANT_X8664_SSE4
[ALGO_SSE4_64] = (sha256_func)scanhash_sse4_64,
#endif
#ifdef WANT_SCRYPT
[ALGO_SCRYPT] = (sha256_func)scanhash_scrypt
#endif
};
#endif
#ifdef WANT_CPUMINE
#if defined(WANT_X8664_SSE4) && defined(__SSE4_1__)
enum sha256_algos opt_algo = ALGO_SSE4_64;
#elif defined(WANT_X8664_SSE2) && defined(__SSE2__)
enum sha256_algos opt_algo = ALGO_SSE2_64;
#elif defined(WANT_X8632_SSE2) && defined(__SSE2__)
enum sha256_algos opt_algo = ALGO_SSE2_32;
#else
enum sha256_algos opt_algo = ALGO_C;
#endif
bool opt_usecpu = false;
static int cpur_thr_id;
static bool forced_n_threads;
#endif
#ifdef WANT_CPUMINE
// Algo benchmark, crash-prone, system independent stage
double bench_algo_stage3(
enum sha256_algos algo
)
{
// Use a random work block pulled from a pool
static uint8_t bench_block[] = { CGMINER_BENCHMARK_BLOCK };
struct work work __attribute__((aligned(128)));
unsigned char hash1[64];
size_t bench_size = sizeof(work);
size_t work_size = sizeof(bench_block);
size_t min_size = (work_size < bench_size ? work_size : bench_size);
memset(&work, 0, sizeof(work));
memcpy(&work, &bench_block, min_size);
struct thr_info dummy = {0};
struct timeval end;
struct timeval start;
uint32_t max_nonce = (1<<22);
uint32_t last_nonce = 0;
hex2bin(hash1, "00000000000000000000000000000000000000000000000000000000000000000000008000000000000000000000000000000000000000000000000000010000", 64);
gettimeofday(&start, 0);
{
sha256_func func = sha256_funcs[algo];
(*func)(
&dummy,
work.midstate,
work.data,
hash1,
work.hash,
work.target,
max_nonce,
&last_nonce,
work.blk.nonce
);
}
gettimeofday(&end, 0);
uint64_t usec_end = ((uint64_t)end.tv_sec)*1000*1000 + end.tv_usec;
uint64_t usec_start = ((uint64_t)start.tv_sec)*1000*1000 + start.tv_usec;
uint64_t usec_elapsed = usec_end - usec_start;
double rate = -1.0;
if (0<usec_elapsed) {
rate = (1.0*(last_nonce+1))/usec_elapsed;
}
return rate;
}
#if defined(unix)
// Change non-blocking status on a file descriptor
static void set_non_blocking(
int fd,
int yes
)
{
int flags = fcntl(fd, F_GETFL, 0);
if (flags<0) {
perror("fcntl(GET) failed");
exit(1);
}
flags = yes ? (flags|O_NONBLOCK) : (flags&~O_NONBLOCK);
int r = fcntl(fd, F_SETFL, flags);
if (r<0) {
perror("fcntl(SET) failed");
exit(1);
}
}
#endif // defined(unix)
// Algo benchmark, crash-safe, system-dependent stage
static double bench_algo_stage2(
enum sha256_algos algo
)
{
// Here, the gig is to safely run a piece of code that potentially
// crashes. Unfortunately, the Right Way (tm) to do this is rather
// heavily platform dependent :(
double rate = -1.23457;
#if defined(unix)
// Make a pipe: [readFD, writeFD]
int pfd[2];
int r = pipe(pfd);
if (r<0) {
perror("pipe - failed to create pipe for --algo auto");
exit(1);
}
// Make pipe non blocking
set_non_blocking(pfd[0], 1);
set_non_blocking(pfd[1], 1);
// Don't allow a crashing child to kill the main process
sighandler_t sr0 = signal(SIGPIPE, SIG_IGN);
sighandler_t sr1 = signal(SIGPIPE, SIG_IGN);
if (SIG_ERR==sr0 || SIG_ERR==sr1) {
perror("signal - failed to edit signal mask for --algo auto");
exit(1);
}
// Fork a child to do the actual benchmarking
pid_t child_pid = fork();
if (child_pid<0) {
perror("fork - failed to create a child process for --algo auto");
exit(1);
}
// Do the dangerous work in the child, knowing we might crash
if (0==child_pid) {
// TODO: some umask trickery to prevent coredumps
// Benchmark this algorithm
double r = bench_algo_stage3(algo);
// We survived, send result to parent and bail
int loop_count = 0;
while (1) {
ssize_t bytes_written = write(pfd[1], &r, sizeof(r));
int try_again = (0==bytes_written || (bytes_written<0 && EAGAIN==errno));
int success = (sizeof(r)==(size_t)bytes_written);
if (success)
break;
if (!try_again) {
perror("write - child failed to write benchmark result to pipe");
exit(1);
}
if (5<loop_count) {
applog(LOG_ERR, "child tried %d times to communicate with parent, giving up", loop_count);
exit(1);
}
++loop_count;
sleep(1);
}
exit(0);
}
// Parent waits for a result from child
int loop_count = 0;
while (1) {
// Wait for child to die
int status;
int r = waitpid(child_pid, &status, WNOHANG);
if ((child_pid==r) || (r<0 && ECHILD==errno)) {
// Child died somehow. Grab result and bail
double tmp;
ssize_t bytes_read = read(pfd[0], &tmp, sizeof(tmp));
if (sizeof(tmp)==(size_t)bytes_read)
rate = tmp;
break;
} else if (r<0) {
perror("bench_algo: waitpid failed. giving up.");
exit(1);
}
// Give up on child after a ~60s
if (60<loop_count) {
kill(child_pid, SIGKILL);
waitpid(child_pid, &status, 0);
break;
}
// Wait a bit longer
++loop_count;
sleep(1);
}
// Close pipe
r = close(pfd[0]);
if (r<0) {
perror("close - failed to close read end of pipe for --algo auto");
exit(1);
}
r = close(pfd[1]);
if (r<0) {
perror("close - failed to close read end of pipe for --algo auto");
exit(1);
}
#elif defined(WIN32)
// Get handle to current exe
HINSTANCE module = GetModuleHandle(0);
if (!module) {
applog(LOG_ERR, "failed to retrieve module handle");
exit(1);
}
// Create a unique name
char unique_name[32];
snprintf(
unique_name,
sizeof(unique_name)-1,
"cgminer-%p",
(void*)module
);
// Create and init a chunked of shared memory
HANDLE map_handle = CreateFileMapping(
INVALID_HANDLE_VALUE, // use paging file
NULL, // default security attributes
PAGE_READWRITE, // read/write access
0, // size: high 32-bits
4096, // size: low 32-bits
unique_name // name of map object
);
if (NULL==map_handle) {
applog(LOG_ERR, "could not create shared memory");
exit(1);
}
void *shared_mem = MapViewOfFile(
map_handle, // object to map view of
FILE_MAP_WRITE, // read/write access
0, // high offset: map from
0, // low offset: beginning
0 // default: map entire file
);
if (NULL==shared_mem) {
applog(LOG_ERR, "could not map shared memory");
exit(1);
}
SetEnvironmentVariable("CGMINER_SHARED_MEM", unique_name);
CopyMemory(shared_mem, &rate, sizeof(rate));
// Get path to current exe
char cmd_line[256 + MAX_PATH];
const size_t n = sizeof(cmd_line)-200;
DWORD size = GetModuleFileName(module, cmd_line, n);
if (0==size) {
applog(LOG_ERR, "failed to retrieve module path");
exit(1);
}
// Construct new command line based on that
char *p = strlen(cmd_line) + cmd_line;
sprintf(p, " --bench-algo %d", algo);
SetEnvironmentVariable("CGMINER_BENCH_ALGO", "1");
// Launch a debug copy of cgminer
STARTUPINFO startup_info;
PROCESS_INFORMATION process_info;
ZeroMemory(&startup_info, sizeof(startup_info));
ZeroMemory(&process_info, sizeof(process_info));
startup_info.cb = sizeof(startup_info);
BOOL ok = CreateProcess(
NULL, // No module name (use command line)
cmd_line, // Command line
NULL, // Process handle not inheritable
NULL, // Thread handle not inheritable
FALSE, // Set handle inheritance to FALSE
DEBUG_ONLY_THIS_PROCESS,// We're going to debug the child
NULL, // Use parent's environment block
NULL, // Use parent's starting directory
&startup_info, // Pointer to STARTUPINFO structure
&process_info // Pointer to PROCESS_INFORMATION structure
);
if (!ok) {
applog(LOG_ERR, "CreateProcess failed with error %d\n", GetLastError() );
exit(1);
}
// Debug the child (only clean way to catch exceptions)
while (1) {
// Wait for child to do something
DEBUG_EVENT debug_event;
ZeroMemory(&debug_event, sizeof(debug_event));
BOOL ok = WaitForDebugEvent(&debug_event, 60 * 1000);
if (!ok)
break;
// Decide if event is "normal"
int go_on =
CREATE_PROCESS_DEBUG_EVENT== debug_event.dwDebugEventCode ||
CREATE_THREAD_DEBUG_EVENT == debug_event.dwDebugEventCode ||
EXIT_THREAD_DEBUG_EVENT == debug_event.dwDebugEventCode ||
EXCEPTION_DEBUG_EVENT == debug_event.dwDebugEventCode ||
LOAD_DLL_DEBUG_EVENT == debug_event.dwDebugEventCode ||
OUTPUT_DEBUG_STRING_EVENT == debug_event.dwDebugEventCode ||
UNLOAD_DLL_DEBUG_EVENT == debug_event.dwDebugEventCode;
if (!go_on)
break;
// Some exceptions are also "normal", apparently.
if (EXCEPTION_DEBUG_EVENT== debug_event.dwDebugEventCode) {
int go_on =
EXCEPTION_BREAKPOINT== debug_event.u.Exception.ExceptionRecord.ExceptionCode;
if (!go_on)
break;
}
// If nothing unexpected happened, let child proceed
ContinueDebugEvent(
debug_event.dwProcessId,
debug_event.dwThreadId,
DBG_CONTINUE
);
}
// Clean up child process
TerminateProcess(process_info.hProcess, 1);
CloseHandle(process_info.hProcess);
CloseHandle(process_info.hThread);
// Reap return value and cleanup
CopyMemory(&rate, shared_mem, sizeof(rate));
(void)UnmapViewOfFile(shared_mem);
(void)CloseHandle(map_handle);
#else
// Not linux, not unix, not WIN32 ... do our best
rate = bench_algo_stage3(algo);
#endif // defined(unix)
// Done
return rate;
}
static void bench_algo(
double *best_rate,
enum sha256_algos *best_algo,
enum sha256_algos algo
)
{
size_t n = max_name_len - strlen(algo_names[algo]);
memset(name_spaces_pad, ' ', n);
name_spaces_pad[n] = 0;
applog(
LOG_ERR,
"\"%s\"%s : benchmarking algorithm ...",
algo_names[algo],
name_spaces_pad
);
double rate = bench_algo_stage2(algo);
if (rate<0.0) {
applog(
LOG_ERR,
"\"%s\"%s : algorithm fails on this platform",
algo_names[algo],
name_spaces_pad
);
} else {
applog(
LOG_ERR,
"\"%s\"%s : algorithm runs at %.5f MH/s",
algo_names[algo],
name_spaces_pad,
rate
);
if (*best_rate<rate) {
*best_rate = rate;
*best_algo = algo;
}
}
}
// Figure out the longest algorithm name
void init_max_name_len()
{
size_t i;
size_t nb_names = sizeof(algo_names)/sizeof(algo_names[0]);
for (i=0; i<nb_names; ++i) {
const char *p = algo_names[i];
size_t name_len = p ? strlen(p) : 0;
if (max_name_len<name_len)
max_name_len = name_len;
}
name_spaces_pad = (char*) malloc(max_name_len+16);
if (0==name_spaces_pad) {
perror("malloc failed");
exit(1);
}
}
// Pick the fastest CPU hasher
static enum sha256_algos pick_fastest_algo()
{
double best_rate = -1.0;
enum sha256_algos best_algo = 0;
applog(LOG_ERR, "benchmarking all sha256 algorithms ...");
bench_algo(&best_rate, &best_algo, ALGO_C);
#if defined(WANT_SSE2_4WAY)
bench_algo(&best_rate, &best_algo, ALGO_4WAY);
#endif
#if defined(WANT_VIA_PADLOCK)
bench_algo(&best_rate, &best_algo, ALGO_VIA);
#endif
bench_algo(&best_rate, &best_algo, ALGO_CRYPTOPP);
#if defined(WANT_CRYPTOPP_ASM32)
bench_algo(&best_rate, &best_algo, ALGO_CRYPTOPP_ASM32);
#endif
#if defined(WANT_X8632_SSE2)
bench_algo(&best_rate, &best_algo, ALGO_SSE2_32);
#endif
#if defined(WANT_X8664_SSE2)
bench_algo(&best_rate, &best_algo, ALGO_SSE2_64);
#endif
#if defined(WANT_X8664_SSE4)
bench_algo(&best_rate, &best_algo, ALGO_SSE4_64);
#endif
#if defined(WANT_ALTIVEC_4WAY)
bench_algo(&best_rate, &best_algo, ALGO_ALTIVEC_4WAY);
#endif
size_t n = max_name_len - strlen(algo_names[best_algo]);
memset(name_spaces_pad, ' ', n);
name_spaces_pad[n] = 0;
applog(
LOG_ERR,
"\"%s\"%s : is fastest algorithm at %.5f MH/s",
algo_names[best_algo],
name_spaces_pad,
best_rate
);
return best_algo;
}
/* FIXME: Use asprintf for better errors. */
char *set_algo(const char *arg, enum sha256_algos *algo)
{
enum sha256_algos i;
if (opt_scrypt)
return "Can only use scrypt algorithm";
if (!strcmp(arg, "auto")) {
*algo = pick_fastest_algo();
return NULL;
}
for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
if (algo_names[i] && !strcmp(arg, algo_names[i])) {
*algo = i;
return NULL;
}
}
return "Unknown algorithm";
}
#ifdef WANT_SCRYPT
void set_scrypt_algo(enum sha256_algos *algo)
{
*algo = ALGO_SCRYPT;
}
#endif
void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo)
{
strncpy(buf, algo_names[*algo], OPT_SHOW_LEN);
}
#endif
#ifdef WANT_CPUMINE
char *force_nthreads_int(const char *arg, int *i)
{
forced_n_threads = true;
return set_int_range(arg, i, 0, 9999);
}
#endif
#ifdef WANT_CPUMINE
static void cpu_detect()
{
int i;
// Reckon number of cores in the box
#if defined(WIN32)
{
DWORD_PTR system_am;
DWORD_PTR process_am;
BOOL ok = GetProcessAffinityMask(
GetCurrentProcess(),
&system_am,
&process_am
);
if (!ok) {
applog(LOG_ERR, "couldn't figure out number of processors :(");
num_processors = 1;
} else {
size_t n = 32;
num_processors = 0;
while (n--)
if (process_am & (1<<n))
++num_processors;
}
}
#else
num_processors = sysconf(_SC_NPROCESSORS_ONLN);
#endif /* !WIN32 */
if (opt_n_threads < 0 || !forced_n_threads) {
if (total_devices && !opt_usecpu)
opt_n_threads = 0;
else
opt_n_threads = num_processors;
}
if (num_processors < 1)
return;
cpus = calloc(opt_n_threads, sizeof(struct cgpu_info));
if (unlikely(!cpus))
quit(1, "Failed to calloc cpus");
for (i = 0; i < opt_n_threads; ++i) {
struct cgpu_info *cgpu;
cgpu = &cpus[i];
cgpu->drv = &cpu_drv;
cgpu->deven = DEV_ENABLED;
cgpu->threads = 1;
cgpu->kname = algo_names[opt_algo];
if (opt_scrypt)
cgpu->drv->max_diff = 0xffffffff;
add_cgpu(cgpu);
}
}
static void reinit_cpu_device(struct cgpu_info *cpu)
{
tq_push(control_thr[cpur_thr_id].q, cpu);
}
static bool cpu_thread_prepare(struct thr_info *thr)
{
thread_reportin(thr);
return true;
}
static uint64_t cpu_can_limit_work(struct thr_info __maybe_unused *thr)
{
return 0xffff;
}
static bool cpu_thread_init(struct thr_info *thr)
{
const int thr_id = thr->id;
/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
* and if that fails, then SCHED_BATCH. No need for this to be an
* error if it fails */
setpriority(PRIO_PROCESS, 0, 19);
drop_policy();
/* Cpu affinity only makes sense if the number of threads is a multiple
* of the number of CPUs */
if (!(opt_n_threads % num_processors))
affine_to_cpu(dev_from_id(thr_id), dev_from_id(thr_id) % num_processors);
return true;
}
static int64_t cpu_scanhash(struct thr_info *thr, struct work *work, int64_t max_nonce)
{
const int thr_id = thr->id;
unsigned char hash1[64];
uint32_t first_nonce = work->blk.nonce;
uint32_t last_nonce;
bool rc;
hex2bin(hash1, "00000000000000000000000000000000000000000000000000000000000000000000008000000000000000000000000000000000000000000000000000010000", 64);
CPUSearch:
last_nonce = first_nonce;
rc = false;
/* scan nonces for a proof-of-work hash */
{
sha256_func func = sha256_funcs[opt_algo];
rc = (*func)(
thr,
work->midstate,
work->data,
hash1,
work->hash,
work->target,
max_nonce,
&last_nonce,
work->blk.nonce
);
}
/* if nonce found, submit work */
if (unlikely(rc)) {
applog(LOG_DEBUG, "CPU %d found something?", dev_from_id(thr_id));
submit_nonce(thr, work, last_nonce);
work->blk.nonce = last_nonce + 1;
goto CPUSearch;
}
else
if (unlikely(last_nonce == first_nonce))
return 0;
work->blk.nonce = last_nonce + 1;
return last_nonce - first_nonce + 1;
}
struct device_drv cpu_drv = {
.drv_id = DRIVER_CPU,
.dname = "cpu",
.name = "CPU",
.drv_detect = cpu_detect,
.reinit_device = reinit_cpu_device,
.thread_prepare = cpu_thread_prepare,
.can_limit_work = cpu_can_limit_work,
.thread_init = cpu_thread_init,
.scanhash = cpu_scanhash,
};
#endif

65
driver-cpu.h

@ -1,65 +0,0 @@ @@ -1,65 +0,0 @@
#ifndef __DEVICE_CPU_H__
#define __DEVICE_CPU_H__
#include "miner.h"
#include "config.h"
#include <stdbool.h>
#ifndef OPT_SHOW_LEN
#define OPT_SHOW_LEN 80
#endif
#ifdef __SSE2__
#define WANT_SSE2_4WAY 1
#endif
#ifdef __ALTIVEC__
#define WANT_ALTIVEC_4WAY 1
#endif
#if defined(__i386__) && defined(HAS_YASM) && defined(__SSE2__)
#define WANT_X8632_SSE2 1
#endif
#if (defined(__i386__) || defined(__x86_64__)) && !defined(__APPLE__)
#define WANT_VIA_PADLOCK 1
#endif
#if defined(__x86_64__) && defined(HAS_YASM)
#define WANT_X8664_SSE2 1
#endif
#if defined(__x86_64__) && defined(HAS_YASM) && defined(__SSE4_1__)
#define WANT_X8664_SSE4 1
#endif
#ifdef USE_SCRYPT
#define WANT_SCRYPT
#endif
enum sha256_algos {
ALGO_C, /* plain C */
ALGO_4WAY, /* parallel SSE2 */
ALGO_VIA, /* VIA padlock */
ALGO_CRYPTOPP, /* Crypto++ (C) */
ALGO_CRYPTOPP_ASM32, /* Crypto++ 32-bit assembly */
ALGO_SSE2_32, /* SSE2 for x86_32 */
ALGO_SSE2_64, /* SSE2 for x86_64 */
ALGO_SSE4_64, /* SSE4 for x86_64 */
ALGO_ALTIVEC_4WAY, /* parallel Altivec */
ALGO_SCRYPT, /* scrypt */
};
extern const char *algo_names[];
extern bool opt_usecpu;
extern struct device_drv cpu_drv;
extern char *set_algo(const char *arg, enum sha256_algos *algo);
extern void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo);
extern char *force_nthreads_int(const char *arg, int *i);
extern void init_max_name_len();
extern double bench_algo_stage3(enum sha256_algos algo);
extern void set_scrypt_algo(enum sha256_algos *algo);
#endif /* __DEVICE_CPU_H__ */

5
miner.h

@ -210,7 +210,6 @@ enum drv_driver { @@ -210,7 +210,6 @@ enum drv_driver {
DRIVER_BITFORCE,
DRIVER_MODMINER,
DRIVER_ZTEX,
DRIVER_CPU,
DRIVER_BFLSC,
DRIVER_AVALON,
DRIVER_MAX
@ -927,7 +926,6 @@ extern bool hotplug_mode; @@ -927,7 +926,6 @@ extern bool hotplug_mode;
extern int hotplug_time;
extern struct list_head scan_devices;
extern int nDevs;
extern int opt_n_threads;
extern int num_processors;
extern int hw_errors;
extern bool use_syslog;
@ -943,13 +941,10 @@ extern bool opt_scrypt; @@ -943,13 +941,10 @@ extern bool opt_scrypt;
#endif
extern double total_secs;
extern int mining_threads;
extern struct cgpu_info *cpus;
extern int total_devices;
extern struct cgpu_info **devices;
extern int total_pools;
extern struct pool **pools;
extern const char *algo_names[];
extern enum sha256_algos opt_algo;
extern struct strategies strategies[];
extern enum pool_strategy pool_strategy;
extern int opt_rotate_period;

488
sha256_4way.c

@ -1,488 +0,0 @@ @@ -1,488 +0,0 @@
// Copyright (c) 2010 Satoshi Nakamoto
// Distributed under the MIT/X11 software license, see the accompanying
// file license.txt or http://www.opensource.org/licenses/mit-license.php.
// tcatm's 4-way 128-bit SSE2 SHA-256
#include "driver-cpu.h"
#ifdef WANT_SSE2_4WAY
#include <string.h>
#include <assert.h>
#include <xmmintrin.h>
#include <stdint.h>
#include <stdio.h>
#define NPAR 32
static void DoubleBlockSHA256(const void* pin, void* pout, const void* pinit, unsigned int hash[8][NPAR], const void* init2);
static const unsigned int sha256_consts[] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /* 0 */
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /* 8 */
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, /* 16 */
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, /* 24 */
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, /* 32 */
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, /* 40 */
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, /* 48 */
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, /* 56 */
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
static inline __m128i Ch(const __m128i b, const __m128i c, const __m128i d) {
return _mm_xor_si128(_mm_and_si128(b,c),_mm_andnot_si128(b,d));
}
static inline __m128i Maj(const __m128i b, const __m128i c, const __m128i d) {
return _mm_xor_si128(_mm_xor_si128(_mm_and_si128(b,c),_mm_and_si128(b,d)),_mm_and_si128(c,d));
}
static inline __m128i ROTR(__m128i x, const int n) {
return _mm_or_si128(_mm_srli_epi32(x, n),_mm_slli_epi32(x, 32 - n));
}
static inline __m128i SHR(__m128i x, const int n) {
return _mm_srli_epi32(x, n);
}
/* SHA256 Functions */
#define BIGSIGMA0_256(x) (_mm_xor_si128(_mm_xor_si128(ROTR((x), 2),ROTR((x), 13)),ROTR((x), 22)))
#define BIGSIGMA1_256(x) (_mm_xor_si128(_mm_xor_si128(ROTR((x), 6),ROTR((x), 11)),ROTR((x), 25)))
#define SIGMA0_256(x) (_mm_xor_si128(_mm_xor_si128(ROTR((x), 7),ROTR((x), 18)), SHR((x), 3 )))
#define SIGMA1_256(x) (_mm_xor_si128(_mm_xor_si128(ROTR((x),17),ROTR((x), 19)), SHR((x), 10)))
static inline unsigned int store32(const __m128i x, int i) {
union { unsigned int ret[4]; __m128i x; } box;
box.x = x;
return box.ret[i];
}
static inline void store_epi32(const __m128i x, unsigned int *x0, unsigned int *x1, unsigned int *x2, unsigned int *x3) {
union { unsigned int ret[4]; __m128i x; } box;
box.x = x;
*x0 = box.ret[3]; *x1 = box.ret[2]; *x2 = box.ret[1]; *x3 = box.ret[0];
}
#define add4(x0, x1, x2, x3) _mm_add_epi32(_mm_add_epi32(x0, x1),_mm_add_epi32( x2,x3))
#define add5(x0, x1, x2, x3, x4) _mm_add_epi32(add4(x0, x1, x2, x3), x4)
#define SHA256ROUND(a, b, c, d, e, f, g, h, i, w) \
T1 = add5(h, BIGSIGMA1_256(e), Ch(e, f, g), _mm_set1_epi32(sha256_consts[i]), w); \
d = _mm_add_epi32(d, T1); \
h = _mm_add_epi32(T1, _mm_add_epi32(BIGSIGMA0_256(a), Maj(a, b, c)));
static inline void dumpreg(__m128i x, char *msg) {
union { unsigned int ret[4]; __m128i x; } box;
box.x = x ;
printf("%s %08x %08x %08x %08x\n", msg, box.ret[0], box.ret[1], box.ret[2], box.ret[3]);
}
#if 1
#define dumpstate(i) printf("%s: %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", \
__func__, store32(w0, i), store32(a, i), store32(b, i), store32(c, i), store32(d, i), store32(e, i), store32(f, i), store32(g, i), store32(h, i));
#else
#define dumpstate()
#endif
static const unsigned int pSHA256InitState[8] =
{0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
bool ScanHash_4WaySSE2(struct thr_info*thr, const unsigned char *pmidstate,
unsigned char *pdata,
unsigned char *phash1, unsigned char *phash,
const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce,
uint32_t nonce)
{
unsigned int *nNonce_p = (unsigned int*)(pdata + 76);
pdata += 64;
for (;;)
{
unsigned int thash[9][NPAR] __attribute__((aligned(128)));
int j;
nonce += NPAR;
*nNonce_p = nonce;
DoubleBlockSHA256(pdata, phash1, pmidstate, thash, pSHA256InitState);
for (j = 0; j < NPAR; j++)
{
if (unlikely(thash[7][j] == 0))
{
int i;
for (i = 0; i < 32/4; i++)
((unsigned int*)phash)[i] = thash[i][j];
if (fulltest(phash, ptarget)) {
nonce += j;
*last_nonce = nonce;
*nNonce_p = nonce;
return true;
}
}
}
if ((nonce >= max_nonce) || thr->work_restart)
{
*last_nonce = nonce;
return false;
}
}
}
static void DoubleBlockSHA256(const void* pin, void* pad, const void *pre, unsigned int thash[9][NPAR], const void *init)
{
unsigned int* In = (unsigned int*)pin;
unsigned int* Pad = (unsigned int*)pad;
unsigned int* hPre = (unsigned int*)pre;
unsigned int* hInit = (unsigned int*)init;
unsigned int /* i, j, */ k;
/* vectors used in calculation */
__m128i w0, w1, w2, w3, w4, w5, w6, w7;
__m128i w8, w9, w10, w11, w12, w13, w14, w15;
__m128i T1;
__m128i a, b, c, d, e, f, g, h;
__m128i nonce, preNonce;
/* nonce offset for vector */
__m128i offset = _mm_set_epi32(0x00000003, 0x00000002, 0x00000001, 0x00000000);
preNonce = _mm_add_epi32(_mm_set1_epi32(In[3]), offset);
for(k = 0; k<NPAR; k+=4) {
w0 = _mm_set1_epi32(In[0]);
w1 = _mm_set1_epi32(In[1]);
w2 = _mm_set1_epi32(In[2]);
//w3 = _mm_set1_epi32(In[3]); nonce will be later hacked into the hash
w4 = _mm_set1_epi32(In[4]);
w5 = _mm_set1_epi32(In[5]);
w6 = _mm_set1_epi32(In[6]);
w7 = _mm_set1_epi32(In[7]);
w8 = _mm_set1_epi32(In[8]);
w9 = _mm_set1_epi32(In[9]);
w10 = _mm_set1_epi32(In[10]);
w11 = _mm_set1_epi32(In[11]);
w12 = _mm_set1_epi32(In[12]);
w13 = _mm_set1_epi32(In[13]);
w14 = _mm_set1_epi32(In[14]);
w15 = _mm_set1_epi32(In[15]);
/* hack nonce into lowest byte of w3 */
nonce = _mm_add_epi32(preNonce, _mm_set1_epi32(k));
w3 = nonce;
a = _mm_set1_epi32(hPre[0]);
b = _mm_set1_epi32(hPre[1]);
c = _mm_set1_epi32(hPre[2]);
d = _mm_set1_epi32(hPre[3]);
e = _mm_set1_epi32(hPre[4]);
f = _mm_set1_epi32(hPre[5]);
g = _mm_set1_epi32(hPre[6]);
h = _mm_set1_epi32(hPre[7]);
SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0);
SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1);
SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2);
SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3);
SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4);
SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5);
SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6);
SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7);
SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8);
SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9);
SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10);
SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11);
SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12);
SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13);
SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14);
SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15);
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0);
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1);
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2);
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3);
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4);
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5);
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6);
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7);
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8);
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9);
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10);
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11);
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12);
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13);
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14);
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15);
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0);
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1);
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2);
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3);
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4);
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5);
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6);
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7);
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8);
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9);
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10);
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11);
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12);
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13);
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14);
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15);
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0);
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1);
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2);
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3);
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4);
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5);
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6);
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7);
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8);
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9);
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10);
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11);
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12);
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13);
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14);
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15);
#define store_load(x, i, dest) \
T1 = _mm_set1_epi32((hPre)[i]); \
dest = _mm_add_epi32(T1, x);
store_load(a, 0, w0);
store_load(b, 1, w1);
store_load(c, 2, w2);
store_load(d, 3, w3);
store_load(e, 4, w4);
store_load(f, 5, w5);
store_load(g, 6, w6);
store_load(h, 7, w7);
w8 = _mm_set1_epi32(Pad[8]);
w9 = _mm_set1_epi32(Pad[9]);
w10 = _mm_set1_epi32(Pad[10]);
w11 = _mm_set1_epi32(Pad[11]);
w12 = _mm_set1_epi32(Pad[12]);
w13 = _mm_set1_epi32(Pad[13]);
w14 = _mm_set1_epi32(Pad[14]);
w15 = _mm_set1_epi32(Pad[15]);
a = _mm_set1_epi32(hInit[0]);
b = _mm_set1_epi32(hInit[1]);
c = _mm_set1_epi32(hInit[2]);
d = _mm_set1_epi32(hInit[3]);
e = _mm_set1_epi32(hInit[4]);
f = _mm_set1_epi32(hInit[5]);
g = _mm_set1_epi32(hInit[6]);
h = _mm_set1_epi32(hInit[7]);
SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0);
SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1);
SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2);
SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3);
SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4);
SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5);
SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6);
SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7);
SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8);
SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9);
SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10);
SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11);
SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12);
SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13);
SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14);
SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15);
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0);
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1);
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2);
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3);
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4);
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5);
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6);
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7);
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8);
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9);
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10);
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11);
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12);
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13);
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14);
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15);
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0);
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1);
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2);
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3);
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4);
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5);
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6);
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7);
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8);
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9);
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10);
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11);
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12);
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13);
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14);
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15);
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0);
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1);
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2);
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3);
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4);
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5);
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6);
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7);
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8);
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9);
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10);
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11);
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12);
/* Skip last 3-rounds; not necessary for H==0 */
#if 0
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13);
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14);
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15);
#endif
/* store resulsts directly in thash */
#define store_2(x,i) \
w0 = _mm_set1_epi32(hInit[i]); \
*(__m128i *)&(thash)[i][0+k] = _mm_add_epi32(w0, x);
store_2(a, 0);
store_2(b, 1);
store_2(c, 2);
store_2(d, 3);
store_2(e, 4);
store_2(f, 5);
store_2(g, 6);
store_2(h, 7);
*(__m128i *)&(thash)[8][0+k] = nonce;
}
}
#endif /* WANT_SSE2_4WAY */

469
sha256_altivec_4way.c

@ -1,469 +0,0 @@ @@ -1,469 +0,0 @@
// Copyright (c) 2010 Satoshi Nakamoto
// Copyright (c) 2011 Gilles Risch
// Distributed under the MIT/X11 software license, see the accompanying
// file license.txt or http://www.opensource.org/licenses/mit-license.php.
// 4-way 128-bit Altivec SHA-256,
// based on tcatm's 4-way 128-bit SSE2 SHA-256
//
#include "driver-cpu.h"
#ifdef WANT_ALTIVEC_4WAY
#include <string.h>
#include <assert.h>
//#include <altivec.h>
#include <stdint.h>
#include <stdio.h>
#define NPAR 32
static void DoubleBlockSHA256(const void* pin, void* pout, const void* pinit, unsigned int hash[8][NPAR], const void* init2);
static const unsigned int sha256_consts[] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /* 0 */
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /* 8 */
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, /* 16 */
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, /* 24 */
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, /* 32 */
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, /* 40 */
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, /* 48 */
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, /* 56 */
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
static inline vector unsigned int Ch(const vector unsigned int b, const vector unsigned int c, const vector unsigned int d) {
return vec_sel(d,c,b);
}
static inline vector unsigned int Maj(const vector unsigned int b, const vector unsigned int c, const vector unsigned int d) {
return vec_sel(b,c, vec_xor(b,d));
}
/* RotateRight(x, n) := RotateLeft(x, 32-n) */
/* SHA256 Functions */
#define BIGSIGMA0_256(x) (vec_xor(vec_xor(vec_rl((x), (vector unsigned int)(32-2)),vec_rl((x), (vector unsigned int)(32-13))),vec_rl((x), (vector unsigned int)(32-22))))
#define BIGSIGMA1_256(x) (vec_xor(vec_xor(vec_rl((x), (vector unsigned int)(32-6)),vec_rl((x), (vector unsigned int)(32-11))),vec_rl((x), (vector unsigned int)(32-25))))
#define SIGMA0_256(x) (vec_xor(vec_xor(vec_rl((x), (vector unsigned int)(32- 7)),vec_rl((x), (vector unsigned int)(32-18))), vec_sr((x), (vector unsigned int)(3 ))))
#define SIGMA1_256(x) (vec_xor(vec_xor(vec_rl((x), (vector unsigned int)(32-17)),vec_rl((x), (vector unsigned int)(32-19))), vec_sr((x), (vector unsigned int)(10))))
#define add4(x0, x1, x2, x3) vec_add(vec_add(x0, x1),vec_add( x2,x3))
#define add5(x0, x1, x2, x3, x4) vec_add(add4(x0, x1, x2, x3), x4)
#define SHA256ROUND(a, b, c, d, e, f, g, h, i, w) \
T1 = add5(h, BIGSIGMA1_256(e), Ch(e, f, g), (vector unsigned int)(sha256_consts[i],sha256_consts[i],sha256_consts[i],sha256_consts[i]), w); \
d = vec_add(d, T1); \
h = vec_add(T1, vec_add(BIGSIGMA0_256(a), Maj(a, b, c)));
static const unsigned int pSHA256InitState[8] =
{0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
bool ScanHash_altivec_4way(struct thr_info*thr, const unsigned char *pmidstate,
unsigned char *pdata,
unsigned char *phash1, unsigned char *phash,
const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce,
uint32_t nonce)
{
unsigned int *nNonce_p = (unsigned int*)(pdata + 76);
pdata += 64;
for (;;)
{
unsigned int thash[9][NPAR] __attribute__((aligned(128)));
int j;
*nNonce_p = nonce;
DoubleBlockSHA256(pdata, phash1, pmidstate, thash, pSHA256InitState);
for (j = 0; j < NPAR; j++)
{
if (unlikely(thash[7][j] == 0))
{
int i;
for (i = 0; i < 32/4; i++)
((unsigned int*)phash)[i] = thash[i][j];
if (fulltest(phash, ptarget)) {
nonce += j;
*last_nonce = nonce;
*nNonce_p = nonce;
return true;
}
}
}
if ((nonce >= max_nonce) || thr->work_restart)
{
*last_nonce = nonce;
return false;
}
nonce += NPAR;
}
}
static void DoubleBlockSHA256(const void* pin, void* pad, const void *pre, unsigned int thash[9][NPAR], const void *init)
{
unsigned int* In = (unsigned int*)pin;
unsigned int* Pad = (unsigned int*)pad;
unsigned int* hPre = (unsigned int*)pre;
unsigned int* hInit = (unsigned int*)init;
unsigned int /* i, j, */ k;
/* vectors used in calculation */
vector unsigned int w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
vector unsigned int T1;
vector unsigned int a, b, c, d, e, f, g, h;
vector unsigned int nonce, preNonce;
/* nonce offset for vector */
vector unsigned int offset = (vector unsigned int)(0, 1, 2, 3);
preNonce = vec_add((vector unsigned int)(In[3],In[3],In[3],In[3]), offset);
for(k = 0; k<NPAR; k+=4)
{
w0 = (vector unsigned int)(In[0],In[0],In[0],In[0]);
w1 = (vector unsigned int)(In[1],In[1],In[1],In[1]);
w2 = (vector unsigned int)(In[2],In[2],In[2],In[2]);
//w3 = (vector unsigned int)(In[3],In[3],In[3],In[3]); nonce will be later hacked into the hash
w4 = (vector unsigned int)(In[4],In[4],In[4],In[4]);
w5 = (vector unsigned int)(In[5],In[5],In[5],In[5]);
w6 = (vector unsigned int)(In[6],In[6],In[6],In[6]);
w7 = (vector unsigned int)(In[7],In[7],In[7],In[7]);
w8 = (vector unsigned int)(In[8],In[8],In[8],In[8]);
w9 = (vector unsigned int)(In[9],In[9],In[9],In[9]);
w10 = (vector unsigned int)(In[10],In[10],In[10],In[10]);
w11 = (vector unsigned int)(In[11],In[11],In[11],In[11]);
w12 = (vector unsigned int)(In[12],In[12],In[12],In[12]);
w13 = (vector unsigned int)(In[13],In[13],In[13],In[13]);
w14 = (vector unsigned int)(In[14],In[14],In[14],In[14]);
w15 = (vector unsigned int)(In[15],In[15],In[15],In[15]);
/* hack nonce into lowest byte of w3 */
nonce = vec_add(preNonce, (vector unsigned int)(k,k,k,k));
w3 = nonce;
//printf ("W3: %08vlx\n", w3);
a = (vector unsigned int)(hPre[0],hPre[0],hPre[0],hPre[0]);
b = (vector unsigned int)(hPre[1],hPre[1],hPre[1],hPre[1]);
c = (vector unsigned int)(hPre[2],hPre[2],hPre[2],hPre[2]);
d = (vector unsigned int)(hPre[3],hPre[3],hPre[3],hPre[3]);
e = (vector unsigned int)(hPre[4],hPre[4],hPre[4],hPre[4]);
f = (vector unsigned int)(hPre[5],hPre[5],hPre[5],hPre[5]);
g = (vector unsigned int)(hPre[6],hPre[6],hPre[6],hPre[6]);
h = (vector unsigned int)(hPre[7],hPre[7],hPre[7],hPre[7]);
SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0);
SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1);
SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2);
SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3);
SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4);
SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5);
SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6);
SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7);
SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8);
SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9);
SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10);
SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11);
SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12);
SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13);
SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14);
SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15);
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0);
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1);
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2);
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3);
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4);
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5);
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6);
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7);
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8);
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9);
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10);
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11);
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12);
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13);
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14);
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15);
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0);
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1);
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2);
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3);
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4);
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5);
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6);
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7);
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8);
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9);
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10);
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11);
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12);
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13);
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14);
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15);
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0);
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1);
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2);
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3);
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4);
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5);
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6);
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7);
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8);
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9);
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10);
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11);
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12);
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13);
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14);
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15);
#define store_load(x, i, dest) \
T1 = (vector unsigned int)((hPre)[i],(hPre)[i],(hPre)[i],(hPre)[i]); \
dest = vec_add(T1, x);
store_load(a, 0, w0);
store_load(b, 1, w1);
store_load(c, 2, w2);
store_load(d, 3, w3);
store_load(e, 4, w4);
store_load(f, 5, w5);
store_load(g, 6, w6);
store_load(h, 7, w7);
/* end of first SHA256 round */
w8 = (vector unsigned int)(Pad[8],Pad[8],Pad[8],Pad[8]);
w9 = (vector unsigned int)(Pad[9],Pad[9],Pad[9],Pad[9]);
w10 = (vector unsigned int)(Pad[10],Pad[10],Pad[10],Pad[10]);
w11 = (vector unsigned int)(Pad[11],Pad[11],Pad[11],Pad[11]);
w12 = (vector unsigned int)(Pad[12],Pad[12],Pad[12],Pad[12]);
w13 = (vector unsigned int)(Pad[13],Pad[13],Pad[13],Pad[13]);
w14 = (vector unsigned int)(Pad[14],Pad[14],Pad[14],Pad[14]);
w15 = (vector unsigned int)(Pad[15],Pad[15],Pad[15],Pad[15]);
a = (vector unsigned int)(hInit[0],hInit[0],hInit[0],hInit[0]);
b = (vector unsigned int)(hInit[1],hInit[1],hInit[1],hInit[1]);
c = (vector unsigned int)(hInit[2],hInit[2],hInit[2],hInit[2]);
d = (vector unsigned int)(hInit[3],hInit[3],hInit[3],hInit[3]);
e = (vector unsigned int)(hInit[4],hInit[4],hInit[4],hInit[4]);
f = (vector unsigned int)(hInit[5],hInit[5],hInit[5],hInit[5]);
g = (vector unsigned int)(hInit[6],hInit[6],hInit[6],hInit[6]);
h = (vector unsigned int)(hInit[7],hInit[7],hInit[7],hInit[7]);
SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0);
SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1);
SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2);
SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3);
SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4);
SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5);
SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6);
SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7);
SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8);
SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9);
SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10);
SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11);
SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12);
SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13);
SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14);
SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15);
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0);
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1);
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2);
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3);
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4);
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5);
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6);
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7);
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8);
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9);
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10);
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11);
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12);
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13);
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14);
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15);
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0);
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1);
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2);
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3);
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4);
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5);
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6);
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7);
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8);
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9);
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10);
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11);
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12);
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13);
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14);
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15);
w0 = add4(SIGMA1_256(w14), w9, SIGMA0_256(w1), w0);
SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0);
w1 = add4(SIGMA1_256(w15), w10, SIGMA0_256(w2), w1);
SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1);
w2 = add4(SIGMA1_256(w0), w11, SIGMA0_256(w3), w2);
SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2);
w3 = add4(SIGMA1_256(w1), w12, SIGMA0_256(w4), w3);
SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3);
w4 = add4(SIGMA1_256(w2), w13, SIGMA0_256(w5), w4);
SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4);
w5 = add4(SIGMA1_256(w3), w14, SIGMA0_256(w6), w5);
SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5);
w6 = add4(SIGMA1_256(w4), w15, SIGMA0_256(w7), w6);
SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6);
w7 = add4(SIGMA1_256(w5), w0, SIGMA0_256(w8), w7);
SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7);
w8 = add4(SIGMA1_256(w6), w1, SIGMA0_256(w9), w8);
SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8);
w9 = add4(SIGMA1_256(w7), w2, SIGMA0_256(w10), w9);
SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9);
w10 = add4(SIGMA1_256(w8), w3, SIGMA0_256(w11), w10);
SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10);
w11 = add4(SIGMA1_256(w9), w4, SIGMA0_256(w12), w11);
SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11);
w12 = add4(SIGMA1_256(w10), w5, SIGMA0_256(w13), w12);
SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12);
/* Skip last 3-rounds; not necessary for H==0 */
/*#if 0
w13 = add4(SIGMA1_256(w11), w6, SIGMA0_256(w14), w13);
SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13);
w14 = add4(SIGMA1_256(w12), w7, SIGMA0_256(w15), w14);
SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14);
w15 = add4(SIGMA1_256(w13), w8, SIGMA0_256(w0), w15);
SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15);
#endif*/
/* store resulsts directly in thash */
#define store_2(x,i) \
w0 = (vector unsigned int)(hInit[i],hInit[i],hInit[i],hInit[i]); \
vec_st(vec_add(w0, x), 0 ,&thash[i][k]);
store_2(a, 0);
store_2(b, 1);
store_2(c, 2);
store_2(d, 3);
store_2(e, 4);
store_2(f, 5);
store_2(g, 6);
store_2(h, 7);
vec_st(nonce, 0 ,&thash[8][k]);
/* writing the results into the array is time intensive */
/* -> try if it´s faster to compare the results with the target inside this function */
}
}
#endif /* WANT_ALTIVEC_4WAY */

609
sha256_cryptopp.c

@ -1,609 +0,0 @@ @@ -1,609 +0,0 @@
#include "config.h"
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include "miner.h"
typedef uint32_t word32;
static word32 rotrFixed(word32 word, unsigned int shift)
{
return (word >> shift) | (word << (32 - shift));
}
#define blk0(i) (W[i] = data[i])
static const word32 SHA256_K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
#define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))
#define Ch(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) (y^((x^y)&(y^z)))
#define a(i) T[(0-i)&7]
#define b(i) T[(1-i)&7]
#define c(i) T[(2-i)&7]
#define d(i) T[(3-i)&7]
#define e(i) T[(4-i)&7]
#define f(i) T[(5-i)&7]
#define g(i) T[(6-i)&7]
#define h(i) T[(7-i)&7]
#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\
d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
// for SHA256
#define S0(x) (rotrFixed(x,2)^rotrFixed(x,13)^rotrFixed(x,22))
#define S1(x) (rotrFixed(x,6)^rotrFixed(x,11)^rotrFixed(x,25))
#define s0(x) (rotrFixed(x,7)^rotrFixed(x,18)^(x>>3))
#define s1(x) (rotrFixed(x,17)^rotrFixed(x,19)^(x>>10))
static void SHA256_Transform(word32 *state, const word32 *data)
{
word32 W[16] = { };
word32 T[8];
unsigned int j;
/* Copy context->state[] to working vars */
memcpy(T, state, sizeof(T));
/* 64 operations, partially loop unrolled */
for (j=0; j<64; j+=16)
{
R( 0); R( 1); R( 2); R( 3);
R( 4); R( 5); R( 6); R( 7);
R( 8); R( 9); R(10); R(11);
R(12); R(13); R(14); R(15);
}
/* Add the working vars back into context.state[] */
state[0] += a(0);
state[1] += b(0);
state[2] += c(0);
state[3] += d(0);
state[4] += e(0);
state[5] += f(0);
state[6] += g(0);
state[7] += h(0);
}
static void runhash(void *state, const void *input, const void *init)
{
memcpy(state, init, 32);
SHA256_Transform(state, input);
}
/* suspiciously similar to ScanHash* from bitcoin */
bool scanhash_cryptopp(struct thr_info*thr, const unsigned char *midstate,
unsigned char *data,
unsigned char *hash1, unsigned char *hash,
const unsigned char *target,
uint32_t max_nonce, uint32_t *last_nonce,
uint32_t n)
{
uint32_t *hash32 = (uint32_t *) hash;
uint32_t *nonce = (uint32_t *)(data + 76);
data += 64;
while (1) {
n++;
*nonce = n;
runhash(hash1, data, midstate);
runhash(hash, hash1, sha256_init_state);
if (unlikely((hash32[7] == 0) && fulltest(hash, target))) {
*last_nonce = n;
return true;
}
if ((n >= max_nonce) || thr->work_restart) {
*last_nonce = n;
return false;
}
}
}
#if defined(WANT_CRYPTOPP_ASM32)
#define CRYPTOPP_FASTCALL
#define CRYPTOPP_BOOL_X86 1
#define CRYPTOPP_BOOL_X64 0
#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0
#ifdef CRYPTOPP_GENERATE_X64_MASM
#define AS1(x) x*newline*
#define AS2(x, y) x, y*newline*
#define AS3(x, y, z) x, y, z*newline*
#define ASS(x, y, a, b, c, d) x, y, a*64+b*16+c*4+d*newline*
#define ASL(x) label##x:*newline*
#define ASJ(x, y, z) x label##y*newline*
#define ASC(x, y) x label##y*newline*
#define AS_HEX(y) 0##y##h
#elif defined(_MSC_VER) || defined(__BORLANDC__)
#define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
#define AS1(x) __asm {x}
#define AS2(x, y) __asm {x, y}
#define AS3(x, y, z) __asm {x, y, z}
#define ASS(x, y, a, b, c, d) __asm {x, y, (a)*64+(b)*16+(c)*4+(d)}
#define ASL(x) __asm {label##x:}
#define ASJ(x, y, z) __asm {x label##y}
#define ASC(x, y) __asm {x label##y}
#define CRYPTOPP_NAKED __declspec(naked)
#define AS_HEX(y) 0x##y
#else
#define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
// define these in two steps to allow arguments to be expanded
#define GNU_AS1(x) #x ";"
#define GNU_AS2(x, y) #x ", " #y ";"
#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";"
#define GNU_ASL(x) "\n" #x ":"
#define GNU_ASJ(x, y, z) #x " " #y #z ";"
#define AS1(x) GNU_AS1(x)
#define AS2(x, y) GNU_AS2(x, y)
#define AS3(x, y, z) GNU_AS3(x, y, z)
#define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";"
#define ASL(x) GNU_ASL(x)
#define ASJ(x, y, z) GNU_ASJ(x, y, z)
#define ASC(x, y) #x " " #y ";"
#define CRYPTOPP_NAKED
#define AS_HEX(y) 0x##y
#endif
#define IF0(y)
#define IF1(y) y
#ifdef CRYPTOPP_GENERATE_X64_MASM
#define ASM_MOD(x, y) ((x) MOD (y))
#define XMMWORD_PTR XMMWORD PTR
#else
// GNU assembler doesn't seem to have mod operator
#define ASM_MOD(x, y) ((x)-((x)/(y))*(y))
// GAS 2.15 doesn't support XMMWORD PTR. it seems necessary only for MASM
#define XMMWORD_PTR
#endif
#if CRYPTOPP_BOOL_X86
#define AS_REG_1 ecx
#define AS_REG_2 edx
#define AS_REG_3 esi
#define AS_REG_4 edi
#define AS_REG_5 eax
#define AS_REG_6 ebx
#define AS_REG_7 ebp
#define AS_REG_1d ecx
#define AS_REG_2d edx
#define AS_REG_3d esi
#define AS_REG_4d edi
#define AS_REG_5d eax
#define AS_REG_6d ebx
#define AS_REG_7d ebp
#define WORD_SZ 4
#define WORD_REG(x) e##x
#define WORD_PTR DWORD PTR
#define AS_PUSH_IF86(x) AS1(push e##x)
#define AS_POP_IF86(x) AS1(pop e##x)
#define AS_JCXZ jecxz
#elif CRYPTOPP_BOOL_X64
#ifdef CRYPTOPP_GENERATE_X64_MASM
#define AS_REG_1 rcx
#define AS_REG_2 rdx
#define AS_REG_3 r8
#define AS_REG_4 r9
#define AS_REG_5 rax
#define AS_REG_6 r10
#define AS_REG_7 r11
#define AS_REG_1d ecx
#define AS_REG_2d edx
#define AS_REG_3d r8d
#define AS_REG_4d r9d
#define AS_REG_5d eax
#define AS_REG_6d r10d
#define AS_REG_7d r11d
#else
#define AS_REG_1 rdi
#define AS_REG_2 rsi
#define AS_REG_3 rdx
#define AS_REG_4 rcx
#define AS_REG_5 r8
#define AS_REG_6 r9
#define AS_REG_7 r10
#define AS_REG_1d edi
#define AS_REG_2d esi
#define AS_REG_3d edx
#define AS_REG_4d ecx
#define AS_REG_5d r8d
#define AS_REG_6d r9d
#define AS_REG_7d r10d
#endif
#define WORD_SZ 8
#define WORD_REG(x) r##x
#define WORD_PTR QWORD PTR
#define AS_PUSH_IF86(x)
#define AS_POP_IF86(x)
#define AS_JCXZ jrcxz
#endif
static void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, size_t len
#if defined(_MSC_VER) && (_MSC_VER == 1200)
, ... // VC60 workaround: prevent VC 6 from inlining this function
#endif
)
{
#if defined(_MSC_VER) && (_MSC_VER == 1200)
AS2(mov ecx, [state])
AS2(mov edx, [data])
#endif
#define LOCALS_SIZE 8*4 + 16*4 + 4*WORD_SZ
#define H(i) [BASE+ASM_MOD(1024+7-(i),8)*4]
#define G(i) H(i+1)
#define F(i) H(i+2)
#define E(i) H(i+3)
#define D(i) H(i+4)
#define C(i) H(i+5)
#define B(i) H(i+6)
#define A(i) H(i+7)
#define Wt(i) BASE+8*4+ASM_MOD(1024+15-(i),16)*4
#define Wt_2(i) Wt((i)-2)
#define Wt_15(i) Wt((i)-15)
#define Wt_7(i) Wt((i)-7)
#define K_END [BASE+8*4+16*4+0*WORD_SZ]
#define STATE_SAVE [BASE+8*4+16*4+1*WORD_SZ]
#define DATA_SAVE [BASE+8*4+16*4+2*WORD_SZ]
#define DATA_END [BASE+8*4+16*4+3*WORD_SZ]
#define Kt(i) WORD_REG(si)+(i)*4
#if CRYPTOPP_BOOL_X86
#define BASE esp+4
#elif defined(__GNUC__)
#define BASE r8
#else
#define BASE rsp
#endif
#define RA0(i, edx, edi) \
AS2( add edx, [Kt(i)] )\
AS2( add edx, [Wt(i)] )\
AS2( add edx, H(i) )\
#define RA1(i, edx, edi)
#define RB0(i, edx, edi)
#define RB1(i, edx, edi) \
AS2( mov AS_REG_7d, [Wt_2(i)] )\
AS2( mov edi, [Wt_15(i)])\
AS2( mov ebx, AS_REG_7d )\
AS2( shr AS_REG_7d, 10 )\
AS2( ror ebx, 17 )\
AS2( xor AS_REG_7d, ebx )\
AS2( ror ebx, 2 )\
AS2( xor ebx, AS_REG_7d )/* s1(W_t-2) */\
AS2( add ebx, [Wt_7(i)])\
AS2( mov AS_REG_7d, edi )\
AS2( shr AS_REG_7d, 3 )\
AS2( ror edi, 7 )\
AS2( add ebx, [Wt(i)])/* s1(W_t-2) + W_t-7 + W_t-16 */\
AS2( xor AS_REG_7d, edi )\
AS2( add edx, [Kt(i)])\
AS2( ror edi, 11 )\
AS2( add edx, H(i) )\
AS2( xor AS_REG_7d, edi )/* s0(W_t-15) */\
AS2( add AS_REG_7d, ebx )/* W_t = s1(W_t-2) + W_t-7 + s0(W_t-15) W_t-16*/\
AS2( mov [Wt(i)], AS_REG_7d)\
AS2( add edx, AS_REG_7d )\
#define ROUND(i, r, eax, ecx, edi, edx)\
/* in: edi = E */\
/* unused: eax, ecx, temp: ebx, AS_REG_7d, out: edx = T1 */\
AS2( mov edx, F(i) )\
AS2( xor edx, G(i) )\
AS2( and edx, edi )\
AS2( xor edx, G(i) )/* Ch(E,F,G) = (G^(E&(F^G))) */\
AS2( mov AS_REG_7d, edi )\
AS2( ror edi, 6 )\
AS2( ror AS_REG_7d, 25 )\
RA##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\
AS2( xor AS_REG_7d, edi )\
AS2( ror edi, 5 )\
AS2( xor AS_REG_7d, edi )/* S1(E) */\
AS2( add edx, AS_REG_7d )/* T1 = S1(E) + Ch(E,F,G) + H + Wt + Kt */\
RB##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\
/* in: ecx = A, eax = B^C, edx = T1 */\
/* unused: edx, temp: ebx, AS_REG_7d, out: eax = A, ecx = B^C, edx = E */\
AS2( mov ebx, ecx )\
AS2( xor ecx, B(i) )/* A^B */\
AS2( and eax, ecx )\
AS2( xor eax, B(i) )/* Maj(A,B,C) = B^((A^B)&(B^C) */\
AS2( mov AS_REG_7d, ebx )\
AS2( ror ebx, 2 )\
AS2( add eax, edx )/* T1 + Maj(A,B,C) */\
AS2( add edx, D(i) )\
AS2( mov D(i), edx )\
AS2( ror AS_REG_7d, 22 )\
AS2( xor AS_REG_7d, ebx )\
AS2( ror ebx, 11 )\
AS2( xor AS_REG_7d, ebx )\
AS2( add eax, AS_REG_7d )/* T1 + S0(A) + Maj(A,B,C) */\
AS2( mov H(i), eax )\
#define SWAP_COPY(i) \
AS2( mov WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\
AS1( bswap WORD_REG(bx))\
AS2( mov [Wt(i*(1+CRYPTOPP_BOOL_X64)+CRYPTOPP_BOOL_X64)], WORD_REG(bx))
#if defined(__GNUC__)
#if CRYPTOPP_BOOL_X64
FixedSizeAlignedSecBlock<byte, LOCALS_SIZE> workspace;
#endif
__asm__ __volatile__
(
#if CRYPTOPP_BOOL_X64
"lea %4, %%r8;"
#endif
".intel_syntax noprefix;"
#elif defined(CRYPTOPP_GENERATE_X64_MASM)
ALIGN 8
X86_SHA256_HashBlocks PROC FRAME
rex_push_reg rsi
push_reg rdi
push_reg rbx
push_reg rbp
alloc_stack(LOCALS_SIZE+8)
.endprolog
mov rdi, r8
lea rsi, [?SHA256_K@CryptoPP@@3QBIB + 48*4]
#endif
#if CRYPTOPP_BOOL_X86
#ifndef __GNUC__
AS2( mov edi, [len])
AS2( lea WORD_REG(si), [SHA256_K+48*4])
#endif
#if !defined(_MSC_VER) || (_MSC_VER < 1400)
AS_PUSH_IF86(bx)
#endif
AS_PUSH_IF86(bp)
AS2( mov ebx, esp)
AS2( and esp, -16)
AS2( sub WORD_REG(sp), LOCALS_SIZE)
AS_PUSH_IF86(bx)
#endif
AS2( mov STATE_SAVE, WORD_REG(cx))
AS2( mov DATA_SAVE, WORD_REG(dx))
AS2( lea WORD_REG(ax), [WORD_REG(di) + WORD_REG(dx)])
AS2( mov DATA_END, WORD_REG(ax))
AS2( mov K_END, WORD_REG(si))
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_BOOL_X86
AS2( test edi, 1)
ASJ( jnz, 2, f)
AS1( dec DWORD PTR K_END)
#endif
AS2( movdqa xmm0, XMMWORD_PTR [WORD_REG(cx)+0*16])
AS2( movdqa xmm1, XMMWORD_PTR [WORD_REG(cx)+1*16])
#endif
#if CRYPTOPP_BOOL_X86
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
ASJ( jmp, 0, f)
#endif
ASL(2) // non-SSE2
AS2( mov esi, ecx)
AS2( lea edi, A(0))
AS2( mov ecx, 8)
AS1( rep movsd)
AS2( mov esi, K_END)
ASJ( jmp, 3, f)
#endif
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
ASL(0)
AS2( movdqa E(0), xmm1)
AS2( movdqa A(0), xmm0)
#endif
#if CRYPTOPP_BOOL_X86
ASL(3)
#endif
AS2( sub WORD_REG(si), 48*4)
SWAP_COPY(0) SWAP_COPY(1) SWAP_COPY(2) SWAP_COPY(3)
SWAP_COPY(4) SWAP_COPY(5) SWAP_COPY(6) SWAP_COPY(7)
#if CRYPTOPP_BOOL_X86
SWAP_COPY(8) SWAP_COPY(9) SWAP_COPY(10) SWAP_COPY(11)
SWAP_COPY(12) SWAP_COPY(13) SWAP_COPY(14) SWAP_COPY(15)
#endif
AS2( mov edi, E(0)) // E
AS2( mov eax, B(0)) // B
AS2( xor eax, C(0)) // B^C
AS2( mov ecx, A(0)) // A
ROUND(0, 0, eax, ecx, edi, edx)
ROUND(1, 0, ecx, eax, edx, edi)
ROUND(2, 0, eax, ecx, edi, edx)
ROUND(3, 0, ecx, eax, edx, edi)
ROUND(4, 0, eax, ecx, edi, edx)
ROUND(5, 0, ecx, eax, edx, edi)
ROUND(6, 0, eax, ecx, edi, edx)
ROUND(7, 0, ecx, eax, edx, edi)
ROUND(8, 0, eax, ecx, edi, edx)
ROUND(9, 0, ecx, eax, edx, edi)
ROUND(10, 0, eax, ecx, edi, edx)
ROUND(11, 0, ecx, eax, edx, edi)
ROUND(12, 0, eax, ecx, edi, edx)
ROUND(13, 0, ecx, eax, edx, edi)
ROUND(14, 0, eax, ecx, edi, edx)
ROUND(15, 0, ecx, eax, edx, edi)
ASL(1)
AS2(add WORD_REG(si), 4*16)
ROUND(0, 1, eax, ecx, edi, edx)
ROUND(1, 1, ecx, eax, edx, edi)
ROUND(2, 1, eax, ecx, edi, edx)
ROUND(3, 1, ecx, eax, edx, edi)
ROUND(4, 1, eax, ecx, edi, edx)
ROUND(5, 1, ecx, eax, edx, edi)
ROUND(6, 1, eax, ecx, edi, edx)
ROUND(7, 1, ecx, eax, edx, edi)
ROUND(8, 1, eax, ecx, edi, edx)
ROUND(9, 1, ecx, eax, edx, edi)
ROUND(10, 1, eax, ecx, edi, edx)
ROUND(11, 1, ecx, eax, edx, edi)
ROUND(12, 1, eax, ecx, edi, edx)
ROUND(13, 1, ecx, eax, edx, edi)
ROUND(14, 1, eax, ecx, edi, edx)
ROUND(15, 1, ecx, eax, edx, edi)
AS2( cmp WORD_REG(si), K_END)
ASJ( jb, 1, b)
AS2( mov WORD_REG(dx), DATA_SAVE)
AS2( add WORD_REG(dx), 64)
AS2( mov AS_REG_7, STATE_SAVE)
AS2( mov DATA_SAVE, WORD_REG(dx))
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_BOOL_X86
AS2( test DWORD PTR K_END, 1)
ASJ( jz, 4, f)
#endif
AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_7+1*16])
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_7+0*16])
AS2( paddd xmm1, E(0))
AS2( paddd xmm0, A(0))
AS2( movdqa [AS_REG_7+1*16], xmm1)
AS2( movdqa [AS_REG_7+0*16], xmm0)
AS2( cmp WORD_REG(dx), DATA_END)
ASJ( jb, 0, b)
#endif
#if CRYPTOPP_BOOL_X86
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
ASJ( jmp, 5, f)
ASL(4) // non-SSE2
#endif
AS2( add [AS_REG_7+0*4], ecx) // A
AS2( add [AS_REG_7+4*4], edi) // E
AS2( mov eax, B(0))
AS2( mov ebx, C(0))
AS2( mov ecx, D(0))
AS2( add [AS_REG_7+1*4], eax)
AS2( add [AS_REG_7+2*4], ebx)
AS2( add [AS_REG_7+3*4], ecx)
AS2( mov eax, F(0))
AS2( mov ebx, G(0))
AS2( mov ecx, H(0))
AS2( add [AS_REG_7+5*4], eax)
AS2( add [AS_REG_7+6*4], ebx)
AS2( add [AS_REG_7+7*4], ecx)
AS2( mov ecx, AS_REG_7d)
AS2( cmp WORD_REG(dx), DATA_END)
ASJ( jb, 2, b)
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
ASL(5)
#endif
#endif
AS_POP_IF86(sp)
AS_POP_IF86(bp)
#if !defined(_MSC_VER) || (_MSC_VER < 1400)
AS_POP_IF86(bx)
#endif
#ifdef CRYPTOPP_GENERATE_X64_MASM
add rsp, LOCALS_SIZE+8
pop rbp
pop rbx
pop rdi
pop rsi
ret
X86_SHA256_HashBlocks ENDP
#endif
#ifdef __GNUC__
".att_syntax prefix;"
:
: "c" (state), "d" (data), "S" (SHA256_K+48), "D" (len)
#if CRYPTOPP_BOOL_X64
, "m" (workspace[0])
#endif
: "memory", "cc", "%eax"
#if CRYPTOPP_BOOL_X64
, "%rbx", "%r8", "%r10"
#endif
);
#endif
}
static inline bool HasSSE2(void) { return false; }
static void SHA256_Transform32(word32 *state, const word32 *data)
{
word32 W[16];
int i;
for (i = 0; i < 16; i++)
W[i] = swab32(((word32 *)(data))[i]);
X86_SHA256_HashBlocks(state, W, 16 * 4);
}
static void runhash32(void *state, const void *input, const void *init)
{
memcpy(state, init, 32);
SHA256_Transform32(state, input);
}
/* suspiciously similar to ScanHash* from bitcoin */
bool scanhash_asm32(struct thr_info*thr, const unsigned char *midstate,
unsigned char *data,
unsigned char *hash1, unsigned char *hash,
const unsigned char *target,
uint32_t max_nonce, uint32_t *last_nonce,
uint32_t n)
{
uint32_t *hash32 = (uint32_t *) hash;
uint32_t *nonce = (uint32_t *)(data + 76);
data += 64;
while (1) {
n++;
*nonce = n;
runhash32(hash1, data, midstate);
runhash32(hash, hash1, sha256_init_state);
if (unlikely((hash32[7] == 0) && fulltest(hash, target))) {
*last_nonce = n;
return true;
}
if ((n >= max_nonce) || thr->work_restart) {
*last_nonce = n;
return false;
}
}
}
#endif // #if defined(WANT_CRYPTOPP_ASM32)

274
sha256_generic.c

@ -1,274 +0,0 @@ @@ -1,274 +0,0 @@
/*
* Cryptographic API.
*
* SHA-256, as specified in
* http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
*
* SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
*
* Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
* Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
* Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
* SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
*/
#include "config.h"
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include "miner.h"
typedef uint32_t u32;
typedef uint8_t u8;
static inline u32 ror32(u32 word, unsigned int shift)
{
return (word >> shift) | (word << (32 - shift));
}
static inline u32 Ch(u32 x, u32 y, u32 z)
{
return z ^ (x & (y ^ z));
}
static inline u32 Maj(u32 x, u32 y, u32 z)
{
return (x & y) | (z & (x | y));
}
#define e0(x) (ror32(x, 2) ^ ror32(x,13) ^ ror32(x,22))
#define e1(x) (ror32(x, 6) ^ ror32(x,11) ^ ror32(x,25))
#define s0(x) (ror32(x, 7) ^ ror32(x,18) ^ (x >> 3))
#define s1(x) (ror32(x,17) ^ ror32(x,19) ^ (x >> 10))
static inline void LOAD_OP(int I, u32 *W, const u8 *input)
{
/* byteswap is commented out, because bitcoin input
* is already big-endian
*/
W[I] = /* ntohl */ ( ((u32*)(input))[I] );
}
static inline void BLEND_OP(int I, u32 *W)
{
W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
}
static void sha256_transform(u32 *state, const u8 *input)
{
u32 a, b, c, d, e, f, g, h, t1, t2;
u32 W[64];
int i;
/* load the input */
for (i = 0; i < 16; i++)
LOAD_OP(i, W, input);
/* now blend */
for (i = 16; i < 64; i++)
BLEND_OP(i, W);
/* load the state into our registers */
a=state[0]; b=state[1]; c=state[2]; d=state[3];
e=state[4]; f=state[5]; g=state[6]; h=state[7];
/* now iterate */
t1 = h + e1(e) + Ch(e,f,g) + 0x428a2f98 + W[ 0];
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;
t1 = g + e1(d) + Ch(d,e,f) + 0x71374491 + W[ 1];
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2;
t1 = f + e1(c) + Ch(c,d,e) + 0xb5c0fbcf + W[ 2];
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2;
t1 = e + e1(b) + Ch(b,c,d) + 0xe9b5dba5 + W[ 3];
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2;
t1 = d + e1(a) + Ch(a,b,c) + 0x3956c25b + W[ 4];
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2;
t1 = c + e1(h) + Ch(h,a,b) + 0x59f111f1 + W[ 5];
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2;
t1 = b + e1(g) + Ch(g,h,a) + 0x923f82a4 + W[ 6];
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2;
t1 = a + e1(f) + Ch(f,g,h) + 0xab1c5ed5 + W[ 7];
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2;
t1 = h + e1(e) + Ch(e,f,g) + 0xd807aa98 + W[ 8];
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;
t1 = g + e1(d) + Ch(d,e,f) + 0x12835b01 + W[ 9];
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2;
t1 = f + e1(c) + Ch(c,d,e) + 0x243185be + W[10];
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2;
t1 = e + e1(b) + Ch(b,c,d) + 0x550c7dc3 + W[11];
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2;
t1 = d + e1(a) + Ch(a,b,c) + 0x72be5d74 + W[12];
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2;
t1 = c + e1(h) + Ch(h,a,b) + 0x80deb1fe + W[13];
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2;
t1 = b + e1(g) + Ch(g,h,a) + 0x9bdc06a7 + W[14];
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2;
t1 = a + e1(f) + Ch(f,g,h) + 0xc19bf174 + W[15];
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2;
t1 = h + e1(e) + Ch(e,f,g) + 0xe49b69c1 + W[16];
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;
t1 = g + e1(d) + Ch(d,e,f) + 0xefbe4786 + W[17];
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2;
t1 = f + e1(c) + Ch(c,d,e) + 0x0fc19dc6 + W[18];
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2;
t1 = e + e1(b) + Ch(b,c,d) + 0x240ca1cc + W[19];
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2;
t1 = d + e1(a) + Ch(a,b,c) + 0x2de92c6f + W[20];
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2;
t1 = c + e1(h) + Ch(h,a,b) + 0x4a7484aa + W[21];
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2;
t1 = b + e1(g) + Ch(g,h,a) + 0x5cb0a9dc + W[22];
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2;
t1 = a + e1(f) + Ch(f,g,h) + 0x76f988da + W[23];
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2;
t1 = h + e1(e) + Ch(e,f,g) + 0x983e5152 + W[24];
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;
t1 = g + e1(d) + Ch(d,e,f) + 0xa831c66d + W[25];
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2;
t1 = f + e1(c) + Ch(c,d,e) + 0xb00327c8 + W[26];
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2;
t1 = e + e1(b) + Ch(b,c,d) + 0xbf597fc7 + W[27];
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2;
t1 = d + e1(a) + Ch(a,b,c) + 0xc6e00bf3 + W[28];
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2;
t1 = c + e1(h) + Ch(h,a,b) + 0xd5a79147 + W[29];
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2;
t1 = b + e1(g) + Ch(g,h,a) + 0x06ca6351 + W[30];
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2;
t1 = a + e1(f) + Ch(f,g,h) + 0x14292967 + W[31];
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2;
t1 = h + e1(e) + Ch(e,f,g) + 0x27b70a85 + W[32];
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;
t1 = g + e1(d) + Ch(d,e,f) + 0x2e1b2138 + W[33];
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2;
t1 = f + e1(c) + Ch(c,d,e) + 0x4d2c6dfc + W[34];
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2;
t1 = e + e1(b) + Ch(b,c,d) + 0x53380d13 + W[35];
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2;
t1 = d + e1(a) + Ch(a,b,c) + 0x650a7354 + W[36];
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2;
t1 = c + e1(h) + Ch(h,a,b) + 0x766a0abb + W[37];
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2;
t1 = b + e1(g) + Ch(g,h,a) + 0x81c2c92e + W[38];
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2;
t1 = a + e1(f) + Ch(f,g,h) + 0x92722c85 + W[39];
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2;
t1 = h + e1(e) + Ch(e,f,g) + 0xa2bfe8a1 + W[40];
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;
t1 = g + e1(d) + Ch(d,e,f) + 0xa81a664b + W[41];
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2;
t1 = f + e1(c) + Ch(c,d,e) + 0xc24b8b70 + W[42];
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2;
t1 = e + e1(b) + Ch(b,c,d) + 0xc76c51a3 + W[43];
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2;
t1 = d + e1(a) + Ch(a,b,c) + 0xd192e819 + W[44];
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2;
t1 = c + e1(h) + Ch(h,a,b) + 0xd6990624 + W[45];
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2;
t1 = b + e1(g) + Ch(g,h,a) + 0xf40e3585 + W[46];
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2;
t1 = a + e1(f) + Ch(f,g,h) + 0x106aa070 + W[47];
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2;
t1 = h + e1(e) + Ch(e,f,g) + 0x19a4c116 + W[48];
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;
t1 = g + e1(d) + Ch(d,e,f) + 0x1e376c08 + W[49];
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2;
t1 = f + e1(c) + Ch(c,d,e) + 0x2748774c + W[50];
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2;
t1 = e + e1(b) + Ch(b,c,d) + 0x34b0bcb5 + W[51];
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2;
t1 = d + e1(a) + Ch(a,b,c) + 0x391c0cb3 + W[52];
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2;
t1 = c + e1(h) + Ch(h,a,b) + 0x4ed8aa4a + W[53];
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2;
t1 = b + e1(g) + Ch(g,h,a) + 0x5b9cca4f + W[54];
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2;
t1 = a + e1(f) + Ch(f,g,h) + 0x682e6ff3 + W[55];
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2;
t1 = h + e1(e) + Ch(e,f,g) + 0x748f82ee + W[56];
t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;
t1 = g + e1(d) + Ch(d,e,f) + 0x78a5636f + W[57];
t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2;
t1 = f + e1(c) + Ch(c,d,e) + 0x84c87814 + W[58];
t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2;
t1 = e + e1(b) + Ch(b,c,d) + 0x8cc70208 + W[59];
t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2;
t1 = d + e1(a) + Ch(a,b,c) + 0x90befffa + W[60];
t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2;
t1 = c + e1(h) + Ch(h,a,b) + 0xa4506ceb + W[61];
t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2;
t1 = b + e1(g) + Ch(g,h,a) + 0xbef9a3f7 + W[62];
t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2;
t1 = a + e1(f) + Ch(f,g,h) + 0xc67178f2 + W[63];
t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2;
state[0] += a; state[1] += b; state[2] += c; state[3] += d;
state[4] += e; state[5] += f; state[6] += g; state[7] += h;
#if 0
/* clear any sensitive info... */
a = b = c = d = e = f = g = h = t1 = t2 = 0;
memset(W, 0, 64 * sizeof(u32));
#endif
}
static void runhash(void *state, const void *input, const void *init)
{
memcpy(state, init, 32);
sha256_transform(state, input);
}
const uint32_t sha256_init_state[8] = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};
/* suspiciously similar to ScanHash* from bitcoin */
bool scanhash_c(struct thr_info*thr, const unsigned char *midstate, unsigned char *data,
unsigned char *hash1, unsigned char *hash,
const unsigned char *target,
uint32_t max_nonce, uint32_t *last_nonce,
uint32_t n)
{
uint32_t *hash32 = (uint32_t *) hash;
uint32_t *nonce = (uint32_t *)(data + 76);
unsigned long stat_ctr = 0;
data += 64;
while (1) {
n++;
*nonce = n;
runhash(hash1, data, midstate);
runhash(hash, hash1, sha256_init_state);
stat_ctr++;
if (unlikely((hash32[7] == 0) && fulltest(hash, target))) {
*last_nonce = n;
return true;
}
if ((n >= max_nonce) || thr->work_restart) {
*last_nonce = n;
return false;
}
}
}

133
sha256_sse2_amd64.c

@ -1,133 +0,0 @@ @@ -1,133 +0,0 @@
/*
* SHA-256 driver for ASM routine for x86_64 on Linux
* Copyright (c) Mark Crichton <crichton@gimp.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
*/
#include "driver-cpu.h"
#ifdef WANT_X8664_SSE2
#include <string.h>
#include <assert.h>
#include <xmmintrin.h>
#include <stdint.h>
#include <stdio.h>
extern void sha256_sse2_64_new (__m128i *res, __m128i *res1, __m128i *data, const uint32_t init[8]);
static uint32_t g_sha256_k[]__attribute__((aligned(0x100))) = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /* 0 */
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /* 8 */
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, /* 16 */
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, /* 24 */
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, /* 32 */
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, /* 40 */
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, /* 48 */
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, /* 56 */
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
const uint32_t sha256_init[8]__attribute__((aligned(0x100))) =
{0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
__m128i g_4sha256_k[64];
__m128i sha256_consts_m128i[64]__attribute__((aligned(0x1000)));
bool scanhash_sse2_64(struct thr_info*thr, const unsigned char *pmidstate,
unsigned char *pdata,
unsigned char *phash1, unsigned char *phash,
const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce,
uint32_t nonce)
{
uint32_t *nNonce_p = (uint32_t *)(pdata + 76);
uint32_t m_midstate[8], m_w[16], m_w1[16];
__m128i m_4w[64] __attribute__ ((aligned (0x100)));
__m128i m_4hash[64] __attribute__ ((aligned (0x100)));
__m128i m_4hash1[64] __attribute__ ((aligned (0x100)));
__m128i offset;
int i;
pdata += 64;
/* For debugging */
union {
__m128i m;
uint32_t i[4];
} mi;
/* Message expansion */
memcpy(m_midstate, pmidstate, sizeof(m_midstate));
memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */
memcpy(m_w1, phash1, sizeof(m_w1));
memset(m_4hash, 0, sizeof(m_4hash));
/* Transmongrify */
for (i = 0; i < 16; i++)
m_4w[i] = _mm_set1_epi32(m_w[i]);
for (i = 0; i < 16; i++)
m_4hash1[i] = _mm_set1_epi32(m_w1[i]);
for (i = 0; i < 64; i++)
sha256_consts_m128i[i] = _mm_set1_epi32(g_sha256_k[i]);
offset = _mm_set_epi32(0x3, 0x2, 0x1, 0x0);
for (;;)
{
int j;
m_4w[3] = _mm_add_epi32(offset, _mm_set1_epi32(nonce));
sha256_sse2_64_new (m_4hash, m_4hash1, m_4w, m_midstate);
for (j = 0; j < 4; j++) {
mi.m = m_4hash[7];
if (unlikely(mi.i[j] == 0))
break;
}
/* If j = true, we found a hit...so check it */
/* Use the C version for a check... */
if (unlikely(j != 4)) {
for (i = 0; i < 8; i++) {
mi.m = m_4hash[i];
*(uint32_t *)&(phash)[i*4] = mi.i[j];
}
if (fulltest(phash, ptarget)) {
nonce += j;
*last_nonce = nonce + 1;
*nNonce_p = nonce;
return true;
}
}
if (unlikely((nonce >= max_nonce) || thr->work_restart))
{
*last_nonce = nonce;
return false;
}
nonce += 4;
}
}
#endif /* WANT_X8664_SSE2 */

125
sha256_sse2_i386.c

@ -1,125 +0,0 @@ @@ -1,125 +0,0 @@
/*
* SHA-256 driver for ASM routine for x86_64 on Linux
* Copyright (c) Mark Crichton <crichton@gimp.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
*/
#include "driver-cpu.h"
#ifdef WANT_X8632_SSE2
#include <string.h>
#include <assert.h>
#include <xmmintrin.h>
#include <stdint.h>
#include <stdio.h>
extern void CalcSha256_x86 (__m128i *res, __m128i *data, const uint32_t init[8])__attribute__((fastcall));
static uint32_t g_sha256_k[]__attribute__((aligned(0x100))) = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /* 0 */
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /* 8 */
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, /* 16 */
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, /* 24 */
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, /* 32 */
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, /* 40 */
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, /* 48 */
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, /* 56 */
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
const uint32_t sha256_32init[8]__attribute__((aligned(0x100))) =
{0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
__m128i g_4sha256_k[64];
__m128i sha256_consts_m128i[64]__attribute__((aligned(0x1000)));
bool scanhash_sse2_32(struct thr_info*thr, const unsigned char *pmidstate,
unsigned char *pdata,
unsigned char *phash1, unsigned char *phash,
const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce,
uint32_t nonce)
{
uint32_t *nNonce_p = (uint32_t *)(pdata + 76);
uint32_t m_midstate[8], m_w[16], m_w1[16];
__m128i m_4w[64] __attribute__ ((aligned (0x100)));
__m128i m_4hash[64] __attribute__ ((aligned (0x100)));
__m128i m_4hash1[64] __attribute__ ((aligned (0x100)));
__m128i offset;
int i;
pdata += 64;
/* Message expansion */
memcpy(m_midstate, pmidstate, sizeof(m_midstate));
memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */
memcpy(m_w1, phash1, sizeof(m_w1));
memset(m_4hash, 0, sizeof(m_4hash));
/* Transmongrify */
for (i = 0; i < 16; i++)
m_4w[i] = _mm_set1_epi32(m_w[i]);
for (i = 0; i < 16; i++)
m_4hash1[i] = _mm_set1_epi32(m_w1[i]);
for (i = 0; i < 64; i++)
sha256_consts_m128i[i] = _mm_set1_epi32(g_sha256_k[i]);
offset = _mm_set_epi32(0x3, 0x2, 0x1, 0x0);
for (;;)
{
int j;
m_4w[3] = _mm_add_epi32(offset, _mm_set1_epi32(nonce));
/* Some optimization can be done here W.R.T. precalculating some hash */
CalcSha256_x86 (m_4hash1, m_4w, m_midstate);
CalcSha256_x86 (m_4hash, m_4hash1, sha256_32init);
for (j = 0; j < 4; j++) {
if (unlikely(((uint32_t *)&(m_4hash[7]))[j] == 0)) {
/* We found a hit...so check it */
/* Use the C version for a check... */
for (i = 0; i < 8; i++) {
*(uint32_t *)&(phash)[i<<2] = ((uint32_t *)&(m_4hash[i]))[j];
}
if (fulltest(phash, ptarget)) {
nonce += j;
*last_nonce = nonce;
*nNonce_p = nonce;
return true;
}
}
}
if (unlikely((nonce >= max_nonce) || thr->work_restart)) {
*last_nonce = nonce;
return false;
}
nonce += 4;
}
}
#endif /* WANT_X8632_SSE2 */

132
sha256_sse4_amd64.c

@ -1,132 +0,0 @@ @@ -1,132 +0,0 @@
/*
* SHA-256 driver for ASM routine for x86_64 on Linux
* Copyright (c) Mark Crichton <crichton@gimp.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
*/
#include "driver-cpu.h"
#ifdef WANT_X8664_SSE4
#include <string.h>
#include <assert.h>
#include <xmmintrin.h>
#include <stdint.h>
#include <stdio.h>
extern void CalcSha256_x64_sse4(__m128i *res, __m128i *data, uint32_t init[8]);
static uint32_t g_sha256_k[] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /* 0 */
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /* 8 */
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, /* 16 */
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, /* 24 */
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, /* 32 */
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, /* 40 */
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, /* 48 */
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, /* 56 */
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
static uint32_t g_sha256_hinit[8] =
{0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
__m128i g_4sha256_k[64];
bool scanhash_sse4_64(struct thr_info*thr, const unsigned char *pmidstate,
unsigned char *pdata,
unsigned char *phash1, unsigned char *phash,
const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce,
uint32_t nonce)
{
uint32_t *nNonce_p = (uint32_t *)(pdata + 76);
uint32_t m_midstate[8], m_w[16], m_w1[16];
__m128i m_4w[64], m_4hash[64], m_4hash1[64];
__m128i offset;
int i;
pdata += 64;
/* For debugging */
union {
__m128i m;
uint32_t i[4];
} mi;
/* Message expansion */
memcpy(m_midstate, pmidstate, sizeof(m_midstate));
memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */
memcpy(m_w1, phash1, sizeof(m_w1));
memset(m_4hash, 0, sizeof(m_4hash));
/* Transmongrify */
for (i = 0; i < 16; i++)
m_4w[i] = _mm_set1_epi32(m_w[i]);
for (i = 0; i < 16; i++)
m_4hash1[i] = _mm_set1_epi32(m_w1[i]);
for (i = 0; i < 64; i++)
g_4sha256_k[i] = _mm_set1_epi32(g_sha256_k[i]);
offset = _mm_set_epi32(0x3, 0x2, 0x1, 0x0);
for (;;)
{
int j;
m_4w[3] = _mm_add_epi32(offset, _mm_set1_epi32(nonce));
/* Some optimization can be done here W.R.T. precalculating some hash */
CalcSha256_x64_sse4(m_4hash1, m_4w, m_midstate);
CalcSha256_x64_sse4(m_4hash, m_4hash1, g_sha256_hinit);
for (j = 0; j < 4; j++) {
mi.m = m_4hash[7];
if (unlikely(mi.i[j] == 0))
break;
}
/* If j = true, we found a hit...so check it */
/* Use the C version for a check... */
if (unlikely(j != 4)) {
for (i = 0; i < 8; i++) {
mi.m = m_4hash[i];
*(uint32_t *)&(phash)[i*4] = mi.i[j];
}
if (fulltest(phash, ptarget)) {
nonce += j;
*last_nonce = nonce;
*nNonce_p = nonce;
return true;
}
}
if (unlikely((nonce >= max_nonce) || thr->work_restart))
{
*last_nonce = nonce;
return false;
}
nonce += 4;
}
}
#endif /* WANT_X8664_SSE4 */

85
sha256_via.c

@ -1,85 +0,0 @@ @@ -1,85 +0,0 @@
#include "driver-cpu.h"
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <sys/time.h>
#include "miner.h"
#ifdef WANT_VIA_PADLOCK
static void via_sha256(void *hash, void *buf, unsigned len)
{
unsigned stat = 0;
asm volatile(".byte 0xf3, 0x0f, 0xa6, 0xd0"
:"+S"(buf), "+a"(stat)
:"c"(len), "D" (hash)
:"memory");
}
bool scanhash_via(struct thr_info*thr, const unsigned char __maybe_unused *pmidstate,
unsigned char *data_inout,
unsigned char __maybe_unused *phash1, unsigned char __maybe_unused *phash,
const unsigned char *target,
uint32_t max_nonce, uint32_t *last_nonce,
uint32_t n)
{
unsigned char data[128] __attribute__((aligned(128)));
unsigned char tmp_hash[32] __attribute__((aligned(128)));
unsigned char tmp_hash1[32] __attribute__((aligned(128)));
uint32_t *data32 = (uint32_t *) data;
uint32_t *hash32 = (uint32_t *) tmp_hash;
uint32_t *nonce = (uint32_t *)(data + 64 + 12);
unsigned long stat_ctr = 0;
int i;
/* bitcoin gives us big endian input, but via wants LE,
* so we reverse the swapping bitcoin has already done (extra work)
* in order to permit the hardware to swap everything
* back to BE again (extra work).
*/
for (i = 0; i < 128/4; i++)
data32[i] = swab32(((uint32_t *)data_inout)[i]);
while (1) {
n++;
*nonce = n;
/* first SHA256 transform */
memcpy(tmp_hash1, sha256_init_state, 32);
via_sha256(tmp_hash1, data, 80); /* or maybe 128? */
for (i = 0; i < 32/4; i++)
((uint32_t *)tmp_hash1)[i] =
swab32(((uint32_t *)tmp_hash1)[i]);
/* second SHA256 transform */
memcpy(tmp_hash, sha256_init_state, 32);
via_sha256(tmp_hash, tmp_hash1, 32);
stat_ctr++;
if (unlikely((hash32[7] == 0) && fulltest(tmp_hash, target))) {
/* swap nonce'd data back into original storage area;
* TODO: only swap back the nonce, rather than all data
*/
for (i = 0; i < 128/4; i++) {
uint32_t *dout32 = (uint32_t *) data_inout;
dout32[i] = swab32(data32[i]);
}
*last_nonce = n;
return true;
}
if ((n >= max_nonce) || thr->work_restart) {
*last_nonce = n;
return false;
}
}
}
#endif /* WANT_VIA_PADLOCK */

1
x86_32/.gitignore vendored

@ -1 +0,0 @@ @@ -1 +0,0 @@
libx8632.a

8
x86_32/Makefile.am

@ -1,8 +0,0 @@ @@ -1,8 +0,0 @@
noinst_LIBRARIES = libx8632.a
SUFFIXES = .asm
libx8632_a_SOURCES = sha256_xmm.asm
.asm.o:
$(YASM) -f $(YASM_FMT) $<

259
x86_32/sha256_xmm.asm

@ -1,259 +0,0 @@ @@ -1,259 +0,0 @@
;; SHA-256 for X86 for Linux, based off of:A
; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011
; This software is Public Domain
; SHA-256 CPU SSE cruncher for Bitcoin Miner
ALIGN 32
BITS 32
%define hash ecx
%define data edx
%define init esi
; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define LAB_CALC_PARA 2
%define LAB_CALC_UNROLL 24
%define LAB_LOOP_UNROLL 64
extern _sha256_consts_m128i
global $@CalcSha256_x86@12
; CalcSha256 hash(ecx), data(edx), init([esp+4])
@CalcSha256_x86@12:
push esi
push edi
mov init, [esp+12]
LAB_SHA:
lea edi, qword [data+256] ; + 256
LAB_CALC:
%macro lab_calc_blk 1
movdqa xmm0, [edi-(15-%1)*16] ; xmm0 = W[I-15]
movdqa xmm4, [edi-(15-(%1+1))*16] ; xmm4 = W[I-15+1]
movdqa xmm2, xmm0 ; xmm2 = W[I-15]
movdqa xmm6, xmm4 ; xmm6 = W[I-15+1]
psrld xmm0, 3 ; xmm0 = W[I-15] >> 3
psrld xmm4, 3 ; xmm4 = W[I-15+1] >> 3
movdqa xmm1, xmm0 ; xmm1 = W[I-15] >> 3
movdqa xmm5, xmm4 ; xmm5 = W[I-15+1] >> 3
pslld xmm2, 14 ; xmm2 = W[I-15] << 14
pslld xmm6, 14 ; xmm6 = W[I-15+1] << 14
psrld xmm1, 4 ; xmm1 = W[I-15] >> 7
psrld xmm5, 4 ; xmm5 = W[I-15+1] >> 7
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)
psrld xmm1, 11 ; xmm1 = W[I-15] >> 18
psrld xmm5, 11 ; xmm5 = W[I-15+1] >> 18
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
pslld xmm2, 11 ; xmm2 = W[I-15] << 25
pslld xmm6, 11 ; xmm6 = W[I-15+1] << 25
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
movdqa xmm3, [edi-(2-%1)*16] ; xmm3 = W[I-2]
movdqa xmm7, [edi-(2-(%1+1))*16] ; xmm7 = W[I-2+1]
paddd xmm0, [edi-(16-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16]
paddd xmm4, [edi-(16-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1]
;;;;;;;;;;;;;;;;;;
movdqa xmm2, xmm3 ; xmm2 = W[I-2]
movdqa xmm6, xmm7 ; xmm6 = W[I-2+1]
psrld xmm3, 10 ; xmm3 = W[I-2] >> 10
psrld xmm7, 10 ; xmm7 = W[I-2+1] >> 10
movdqa xmm1, xmm3 ; xmm1 = W[I-2] >> 10
movdqa xmm5, xmm7 ; xmm5 = W[I-2+1] >> 10
paddd xmm0, [edi-(7-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
pslld xmm2, 13 ; xmm2 = W[I-2] << 13
pslld xmm6, 13 ; xmm6 = W[I-2+1] << 13
psrld xmm1, 7 ; xmm1 = W[I-2] >> 17
psrld xmm5, 7 ; xmm5 = W[I-2+1] >> 17
paddd xmm4, [edi-(7-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]
pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
psrld xmm1, 2 ; xmm1 = W[I-2] >> 19
psrld xmm5, 2 ; xmm5 = W[I-2+1] >> 19
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)
pslld xmm2, 2 ; xmm2 = W[I-2] << 15
pslld xmm6, 2 ; xmm6 = W[I-2+1] << 15
pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
paddd xmm0, xmm3 ; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
paddd xmm4, xmm7 ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]
movdqa [edi+(%1*16)], xmm0
movdqa [edi+((%1+1)*16)], xmm4
%endmacro
%assign i 0
%rep LAB_CALC_UNROLL
lab_calc_blk i
%assign i i+LAB_CALC_PARA
%endrep
; Load the init values of the message into the hash.
movdqa xmm7, [init]
pshufd xmm5, xmm7, 0x55 ; xmm5 == b
pshufd xmm4, xmm7, 0xAA ; xmm4 == c
pshufd xmm3, xmm7, 0xFF ; xmm3 == d
pshufd xmm7, xmm7, 0 ; xmm7 == a
movdqa xmm0, [init+4*4]
pshufd xmm1, xmm0, 0x55 ; [hash+0*16] == f
movdqa [hash+0*16], xmm1
pshufd xmm1, xmm0, 0xAA ; [hash+1*16] == g
movdqa [hash+1*16], xmm1
pshufd xmm1, xmm0, 0xFF ; [hash+2*16] == h
movdqa [hash+2*16], xmm1
pshufd xmm0, xmm0, 0 ; xmm0 == e
LAB_LOOP:
;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]
%macro lab_loop_blk 1
movdqa xmm6, [data+%1]
paddd xmm6, _sha256_consts_m128i[%1]
paddd xmm6, [hash+2*16] ; +h
movdqa xmm1, xmm0
movdqa xmm2, [hash+1*16]
pandn xmm1, xmm2 ; ~e & g
movdqa [hash+2*16], xmm2 ; h = g
movdqa xmm2, [hash+0*16] ; f
movdqa [hash+1*16], xmm2 ; g = f
pand xmm2, xmm0 ; e & f
pxor xmm1, xmm2 ; (e & f) ^ (~e & g)
movdqa [hash+0*16], xmm0 ; f = e
paddd xmm6, xmm1 ; Ch + h + w[i] + k[i]
movdqa xmm1, xmm0
psrld xmm0, 6
movdqa xmm2, xmm0
pslld xmm1, 7
psrld xmm2, 5
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 14
psrld xmm2, 14
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 5
pxor xmm0, xmm1 ; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
paddd xmm6, xmm0 ; xmm6 = t1
movdqa xmm0, xmm3 ; d
paddd xmm0, xmm6 ; e = d+t1
movdqa xmm1, xmm5 ; =b
movdqa xmm3, xmm4 ; d = c
movdqa xmm2, xmm4 ; c
pand xmm2, xmm5 ; b & c
pand xmm4, xmm7 ; a & c
pand xmm1, xmm7 ; a & b
pxor xmm1, xmm4
movdqa xmm4, xmm5 ; c = b
movdqa xmm5, xmm7 ; b = a
pxor xmm1, xmm2 ; (a & c) ^ (a & d) ^ (c & d)
paddd xmm6, xmm1 ; t1 + ((a & c) ^ (a & d) ^ (c & d))
movdqa xmm2, xmm7
psrld xmm7, 2
movdqa xmm1, xmm7
pslld xmm2, 10
psrld xmm1, 11
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 9
psrld xmm1, 9
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 11
pxor xmm7, xmm2
paddd xmm7, xmm6 ; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
%endmacro
%assign i 0
%rep LAB_LOOP_UNROLL
lab_loop_blk i
%assign i i+16
%endrep
; Finished the 64 rounds, calculate hash and save
movdqa xmm1, [init+16]
pshufd xmm2, xmm1, 0xFF
movdqa xmm6, [hash+2*16]
paddd xmm2, xmm6
movdqa [hash+7*16], xmm2
pshufd xmm2, xmm1, 0xAA
movdqa xmm6, [hash+1*16]
paddd xmm2, xmm6
movdqa [hash+6*16], xmm2
pshufd xmm2, xmm1, 0x55
movdqa xmm6, [hash+0*16]
paddd xmm2, xmm6
movdqa [hash+5*16], xmm2
pshufd xmm1, xmm1, 0
paddd xmm0, xmm1
movdqa [hash+4*16], xmm0
movdqa xmm1, [init]
pshufd xmm2, xmm1, 0xFF
paddd xmm3, xmm2
movdqa [hash+3*16], xmm3
pshufd xmm2, xmm1, 0xAA
paddd xmm4, xmm2
movdqa [hash+2*16], xmm4
pshufd xmm2, xmm1, 0x55
paddd xmm5, xmm2
movdqa [hash+1*16], xmm5
pshufd xmm1, xmm1, 0
paddd xmm7, xmm1
movdqa [hash+0*16], xmm7
LAB_RET:
pop edi
pop esi
retn 4
%ifidn __OUTPUT_FORMAT__,elf
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
%ifidn __OUTPUT_FORMAT__,elf32
section .note.GNU-stack noalloc noexec nowrite progbits
%endif

1
x86_64/.gitignore vendored

@ -1 +0,0 @@ @@ -1 +0,0 @@
libx8664.a

8
x86_64/Makefile.am

@ -1,8 +0,0 @@ @@ -1,8 +0,0 @@
noinst_LIBRARIES = libx8664.a
SUFFIXES = .asm
libx8664_a_SOURCES = sha256_xmm_amd64.asm sha256_sse4_amd64.asm
.asm.o:
$(YASM) -f $(YASM_FMT) -o $@ $<

292
x86_64/sha256_sse4_amd64.asm

@ -1,292 +0,0 @@ @@ -1,292 +0,0 @@
;; SHA-256 for X86-64 for Linux, based off of:
; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011
; This software is Public Domain
; Significant re-write/optimisation and reordering by,
; Neil Kettle <mu-b@digit-labs.org>
; ~18% performance improvement
; SHA-256 CPU SSE cruncher for Bitcoin Miner
ALIGN 32
BITS 64
%ifidn __OUTPUT_FORMAT__,win64
%define hash rcx
%define data rdx
%define init r8
%define temp r9
%else
%define hash rdi
%define data rsi
%define init rdx
%define temp rcx
%endif
; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define LAB_CALC_PARA 2
%define LAB_CALC_UNROLL 8
%define LAB_LOOP_UNROLL 8
extern g_4sha256_k
global CalcSha256_x64_sse4
; CalcSha256 hash(rdi), data(rsi), init(rdx)
; CalcSha256 hash(rcx), data(rdx), init(r8)
CalcSha256_x64_sse4:
push rbx
%ifidn __OUTPUT_FORMAT__,win64
sub rsp, 16 * 6
movdqa [rsp + 16*0], xmm6
movdqa [rsp + 16*1], xmm7
movdqa [rsp + 16*2], xmm8
movdqa [rsp + 16*3], xmm9
movdqa [rsp + 16*4], xmm10
movdqa [rsp + 16*5], xmm11
%endif
LAB_NEXT_NONCE:
mov temp, 64*4 ; 256 - temp is # of SHA-2 rounds
mov rax, 16*4 ; 64 - rax is where we expand to
LAB_SHA:
push temp
lea temp, qword [data+temp*4] ; + 1024
lea r11, qword [data+rax*4] ; + 256
LAB_CALC:
%macro lab_calc_blk 1
movntdqa xmm0, [r11-(15-%1)*16] ; xmm0 = W[I-15]
movdqa xmm2, xmm0 ; xmm2 = W[I-15]
movntdqa xmm4, [r11-(15-(%1+1))*16] ; xmm4 = W[I-15+1]
movdqa xmm6, xmm4 ; xmm6 = W[I-15+1]
psrld xmm0, 3 ; xmm0 = W[I-15] >> 3
movdqa xmm1, xmm0 ; xmm1 = W[I-15] >> 3
pslld xmm2, 14 ; xmm2 = W[I-15] << 14
psrld xmm4, 3 ; xmm4 = W[I-15+1] >> 3
movdqa xmm5, xmm4 ; xmm5 = W[I-15+1] >> 3
psrld xmm5, 4 ; xmm5 = W[I-15+1] >> 7
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)
pslld xmm6, 14 ; xmm6 = W[I-15+1] << 14
psrld xmm1, 4 ; xmm1 = W[I-15] >> 7
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
psrld xmm1, 11 ; xmm1 = W[I-15] >> 18
psrld xmm5, 11 ; xmm5 = W[I-15+1] >> 18
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)
pslld xmm2, 11 ; xmm2 = W[I-15] << 25
pslld xmm6, 11 ; xmm6 = W[I-15+1] << 25
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
paddd xmm0, [r11-(16-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16]
paddd xmm4, [r11-(16-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1]
movntdqa xmm3, [r11-(2-%1)*16] ; xmm3 = W[I-2]
movntdqa xmm7, [r11-(2-(%1+1))*16] ; xmm7 = W[I-2+1]
;;;;;;;;;;;;;;;;;;
movdqa xmm2, xmm3 ; xmm2 = W[I-2]
psrld xmm3, 10 ; xmm3 = W[I-2] >> 10
movdqa xmm1, xmm3 ; xmm1 = W[I-2] >> 10
movdqa xmm6, xmm7 ; xmm6 = W[I-2+1]
psrld xmm7, 10 ; xmm7 = W[I-2+1] >> 10
movdqa xmm5, xmm7 ; xmm5 = W[I-2+1] >> 10
paddd xmm0, [r11-(7-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
paddd xmm4, [r11-(7-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]
pslld xmm2, 13 ; xmm2 = W[I-2] << 13
pslld xmm6, 13 ; xmm6 = W[I-2+1] << 13
psrld xmm1, 7 ; xmm1 = W[I-2] >> 17
psrld xmm5, 7 ; xmm5 = W[I-2+1] >> 17
pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
psrld xmm1, 2 ; xmm1 = W[I-2] >> 19
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
pslld xmm2, 2 ; xmm2 = W[I-2] << 15
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
psrld xmm5, 2 ; xmm5 = W[I-2+1] >> 19
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)
pslld xmm6, 2 ; xmm6 = W[I-2+1] << 15
pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
paddd xmm0, xmm3 ; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
paddd xmm4, xmm7 ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]
movdqa [r11+(%1*16)], xmm0
movdqa [r11+((%1+1)*16)], xmm4
%endmacro
%assign i 0
%rep LAB_CALC_UNROLL
lab_calc_blk i
%assign i i+LAB_CALC_PARA
%endrep
add r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
cmp r11, temp
jb LAB_CALC
pop temp
mov rax, 0
; Load the init values of the message into the hash.
movntdqa xmm7, [init]
pshufd xmm5, xmm7, 0x55 ; xmm5 == b
pshufd xmm4, xmm7, 0xAA ; xmm4 == c
pshufd xmm3, xmm7, 0xFF ; xmm3 == d
pshufd xmm7, xmm7, 0 ; xmm7 == a
movntdqa xmm0, [init+4*4]
pshufd xmm8, xmm0, 0x55 ; xmm8 == f
pshufd xmm9, xmm0, 0xAA ; xmm9 == g
pshufd xmm10, xmm0, 0xFF ; xmm10 == h
pshufd xmm0, xmm0, 0 ; xmm0 == e
LAB_LOOP:
;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]
%macro lab_loop_blk 0
movntdqa xmm6, [data+rax*4]
paddd xmm6, g_4sha256_k[rax*4]
add rax, 4
paddd xmm6, xmm10 ; +h
movdqa xmm1, xmm0
movdqa xmm2, xmm9
pandn xmm1, xmm2 ; ~e & g
movdqa xmm10, xmm2 ; h = g
movdqa xmm2, xmm8 ; f
movdqa xmm9, xmm2 ; g = f
pand xmm2, xmm0 ; e & f
pxor xmm1, xmm2 ; (e & f) ^ (~e & g)
movdqa xmm8, xmm0 ; f = e
paddd xmm6, xmm1 ; Ch + h + w[i] + k[i]
movdqa xmm1, xmm0
psrld xmm0, 6
movdqa xmm2, xmm0
pslld xmm1, 7
psrld xmm2, 5
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 14
psrld xmm2, 14
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 5
pxor xmm0, xmm1 ; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
paddd xmm6, xmm0 ; xmm6 = t1
movdqa xmm0, xmm3 ; d
paddd xmm0, xmm6 ; e = d+t1
movdqa xmm1, xmm5 ; =b
movdqa xmm3, xmm4 ; d = c
movdqa xmm2, xmm4 ; c
pand xmm2, xmm5 ; b & c
pand xmm4, xmm7 ; a & c
pand xmm1, xmm7 ; a & b
pxor xmm1, xmm4
movdqa xmm4, xmm5 ; c = b
movdqa xmm5, xmm7 ; b = a
pxor xmm1, xmm2 ; (a & c) ^ (a & d) ^ (c & d)
paddd xmm6, xmm1 ; t1 + ((a & c) ^ (a & d) ^ (c & d))
movdqa xmm2, xmm7
psrld xmm7, 2
movdqa xmm1, xmm7
pslld xmm2, 10
psrld xmm1, 11
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 9
psrld xmm1, 9
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 11
pxor xmm7, xmm2
paddd xmm7, xmm6 ; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
%endmacro
%assign i 0
%rep LAB_LOOP_UNROLL
lab_loop_blk
%assign i i+1
%endrep
cmp rax, temp
jb LAB_LOOP
; Finished the 64 rounds, calculate hash and save
movntdqa xmm1, [init]
pshufd xmm2, xmm1, 0x55
paddd xmm5, xmm2
pshufd xmm6, xmm1, 0xAA
paddd xmm4, xmm6
pshufd xmm11, xmm1, 0xFF
paddd xmm3, xmm11
pshufd xmm1, xmm1, 0
paddd xmm7, xmm1
movntdqa xmm1, [init+4*4]
pshufd xmm2, xmm1, 0x55
paddd xmm8, xmm2
pshufd xmm6, xmm1, 0xAA
paddd xmm9, xmm6
pshufd xmm11, xmm1, 0xFF
paddd xmm10, xmm11
pshufd xmm1, xmm1, 0
paddd xmm0, xmm1
movdqa [hash+0*16], xmm7
movdqa [hash+1*16], xmm5
movdqa [hash+2*16], xmm4
movdqa [hash+3*16], xmm3
movdqa [hash+4*16], xmm0
movdqa [hash+5*16], xmm8
movdqa [hash+6*16], xmm9
movdqa [hash+7*16], xmm10
LAB_RET:
%ifidn __OUTPUT_FORMAT__,win64
movdqa xmm6, [rsp + 16*0]
movdqa xmm7, [rsp + 16*1]
movdqa xmm8, [rsp + 16*2]
movdqa xmm9, [rsp + 16*3]
movdqa xmm10, [rsp + 16*4]
movdqa xmm11, [rsp + 16*5]
add rsp, 16 * 6
%endif
pop rbx
ret
%ifidn __OUTPUT_FORMAT__,elf
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
%ifidn __OUTPUT_FORMAT__,elf64
section .note.GNU-stack noalloc noexec nowrite progbits
%endif

354
x86_64/sha256_xmm_amd64.asm

@ -1,354 +0,0 @@ @@ -1,354 +0,0 @@
;/*
; * Copyright (C) 2011 - Neil Kettle <neil@digit-labs.org>
; *
; * This file is part of cpuminer-ng.
; *
; * cpuminer-ng is free software: you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation, either version 3 of the License, or
; * (at your option) any later version.
; *
; * cpuminer-ng is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with cpuminer-ng. If not, see <http://www.gnu.org/licenses/>.
; */
; %rbp, %rbx, and %r12-%r15 - callee save
ALIGN 32
BITS 64
%ifidn __OUTPUT_FORMAT__,win64
%define hash rcx
%define hash1 rdx
%define data r8
%define init r9
%else
%define hash rdi
%define hash1 rsi
%define data rdx
%define init rcx
%endif
; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define SHA_CALC_W_PARA 2
%define SHA_CALC_W_UNROLL 8
%define SHA_ROUND_LOOP_UNROLL 16
%ifidn __YASM_OBJFMT__, macho64
extern _sha256_consts_m128i
extern _sha256_init
%else
extern sha256_consts_m128i
extern sha256_init
%endif
%ifidn __YASM_OBJFMT__, macho64
global _sha256_sse2_64_new
%else
global sha256_sse2_64_new
%endif
%define sr1 xmm6
%define sr2 xmm1
%define sr3 xmm2
%define sr4 xmm13
%define rA xmm7
%define rB xmm5
%define rC xmm4
%define rD xmm3
%define rE xmm0
%define rF xmm8
%define rG xmm9
%define rH xmm10
%macro sha_round_blk 0
movdqa sr1, [data+rax] ; T1 = w;
;movdqa sr1, xmm11
movdqa sr2, rE ; sr2 = rE
pandn sr2, rG ; sr2 = ~rE & rG
movdqa sr3, rF ; sr3 = rF
paddd sr1, rH ; T1 = h + sha256_consts_m128i[i] + w;
movdqa rH, rG ; rH = rG
pand sr3, rE ; sr3 = rE & rF
movdqa rG, rF ; rG = rF
%ifidn __YASM_OBJFMT__, macho64
paddd sr1, [rcx+rax]
%else
paddd sr1, sha256_consts_m128i[rax] ; T1 = sha256_consts_m128i[i] + w;
%endif
pxor sr2, sr3 ; sr2 = (rE & rF) ^ (~rE & rG) = Ch (e, f, g)
movdqa rF, rE ; rF = rE
paddd sr1, sr2 ; T1 = h + Ch (e, f, g) + sha256_consts_m128i[i] + w;
movdqa sr2, rE ; sr2 = rE
psrld rE, 6 ; e >> 6
movdqa sr3, rE ; e >> 6
pslld sr2, 7 ; e << 7
psrld sr3, 5 ; e >> 11
pxor rE, sr2 ; e >> 6 ^ e << 7
pslld sr2, 14 ; e << 21
pxor rE, sr3 ; e >> 6 ^ e << 7 ^ e >> 11
psrld sr3, 14 ; e >> 25
pxor rE, sr2 ; e >> 6 ^ e << 7 ^ e >> 11 ^ e << 21
pslld sr2, 5 ; e << 26
pxor rE, sr3 ; e >> 6 ^ e << 7 ^ e >> 11 ^ e << 21 ^ e >> 25
pxor rE, sr2 ; e >> 6 ^ e << 7 ^ e >> 11 ^ e << 21 ^ e >> 25 ^ e << 26
movdqa sr2, rB ; sr2 = rB
paddd sr1, rE ; sr1 = h + BIGSIGMA1_256(e) + Ch (e, f, g) + sha256_consts_m128i[i] + w;
movdqa rE, rD ; rE = rD
movdqa rD, rC ; rD = rC
paddd rE, sr1 ; rE = rD + T1
movdqa sr3, rC ; sr3 = rC
pand rC, rA ; rC = rC & rA
pand sr3, rB ; sr3 = rB & rC
pand sr2, rA ; sr2 = rB & rA
pxor sr2, rC ; sr2 = (rB & rA) ^ (rC & rA)
movdqa rC, rB ; rC = rB
pxor sr2, sr3 ; sr2 = (rB & rA) ^ (rC & rA) ^ (rB & rC)
movdqa rB, rA ; rB = rA
paddd sr1, sr2 ; sr1 = T1 + (rB & rA) ^ (rC & rA) ^ (rB & rC)
lea rax, [rax+16]
movdqa sr3, rA ; sr3 = rA
psrld rA, 2 ; a >> 2
pslld sr3, 10 ; a << 10
movdqa sr2, rA ; a >> 2
pxor rA, sr3 ; a >> 2 ^ a << 10
psrld sr2, 11 ; a >> 13
pxor rA, sr2 ; a >> 2 ^ a << 10 ^ a >> 13
pslld sr3, 9 ; a << 19
pxor rA, sr3 ; a >> 2 ^ a << 10 ^ a >> 13 ^ a << 19
psrld sr2, 9 ; a >> 21
pxor rA, sr2 ; a >> 2 ^ a << 10 ^ a >> 13 ^ a << 19 ^ a >> 21
pslld sr3, 11 ; a << 30
pxor rA, sr3 ; a >> 2 ^ a << 10 ^ a >> 13 ^ a << 19 ^ a >> 21 ^ a << 30
paddd rA, sr1 ; T1 + BIGSIGMA0_256(a) + Maj(a, b, c);
%endmacro
%macro sha_calc_w_blk 1
movdqa xmm0, [r11-(15-%1)*16] ; xmm0 = W[I-15]
movdqa xmm4, [r11-(15-(%1+1))*16] ; xmm4 = W[I-15+1]
movdqa xmm2, xmm0 ; xmm2 = W[I-15]
movdqa xmm6, xmm4 ; xmm6 = W[I-15+1]
psrld xmm0, 3 ; xmm0 = W[I-15] >> 3
psrld xmm4, 3 ; xmm4 = W[I-15+1] >> 3
movdqa xmm1, xmm0 ; xmm1 = W[I-15] >> 3
movdqa xmm5, xmm4 ; xmm5 = W[I-15+1] >> 3
pslld xmm2, 14 ; xmm2 = W[I-15] << 14
pslld xmm6, 14 ; xmm6 = W[I-15+1] << 14
psrld xmm1, 4 ; xmm1 = W[I-15] >> 7
psrld xmm5, 4 ; xmm5 = W[I-15+1] >> 7
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)
psrld xmm1, 11 ; xmm1 = W[I-15] >> 18
psrld xmm5, 11 ; xmm5 = W[I-15+1] >> 18
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
pslld xmm2, 11 ; xmm2 = W[I-15] << 25
pslld xmm6, 11 ; xmm6 = W[I-15+1] << 25
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
movdqa xmm3, [r11-(2-%1)*16] ; xmm3 = W[I-2]
movdqa xmm7, [r11-(2-(%1+1))*16] ; xmm7 = W[I-2+1]
paddd xmm0, [r11-(16-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16]
paddd xmm4, [r11-(16-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1]
;;;;;;;;;;;;;;;;;;
movdqa xmm2, xmm3 ; xmm2 = W[I-2]
movdqa xmm6, xmm7 ; xmm6 = W[I-2+1]
psrld xmm3, 10 ; xmm3 = W[I-2] >> 10
psrld xmm7, 10 ; xmm7 = W[I-2+1] >> 10
movdqa xmm1, xmm3 ; xmm1 = W[I-2] >> 10
movdqa xmm5, xmm7 ; xmm5 = W[I-2+1] >> 10
paddd xmm0, [r11-(7-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
pslld xmm2, 13 ; xmm2 = W[I-2] << 13
pslld xmm6, 13 ; xmm6 = W[I-2+1] << 13
psrld xmm1, 7 ; xmm1 = W[I-2] >> 17
psrld xmm5, 7 ; xmm5 = W[I-2+1] >> 17
paddd xmm4, [r11-(7-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]
pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
psrld xmm1, 2 ; xmm1 = W[I-2] >> 19
psrld xmm5, 2 ; xmm5 = W[I-2+1] >> 19
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)
pslld xmm2, 2 ; xmm2 = W[I-2] << 15
pslld xmm6, 2 ; xmm6 = W[I-2+1] << 15
pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
paddd xmm0, xmm3 ; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
paddd xmm4, xmm7 ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]
movdqa [r11+(%1*16)], xmm0
movdqa [r11+((%1+1)*16)], xmm4
%endmacro
; _sha256_sse2_64_new hash(rdi), hash1(rsi), data(rdx), init(rcx),
%ifidn __YASM_OBJFMT__, macho64
_sha256_sse2_64_new:
%else
sha256_sse2_64_new:
%endif
push rbx
%ifidn __OUTPUT_FORMAT__,win64
sub rsp, 16 * 6
movdqa [rsp + 16*0], xmm6
movdqa [rsp + 16*1], xmm7
movdqa [rsp + 16*2], xmm8
movdqa [rsp + 16*3], xmm9
movdqa [rsp + 16*4], xmm10
movdqa [rsp + 16*5], xmm13
%endif
%macro SHA_256 0
mov rbx, 64*4 ; rbx is # of SHA-2 rounds
mov rax, 16*4 ; rax is where we expand to
push rbx
lea rbx, qword [data+rbx*4]
lea r11, qword [data+rax*4]
%%SHA_CALC_W:
%assign i 0
%rep SHA_CALC_W_UNROLL
sha_calc_w_blk i
%assign i i+SHA_CALC_W_PARA
%endrep
add r11, SHA_CALC_W_UNROLL*SHA_CALC_W_PARA*16
cmp r11, rbx
jb %%SHA_CALC_W
pop rbx
mov rax, 0
lea rbx, [rbx*4]
movdqa rA, [init]
pshufd rB, rA, 0x55 ; rB == B
pshufd rC, rA, 0xAA ; rC == C
pshufd rD, rA, 0xFF ; rD == D
pshufd rA, rA, 0 ; rA == A
movdqa rE, [init+4*4]
pshufd rF, rE, 0x55 ; rF == F
pshufd rG, rE, 0xAA ; rG == G
pshufd rH, rE, 0xFF ; rH == H
pshufd rE, rE, 0 ; rE == E
%ifidn __YASM_OBJFMT__, macho64
lea rcx, [_sha256_consts_m128i wrt rip]
%endif
%%SHAROUND_LOOP:
%assign i 0
%rep SHA_ROUND_LOOP_UNROLL
sha_round_blk
%assign i i+1
%endrep
cmp rax, rbx
jb %%SHAROUND_LOOP
; Finished the 64 rounds, calculate hash and save
movdqa sr1, [init]
pshufd sr2, sr1, 0x55
pshufd sr3, sr1, 0xAA
pshufd sr4, sr1, 0xFF
pshufd sr1, sr1, 0
paddd rB, sr2
paddd rC, sr3
paddd rD, sr4
paddd rA, sr1
movdqa sr1, [init+4*4]
pshufd sr2, sr1, 0x55
pshufd sr3, sr1, 0xAA
pshufd sr4, sr1, 0xFF
pshufd sr1, sr1, 0
paddd rF, sr2
paddd rG, sr3
paddd rH, sr4
paddd rE, sr1
%endmacro
SHA_256
movdqa [hash1+0*16], rA
movdqa [hash1+1*16], rB
movdqa [hash1+2*16], rC
movdqa [hash1+3*16], rD
movdqa [hash1+4*16], rE
movdqa [hash1+5*16], rF
movdqa [hash1+6*16], rG
movdqa [hash1+7*16], rH
mov data, hash1
mov init, sha256_init
SHA_256
movdqa [hash+7*16], rH
LAB_RET:
%ifidn __OUTPUT_FORMAT__,win64
movdqa xmm6, [rsp + 16*0]
movdqa xmm7, [rsp + 16*1]
movdqa xmm8, [rsp + 16*2]
movdqa xmm9, [rsp + 16*3]
movdqa xmm10, [rsp + 16*4]
movdqa xmm13, [rsp + 16*5]
add rsp, 16 * 6
%endif
pop rbx
ret
%ifidn __OUTPUT_FORMAT__,elf
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
%ifidn __OUTPUT_FORMAT__,elf64
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
Loading…
Cancel
Save