OpenCL GPU miner
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

789 lines
18 KiB

/*
* Copyright 2011-2012 Con Kolivas
* Copyright 2011-2012 Luke Dashjr
* Copyright 2010 Jeff Garzik
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version. See COPYING for more details.
*/
#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <stdint.h>
#include <unistd.h>
#include <signal.h>
#include <sys/stat.h>
#include <sys/types.h>
#ifndef WIN32
#include <sys/resource.h>
#endif
#include <libgen.h>
#include "compat.h"
#include "miner.h"
#include "bench_block.h"
#include "device-cpu.h"
#if defined(unix)
#include <errno.h>
#include <fcntl.h>
#endif
#ifdef __linux /* Linux specific policy and affinity management */
#include <sched.h>
static inline void drop_policy(void)
{
struct sched_param param;
#ifdef SCHED_BATCH
#ifdef SCHED_IDLE
if (unlikely(sched_setscheduler(0, SCHED_IDLE, &param) == -1))
#endif
sched_setscheduler(0, SCHED_BATCH, &param);
#endif
}
static inline void affine_to_cpu(int id, int cpu)
{
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(cpu, &set);
sched_setaffinity(0, sizeof(&set), &set);
applog(LOG_INFO, "Binding cpu mining thread %d to cpu %d", id, cpu);
}
#else
static inline void drop_policy(void)
{
}
static inline void affine_to_cpu(int id, int cpu)
{
}
#endif
/* TODO: resolve externals */
extern bool submit_work_sync(struct thr_info *thr, const struct work *work_in);
extern char *set_int_range(const char *arg, int *i, int min, int max);
extern int dev_from_id(int thr_id);
#ifdef WANT_CPUMINE
static size_t max_name_len = 0;
static char *name_spaces_pad = NULL;
const char *algo_names[] = {
[ALGO_C] = "c",
#ifdef WANT_SSE2_4WAY
[ALGO_4WAY] = "4way",
#endif
#ifdef WANT_VIA_PADLOCK
[ALGO_VIA] = "via",
#endif
[ALGO_CRYPTOPP] = "cryptopp",
#ifdef WANT_CRYPTOPP_ASM32
[ALGO_CRYPTOPP_ASM32] = "cryptopp_asm32",
#endif
#ifdef WANT_X8632_SSE2
[ALGO_SSE2_32] = "sse2_32",
#endif
#ifdef WANT_X8664_SSE2
[ALGO_SSE2_64] = "sse2_64",
#endif
#ifdef WANT_X8664_SSE4
[ALGO_SSE4_64] = "sse4_64",
#endif
#ifdef WANT_ALTIVEC_4WAY
[ALGO_ALTIVEC_4WAY] = "altivec_4way",
#endif
};
static const sha256_func sha256_funcs[] = {
[ALGO_C] = (sha256_func)scanhash_c,
#ifdef WANT_SSE2_4WAY
[ALGO_4WAY] = (sha256_func)ScanHash_4WaySSE2,
#endif
#ifdef WANT_ALTIVEC_4WAY
[ALGO_ALTIVEC_4WAY] = (sha256_func) ScanHash_altivec_4way,
#endif
#ifdef WANT_VIA_PADLOCK
[ALGO_VIA] = (sha256_func)scanhash_via,
#endif
[ALGO_CRYPTOPP] = (sha256_func)scanhash_cryptopp,
#ifdef WANT_CRYPTOPP_ASM32
[ALGO_CRYPTOPP_ASM32] = (sha256_func)scanhash_asm32,
#endif
#ifdef WANT_X8632_SSE2
[ALGO_SSE2_32] = (sha256_func)scanhash_sse2_32,
#endif
#ifdef WANT_X8664_SSE2
[ALGO_SSE2_64] = (sha256_func)scanhash_sse2_64,
#endif
#ifdef WANT_X8664_SSE4
[ALGO_SSE4_64] = (sha256_func)scanhash_sse4_64
#endif
};
#endif
#ifdef WANT_CPUMINE
#if defined(WANT_X8664_SSE2) && defined(__SSE2__)
enum sha256_algos opt_algo = ALGO_SSE2_64;
#elif defined(WANT_X8632_SSE2) && defined(__SSE2__)
enum sha256_algos opt_algo = ALGO_SSE2_32;
#else
enum sha256_algos opt_algo = ALGO_C;
#endif
bool opt_usecpu = false;
static int cpur_thr_id;
static bool forced_n_threads;
#endif
#ifdef WANT_CPUMINE
// Algo benchmark, crash-prone, system independent stage
double bench_algo_stage3(
enum sha256_algos algo
)
{
// Use a random work block pulled from a pool
static uint8_t bench_block[] = { CGMINER_BENCHMARK_BLOCK };
struct work work __attribute__((aligned(128)));
size_t bench_size = sizeof(work);
size_t work_size = sizeof(bench_block);
size_t min_size = (work_size < bench_size ? work_size : bench_size);
memset(&work, 0, sizeof(work));
memcpy(&work, &bench_block, min_size);
struct work_restart dummy;
work_restart = &dummy;
struct timeval end;
struct timeval start;
uint32_t max_nonce = (1<<22);
uint32_t last_nonce = 0;
gettimeofday(&start, 0);
{
sha256_func func = sha256_funcs[algo];
(*func)(
0,
work.midstate,
work.data,
work.hash1,
work.hash,
work.target,
max_nonce,
&last_nonce,
work.blk.nonce
);
}
gettimeofday(&end, 0);
work_restart = NULL;
uint64_t usec_end = ((uint64_t)end.tv_sec)*1000*1000 + end.tv_usec;
uint64_t usec_start = ((uint64_t)start.tv_sec)*1000*1000 + start.tv_usec;
uint64_t usec_elapsed = usec_end - usec_start;
double rate = -1.0;
if (0<usec_elapsed) {
rate = (1.0*(last_nonce+1))/usec_elapsed;
}
return rate;
}
#if defined(unix)
// Change non-blocking status on a file descriptor
static void set_non_blocking(
int fd,
int yes
)
{
int flags = fcntl(fd, F_GETFL, 0);
if (flags<0) {
perror("fcntl(GET) failed");
exit(1);
}
flags = yes ? (flags|O_NONBLOCK) : (flags&~O_NONBLOCK);
int r = fcntl(fd, F_SETFL, flags);
if (r<0) {
perror("fcntl(SET) failed");
exit(1);
}
}
#endif // defined(unix)
// Algo benchmark, crash-safe, system-dependent stage
static double bench_algo_stage2(
enum sha256_algos algo
)
{
// Here, the gig is to safely run a piece of code that potentially
// crashes. Unfortunately, the Right Way (tm) to do this is rather
// heavily platform dependent :(
double rate = -1.23457;
#if defined(unix)
// Make a pipe: [readFD, writeFD]
int pfd[2];
int r = pipe(pfd);
if (r<0) {
perror("pipe - failed to create pipe for --algo auto");
exit(1);
}
// Make pipe non blocking
set_non_blocking(pfd[0], 1);
set_non_blocking(pfd[1], 1);
// Don't allow a crashing child to kill the main process
sighandler_t sr0 = signal(SIGPIPE, SIG_IGN);
sighandler_t sr1 = signal(SIGPIPE, SIG_IGN);
if (SIG_ERR==sr0 || SIG_ERR==sr1) {
perror("signal - failed to edit signal mask for --algo auto");
exit(1);
}
// Fork a child to do the actual benchmarking
pid_t child_pid = fork();
if (child_pid<0) {
perror("fork - failed to create a child process for --algo auto");
exit(1);
}
// Do the dangerous work in the child, knowing we might crash
if (0==child_pid) {
// TODO: some umask trickery to prevent coredumps
// Benchmark this algorithm
double r = bench_algo_stage3(algo);
// We survived, send result to parent and bail
int loop_count = 0;
while (1) {
ssize_t bytes_written = write(pfd[1], &r, sizeof(r));
int try_again = (0==bytes_written || (bytes_written<0 && EAGAIN==errno));
int success = (sizeof(r)==(size_t)bytes_written);
if (success)
break;
if (!try_again) {
perror("write - child failed to write benchmark result to pipe");
exit(1);
}
if (5<loop_count) {
applog(LOG_ERR, "child tried %d times to communicate with parent, giving up", loop_count);
exit(1);
}
++loop_count;
sleep(1);
}
exit(0);
}
// Parent waits for a result from child
int loop_count = 0;
while (1) {
// Wait for child to die
int status;
int r = waitpid(child_pid, &status, WNOHANG);
if ((child_pid==r) || (r<0 && ECHILD==errno)) {
// Child died somehow. Grab result and bail
double tmp;
ssize_t bytes_read = read(pfd[0], &tmp, sizeof(tmp));
if (sizeof(tmp)==(size_t)bytes_read)
rate = tmp;
break;
} else if (r<0) {
perror("bench_algo: waitpid failed. giving up.");
exit(1);
}
// Give up on child after a ~60s
if (60<loop_count) {
kill(child_pid, SIGKILL);
waitpid(child_pid, &status, 0);
break;
}
// Wait a bit longer
++loop_count;
sleep(1);
}
// Close pipe
r = close(pfd[0]);
if (r<0) {
perror("close - failed to close read end of pipe for --algo auto");
exit(1);
}
r = close(pfd[1]);
if (r<0) {
perror("close - failed to close read end of pipe for --algo auto");
exit(1);
}
#elif defined(WIN32)
// Get handle to current exe
HINSTANCE module = GetModuleHandle(0);
if (!module) {
applog(LOG_ERR, "failed to retrieve module handle");
exit(1);
}
// Create a unique name
char unique_name[32];
snprintf(
unique_name,
sizeof(unique_name)-1,
"cgminer-%p",
(void*)module
);
// Create and init a chunked of shared memory
HANDLE map_handle = CreateFileMapping(
INVALID_HANDLE_VALUE, // use paging file
NULL, // default security attributes
PAGE_READWRITE, // read/write access
0, // size: high 32-bits
4096, // size: low 32-bits
unique_name // name of map object
);
if (NULL==map_handle) {
applog(LOG_ERR, "could not create shared memory");
exit(1);
}
void *shared_mem = MapViewOfFile(
map_handle, // object to map view of
FILE_MAP_WRITE, // read/write access
0, // high offset: map from
0, // low offset: beginning
0 // default: map entire file
);
if (NULL==shared_mem) {
applog(LOG_ERR, "could not map shared memory");
exit(1);
}
SetEnvironmentVariable("CGMINER_SHARED_MEM", unique_name);
CopyMemory(shared_mem, &rate, sizeof(rate));
// Get path to current exe
char cmd_line[256 + MAX_PATH];
const size_t n = sizeof(cmd_line)-200;
DWORD size = GetModuleFileName(module, cmd_line, n);
if (0==size) {
applog(LOG_ERR, "failed to retrieve module path");
exit(1);
}
// Construct new command line based on that
char *p = strlen(cmd_line) + cmd_line;
sprintf(p, " --bench-algo %d", algo);
SetEnvironmentVariable("CGMINER_BENCH_ALGO", "1");
// Launch a debug copy of cgminer
STARTUPINFO startup_info;
PROCESS_INFORMATION process_info;
ZeroMemory(&startup_info, sizeof(startup_info));
ZeroMemory(&process_info, sizeof(process_info));
startup_info.cb = sizeof(startup_info);
BOOL ok = CreateProcess(
NULL, // No module name (use command line)
cmd_line, // Command line
NULL, // Process handle not inheritable
NULL, // Thread handle not inheritable
FALSE, // Set handle inheritance to FALSE
DEBUG_ONLY_THIS_PROCESS,// We're going to debug the child
NULL, // Use parent's environment block
NULL, // Use parent's starting directory
&startup_info, // Pointer to STARTUPINFO structure
&process_info // Pointer to PROCESS_INFORMATION structure
);
if (!ok) {
applog(LOG_ERR, "CreateProcess failed with error %d\n", GetLastError() );
exit(1);
}
// Debug the child (only clean way to catch exceptions)
while (1) {
// Wait for child to do something
DEBUG_EVENT debug_event;
ZeroMemory(&debug_event, sizeof(debug_event));
BOOL ok = WaitForDebugEvent(&debug_event, 60 * 1000);
if (!ok)
break;
// Decide if event is "normal"
int go_on =
CREATE_PROCESS_DEBUG_EVENT== debug_event.dwDebugEventCode ||
CREATE_THREAD_DEBUG_EVENT == debug_event.dwDebugEventCode ||
EXIT_THREAD_DEBUG_EVENT == debug_event.dwDebugEventCode ||
EXCEPTION_DEBUG_EVENT == debug_event.dwDebugEventCode ||
LOAD_DLL_DEBUG_EVENT == debug_event.dwDebugEventCode ||
OUTPUT_DEBUG_STRING_EVENT == debug_event.dwDebugEventCode ||
UNLOAD_DLL_DEBUG_EVENT == debug_event.dwDebugEventCode;
if (!go_on)
break;
// Some exceptions are also "normal", apparently.
if (EXCEPTION_DEBUG_EVENT== debug_event.dwDebugEventCode) {
int go_on =
EXCEPTION_BREAKPOINT== debug_event.u.Exception.ExceptionRecord.ExceptionCode;
if (!go_on)
break;
}
// If nothing unexpected happened, let child proceed
ContinueDebugEvent(
debug_event.dwProcessId,
debug_event.dwThreadId,
DBG_CONTINUE
);
}
// Clean up child process
TerminateProcess(process_info.hProcess, 1);
CloseHandle(process_info.hProcess);
CloseHandle(process_info.hThread);
// Reap return value and cleanup
CopyMemory(&rate, shared_mem, sizeof(rate));
(void)UnmapViewOfFile(shared_mem);
(void)CloseHandle(map_handle);
#else
// Not linux, not unix, not WIN32 ... do our best
rate = bench_algo_stage3(algo);
#endif // defined(unix)
// Done
return rate;
}
static void bench_algo(
double *best_rate,
enum sha256_algos *best_algo,
enum sha256_algos algo
)
{
size_t n = max_name_len - strlen(algo_names[algo]);
memset(name_spaces_pad, ' ', n);
name_spaces_pad[n] = 0;
applog(
LOG_ERR,
"\"%s\"%s : benchmarking algorithm ...",
algo_names[algo],
name_spaces_pad
);
double rate = bench_algo_stage2(algo);
if (rate<0.0) {
applog(
LOG_ERR,
"\"%s\"%s : algorithm fails on this platform",
algo_names[algo],
name_spaces_pad
);
} else {
applog(
LOG_ERR,
"\"%s\"%s : algorithm runs at %.5f MH/s",
algo_names[algo],
name_spaces_pad,
rate
);
if (*best_rate<rate) {
*best_rate = rate;
*best_algo = algo;
}
}
}
// Figure out the longest algorithm name
void init_max_name_len()
{
size_t i;
size_t nb_names = sizeof(algo_names)/sizeof(algo_names[0]);
for (i=0; i<nb_names; ++i) {
const char *p = algo_names[i];
size_t name_len = p ? strlen(p) : 0;
if (max_name_len<name_len)
max_name_len = name_len;
}
name_spaces_pad = (char*) malloc(max_name_len+16);
if (0==name_spaces_pad) {
perror("malloc failed");
exit(1);
}
}
// Pick the fastest CPU hasher
static enum sha256_algos pick_fastest_algo()
{
double best_rate = -1.0;
enum sha256_algos best_algo = 0;
applog(LOG_ERR, "benchmarking all sha256 algorithms ...");
bench_algo(&best_rate, &best_algo, ALGO_C);
#if defined(WANT_SSE2_4WAY)
bench_algo(&best_rate, &best_algo, ALGO_4WAY);
#endif
#if defined(WANT_VIA_PADLOCK)
bench_algo(&best_rate, &best_algo, ALGO_VIA);
#endif
bench_algo(&best_rate, &best_algo, ALGO_CRYPTOPP);
#if defined(WANT_CRYPTOPP_ASM32)
bench_algo(&best_rate, &best_algo, ALGO_CRYPTOPP_ASM32);
#endif
#if defined(WANT_X8632_SSE2)
bench_algo(&best_rate, &best_algo, ALGO_SSE2_32);
#endif
#if defined(WANT_X8664_SSE2)
bench_algo(&best_rate, &best_algo, ALGO_SSE2_64);
#endif
#if defined(WANT_X8664_SSE4)
bench_algo(&best_rate, &best_algo, ALGO_SSE4_64);
#endif
#if defined(WANT_ALTIVEC_4WAY)
bench_algo(&best_rate, &best_algo, ALGO_ALTIVEC_4WAY);
#endif
size_t n = max_name_len - strlen(algo_names[best_algo]);
memset(name_spaces_pad, ' ', n);
name_spaces_pad[n] = 0;
applog(
LOG_ERR,
"\"%s\"%s : is fastest algorithm at %.5f MH/s",
algo_names[best_algo],
name_spaces_pad,
best_rate
);
return best_algo;
}
/* FIXME: Use asprintf for better errors. */
char *set_algo(const char *arg, enum sha256_algos *algo)
{
enum sha256_algos i;
if (!strcmp(arg, "auto")) {
*algo = pick_fastest_algo();
return NULL;
}
for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
if (algo_names[i] && !strcmp(arg, algo_names[i])) {
*algo = i;
return NULL;
}
}
return "Unknown algorithm";
}
void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo)
{
strncpy(buf, algo_names[*algo], OPT_SHOW_LEN);
}
#endif
#ifdef WANT_CPUMINE
char *force_nthreads_int(const char *arg, int *i)
{
forced_n_threads = true;
return set_int_range(arg, i, 0, 9999);
}
#endif
#ifdef WANT_CPUMINE
static void cpu_detect()
{
int i;
// Reckon number of cores in the box
#if defined(WIN32)
{
DWORD system_am;
DWORD process_am;
BOOL ok = GetProcessAffinityMask(
GetCurrentProcess(),
&system_am,
&process_am
);
if (!ok) {
applog(LOG_ERR, "couldn't figure out number of processors :(");
num_processors = 1;
} else {
size_t n = 32;
num_processors = 0;
while (n--)
if (process_am & (1<<n))
++num_processors;
}
}
#else
num_processors = sysconf(_SC_NPROCESSORS_ONLN);
#endif /* !WIN32 */
if (opt_n_threads < 0 || !forced_n_threads) {
if (total_devices && !opt_usecpu)
opt_n_threads = 0;
else
opt_n_threads = num_processors;
}
if (num_processors < 1)
return;
if (total_devices + opt_n_threads > MAX_DEVICES)
opt_n_threads = MAX_DEVICES - total_devices;
cpus = calloc(opt_n_threads, sizeof(struct cgpu_info));
if (unlikely(!cpus))
quit(1, "Failed to calloc cpus");
for (i = 0; i < opt_n_threads; ++i) {
struct cgpu_info *cgpu;
cgpu = devices[total_devices + i] = &cpus[i];
cgpu->api = &cpu_api;
cgpu->enabled = true;
cgpu->device_id = i;
cgpu->threads = 1;
}
total_devices += opt_n_threads;
}
static void reinit_cpu_device(struct cgpu_info *cpu)
{
tq_push(thr_info[cpur_thr_id].q, cpu);
}
static bool cpu_thread_prepare(struct thr_info *thr)
{
thread_reportin(thr);
return true;
}
static uint64_t cpu_can_limit_work(struct thr_info *thr)
{
return 0xfffff;
}
static bool cpu_thread_init(struct thr_info *thr)
{
const int thr_id = thr->id;
/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
* and if that fails, then SCHED_BATCH. No need for this to be an
* error if it fails */
setpriority(PRIO_PROCESS, 0, 19);
drop_policy();
/* Cpu affinity only makes sense if the number of threads is a multiple
* of the number of CPUs */
if (!(opt_n_threads % num_processors))
affine_to_cpu(dev_from_id(thr_id), dev_from_id(thr_id) % num_processors);
return true;
}
static uint64_t cpu_scanhash(struct thr_info *thr, struct work *work, uint64_t max_nonce)
{
const int thr_id = thr->id;
uint32_t first_nonce = work->blk.nonce;
uint32_t last_nonce;
bool rc;
CPUSearch:
last_nonce = first_nonce;
rc = false;
/* scan nonces for a proof-of-work hash */
{
sha256_func func = sha256_funcs[opt_algo];
rc = (*func)(
thr_id,
work->midstate,
work->data,
work->hash1,
work->hash,
work->target,
max_nonce,
&last_nonce,
work->blk.nonce
);
}
/* if nonce found, submit work */
if (unlikely(rc)) {
if (opt_debug)
applog(LOG_DEBUG, "CPU %d found something?", dev_from_id(thr_id));
if (unlikely(!submit_work_sync(thr, work))) {
applog(LOG_ERR, "Failed to submit_work_sync in miner_thread %d", thr_id);
}
work->blk.nonce = last_nonce + 1;
goto CPUSearch;
}
else
if (unlikely(last_nonce == first_nonce))
return 0;
work->blk.nonce = last_nonce + 1;
return last_nonce - first_nonce + 1;
}
struct device_api cpu_api = {
.name = "CPU",
.api_detect = cpu_detect,
.reinit_device = reinit_cpu_device,
.thread_prepare = cpu_thread_prepare,
.can_limit_work = cpu_can_limit_work,
.thread_init = cpu_thread_init,
.scanhash = cpu_scanhash,
};
#endif