mirror of https://github.com/GOSTSec/sgminer
Noel Maersk
10 years ago
18 changed files with 1479 additions and 496 deletions
@ -1,19 +1,63 @@
@@ -1,19 +1,63 @@
|
||||
Version 4.0.0 - 15th January 2014 |
||||
# Release news |
||||
|
||||
## Version 4.1.0 - 7th February 2014 |
||||
|
||||
* Writing configuration file from NCurses interface is broken! |
||||
* Commandline option parsing may be broken! |
||||
* MinGW building may be broken! |
||||
* Bug reporting documentation: `doc/BUGS.md`. |
||||
* Kernel selection and integration: `doc/KERNEL.md`. |
||||
* Several updates to other documentation files. |
||||
* Warn if `GPU_*` environment variables are not set. |
||||
* Maximum `intensity` lowered to 31 (anything above that gives an overflow |
||||
anyway). |
||||
* Experimental `xintensity` setting (by _Kalroth_), see |
||||
[commit message](https://github.com/veox/sgminer/commit/7aeae40af22e6108aab8b68a229eea25a639d650). |
||||
* Experimental `rawintensity` setting (by _Kalroth_), see |
||||
[commit message](https://github.com/veox/sgminer/commit/d11df698d141988491494aa1f29c7d3595e9712b). |
||||
* `-v` is now a shorthand for `--verbose`, not `--vectors`. |
||||
* Default `scantime` and `expiry` changed to 7 and 28 (tests run by |
||||
_MissedOutOnBTC_). |
||||
* Increased precision of `API_MHS`; added `API_KHS`. |
||||
* Pulled in kernels: `alexkarold`, `alexkarnew`, `psw`, `zuikkis`; renamed |
||||
`scrypt` to `ckolivas`; all kernels now reside in directory `kernel`. |
||||
Kernels can be chosen at startup only, by specifying `kernel`. |
||||
* Small optimisation to `ckolivas` kernel (by _gdevenyi_). |
||||
* Named pools via `poolname` (by _Kalroth_). |
||||
* Failover pool switching back delay is configurable via |
||||
`failover-switch-delay` (by _Kalroth_). |
||||
* Pool `state`: `enabled`, `disabled`, and `hidden` (by _Joe4782_). |
||||
* Allow all pools to be set `disabled`. |
||||
* Use RPM in ADL `get-fanspeed` requests (from `bfgminer`, by _luke-jr_). |
||||
* Verbose ADL failure messages (by _Joe4782_ and _deba12_). |
||||
* Use `git` version string if available. |
||||
* Allow bypassing ADL checks during build with `--disable-adl-checks`. |
||||
* MinGW build checks (by _tonobitc_). |
||||
* Experimental Microsoft Visual Studio 2010 building support in branch |
||||
`build-msvs2010-upd` (by _troky_). |
||||
|
||||
|
||||
## Version 4.0.0 - 15th January 2014 |
||||
|
||||
* Fork `veox/sgminer` from `ckolivas/cgminer` version 3.7.2. |
||||
* Remove code referencing SHA256d mining, FPGAs and ASICS. Leftovers most probably still remain. |
||||
* Remove code referencing SHA256d mining, FPGAs and ASICS. Leftovers most |
||||
probably still remain. |
||||
* AMD ADL crash fix on R9 chipsets by Benjamin Herrenschmidt. |
||||
* Maximum allowed intensity increased to 42. |
||||
* Move documentation to directory `doc`. |
||||
* `--gpu-threads` support for comma-separated values by Martin "Kalroth" Danielsen. |
||||
* AMD ADL SDK 5 mandatory, preparation for ADL Overdrive 6 support by Martin. |
||||
* `--gpu-threads` support for comma-separated values by Martin Danielsen |
||||
(_Kalroth_). |
||||
* AMD ADL SDK 5 mandatory, preparation for ADL Overdrive 6 support by |
||||
_Kalroth_. |
||||
* Allow changing TCP keepalive packet idle time using `--tcp-keepalive`. |
||||
* Automatic library presence detection by `configure`. |
||||
* `--scrypt` option removed (no other choice now). |
||||
* `--vectors` option removed (current kernel only supports 1). |
||||
* Display per-GPU reject percentage instead of absolute values by Martin. |
||||
* Display per-GPU reject percentage instead of absolute values by _Kalroth_. |
||||
* Do not show date in log by default (switch with `--log-show-date`). |
||||
* Fix network difficulty display to resemble that of cgminer 3.1.1. |
||||
* Forward-port relevant bugfixes form `ckolivas/cgminer`, up to cgminer version 3.10.0. |
||||
* Fix network difficulty display to resemble that of `cgminer` 3.1.1. |
||||
* Forward-port relevant bugfixes form `ckolivas/cgminer`, up to `cgminer` |
||||
version 3.10.0. |
||||
|
||||
|
||||
Previous NEWS file available [here](https://github.com/veox/sgminer/blob/829f0687bfd0ddb0cf12a9a8588ae2478dfe8d99/NEWS). |
||||
|
@ -1,154 +0,0 @@
@@ -1,154 +0,0 @@
|
||||
FAQ |
||||
|
||||
Q: Can I mine on servers from different networks (eg smartcoin and bitcoin) at |
||||
the same time? |
||||
A: No, sgminer keeps a database of the block it's working on to ensure it does |
||||
not work on stale blocks, and having different blocks from two networks would |
||||
make it invalidate the work from each other. |
||||
|
||||
Q: Can I configure sgminer to mine with different login credentials or pools |
||||
for each separate device? |
||||
A: No. |
||||
|
||||
Q: Can I put multiple pools in the config file? |
||||
A: Yes, check the example.conf file. Alternatively, set up everything either on |
||||
the command line or via the menu after startup and choose settings->write |
||||
config file and the file will be loaded one each startup. |
||||
|
||||
Q: The build fails with gcc is unable to build a binary. |
||||
A: Remove the "-march=native" component of your CFLAGS as your version of gcc |
||||
does not support it. |
||||
|
||||
Q: Can you implement feature X? |
||||
A: I can, but time is limited, and people who donate are more likely to get |
||||
their feature requests implemented. |
||||
|
||||
Q: Work keeps going to my backup pool even though my primary pool hasn't |
||||
failed? |
||||
A: sgminer checks for conditions where the primary pool is lagging and will |
||||
pass some work to the backup servers under those conditions. The reason for |
||||
doing this is to try its absolute best to keep the GPUs working on something |
||||
useful and not risk idle periods. You can disable this behaviour with the |
||||
option --failover-only. |
||||
|
||||
Q: Is this a virus? |
||||
A: sgminer is being packaged with other trojan scripts and some antivirus |
||||
software is falsely accusing sgminer.exe as being the actual virus, rather |
||||
than whatever it is being packaged with. If you installed sgminer yourself, |
||||
then you do not have a virus on your computer. Complain to your antivirus |
||||
software company. They seem to be flagging even source code now from sgminer |
||||
as viruses, even though text source files can't do anything by themself. |
||||
|
||||
Q: Can you modify the display to include more of one thing in the output and |
||||
less of another, or can you change the quiet mode or can you add yet another |
||||
output mode? |
||||
A: Everyone will always have their own view of what's important to monitor. |
||||
The defaults are very sane and I have very little interest in changing this |
||||
any further. |
||||
|
||||
Q: What are the best parameters to pass for X pool/hardware/device. |
||||
A: Virtually always, the DEFAULT parameters give the best results. Most user |
||||
defined settings lead to worse performance. The ONLY thing most users should |
||||
need to set is the Intensity for GPUs. |
||||
|
||||
Q: What happened to CPU mining? |
||||
A: Being increasingly irrelevant for most users, and a maintenance issue, it is |
||||
no longer under active development and will not be supported. No binary builds |
||||
supporting CPU mining will be released. Virtually all remaining users of CPU |
||||
mining are as back ends for illegal botnets. The main reason sgminer is being |
||||
inappopriately tagged as a virus by antivirus software is due to the trojans |
||||
packaging a CPU mining capable version of it. There is no longer ANY CPU mining |
||||
code in sgminer. If you are mining bitcoin with CPU today, you are spending |
||||
1000x more in electricity costs than you are earning in bitcoin. |
||||
|
||||
Q: GUI version? |
||||
A: No. The RPC interface makes it possible for someone else to write one |
||||
though. |
||||
|
||||
Q: I'm having an issue. What debugging information should I provide in the |
||||
bug report? |
||||
A: Start sgminer with your regular commands and add -D -T --verbose and provide |
||||
the full startup output and a summary of your hardware, operating system, AMD |
||||
driver version and AMD APP SDK version (if installed separately). |
||||
|
||||
Q: Why don't you provide win64 builds? |
||||
A: Win32 builds work everywhere and there is precisely zero advantage to a |
||||
64 bit build on windows. |
||||
|
||||
Q: Is it faster to mine on windows or linux? |
||||
A: It makes no difference. It comes down to choice of operating system for |
||||
their various features. Linux offers much better long term stability and |
||||
remote monitoring and security, while windows offers you overclocking tools |
||||
that can achieve much more than sgminer can do on linux. |
||||
|
||||
Q: Can I mine with sgminer on a MAC? |
||||
A: sgminer will compile on OSX, but the performance of GPU mining is |
||||
compromised due to the opencl implementation on OSX, there is no temperature |
||||
or fanspeed monitoring, and the cooling design of most MACs, despite having |
||||
powerful GPUs, will usually not cope with constant usage leading to a high |
||||
risk of thermal damage. It is highly recommended not to mine on a MAC. |
||||
|
||||
Q: I switch users on windows and my mining stops working? |
||||
A: That's correct, it does. It's a permissions issue that there is no known |
||||
fix for due to monitoring of GPU fanspeeds and temperatures. If you disable |
||||
the monitoring with --no-adl it should switch okay. |
||||
|
||||
Q: My network gets slower and slower and then dies for a minute? |
||||
A; Try the --net-delay option. |
||||
|
||||
Q: How do I tune for p2pool? |
||||
A: p2pool has very rapid expiration of work and new blocks, it is suggested you |
||||
decrease intensity by 1 from your optimal value, and decrease GPU threads to 1 |
||||
with -g 1. It is also recommended to use --failover-only since the work is |
||||
effectively like a different block chain. |
||||
|
||||
Q: Are OpenCL kernels from other mining software useable in sgminer? |
||||
A: No, the APIs are slightly different between the different software and they |
||||
will not work. |
||||
|
||||
Q: I run PHP on windows to access the API with the example miner.php. Why does |
||||
it fail when php is installed properly but I only get errors about Sockets not |
||||
working in the logs? |
||||
A: http://us.php.net/manual/en/sockets.installation.php |
||||
|
||||
Q: Can I mine scrypt with FPGAs or ASICs? |
||||
A: As of Jan 15 2014, no. |
||||
|
||||
Q: What is stratum and how do I use it? |
||||
A: Stratum is a protocol designed for pooled mining in such a way as to |
||||
minimise the amount of network communications, yet scale to hardware of any |
||||
speed. With versions of sgminer 2.8.0+, if a pool has stratum support, sgminer |
||||
will automatically detect it and switch to the support as advertised if it can. |
||||
If you input the stratum port directly into your configuration, or use the |
||||
special prefix "stratum+tcp://" instead of "http://", sgminer will ONLY try to |
||||
use stratum protocol mining. The advantages of stratum to the miner are no |
||||
delays in getting more work for the miner, less rejects across block changes, |
||||
and far less network communications for the same amount of mining hashrate. If |
||||
you do NOT wish sgminer to automatically switch to stratum protocol even if it |
||||
is detected, add the --fix-protocol option. |
||||
|
||||
Q: Why don't the statistics add up: Accepted, Rejected, Stale, Hardware Errors, |
||||
Diff1 Work, etc. when mining greater than 1 difficulty shares? |
||||
A: As an example, if you look at 'Difficulty Accepted' in the RPC API, the number |
||||
of difficulty shares accepted does not usually exactly equal the amount of work |
||||
done to find them. If you are mining at 8 difficulty, then you would expect on |
||||
average to find one 8 difficulty share, per 8 single difficulty shares found. |
||||
However, the number is actually random and converges over time, it is an average, |
||||
not an exact value, thus you may find more or less than the expected average. |
||||
|
||||
Q: Why do the scrypt diffs not match with the current difficulty target? |
||||
A: The current scrypt block difficulty is expressed in terms of how many |
||||
multiples of the BTC difficulty it currently is (eg 28) whereas the shares of |
||||
"difficulty 1" are actually 65536 times smaller than the BTC ones. The diff |
||||
expressed by sgminer is as multiples of difficulty 1 shares. |
||||
|
||||
Q: Can I make a donation? |
||||
A: Yes, see AUTHORS.md for authors' donation addresses. |
||||
|
||||
Q: What should my Work Utility (WU) be? |
||||
A: Work utility is the product of hashrate * luck and only stabilises over a |
||||
very long period of time. Assuming all your work is valid work, bitcoin mining |
||||
should produce a work utility of approximately 1 per 71.6MH. This means at |
||||
5GH you should have a WU of 5000 / 71.6 or ~ 69. You cannot make your machine |
||||
do "better WU" than this - it is luck related. However you can make it much |
||||
worse if your machine produces a lot of hardware errors producing invalid work. |
@ -0,0 +1,170 @@
@@ -0,0 +1,170 @@
|
||||
# FAQ |
||||
|
||||
Q: Why is the network difficulty wrong? |
||||
A: It is not wrong. Sharediff of 1 (historically) corresponds to a |
||||
network difficulty of 1/65536. Throughout the inteface, share difficulty |
||||
is displayed as whole numbers, not fractionals. Pools use the same |
||||
convention (with the notable exception of P2Pool). Until pools start |
||||
using true network difficulty to display share difficulty, there is no |
||||
reason to display difficulty differently. This is a vicious cycle and a |
||||
remnant of Bitcoin mining on GPUs. |
||||
|
||||
Q: Can I mine on servers from different networks (eg litecoin and |
||||
dogecoin) at the same time? |
||||
A: No. `sgminer` keeps a database of the block it's working on to ensure |
||||
it does not work on stale blocks, and having different blocks from two |
||||
networks would make it invalidate the work from each other. |
||||
|
||||
Q: Can I configure sgminer to mine with different login credentials or |
||||
pools for each separate device? |
||||
A: No. |
||||
|
||||
Q: Can I put multiple pools in the config file? |
||||
A: Yes, check the `example.conf` file. Alternatively, set up everything |
||||
either on the command line or via the menu after startup and choose |
||||
`Settings -> Write config file`. |
||||
|
||||
Q: The build fails with `gcc is unable to build a binary`. |
||||
A: Remove the "-march=native" component of your `CFLAGS` as your version |
||||
of gcc does not support it. |
||||
|
||||
Q: Can you implement feature X? |
||||
A: I can, but time is limited, and people who donate are more likely to |
||||
get their feature requests implemented. |
||||
|
||||
Q: Work keeps going to my backup pool even though my primary pool |
||||
hasn't failed? |
||||
A: sgminer checks for conditions where the primary pool is lagging and |
||||
will pass some work to the backup servers under those conditions. The |
||||
reason for doing this is to try its absolute best to keep the GPUs |
||||
working on something useful and not risk idle periods. You can disable |
||||
this behaviour with the option --failover-only. |
||||
|
||||
Q: Is this a virus? |
||||
A: sgminer is being packaged with other trojan |
||||
scripts and some antivirus software is falsely accusing sgminer.exe as |
||||
being the actual virus, rather than whatever it is being packaged with. |
||||
If you had built sgminer yourself, then you do not have a virus on your |
||||
computer. Complain to your antivirus software company.. |
||||
|
||||
Q: Can you modify the display to include more of one thing in the output |
||||
and less of another, or can you change the quiet mode or can you add |
||||
yet another output mode? |
||||
A: Everyone will always have their own view of what's important to |
||||
monitor. The shipped NCurses TUI is intentionally ascetic, and is only |
||||
provided as a fallback. It is recomended to use an API client if you |
||||
want to customise the display. |
||||
|
||||
Q: GUI version? |
||||
A: No. The API makes it possible for someone else to write one though. |
||||
|
||||
Q: What are the best parameters to pass for pool / hardware / device? |
||||
A: See `doc/MINING.md` in your source distribution directory, or |
||||
[doc/MINING.md](https://github.com/veox/sgminer/blob/master/doc/MINING.md] |
||||
for an online version. Note that the latter is for the latest |
||||
development version, and arguments listed there are not necessarily |
||||
available in your local version. |
||||
|
||||
Q: Is CPU mining supported? |
||||
A: No. Consider using [cpuminer](https://github.com/pooler/cpuminer). |
||||
|
||||
Q: I'm having an issue. What debugging information should I provide in |
||||
the bug report? |
||||
A: See `doc/BUGS.md` in your source distribution directory, or |
||||
[doc/BUGS.md](https://github.com/veox/sgminer/blob/master/doc/BUGS.md] |
||||
for an online version. |
||||
|
||||
Q: Why don't you provide binaries? |
||||
A: Binaries are a hassle to maintain. On Linux, they should be provided |
||||
by your distribution's package manager anyway. Runnning an unoptimised |
||||
binary gives a minor performance penalty. Running binaries from |
||||
untrusted providers is a security risk. There has not been sufficient |
||||
community interest to provide distributed determininstic builds. |
||||
|
||||
Q: Is it better to mine on Linux or Windows? |
||||
A: It comes down to choice of operating system for their various |
||||
features. Linux offers specialised mining distributions, much better |
||||
long term stability, remote monitoring and security, while Windows |
||||
offers overclocking tools that can achieve much more than sgminer can do |
||||
on Linux. YMMV. |
||||
|
||||
Q: Can I mine with sgminer on a Mac? |
||||
A: sgminer will compile on OSX, but the performance of GPU mining |
||||
is compromised due to the OpenCL implementation on OSX, there is no |
||||
temperature or fanspeed monitoring, and the cooling design will usually |
||||
not cope with constant usage leading to a high risk of thermal damage. |
||||
It is highly recommended not to mine on a Mac. |
||||
|
||||
Q: I switch users on Windows and my mining stops working? |
||||
A: That's correct, it does. It's a permissions issue that there is no |
||||
known fix for due to monitoring of GPU fanspeeds and temperatures. If |
||||
you disable the monitoring with `--no-adl` it should switch okay. |
||||
|
||||
Q: My network gets slower and slower and then dies for a minute? |
||||
A: Try the `--net-delay` option. |
||||
|
||||
Q: How do I tune for P2Pool? |
||||
A: P2Pool has very rapid expiration of work and new blocks, it is |
||||
suggested you decrease intensity, decrease `scantime` and `expiry`, |
||||
and/or decrease GPU threads to 1 with `-g 1`. It is also recommended to |
||||
use `--failover-only` since the work is effectively a separate |
||||
blockchain. |
||||
|
||||
Q: Are OpenCL kernels from other mining software usable in sgminer? |
||||
A: Most often no. |
||||
|
||||
Q: How do I add my own kernel? |
||||
A: See `doc/KERNEL.md` in your source distribution directory, or |
||||
[doc/KERNEL.md](https://github.com/veox/sgminer/blob/master/doc/KERNEL.md] |
||||
for an online version. |
||||
|
||||
Q: I run PHP on Windows to access the API with the example |
||||
`miner.php`. Why does it fail when PHP is installed properly but |
||||
I only get errors about Sockets not working in the logs? |
||||
A: http://us.php.net/manual/en/sockets.installation.php |
||||
|
||||
Q: Will sgminer support FPGAs or ASICs? |
||||
A: No. sgminer will only support GPUs. It is bad software design |
||||
practice to try and support every gadget out there. Developers |
||||
for dedicated hardware products are better off creating standalone |
||||
software. |
||||
|
||||
Q: What is stratum and how do I use it? |
||||
A: Stratum is a protocol designed for pooled mining in such a way as to |
||||
minimise the amount of network communications, yet scale to hardware |
||||
of any speed. If a pool has stratum support (and most public ones do), |
||||
sgminer will automatically detect it and switch to the support as |
||||
advertised if it can. If you input the stratum port directly into your |
||||
configuration, or use the special prefix `stratum+tcp://` instead of |
||||
`http://`, sgminer will ONLY try to use stratum protocol mining. The |
||||
advantages of stratum to the miner are no delays in getting more work |
||||
for the miner, less rejects across block changes, and far less network |
||||
communications for the same amount of mining hashrate. If you do not |
||||
wish sgminer to automatically switch to stratum protocol even if it is |
||||
detected, add the `--fix-protocol` option. |
||||
|
||||
Q: Why don't the statistics add up: Accepted, Rejected, Stale, Hardware |
||||
Errors, Diff1 Work, etc. when mining greater than 1 difficulty shares? |
||||
A: As an example, if you look at 'Difficulty Accepted' in the RPC API, |
||||
the number of difficulty shares accepted does not usually exactly equal |
||||
the amount of work done to find them. If you are mining at 8 difficulty, |
||||
then you would expect on average to find one 8 difficulty share, per 8 |
||||
single difficulty shares found. However, the number is actually random |
||||
and converges over time, it is an average, not an exact value, thus you |
||||
may find more or less than the expected average. |
||||
|
||||
Q: Why do the scrypt diffs not match with the current difficulty target? |
||||
A: The current scrypt block difficulty is expressed in terms of how |
||||
many multiples of the BTC difficulty it currently is (eg 28) whereas |
||||
the shares of "difficulty 1" are actually 65536 times smaller than the |
||||
BTC ones. The diff expressed by sgminer is as multiples of difficulty 1 |
||||
shares. |
||||
|
||||
Q: Can I make a donation? |
||||
A: Yes, see AUTHORS.md for authors' donation addresses. |
||||
|
||||
Q: What is Work Utility (WU)? |
||||
A: Work utility is the product of hashrate * luck and only stabilises |
||||
over a very long period of time. Luck includes hardware error rate, |
||||
share reject rate and other parameters. Therefore, it is often a better |
||||
indicator of hardware or software misconfiguration. |
@ -1,237 +0,0 @@
@@ -1,237 +0,0 @@
|
||||
While BTC donations are preferred, if you wish to donate to the author, Con |
||||
Kolivas, in LTC, please submit your donations to: |
||||
|
||||
Lc8TWMiKM7gRUrG8VB8pPNP1Yvt1SGZnoH |
||||
|
||||
Otherwise, please donate in BTC as per the main README. |
||||
|
||||
--- |
||||
|
||||
Scrypt mining, AKA litecoin mining, for GPU is completely different to sha256 |
||||
used for bitcoin mining. The algorithm was originally developed in a manner |
||||
that it was anticipated would make it suitable for mining on CPU but NOT GPU. |
||||
Thanks to some innovative work by Artforz and mtrlt, this was proven to be |
||||
wrong. However, it has very different requirements to bitcoin mining and is a |
||||
lot more complicated to get working well. Note that it is a ram dependent |
||||
workload, and requires you to have enough system ram as well as fast enough |
||||
GPU ram. If you have less system ram than your GPU has, it may not be possible |
||||
to mine at any reasonable rate. |
||||
|
||||
There are 5 main parameters to tuning scrypt, all of which are optional for |
||||
further fine tuning. When you start mining, sgminer may fail IN RANDOM WAYS. |
||||
They are all due to parameters being outside what the GPU can cope with. |
||||
|
||||
NOTE that if it does not fail at startup, the presence of hardware errors (HW) |
||||
are a sure sign that you have set the parameters too high. |
||||
|
||||
|
||||
DRIVERS AND OPENCL SDK |
||||
|
||||
The choice of driver version for your GPU is critical, as some are known to |
||||
break scrypt mining entirely while others give poor hashrates. As for the |
||||
OpenCL SDK installed, for AMD it must be version 2.6 or later. |
||||
|
||||
|
||||
Step 1 on Linux: |
||||
export GPU_MAX_ALLOC_PERCENT=100 |
||||
If you do not do this, you may find it impossible to scrypt mine. You may find |
||||
a value of 40 is enough and increasing this further has little effect. |
||||
|
||||
export GPU_USE_SYNC_OBJECTS=1 |
||||
may help CPU usage a little as well. |
||||
|
||||
On windows the same commands can be passed via a batch file if the following |
||||
lines are in the .bat before starting sgminer: |
||||
setx GPU_MAX_ALLOC_PERCENT 100 |
||||
setx GPU_USE_SYNC_OBJECTS 1 |
||||
|
||||
--intensity XX (-I XX) |
||||
|
||||
The scale goes from 0 to 42. The reason this is crucial is that too |
||||
high an intensity can actually be disastrous with scrypt because it CAN |
||||
run out of ram. High intensities start writing over the same ram and it |
||||
is highly dependent on the GPU, but they can start actually DECREASING |
||||
your hashrate, or even worse, start producing garbage with HW errors |
||||
skyrocketing, or locking up the system altogether. Note that if you do |
||||
NOT specify an intensity, sgminer uses dynamic mode which is designed |
||||
to minimise the harm to a running desktop and performance WILL be poor. |
||||
The lower limit to intensity with scrypt is usually 8 and sgminer will |
||||
prevent it going too low. |
||||
SUMMARY: Setting this for reasonable hashrates is mandatory. |
||||
|
||||
--shaders XXX |
||||
|
||||
is an option where you tell sgminer how many shaders your GPU has. This |
||||
helps sgminer try to choose some meaningful baseline parameters. Use |
||||
this table below to determine how many shaders your GPU has, and note |
||||
that there are some variants of these cards, and nvidia shaders are much |
||||
much lower and virtually pointless trying to mine on. If this is not |
||||
set, sgminer will query the device for how much memory it supports and |
||||
will try to set a value based on that. |
||||
SUMMARY: This will get you started but fine tuning for optimal performance is |
||||
required. Using --thread-concurrency is recommended instead. |
||||
|
||||
GPU Shaders |
||||
7750 512 |
||||
7770 640 |
||||
7850 1024 |
||||
7870 1280 |
||||
7950 1792 |
||||
7970 2048 |
||||
|
||||
6850 960 |
||||
6870 1120 |
||||
6950 1408 |
||||
6970 1536 |
||||
6990 (6970x2) |
||||
|
||||
6570 480 |
||||
6670 480 |
||||
6790 800 |
||||
|
||||
6450 160 |
||||
|
||||
5670 400 |
||||
5750 720 |
||||
5770 800 |
||||
5830 1120 |
||||
5850 1440 |
||||
5870 1600 |
||||
5970 (5870x2) |
||||
|
||||
These are only used as a rough guide for sgminer, and it is rare that this is |
||||
all you will need to set. |
||||
|
||||
|
||||
Optional parameters to tune: |
||||
-g, --thread-concurrency, --lookup-gap |
||||
|
||||
--thread-concurrency: |
||||
This tunes the optimal size of work that scrypt can do. It is internally tuned |
||||
by sgminer to be the highest reasonable multiple of shaders that it can |
||||
allocate on your GPU. Ideally it should be a multiple of your shader count. |
||||
vliw5 architecture (R5XXX) would be best at 5x shaders, while VLIW4 (R6xxx and |
||||
R7xxx) are best at 4x. Setting thread concurrency overrides anything you put |
||||
into --shaders and is ultimately a BETTER way to tune performance. |
||||
SUMMARY: Spend lots of time finding the highest value that your device likes |
||||
and increases hashrate. |
||||
|
||||
-g: |
||||
Once you have found the optimal shaders and intensity, you can start increasing |
||||
the -g value till sgminer fails to start. This is really only of value if you |
||||
want to run low intensities as you will be unable to run more than 1. |
||||
SUMMARY: Don't touch this. |
||||
|
||||
--lookup-gap |
||||
This tunes a compromise between ram usage and performance. Performance peaks |
||||
at a gap of 2, but increasing the gap can save you some GPU ram, but almost |
||||
always at the cost of significant loss of hashrate. Setting lookup gap |
||||
overrides the default of 2, but sgminer will use the --shaders value to choose |
||||
a thread-concurrency if you haven't chosen one. |
||||
SUMMARY: Don't touch this. |
||||
|
||||
|
||||
Related parameters: |
||||
--worksize XX (-w XX) |
||||
Has a minor effect, should be a multiple of 64 up to 256 maximum. |
||||
SUMMARY: Worth playing with once everything else has been tried but will |
||||
probably do nothing. |
||||
|
||||
|
||||
Overclocking for scrypt mining: |
||||
First of all, do not underclock your memory initially. Scrypt mining requires |
||||
memory speed and on most, but not all, GPUs, lowering memory speed lowers |
||||
mining performance. |
||||
|
||||
Second, absolute engine clock speeds do NOT correlate with hashrate. The ratio |
||||
of engine clock speed to memory matters, so if you set your memory to the |
||||
default value, and then start overclocking as you are running it, you should |
||||
find a sweet spot where the hashrate peaks and then it might actually drop if |
||||
you increase the engine clock speed further. |
||||
|
||||
Third, the combination of motherboard, CPU and system ram ALSO makes a |
||||
difference, so values that work for a GPU on one system may not work for the |
||||
same GPU on a different system. A decent amount of system ram is actually |
||||
required for scrypt mining, and 4GB is suggested. |
||||
|
||||
Finally, the power consumption while mining at high engine clocks, very high |
||||
memory clocks can be far in excess of what you might imagine. |
||||
For example, a 7970 running with the following settings: |
||||
--thread-concurrency 22392 --gpu-engine 1135 --gpu-memclock 1890 |
||||
was using 305W! |
||||
|
||||
--- |
||||
TUNING AN AMD RADEON 7970 |
||||
Example tuning a 7970 for Scrypt mining: |
||||
|
||||
On linux run this command: |
||||
export GPU_MAX_ALLOC_PERCENT=100 |
||||
or on windows this: |
||||
setx GPU_MAX_ALLOC_PERCENT 100 |
||||
in the same console/bash/dos prompt/bat file/whatever you want to call it, |
||||
before running sgminer. |
||||
|
||||
First, find the highest thread concurrency that you can start it at. They should |
||||
all start at 8192 but some will go up to 3 times that. Don't go too high on the |
||||
intensity while testing and don't change gpu threads. If you cannot go above |
||||
8192, don't fret as you can still get a high hashrate. |
||||
|
||||
Delete any .bin files so you're starting from scratch and see what bins get |
||||
generated. |
||||
|
||||
First try without any thread concurrency or even shaders, as sgminer will try to |
||||
find an optimal value |
||||
sgminer -I 13 |
||||
|
||||
If that starts mining, see what bin was generated, it is likely the largest |
||||
meaningful TC you can set. |
||||
Starting it on mine I get: |
||||
scrypt130302Tahitiglg2tc22392w64l8.bin |
||||
|
||||
See tc22392 that's telling you what thread concurrency it was. It should start |
||||
without TC parameters, but you never know. So if it doesn't, start with |
||||
--thread-concurrency 8192 and add 2048 to it at a time till you find the highest |
||||
value it will start successfully at. |
||||
|
||||
Then start overclocking the eyeballs off your memory, as 7970s are exquisitely |
||||
sensitive to memory speed and amazingly overclockable but please make sure it |
||||
keeps adequately cooled with --auto-fan! Do it while it's running from the GPU |
||||
menu. Go up by 25 at a time every 30 seconds or so until your GPU crashes. Then |
||||
reboot and start it 25 lower as a rough start. Mine runs stable at 1900 memory |
||||
without overvolting. Overvolting is the only thing that can actually damage your |
||||
GPU so I wouldn't recommend it at all. |
||||
|
||||
Then once you find the maximum memory clock speed, you need to find the sweet |
||||
spot engine clock speed that matches it. It's a fine line where one more MHz |
||||
will make the hashrate drop by 20%. It's somewhere in the .57 - 0.6 ratio range. |
||||
Start your engine clock speed at half your memory clock speed and then increase |
||||
it by 5 at a time. The hashrate should climb a little each rise in engine speed |
||||
and then suddenly drop above a certain value. Decrease it by 1 then until you |
||||
find it climbs dramatically. If your engine clock speed cannot get that high |
||||
without crashing the GPU, you will have to use a lower memclock. |
||||
|
||||
Then, and only then, bother trying to increase intensity further. |
||||
|
||||
My final settings were: |
||||
--gpu-engine 1141 --gpu-memclock 1875 -I 20 |
||||
for a hashrate of 745kH. |
||||
|
||||
Note I did not bother setting a thread concurrency. Once you have the magic |
||||
endpoint, look at what tc was chosen by the bin file generated and then hard |
||||
code that in next time (eg --thread-concurrency 22392) as slight changes in |
||||
thread concurrency will happen every time if you don't specify one, and the tc |
||||
to clock ratios are critical! |
||||
|
||||
Good luck, and if this doesn't work for you, well same old magic discussion |
||||
applies, I cannot debug every hardware combo out there. |
||||
|
||||
Your numbers will be your numbers depending on your hardware combination and OS, |
||||
so don't expect to get exactly the same results! |
||||
|
||||
--- |
||||
While BTC donations are preferred, if you wish to donate to the author, Con |
||||
Kolivas, in LTC, please submit your donations to: |
||||
|
||||
Lc8TWMiKM7gRUrG8VB8pPNP1Yvt1SGZnoH |
||||
|
||||
Otherwise, please donate in BTC as per the main README. |
@ -0,0 +1,263 @@
@@ -0,0 +1,263 @@
|
||||
# Mining scrypt |
||||
|
||||
## Introduction |
||||
|
||||
Mining scrypt-based cryptocurrencies using GPUs is completely different |
||||
to mining SHA256d (used in Bitcoin). The former was intentionally |
||||
developed in a manner that (it was hoped) would make it suitable |
||||
for mining on CPUs, but not GPUs. Thanks to some innovative work by |
||||
_Artforz_ and _mtrlt_, this was proven to be wrong. |
||||
|
||||
However, it has very different requirements compared to SHA256d and |
||||
is a lot more complicated to get working well. It is a RAM-dependent |
||||
workload, and requires you to have enough system RAM as well as fast |
||||
enough GPU RAM. What is "enough" depends on setup specifics. |
||||
|
||||
|
||||
## Catalyst drivers and OpenCL SDK |
||||
|
||||
The choice of driver version for your GPU is critical, as some are known |
||||
to break scrypt mining entirely while others give poor hashrates. It is |
||||
recommended that you first try with the latest stable version available. |
||||
|
||||
Latest driver distribution versions may aready include the AMD APP |
||||
SDK, therefore presenting an OpenCL vendor conflict when building or |
||||
running. Systems with NVidia cards and NVidia drivers may have a similar |
||||
conflict. If this is the case, check which OpenCL vendor is used, and |
||||
consider removing unneeded ones. |
||||
|
||||
|
||||
## Runtime environment |
||||
|
||||
Environment variables must be set to allow access from console / |
||||
terminal / screen. |
||||
|
||||
On Linux: |
||||
|
||||
export DISPLAY=:0 |
||||
export GPU_MAX_ALLOC_PERCENT=100 |
||||
export GPU_USE_SYNC_OBJECTS=1 |
||||
|
||||
On Windows: |
||||
|
||||
setx GPU_MAX_ALLOC_PERCENT 100 |
||||
setx GPU_USE_SYNC_OBJECTS 1 |
||||
|
||||
|
||||
## Tuning |
||||
|
||||
When mining is started, sgminer may fail in various ways. This is often |
||||
not a bug in the software, but rather misconfiguration. The failures may |
||||
occur due to parameters being outside what the GPU can cope with (both |
||||
too high and too low). |
||||
|
||||
All parameters are optional for fine tuning. |
||||
|
||||
**WARNING**: documentation below has not been reviewed to be up-to-date. |
||||
|
||||
|
||||
--intensity XX (-I XX) |
||||
|
||||
The scale goes from 0 to 31. The reason this is crucial is that too |
||||
high an intensity can actually be disastrous with scrypt because it CAN |
||||
run out of ram. High intensities start writing over the same ram and it |
||||
is highly dependent on the GPU, but they can start actually DECREASING |
||||
your hashrate, or even worse, start producing garbage with HW errors |
||||
skyrocketing, or locking up the system altogether. Note that if you do |
||||
NOT specify an intensity, sgminer uses dynamic mode which is designed |
||||
to minimise the harm to a running desktop and performance WILL be poor. |
||||
The lower limit to intensity with scrypt is usually 8 and sgminer will |
||||
prevent it going too low. |
||||
|
||||
SUMMARY: Setting this for reasonable hashrates is mandatory. |
||||
|
||||
|
||||
--shaders XXX |
||||
|
||||
is an option where you tell sgminer how many shaders your GPU has. This |
||||
helps sgminer try to choose some meaningful baseline parameters. Use |
||||
this table below to determine how many shaders your GPU has, and note |
||||
that there are some variants of these cards, and nvidia shaders are |
||||
much much lower and virtually pointless trying to mine on. If this is |
||||
not set, sgminer will query the device for how much memory it supports |
||||
and will try to set a value based on that. |
||||
|
||||
SUMMARY: This will get you started but fine tuning for optimal |
||||
performance is required. Using --thread-concurrency is recommended |
||||
instead. |
||||
|
||||
GPU Shaders |
||||
7750 512 |
||||
7770 640 |
||||
7850 1024 |
||||
7870 1280 |
||||
7950 1792 |
||||
7970 2048 |
||||
|
||||
6850 960 |
||||
6870 1120 |
||||
6950 1408 |
||||
6970 1536 |
||||
6990 (6970x2) |
||||
|
||||
6570 480 |
||||
6670 480 |
||||
6790 800 |
||||
|
||||
6450 160 |
||||
|
||||
5670 400 |
||||
5750 720 |
||||
5770 800 |
||||
5830 1120 |
||||
5850 1440 |
||||
5870 1600 |
||||
5970 (5870x2) |
||||
|
||||
These are only used as a rough guide for sgminer, and it is rare that |
||||
this is all you will need to set. |
||||
|
||||
|
||||
--thread-concurrency |
||||
|
||||
This tunes the optimal size of work that scrypt can do. It is internally |
||||
tuned by sgminer to be the highest reasonable multiple of shaders that |
||||
it can allocate on your GPU. Ideally it should be a multiple of your |
||||
shader count. vliw5 architecture (R5XXX) would be best at 5x shaders, |
||||
while VLIW4 (R6xxx and R7xxx) are best at 4x. Setting thread concurrency |
||||
overrides anything you put into --shaders and is ultimately a BETTER way |
||||
to tune performance. |
||||
|
||||
SUMMARY: Spend lots of time finding the highest value that your device |
||||
likes and increases hashrate. |
||||
|
||||
|
||||
-g |
||||
|
||||
Once you have found the optimal shaders and intensity, you can start |
||||
increasing the -g value till sgminer fails to start. This is really only |
||||
of value if you want to run low intensities as you will be unable to run |
||||
more than 1. |
||||
|
||||
SUMMARY: Don't touch this. |
||||
|
||||
|
||||
--lookup-gap |
||||
|
||||
This tunes a compromise between ram usage and performance. Performance |
||||
peaks at a gap of 2, but increasing the gap can save you some GPU |
||||
ram, but almost always at the cost of significant loss of hashrate. |
||||
Setting lookup gap overrides the default of 2, but sgminer will use the |
||||
--shaders value to choose a thread-concurrency if you haven't chosen |
||||
one. |
||||
|
||||
SUMMARY: Don't touch this. |
||||
|
||||
|
||||
Related parameters: |
||||
--worksize XX (-w XX) |
||||
Has a minor effect, should be a multiple of 64 up to 256 maximum. |
||||
SUMMARY: Worth playing with once everything else has been tried but will |
||||
probably do nothing. |
||||
|
||||
|
||||
Overclocking for scrypt mining: First of all, do not underclock your |
||||
memory initially. Scrypt mining requires memory speed and on most, but |
||||
not all, GPUs, lowering memory speed lowers mining performance. |
||||
|
||||
|
||||
Second, absolute engine clock speeds do NOT correlate with hashrate. The |
||||
ratio of engine clock speed to memory matters, so if you set your memory |
||||
to the default value, and then start overclocking as you are running it, |
||||
you should find a sweet spot where the hashrate peaks and then it might |
||||
actually drop if you increase the engine clock speed further. |
||||
|
||||
|
||||
Third, the combination of motherboard, CPU and system ram ALSO makes a |
||||
difference, so values that work for a GPU on one system may not work for |
||||
the same GPU on a different system. A decent amount of system ram is |
||||
actually required for scrypt mining, and 4GB is suggested. |
||||
|
||||
|
||||
Finally, the power consumption while mining at high engine clocks, |
||||
very high memory clocks can be far in excess of what you might |
||||
imagine. For example, a 7970 running with the following settings: |
||||
--thread-concurrency 22392 --gpu-engine 1135 --gpu-memclock 1890 was |
||||
using 305W! |
||||
|
||||
|
||||
## Example: tuning a 7970 |
||||
|
||||
On linux run this command: |
||||
|
||||
export GPU_MAX_ALLOC_PERCENT=100 |
||||
|
||||
or on windows this: |
||||
|
||||
setx GPU_MAX_ALLOC_PERCENT 100 |
||||
|
||||
in the same console/bash/dos prompt/bat file/whatever you want to call it, |
||||
before running sgminer. |
||||
|
||||
First, find the highest thread concurrency that you can start it at. |
||||
They should all start at 8192 but some will go up to 3 times that. Don't |
||||
go too high on the intensity while testing and don't change gpu threads. |
||||
If you cannot go above 8192, don't fret as you can still get a high |
||||
hashrate. |
||||
|
||||
Delete any .bin files so you're starting from scratch and see what bins |
||||
get generated. |
||||
|
||||
First try without any thread concurrency or even shaders, as sgminer |
||||
will try to find an optimal value |
||||
|
||||
sgminer -I 13 |
||||
|
||||
If that starts mining, see what bin was generated, it is likely the |
||||
largest meaningful TC you can set. Starting it on mine I get: |
||||
|
||||
scrypt130302Tahitiglg2tc22392w64l8.bin |
||||
|
||||
See tc22392 that's telling you what thread concurrency it was. It should |
||||
start without TC parameters, but you never know. So if it doesn't, start |
||||
with --thread-concurrency 8192 and add 2048 to it at a time till you |
||||
find the highest value it will start successfully at. |
||||
|
||||
Then start overclocking the eyeballs off your memory, as 7970s are |
||||
exquisitely sensitive to memory speed and amazingly overclockable but |
||||
please make sure it keeps adequately cooled with --auto-fan! Do it |
||||
while it's running from the GPU menu. Go up by 25 at a time every 30 |
||||
seconds or so until your GPU crashes. Then reboot and start it 25 lower |
||||
as a rough start. Mine runs stable at 1900 memory without overvolting. |
||||
Overvolting is the only thing that can actually damage your GPU so I |
||||
wouldn't recommend it at all. |
||||
|
||||
Then once you find the maximum memory clock speed, you need to find |
||||
the sweet spot engine clock speed that matches it. It's a fine line |
||||
where one more MHz will make the hashrate drop by 20%. It's somewhere in |
||||
the .57 - 0.6 ratio range. Start your engine clock speed at half your |
||||
memory clock speed and then increase it by 5 at a time. The hashrate |
||||
should climb a little each rise in engine speed and then suddenly drop |
||||
above a certain value. Decrease it by 1 then until you find it climbs |
||||
dramatically. If your engine clock speed cannot get that high without |
||||
crashing the GPU, you will have to use a lower memclock. |
||||
|
||||
Then, and only then, bother trying to increase intensity further. |
||||
|
||||
My final settings were: |
||||
|
||||
--gpu-engine 1141 --gpu-memclock 1875 -I 20 |
||||
|
||||
for a hashrate of 745kH. |
||||
|
||||
Note I did not bother setting a thread concurrency. Once you have the |
||||
magic endpoint, look at what tc was chosen by the bin file generated |
||||
and then hard code that in next time (eg --thread-concurrency 22392) as |
||||
slight changes in thread concurrency will happen every time if you don't |
||||
specify one, and the tc to clock ratios are critical! |
||||
|
||||
Good luck, and if this doesn't work for you, well same old magic |
||||
discussion applies, I cannot debug every hardware combo out there. |
||||
|
||||
Your numbers will be your numbers depending on your hardware combination |
||||
and OS, so don't expect to get exactly the same results! |
@ -0,0 +1,792 @@
@@ -0,0 +1,792 @@
|
||||
/*- |
||||
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt, |
||||
* 2013 Optimiztion by Pavel Semjanov, |
||||
* 2012-2013 Con Kolivas. |
||||
* All rights reserved. |
||||
* |
||||
* Redistribution and use in source and binary forms, with or without |
||||
* modification, are permitted provided that the following conditions |
||||
* are met: |
||||
* 1. Redistributions of source code must retain the above copyright |
||||
* notice, this list of conditions and the following disclaimer. |
||||
* 2. Redistributions in binary form must reproduce the above copyright |
||||
* notice, this list of conditions and the following disclaimer in the |
||||
* documentation and/or other materials provided with the distribution. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
||||
* SUCH DAMAGE. |
||||
* |
||||
* This file was originally written by Colin Percival as part of the Tarsnap |
||||
* online backup system. |
||||
*/ |
||||
|
||||
__constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 }; |
||||
__constant uint K[] = { |
||||
0x428a2f98U, |
||||
0x71374491U, |
||||
0xb5c0fbcfU, |
||||
0xe9b5dba5U, |
||||
0x3956c25bU, |
||||
0x59f111f1U, |
||||
0x923f82a4U, |
||||
0xab1c5ed5U, |
||||
0xd807aa98U, |
||||
0x12835b01U, |
||||
0x243185beU, // 10 |
||||
0x550c7dc3U, |
||||
0x72be5d74U, |
||||
0x80deb1feU, |
||||
0x9bdc06a7U, |
||||
0xe49b69c1U, |
||||
0xefbe4786U, |
||||
0x0fc19dc6U, |
||||
0x240ca1ccU, |
||||
0x2de92c6fU, |
||||
0x4a7484aaU, // 20 |
||||
0x5cb0a9dcU, |
||||
0x76f988daU, |
||||
0x983e5152U, |
||||
0xa831c66dU, |
||||
0xb00327c8U, |
||||
0xbf597fc7U, |
||||
0xc6e00bf3U, |
||||
0xd5a79147U, |
||||
0x06ca6351U, |
||||
0x14292967U, // 30 |
||||
0x27b70a85U, |
||||
0x2e1b2138U, |
||||
0x4d2c6dfcU, |
||||
0x53380d13U, |
||||
0x650a7354U, |
||||
0x766a0abbU, |
||||
0x81c2c92eU, |
||||
0x92722c85U, |
||||
0xa2bfe8a1U, |
||||
0xa81a664bU, // 40 |
||||
0xc24b8b70U, |
||||
0xc76c51a3U, |
||||
0xd192e819U, |
||||
0xd6990624U, |
||||
0xf40e3585U, |
||||
0x106aa070U, |
||||
0x19a4c116U, |
||||
0x1e376c08U, |
||||
0x2748774cU, |
||||
0x34b0bcb5U, // 50 |
||||
0x391c0cb3U, |
||||
0x4ed8aa4aU, |
||||
0x5b9cca4fU, |
||||
0x682e6ff3U, |
||||
0x748f82eeU, |
||||
0x78a5636fU, |
||||
0x84c87814U, |
||||
0x8cc70208U, |
||||
0x90befffaU, |
||||
0xa4506cebU, // 60 |
||||
0xbef9a3f7U, |
||||
0xc67178f2U, |
||||
0x98c7e2a2U, |
||||
0xfc08884dU, |
||||
0xcd2a11aeU, |
||||
0x510e527fU, |
||||
0x9b05688cU, |
||||
0xC3910C8EU, |
||||
0xfb6feee7U, |
||||
0x2a01a605U, // 70 |
||||
0x0c2e12e0U, |
||||
0x4498517BU, |
||||
0x6a09e667U, |
||||
0xa4ce148bU, |
||||
0x95F61999U, |
||||
0xc19bf174U, |
||||
0xBB67AE85U, |
||||
0x3C6EF372U, |
||||
0xA54FF53AU, |
||||
0x1F83D9ABU, // 80 |
||||
0x5BE0CD19U, |
||||
0x5C5C5C5CU, |
||||
0x36363636U, |
||||
0x80000000U, |
||||
0x000003FFU, |
||||
0x00000280U, |
||||
0x000004a0U, |
||||
0x00000300U |
||||
}; |
||||
|
||||
#define rotl(x,y) rotate(x,y) |
||||
#define Ch(x,y,z) bitselect(z,y,x) |
||||
#define Maj(x,y,z) Ch((x^z),y,z) |
||||
|
||||
#define EndianSwap(n) (rotl(n & ES[0], 24U)|rotl(n & ES[1], 8U)) |
||||
|
||||
#define Tr2(x) (rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U)) |
||||
#define Tr1(x) (rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U)) |
||||
#define Wr2(x) (rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U)) |
||||
#define Wr1(x) (rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U)) |
||||
|
||||
#define RND(a, b, c, d, e, f, g, h, k) \ |
||||
h += Tr1(e); \ |
||||
h += Ch(e, f, g); \ |
||||
h += k; \ |
||||
d += h; \ |
||||
h += Tr2(a); \ |
||||
h += Maj(a, b, c); |
||||
|
||||
void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3) |
||||
{ |
||||
uint4 S0 = *state0; |
||||
uint4 S1 = *state1; |
||||
|
||||
#define A S0.x |
||||
#define B S0.y |
||||
#define C S0.z |
||||
#define D S0.w |
||||
#define E S1.x |
||||
#define F S1.y |
||||
#define G S1.z |
||||
#define H S1.w |
||||
|
||||
uint4 W[4]; |
||||
|
||||
W[ 0].x = block0.x; |
||||
RND(A,B,C,D,E,F,G,H, W[0].x+ K[0]); |
||||
W[ 0].y = block0.y; |
||||
RND(H,A,B,C,D,E,F,G, W[0].y+ K[1]); |
||||
W[ 0].z = block0.z; |
||||
RND(G,H,A,B,C,D,E,F, W[0].z+ K[2]); |
||||
W[ 0].w = block0.w; |
||||
RND(F,G,H,A,B,C,D,E, W[0].w+ K[3]); |
||||
|
||||
W[ 1].x = block1.x; |
||||
RND(E,F,G,H,A,B,C,D, W[1].x+ K[4]); |
||||
W[ 1].y = block1.y; |
||||
RND(D,E,F,G,H,A,B,C, W[1].y+ K[5]); |
||||
W[ 1].z = block1.z; |
||||
RND(C,D,E,F,G,H,A,B, W[1].z+ K[6]); |
||||
W[ 1].w = block1.w; |
||||
RND(B,C,D,E,F,G,H,A, W[1].w+ K[7]); |
||||
|
||||
W[ 2].x = block2.x; |
||||
RND(A,B,C,D,E,F,G,H, W[2].x+ K[8]); |
||||
W[ 2].y = block2.y; |
||||
RND(H,A,B,C,D,E,F,G, W[2].y+ K[9]); |
||||
W[ 2].z = block2.z; |
||||
RND(G,H,A,B,C,D,E,F, W[2].z+ K[10]); |
||||
W[ 2].w = block2.w; |
||||
RND(F,G,H,A,B,C,D,E, W[2].w+ K[11]); |
||||
|
||||
W[ 3].x = block3.x; |
||||
RND(E,F,G,H,A,B,C,D, W[3].x+ K[12]); |
||||
W[ 3].y = block3.y; |
||||
RND(D,E,F,G,H,A,B,C, W[3].y+ K[13]); |
||||
W[ 3].z = block3.z; |
||||
RND(C,D,E,F,G,H,A,B, W[3].z+ K[14]); |
||||
W[ 3].w = block3.w; |
||||
RND(B,C,D,E,F,G,H,A, W[3].w+ K[76]); |
||||
|
||||
|
||||
#define WUpdate(i) { uint4 tmp1, tmp2, tmp3; \ |
||||
tmp1 = (uint4) (W[(i+0)%4].y, W[(i+0)%4].z, W[(i+0)%4].w, W[(i+1)%4].x); \ |
||||
tmp2 = (uint4) (W[(i+2)%4].y, W[(i+2)%4].z, W[(i+2)%4].w, W[(i+3)%4].x); \ |
||||
tmp3 = (uint4) (W[(i+3)%4].z, W[(i+3)%4].w, 0, 0); \ |
||||
W[(i+0)%4] += tmp2 + Wr2 (tmp1) + Wr1 (tmp3); \ |
||||
tmp1 = (uint4) (0, 0, W[(i+0)%4].x, W[(i+0)%4].y); \ |
||||
W[(i+0)%4] += Wr1 (tmp1); \ |
||||
} |
||||
|
||||
WUpdate (0); |
||||
RND(A,B,C,D,E,F,G,H, W[0].x+ K[15]); |
||||
RND(H,A,B,C,D,E,F,G, W[0].y+ K[16]); |
||||
RND(G,H,A,B,C,D,E,F, W[0].z+ K[17]); |
||||
RND(F,G,H,A,B,C,D,E, W[0].w+ K[18]); |
||||
|
||||
WUpdate (1); |
||||
RND(E,F,G,H,A,B,C,D, W[1].x+ K[19]); |
||||
RND(D,E,F,G,H,A,B,C, W[1].y+ K[20]); |
||||
RND(C,D,E,F,G,H,A,B, W[1].z+ K[21]); |
||||
RND(B,C,D,E,F,G,H,A, W[1].w+ K[22]); |
||||
|
||||
WUpdate (2); |
||||
RND(A,B,C,D,E,F,G,H, W[2].x+ K[23]); |
||||
RND(H,A,B,C,D,E,F,G, W[2].y+ K[24]); |
||||
RND(G,H,A,B,C,D,E,F, W[2].z+ K[25]); |
||||
RND(F,G,H,A,B,C,D,E, W[2].w+ K[26]); |
||||
|
||||
WUpdate (3); |
||||
RND(E,F,G,H,A,B,C,D, W[3].x+ K[27]); |
||||
RND(D,E,F,G,H,A,B,C, W[3].y+ K[28]); |
||||
RND(C,D,E,F,G,H,A,B, W[3].z+ K[29]); |
||||
RND(B,C,D,E,F,G,H,A, W[3].w+ K[30]); |
||||
|
||||
WUpdate (0); |
||||
RND(A,B,C,D,E,F,G,H, W[0].x+ K[31]); |
||||
RND(H,A,B,C,D,E,F,G, W[0].y+ K[32]); |
||||
RND(G,H,A,B,C,D,E,F, W[0].z+ K[33]); |
||||
RND(F,G,H,A,B,C,D,E, W[0].w+ K[34]); |
||||
|
||||
WUpdate (1); |
||||
RND(E,F,G,H,A,B,C,D, W[1].x+ K[35]); |
||||
RND(D,E,F,G,H,A,B,C, W[1].y+ K[36]); |
||||
RND(C,D,E,F,G,H,A,B, W[1].z+ K[37]); |
||||
RND(B,C,D,E,F,G,H,A, W[1].w+ K[38]); |
||||
|
||||
WUpdate (2); |
||||
RND(A,B,C,D,E,F,G,H, W[2].x+ K[39]); |
||||
RND(H,A,B,C,D,E,F,G, W[2].y+ K[40]); |
||||
RND(G,H,A,B,C,D,E,F, W[2].z+ K[41]); |
||||
RND(F,G,H,A,B,C,D,E, W[2].w+ K[42]); |
||||
|
||||
WUpdate (3); |
||||
RND(E,F,G,H,A,B,C,D, W[3].x+ K[43]); |
||||
RND(D,E,F,G,H,A,B,C, W[3].y+ K[44]); |
||||
RND(C,D,E,F,G,H,A,B, W[3].z+ K[45]); |
||||
RND(B,C,D,E,F,G,H,A, W[3].w+ K[46]); |
||||
|
||||
WUpdate (0); |
||||
RND(A,B,C,D,E,F,G,H, W[0].x+ K[47]); |
||||
RND(H,A,B,C,D,E,F,G, W[0].y+ K[48]); |
||||
RND(G,H,A,B,C,D,E,F, W[0].z+ K[49]); |
||||
RND(F,G,H,A,B,C,D,E, W[0].w+ K[50]); |
||||
|
||||
WUpdate (1); |
||||
RND(E,F,G,H,A,B,C,D, W[1].x+ K[51]); |
||||
RND(D,E,F,G,H,A,B,C, W[1].y+ K[52]); |
||||
RND(C,D,E,F,G,H,A,B, W[1].z+ K[53]); |
||||
RND(B,C,D,E,F,G,H,A, W[1].w+ K[54]); |
||||
|
||||
WUpdate (2); |
||||
RND(A,B,C,D,E,F,G,H, W[2].x+ K[55]); |
||||
RND(H,A,B,C,D,E,F,G, W[2].y+ K[56]); |
||||
RND(G,H,A,B,C,D,E,F, W[2].z+ K[57]); |
||||
RND(F,G,H,A,B,C,D,E, W[2].w+ K[58]); |
||||
|
||||
WUpdate (3); |
||||
RND(E,F,G,H,A,B,C,D, W[3].x+ K[59]); |
||||
RND(D,E,F,G,H,A,B,C, W[3].y+ K[60]); |
||||
RND(C,D,E,F,G,H,A,B, W[3].z+ K[61]); |
||||
RND(B,C,D,E,F,G,H,A, W[3].w+ K[62]); |
||||
|
||||
#undef A |
||||
#undef B |
||||
#undef C |
||||
#undef D |
||||
#undef E |
||||
#undef F |
||||
#undef G |
||||
#undef H |
||||
|
||||
*state0 += S0; |
||||
*state1 += S1; |
||||
} |
||||
|
||||
void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3) |
||||
{ |
||||
#define A (*state0).x |
||||
#define B (*state0).y |
||||
#define C (*state0).z |
||||
#define D (*state0).w |
||||
#define E (*state1).x |
||||
#define F (*state1).y |
||||
#define G (*state1).z |
||||
#define H (*state1).w |
||||
|
||||
uint4 W[4]; |
||||
|
||||
W[0].x = block0.x; |
||||
D= K[63] +W[0].x; |
||||
H= K[64] +W[0].x; |
||||
|
||||
W[0].y = block0.y; |
||||
C= K[65] +Tr1(D)+Ch(D, K[66], K[67])+W[0].y; |
||||
G= K[68] +C+Tr2(H)+Ch(H, K[69] ,K[70]); |
||||
|
||||
W[0].z = block0.z; |
||||
B= K[71] +Tr1(C)+Ch(C,D,K[66])+W[0].z; |
||||
F= K[72] +B+Tr2(G)+Maj(G,H, K[73]); |
||||
|
||||
W[0].w = block0.w; |
||||
A= K[74] +Tr1(B)+Ch(B,C,D)+W[0].w; |
||||
E= K[75] +A+Tr2(F)+Maj(F,G,H); |
||||
|
||||
W[1].x = block1.x; |
||||
RND(E,F,G,H,A,B,C,D, W[1].x+ K[4]); |
||||
W[1].y = block1.y; |
||||
RND(D,E,F,G,H,A,B,C, W[1].y+ K[5]); |
||||
W[1].z = block1.z; |
||||
RND(C,D,E,F,G,H,A,B, W[1].z+ K[6]); |
||||
W[1].w = block1.w; |
||||
RND(B,C,D,E,F,G,H,A, W[1].w+ K[7]); |
||||
|
||||
W[2].x = block2.x; |
||||
RND(A,B,C,D,E,F,G,H, W[2].x+ K[8]); |
||||
W[2].y = block2.y; |
||||
RND(H,A,B,C,D,E,F,G, W[2].y+ K[9]); |
||||
W[2].z = block2.z; |
||||
RND(G,H,A,B,C,D,E,F, W[2].z+ K[10]); |
||||
W[2].w = block2.w; |
||||
RND(F,G,H,A,B,C,D,E, W[2].w+ K[11]); |
||||
|
||||
W[3].x = block3.x; |
||||
RND(E,F,G,H,A,B,C,D, W[3].x+ K[12]); |
||||
W[3].y = block3.y; |
||||
RND(D,E,F,G,H,A,B,C, W[3].y+ K[13]); |
||||
W[3].z = block3.z; |
||||
RND(C,D,E,F,G,H,A,B, W[3].z+ K[14]); |
||||
W[3].w = block3.w; |
||||
RND(B,C,D,E,F,G,H,A, W[3].w+ K[76]); |
||||
|
||||
W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); |
||||
RND(A,B,C,D,E,F,G,H, W[0].x+ K[15]); |
||||
|
||||
W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); |
||||
RND(H,A,B,C,D,E,F,G, W[0].y+ K[16]); |
||||
|
||||
W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); |
||||
RND(G,H,A,B,C,D,E,F, W[0].z+ K[17]); |
||||
|
||||
W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); |
||||
RND(F,G,H,A,B,C,D,E, W[0].w+ K[18]); |
||||
|
||||
W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); |
||||
RND(E,F,G,H,A,B,C,D, W[1].x+ K[19]); |
||||
|
||||
W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); |
||||
RND(D,E,F,G,H,A,B,C, W[1].y+ K[20]); |
||||
|
||||
W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); |
||||
RND(C,D,E,F,G,H,A,B, W[1].z+ K[21]); |
||||
|
||||
W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); |
||||
RND(B,C,D,E,F,G,H,A, W[1].w+ K[22]); |
||||
|
||||
W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); |
||||
RND(A,B,C,D,E,F,G,H, W[2].x+ K[23]); |
||||
|
||||
W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); |
||||
RND(H,A,B,C,D,E,F,G, W[2].y+ K[24]); |
||||
|
||||
W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); |
||||
RND(G,H,A,B,C,D,E,F, W[2].z+ K[25]); |
||||
|
||||
W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); |
||||
RND(F,G,H,A,B,C,D,E, W[2].w+ K[26]); |
||||
|
||||
W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); |
||||
RND(E,F,G,H,A,B,C,D, W[3].x+ K[27]); |
||||
|
||||
W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); |
||||
RND(D,E,F,G,H,A,B,C, W[3].y+ K[28]); |
||||
|
||||
W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); |
||||
RND(C,D,E,F,G,H,A,B, W[3].z+ K[29]); |
||||
|
||||
W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); |
||||
RND(B,C,D,E,F,G,H,A, W[3].w+ K[30]); |
||||
|
||||
W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); |
||||
RND(A,B,C,D,E,F,G,H, W[0].x+ K[31]); |
||||
|
||||
W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); |
||||
RND(H,A,B,C,D,E,F,G, W[0].y+ K[32]); |
||||
|
||||
W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); |
||||
RND(G,H,A,B,C,D,E,F, W[0].z+ K[33]); |
||||
|
||||
W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); |
||||
RND(F,G,H,A,B,C,D,E, W[0].w+ K[34]); |
||||
|
||||
W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); |
||||
RND(E,F,G,H,A,B,C,D, W[1].x+ K[35]); |
||||
|
||||
W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); |
||||
RND(D,E,F,G,H,A,B,C, W[1].y+ K[36]); |
||||
|
||||
W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); |
||||
RND(C,D,E,F,G,H,A,B, W[1].z+ K[37]); |
||||
|
||||
W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); |
||||
RND(B,C,D,E,F,G,H,A, W[1].w+ K[38]); |
||||
|
||||
W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); |
||||
RND(A,B,C,D,E,F,G,H, W[2].x+ K[39]); |
||||
|
||||
W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); |
||||
RND(H,A,B,C,D,E,F,G, W[2].y+ K[40]); |
||||
|
||||
W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); |
||||
RND(G,H,A,B,C,D,E,F, W[2].z+ K[41]); |
||||
|
||||
W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); |
||||
RND(F,G,H,A,B,C,D,E, W[2].w+ K[42]); |
||||
|
||||
W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); |
||||
RND(E,F,G,H,A,B,C,D, W[3].x+ K[43]); |
||||
|
||||
W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); |
||||
RND(D,E,F,G,H,A,B,C, W[3].y+ K[44]); |
||||
|
||||
W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); |
||||
RND(C,D,E,F,G,H,A,B, W[3].z+ K[45]); |
||||
|
||||
W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); |
||||
RND(B,C,D,E,F,G,H,A, W[3].w+ K[46]); |
||||
|
||||
W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); |
||||
RND(A,B,C,D,E,F,G,H, W[0].x+ K[47]); |
||||
|
||||
W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); |
||||
RND(H,A,B,C,D,E,F,G, W[0].y+ K[48]); |
||||
|
||||
W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); |
||||
RND(G,H,A,B,C,D,E,F, W[0].z+ K[49]); |
||||
|
||||
W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); |
||||
RND(F,G,H,A,B,C,D,E, W[0].w+ K[50]); |
||||
|
||||
W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); |
||||
RND(E,F,G,H,A,B,C,D, W[1].x+ K[51]); |
||||
|
||||
W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); |
||||
RND(D,E,F,G,H,A,B,C, W[1].y+ K[52]); |
||||
|
||||
W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); |
||||
RND(C,D,E,F,G,H,A,B, W[1].z+ K[53]); |
||||
|
||||
W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); |
||||
RND(B,C,D,E,F,G,H,A, W[1].w+ K[54]); |
||||
|
||||
W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); |
||||
RND(A,B,C,D,E,F,G,H, W[2].x+ K[55]); |
||||
|
||||
W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); |
||||
RND(H,A,B,C,D,E,F,G, W[2].y+ K[56]); |
||||
|
||||
W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); |
||||
RND(G,H,A,B,C,D,E,F, W[2].z+ K[57]); |
||||
|
||||
W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); |
||||
RND(F,G,H,A,B,C,D,E, W[2].w+ K[58]); |
||||
|
||||
W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); |
||||
RND(E,F,G,H,A,B,C,D, W[3].x+ K[59]); |
||||
|
||||
W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); |
||||
RND(D,E,F,G,H,A,B,C, W[3].y+ K[60]); |
||||
|
||||
W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); |
||||
RND(C,D,E,F,G,H,A,B, W[3].z+ K[61]); |
||||
|
||||
W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); |
||||
RND(B,C,D,E,F,G,H,A, W[3].w+ K[62]); |
||||
|
||||
#undef A |
||||
#undef B |
||||
#undef C |
||||
#undef D |
||||
#undef E |
||||
#undef F |
||||
#undef G |
||||
#undef H |
||||
|
||||
*state0 += (uint4)(K[73], K[77], K[78], K[79]); |
||||
*state1 += (uint4)(K[66], K[67], K[80], K[81]); |
||||
} |
||||
|
||||
__constant uint fixedW[64] = |
||||
{ |
||||
0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, |
||||
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794, |
||||
0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f, |
||||
0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c, |
||||
0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa, |
||||
0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012, |
||||
0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4, |
||||
0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848, |
||||
}; |
||||
|
||||
void SHA256_fixed(uint4*restrict state0,uint4*restrict state1) |
||||
{ |
||||
uint4 S0 = *state0; |
||||
uint4 S1 = *state1; |
||||
|
||||
#define A S0.x |
||||
#define B S0.y |
||||
#define C S0.z |
||||
#define D S0.w |
||||
#define E S1.x |
||||
#define F S1.y |
||||
#define G S1.z |
||||
#define H S1.w |
||||
|
||||
RND(A,B,C,D,E,F,G,H, fixedW[0]); |
||||
RND(H,A,B,C,D,E,F,G, fixedW[1]); |
||||
RND(G,H,A,B,C,D,E,F, fixedW[2]); |
||||
RND(F,G,H,A,B,C,D,E, fixedW[3]); |
||||
RND(E,F,G,H,A,B,C,D, fixedW[4]); |
||||
RND(D,E,F,G,H,A,B,C, fixedW[5]); |
||||
RND(C,D,E,F,G,H,A,B, fixedW[6]); |
||||
RND(B,C,D,E,F,G,H,A, fixedW[7]); |
||||
RND(A,B,C,D,E,F,G,H, fixedW[8]); |
||||
RND(H,A,B,C,D,E,F,G, fixedW[9]); |
||||
RND(G,H,A,B,C,D,E,F, fixedW[10]); |
||||
RND(F,G,H,A,B,C,D,E, fixedW[11]); |
||||
RND(E,F,G,H,A,B,C,D, fixedW[12]); |
||||
RND(D,E,F,G,H,A,B,C, fixedW[13]); |
||||
RND(C,D,E,F,G,H,A,B, fixedW[14]); |
||||
RND(B,C,D,E,F,G,H,A, fixedW[15]); |
||||
RND(A,B,C,D,E,F,G,H, fixedW[16]); |
||||
RND(H,A,B,C,D,E,F,G, fixedW[17]); |
||||
RND(G,H,A,B,C,D,E,F, fixedW[18]); |
||||
RND(F,G,H,A,B,C,D,E, fixedW[19]); |
||||
RND(E,F,G,H,A,B,C,D, fixedW[20]); |
||||
RND(D,E,F,G,H,A,B,C, fixedW[21]); |
||||
RND(C,D,E,F,G,H,A,B, fixedW[22]); |
||||
RND(B,C,D,E,F,G,H,A, fixedW[23]); |
||||
RND(A,B,C,D,E,F,G,H, fixedW[24]); |
||||
RND(H,A,B,C,D,E,F,G, fixedW[25]); |
||||
RND(G,H,A,B,C,D,E,F, fixedW[26]); |
||||
RND(F,G,H,A,B,C,D,E, fixedW[27]); |
||||
RND(E,F,G,H,A,B,C,D, fixedW[28]); |
||||
RND(D,E,F,G,H,A,B,C, fixedW[29]); |
||||
RND(C,D,E,F,G,H,A,B, fixedW[30]); |
||||
RND(B,C,D,E,F,G,H,A, fixedW[31]); |
||||
RND(A,B,C,D,E,F,G,H, fixedW[32]); |
||||
RND(H,A,B,C,D,E,F,G, fixedW[33]); |
||||
RND(G,H,A,B,C,D,E,F, fixedW[34]); |
||||
RND(F,G,H,A,B,C,D,E, fixedW[35]); |
||||
RND(E,F,G,H,A,B,C,D, fixedW[36]); |
||||
RND(D,E,F,G,H,A,B,C, fixedW[37]); |
||||
RND(C,D,E,F,G,H,A,B, fixedW[38]); |
||||
RND(B,C,D,E,F,G,H,A, fixedW[39]); |
||||
RND(A,B,C,D,E,F,G,H, fixedW[40]); |
||||
RND(H,A,B,C,D,E,F,G, fixedW[41]); |
||||
RND(G,H,A,B,C,D,E,F, fixedW[42]); |
||||
RND(F,G,H,A,B,C,D,E, fixedW[43]); |
||||
RND(E,F,G,H,A,B,C,D, fixedW[44]); |
||||
RND(D,E,F,G,H,A,B,C, fixedW[45]); |
||||
RND(C,D,E,F,G,H,A,B, fixedW[46]); |
||||
RND(B,C,D,E,F,G,H,A, fixedW[47]); |
||||
RND(A,B,C,D,E,F,G,H, fixedW[48]); |
||||
RND(H,A,B,C,D,E,F,G, fixedW[49]); |
||||
RND(G,H,A,B,C,D,E,F, fixedW[50]); |
||||
RND(F,G,H,A,B,C,D,E, fixedW[51]); |
||||
RND(E,F,G,H,A,B,C,D, fixedW[52]); |
||||
RND(D,E,F,G,H,A,B,C, fixedW[53]); |
||||
RND(C,D,E,F,G,H,A,B, fixedW[54]); |
||||
RND(B,C,D,E,F,G,H,A, fixedW[55]); |
||||
RND(A,B,C,D,E,F,G,H, fixedW[56]); |
||||
RND(H,A,B,C,D,E,F,G, fixedW[57]); |
||||
RND(G,H,A,B,C,D,E,F, fixedW[58]); |
||||
RND(F,G,H,A,B,C,D,E, fixedW[59]); |
||||
RND(E,F,G,H,A,B,C,D, fixedW[60]); |
||||
RND(D,E,F,G,H,A,B,C, fixedW[61]); |
||||
RND(C,D,E,F,G,H,A,B, fixedW[62]); |
||||
RND(B,C,D,E,F,G,H,A, fixedW[63]); |
||||
|
||||
#undef A |
||||
#undef B |
||||
#undef C |
||||
#undef D |
||||
#undef E |
||||
#undef F |
||||
#undef G |
||||
#undef H |
||||
*state0 += S0; |
||||
*state1 += S1; |
||||
} |
||||
|
||||
void shittify(uint4 B[8]) |
||||
{ |
||||
uint4 tmp[4]; |
||||
tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w); |
||||
tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w); |
||||
tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w); |
||||
tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w); |
||||
|
||||
#pragma unroll |
||||
for(uint i=0; i<4; ++i) |
||||
B[i] = EndianSwap(tmp[i]); |
||||
|
||||
tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w); |
||||
tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w); |
||||
tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w); |
||||
tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w); |
||||
|
||||
#pragma unroll |
||||
for(uint i=0; i<4; ++i) |
||||
B[i+4] = EndianSwap(tmp[i]); |
||||
} |
||||
|
||||
void unshittify(uint4 B[8]) |
||||
{ |
||||
uint4 tmp[4]; |
||||
tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w); |
||||
tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w); |
||||
tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w); |
||||
tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w); |
||||
|
||||
#pragma unroll |
||||
for(uint i=0; i<4; ++i) |
||||
B[i] = EndianSwap(tmp[i]); |
||||
|
||||
tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w); |
||||
tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w); |
||||
tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w); |
||||
tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w); |
||||
|
||||
#pragma unroll |
||||
for(uint i=0; i<4; ++i) |
||||
B[i+4] = EndianSwap(tmp[i]); |
||||
} |
||||
|
||||
void salsa(uint4 B[8]) |
||||
{ |
||||
uint4 w[4]; |
||||
|
||||
#pragma unroll |
||||
for(uint i=0; i<4; ++i) |
||||
w[i] = (B[i]^=B[i+4]); |
||||
|
||||
#pragma unroll |
||||
for(uint i=0; i<4; ++i) |
||||
{ |
||||
w[0] ^= rotl(w[3] +w[2] , 7U); |
||||
w[1] ^= rotl(w[0] +w[3] , 9U); |
||||
w[2] ^= rotl(w[1] +w[0] ,13U); |
||||
w[3] ^= rotl(w[2] +w[1] ,18U); |
||||
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); |
||||
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); |
||||
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); |
||||
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); |
||||
} |
||||
|
||||
#pragma unroll |
||||
for(uint i=0; i<4; ++i) |
||||
w[i] = (B[i+4]^=(B[i]+=w[i])); |
||||
|
||||
#pragma unroll |
||||
for(uint i=0; i<4; ++i) |
||||
{ |
||||
w[0] ^= rotl(w[3] +w[2] , 7U); |
||||
w[1] ^= rotl(w[0] +w[3] , 9U); |
||||
w[2] ^= rotl(w[1] +w[0] ,13U); |
||||
w[3] ^= rotl(w[2] +w[1] ,18U); |
||||
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); |
||||
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); |
||||
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); |
||||
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); |
||||
} |
||||
|
||||
#pragma unroll |
||||
for(uint i=0; i<4; ++i) |
||||
B[i+4] += w[i]; |
||||
} |
||||
|
||||
#define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE) |
||||
#define CO Coord(z,x,y) |
||||
|
||||
void scrypt_core(uint4 X[8], __global uint4*restrict lookup) |
||||
{ |
||||
shittify(X); |
||||
const uint zSIZE = 8; |
||||
const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0)); |
||||
const uint xSIZE = CONCURRENT_THREADS; |
||||
uint x = get_global_id(0)%xSIZE; |
||||
|
||||
for(uint y=0; y<1024/LOOKUP_GAP; ++y) |
||||
{ |
||||
#pragma unroll |
||||
for(uint z=0; z<zSIZE; ++z) |
||||
lookup[CO] = X[z]; |
||||
for(uint i=0; i<LOOKUP_GAP; ++i) |
||||
salsa(X); |
||||
} |
||||
#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8) |
||||
{ |
||||
uint y = (1024/LOOKUP_GAP); |
||||
#pragma unroll |
||||
for(uint z=0; z<zSIZE; ++z) |
||||
lookup[CO] = X[z]; |
||||
for(uint i=0; i<1024%LOOKUP_GAP; ++i) |
||||
salsa(X); |
||||
} |
||||
#endif |
||||
for (uint i=0; i<1024; ++i) |
||||
{ |
||||
uint4 V[8]; |
||||
uint j = X[7].x & K[85]; |
||||
uint y = (j/LOOKUP_GAP); |
||||
#pragma unroll |
||||
for(uint z=0; z<zSIZE; ++z) |
||||
V[z] = lookup[CO]; |
||||
|
||||
#if (LOOKUP_GAP == 1) |
||||
#elif (LOOKUP_GAP == 2) |
||||
if (j&1) |
||||
salsa(V); |
||||
#else |
||||
uint val = j%LOOKUP_GAP; |
||||
for (uint z=0; z<val; ++z) |
||||
salsa(V); |
||||
#endif |
||||
|
||||
#pragma unroll |
||||
for(uint z=0; z<zSIZE; ++z) |
||||
X[z] ^= V[z]; |
||||
salsa(X); |
||||
} |
||||
unshittify(X); |
||||
} |
||||
|
||||
#define FOUND (0xFF) |
||||
#define SETFOUND(Xnonce) output[output[FOUND]++] = Xnonce |
||||
|
||||
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) |
||||
__kernel void search(__global const uint4 * restrict input, |
||||
volatile __global uint*restrict output, __global uint4*restrict padcache, |
||||
const uint4 midstate0, const uint4 midstate16, const uint target) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
uint4 X[8]; |
||||
uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1; |
||||
uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid); |
||||
uint4 pad0 = midstate0, pad1 = midstate16; |
||||
|
||||
SHA256(&pad0,&pad1, data, (uint4)(K[84],0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[86])); |
||||
SHA256_fresh(&ostate0,&ostate1, pad0^ K[82], pad1^ K[82], K[82], K[82]); |
||||
SHA256_fresh(&tstate0,&tstate1, pad0^ K[83], pad1^ K[83], K[83], K[83]); |
||||
|
||||
tmp0 = tstate0; |
||||
tmp1 = tstate1; |
||||
SHA256(&tstate0, &tstate1, input[0],input[1],input[2],input[3]); |
||||
|
||||
#pragma unroll |
||||
for (uint i=0; i<4; i++) |
||||
{ |
||||
pad0 = tstate0; |
||||
pad1 = tstate1; |
||||
X[i*2 ] = ostate0; |
||||
X[i*2+1] = ostate1; |
||||
|
||||
SHA256(&pad0,&pad1, data, (uint4)(i+1,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87])); |
||||
SHA256(X+i*2,X+i*2+1, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88])); |
||||
} |
||||
scrypt_core(X,padcache); |
||||
SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]); |
||||
SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]); |
||||
SHA256_fixed(&tmp0,&tmp1); |
||||
SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88])); |
||||
|
||||
bool result = (EndianSwap(ostate1.w) <= target); |
||||
if (result) |
||||
SETFOUND(gid); |
||||
} |
Loading…
Reference in new issue