diff --git a/.gitignore b/.gitignore
index 9ab93c0f..3e345162 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,3 +31,13 @@ mingw32-config.cache
 ext_deps
 config.h.in
 config.h
+
+ccan/libccan.a
+lib/arg-nonnull.h
+lib/c++defs.h
+lib/libgnu.a
+lib/signal.h
+lib/string.h
+lib/warn-on-use.h
+
+mkinstalldirs
diff --git a/API-README b/API-README
index df905ef3..a62b6304 100644
--- a/API-README
+++ b/API-README
@@ -39,7 +39,7 @@ To give an IP address/subnet access to a group you use the group letter
 in front of the IP address instead of W: e.g. P:192.168.0/32
 An IP address/subnet can only be a member of one group
 A sample API group would be:
- --api-groups P:switchpool:enablepool:addpool:disablepool:removepool:*
+ --api-groups P:switchpool:enablepool:addpool:disablepool:removepool.poolpriority:*
 This would create a group 'P' that can do all current pool commands and all
 non-priviliged commands - the '*' means all non-priviledged commands
 Without the '*' the group would only have access to the pool commands
@@ -174,6 +174,11 @@ The list of requests - a (*) means it requires privileged access - and replies a
                               Use '\\' to get a '\' and '\,' to include a comma
                               inside URL, USR or PASS
 
+ poolpriority|N,... (*)
+               none           There is no reply section just the STATUS section
+                              stating the results of changing pool priorities
+                              See usage below
+
  disablepool|N (*)
                none           There is no reply section just the STATUS section
                               stating the results of disabling pool N
@@ -270,8 +275,18 @@ The list of requests - a (*) means it requires privileged access - and replies a
 When you enable, disable or restart a GPU or PGA, you will also get Thread messages
 in the cgminer status window
 
-When you switch to a different pool to the current one, you will get a
-'Switching to URL' message in the cgminer status windows
+The 'poolpriority' command can be used to reset the priority order of multiple
+pools with a single command - 'switchpool' only sets a single pool to first priority
+Each pool should be listed by id number in order of preference (first = most
+preferred)
+Any pools not listed will be prioritised after the ones that are listed, in the
+priority order they were originally
+If the priority change affects the miner's preference for mining, it may switch
+immediately
+
+When you switch to a different pool to the current one (including by priority
+change), you will get a 'Switching to URL' message in the cgminer status
+windows
 
 Obviously, the JSON format is simply just the names as given before the '='
 with the values after the '='
@@ -309,7 +324,14 @@ miner.php - an example web page to access the API
 Feature Changelog for external applications using the API:
 
 
-API V1.14
+API V1.15
+
+Added API commands:
+ 'poolpriority'
+
+----------
+
+API V1.14 (cgminer v2.5.0)
 
 Modified API commands:
  'stats' - more icarus timing stats added
diff --git a/FPGA-README b/FPGA-README
index 7b9f004d..0c4da8a5 100644
--- a/FPGA-README
+++ b/FPGA-README
@@ -16,7 +16,25 @@ p2pool.
 
 Icarus
 
-There is a hidden option in cgminer when Icarus support is compiled in:
+There are two hidden options in cgminer when Icarus support is compiled in:
+
+--icarus-options <arg> Set specific FPGA board configurations - one set of values for all or comma separated
+           baud:work_division:fpga_count
+
+           baud           The Serial/USB baud rate - 115200 or 57600 only - default 115200
+           work_division  The fraction of work divided up for each FPGA chip - 1, 2, 4 or 8
+                          e.g. 2 means each FPGA does half the nonce range - default 2
+           fpga_count     The actual number of FPGA working - this would normally be the same
+                          as work_division - range is from 1 up to 'work_division'
+                          It defaults to the value of work_division - or 2 if you don't specify
+                          work_division
+
+If you define fewer comma seperated values than Icarus devices, the last values will be used
+for all extra devices
+
+An example would be: --icarus-options 57600:2:1
+This would mean: use 57600 baud, the FPGA board divides the work in half however
+only 1 FPGA actually runs on the board (e.g. like an early CM1 Icarus copy bitstream)
 
 --icarus-timing <arg> Set how the Icarus timing is calculated - one setting/value for all or comma separated
            default[=N]   Use the default Icarus hash time (2.6316ns)
@@ -24,6 +42,9 @@ There is a hidden option in cgminer when Icarus support is compiled in:
            long          Re-calculate the hash time continuously
            value[=N]     Specify the hash time in nanoseconds (e.g. 2.6316) and abort time (e.g. 2.6316=80)
 
+If you define fewer comma seperated values than Icarus devices, the last values will be used
+for all extra devices
+
 Icarus timing is required for devices that do not exactly match a default Icarus Rev3 in
 processing speed
 If you have an Icarus Rev3 you should not normally need to use --icarus-timing since the
@@ -55,9 +76,9 @@ bitstream to the default one, use 'long' mode and give it at least a few hundred
 'short' mode and take note of the final hash time value (Hs) calculated
 You can also use the RPC API 'stats' command to see the current hash time (Hs) at any time
 
-The Icarus code currently only works with a dual FPGA device that supports the same commands as
+The Icarus code currently only works with an FPGA device that supports the same commands as
 Icarus Rev3 requires and also is less than ~840MH/s and greater than 2MH/s
-If a dual FPGA device does hash faster than ~840MH/s it should work correctly if you supply the
+If an FPGA device does hash faster than ~840MH/s it should work correctly if you supply the
 correct hash time nanoseconds value
 
 The timing code itself will affect the Icarus performance since it increases the delay after
diff --git a/Makefile.am b/Makefile.am
index d17414df..d9389101 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -10,7 +10,7 @@ endif
 EXTRA_DIST	= example.conf m4/gnulib-cache.m4 linux-usb-cgminer \
 		  ADL_SDK/readme.txt api-example.php miner.php	\
 		  API.class API.java api-example.c windows-build.txt \
-		  bitstreams/* API-README FPGA-README
+		  bitstreams/* API-README FPGA-README SCRYPT-README
 
 SUBDIRS		= lib compat ccan
 
@@ -45,6 +45,10 @@ cgminer_SOURCES += ocl.c ocl.h findnonce.c findnonce.h
 cgminer_SOURCES += adl.c adl.h adl_functions.h
 cgminer_SOURCES += *.cl
 
+if HAS_SCRYPT
+cgminer_SOURCES += scrypt.c scrypt.h
+endif
+
 if HAS_CPUMINE
 # original CPU related sources, unchanged
 cgminer_SOURCES	+= \
diff --git a/NEWS b/NEWS
index 06370090..7bb9524f 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,253 @@
+Version 2.6.4 - August 7, 2012
+
+- Convert the serial autodetect functions to use int instead of char to
+enumerate devices.
+- Make the serial open timeout for BFL generically 1 second on windows.
+- Deuglify windows autodetect code for BFL.
+- There is no point zeroing temperature in BFL if we fail to get a response, and
+we should register it as a HW error, suggesting throttling.
+- Update SCRYPT README with information about HW errors.
+- Use the scrypt CPU code to confirm results from OCL code, and mark failures as
+HW errors, making it easier to tune scrypt parameters.
+- We may as well leave one curl still available per pool instead of reaping the
+last one.
+- Need to recheck the pool->curls count on regaining the pool lock after the
+pthread conditional wait returns.
+- Display reaped debug message outside mutex lock to avoid recursive locking.
+- Add specific information when ADL detects error -10 saying the device is not
+enabled.
+- api.c update API start message and include port number
+- miner.php ignore arg when readonly
+- miner.php allow pool inputs: delete, addpool, poolpriority
+
+
+Version 2.6.3 - August 5, 2012
+
+- Count likely throttling episodes on bitforce devices as hardware errors.
+- Style cleanups.
+- Use FTD2XX.DLL on Windows to autodetect BitFORCE SHA256 devices.
+- Make pool_disabled the first in the enums == 0, fixing the pool enabled count
+which compares if value is not enabled before enabling it.
+- Correct writing of scrypt parameters to config file based on command line
+parameters only.
+- Use different variables for command line specified lookup gap and thread
+concurrency to differentiate user defined versus auto chosen values.
+- Queue a request on pool switch in case we have no work from the new pool yet.
+- Display failover only mode in pool menu and allow it to be toggled live.
+- Reinstate check for system queueing lag when the current pool's queue is maxed
+out, there is no staged work, and the work is needed now.
+- There is no need for pool active testing to be mandatory any more with queue
+request changes.
+- Fix harmless warnings.
+- Check the current staged and global queued as well before queueing requests.
+Discard stales before ageing work in the watchdog thread. Queue requests after
+discarding and ageing work in watchdog thread. Display accurate global queued in
+curses output. Reuse variable in age_work().
+- The queueing mechanism has become a complex state machine that is no longer
+predictable. Rewrite it from scratch watching only current queues in flight and
+staged work available on a pool by pool basis.
+- API remove unused warning in non-GPU compile
+- api.c in linux allow to open a closed socket in TIME_WAIT
+- Queue an extra request whenever staged work drops below mining thread count in
+hash_pop.
+- Update debian package configs to v2.6.2
+
+
+Version 2.6.2 - August 3, 2012
+
+- Scrypt mining does not support block testing yet so don't try to print it.
+- Clear the bitforce buffer whenever we get an unexpected result as it has
+likely throttled and we are getting cached responses out of order, and use the
+temperature monitoring as a kind of watchdog to flush unexpected results.
+- It is not critical getting the temperature response in bitforce so don't
+mandatorily wait on the mutex lock.
+- Check there is a cutoff temp actually set in bitforce before using it as a cut
+off value otherwise it may think it's set to zero degrees.
+- We dropped the temporary stopping of curl recruiting on submit_fail by
+mistake, reinstate it.
+- Make threads report in either side of the scanhash function in case we miss
+reporting in when restarting work.
+- Don't make mandatory work and its clones last forever.
+- Make test work for pool_active mandatory work items to smooth out staged work
+counts when in failover-only mode.
+- Add debugging output when work is found stale as to why.
+- Print the 3 parameters that are passed to applog for a debug line in
+bitforce.c
+- Clear bitforce buffer on init as previously.
+- Add some headroom to the number of curls available per pool to allow for
+longpoll and sendwork curls.
+- Revert "Revert "Change BFL driver thread initialising to a constant 100ms
+delay between devices instead of a random arrangement.""
+- Revert "Remove bitforce_thread_init"
+- Show the correct base units on GPU summary.
+- Differentiate between the send return value being a bool and the get return
+value when managing them in bitforce scanhash.
+- 23a8c60 Revert "bitforce: Skip out of sending work if work restart requested"
+
+
+Version 2.6.1 - July 30, 2012
+
+- Display scrypt as being built in as well.
+- Fix build warning about KL_SCRYPT when built without scrypt support.
+- Remove the low hash count determinant of hardware being sick. A low hash rate
+can be for poor network connectivity or scrypt mining, neither of which are due
+to a sick device.
+- api.c poolpriority changes
+
+
+Version 2.6.0 - July 29, 2012
+
+- Display kilohash when suitable, but store the global mhash value still truly
+in megahashes to not break the API output.
+- Don't try and print curses output for devices that won't fit on the screen.
+- Add scrypt documentation in the form of a separate readme.
+- Fix build error without scrypt enabled.
+- Limit total number of curls recruited per pool to the number of mining threads
+to prevent blasting the network when we only have one pool to talk to.
+- bitforce: Skip out of sending work if work restart requested
+- Keep a counter of enabled pools and use that instead of iterating over the
+pool list. Use that value to ensure we don't set the last remaining active pool
+to the rejecting state.
+- fpgautils: add support for 57.6 kBd serial
+- miner.php add a socket RCV timeout for if cgminer is hung and the API thread
+is still running
+- Limit thread concurrency for scrypt to 5xshaders if shaders is specified.
+- Simplify repeated use of gpus[gpu]. in ocl.c
+- Find the nearest power of 2 maximum alloc size for the scrypt buffer that can
+successfully be allocated and is large enough to accomodate the thread
+concurrency chosen, thus mapping it to an intensity.
+- Don't make opt_scrypt mandatory blocking with opencl code.
+- Update kernel versions reflecting changes in the API.
+- Make the thread concurrency and lookup gap options hidden on the command line
+and autotune parameters with a newly parsed --shaders option.
+- Fix target testing with scrypt kernel as it would have been missing shares
+below target.
+- Bugfix: Use a mutex to control non-curses output
+- Simplify code to a single vprintf path for curses-less printing
+- Move opt_quiet check to my_log_curses, so it works for curses-less builds
+- Use log_generic for vapplog to cut down on code duplication
+- Add space to log output now that there is more screen real estate available.
+- BFL force all code to timeout to avoid hanging
+- Bugfix: Copy argv[0] given to dirname()
+- Always create the largest possible padbuffer for scrypt kernels even if not
+needed for thread_concurrency, giving us some headroom for intensity levels.
+- Use the detected maximum allocable memory on a GPU to determine the optimal
+scrypt settings when lookup_gap and thread_concurrency parameters are not given.
+- Check the maximum allocable memory size per opencl device.
+- Add debugging output if buffer allocation fails for scrypt and round up
+bufsize to a multiple of 256.
+- Nonce testing for btc got screwed up, leading to no accepted shares. Fix it.
+- Display size of scrypt buffer used in debug.
+- Allow intensities up to 20 if scrypt is compiled in.
+- Add name to scrypt kernel copyright.
+- Allow lookup gap and thread concurrency to be passed per device and store
+details in kernel binary filename.
+- Ignore negative intensities for scrypt.
+- Change the scale of intensity for scrypt kernel and fix a build warning.
+- Correct target value passed to scrypt kernel.
+- Use 256 output slots for kernels to allow 1 for each worksize.
+- Test the target in the actual scrypt kernel itself saving further
+calculations.
+- Reinstate GPU only opencl device detection.
+- Decrease lookup gap to 1. Does not seem to help in any way being 2.
+- Fix build.
+- Make pad0 and pad1 local variable in scrypt kernel.
+- Constify input variable in scrypt kernel.
+- Send correct values to scrypt kernel to get it finally working.
+- Create command queue before compiling program in opencl.
+- Detach pthread from within the api thread in case it is terminated due to not
+being instantiated before pthread_cancel is called from main, leading to a
+segfault.
+- Debug output per thread hashrate is out by a factor of 1000.
+- Initialise mdplatform.
+- Find the gpu platform with the most devices and use that if no platform option
+is passed.
+- Allow more platforms to be probed if first does not return GPUs.
+- Fix external scrypt algo missing.
+- Limit scrypt to 1 vector.
+- Handle KL_SCRYPT in config write.
+- Get rid of stuff.
+- Don't enqueuewrite buffer at all for pad8 and pass work details around for
+scrypt in dev_blk.
+- Set the correct data for cldata and prepare for pad8 fixes.
+- Bugfix: Fix build without curses but with OpenCL
+- Find the gpu platform with the most devices and use that if no platform option
+is passed.
+- Allow more platforms to be probed if first does not return GPUs.
+- Get rid of spaces in arrays in scrypt kernel.
+- Start with smaller amount of hashes in cpu mining to enable scrypt to return
+today sometime.
+- Show Khash hashrates when scrypt is in use.
+- Free the scratchbuf memory allocated in scrypt and don't check if CPUs are
+sick since they can't be. Prepare for khash hash rates in display.
+- Add cpumining capability for scrypt.
+- Set scrypt settings and buffer size in ocl.c code to be future modifiable.
+- Cope with when we cannot set intensity low enough to meet dynamic interval by
+inducing a forced sleep.
+- Make dynamic and scrypt opencl calls blocking.
+- Calculate midstate in separate function and remove likely/unlikely macros
+since they're dependent on pools, not code design.
+- bitforce: Use "full work" vs "nonce range" for kernel name
+- Display in debug mode when we're making the midstate locally.
+- Fix nonce submission code for scrypt.
+- Make sure goffset is set for scrypt and drop padbuffer8 to something
+manageable for now.
+- Set up buffer8 for scrypt.
+- Build fix for opt scrypt.
+- Don't check postcalc nonce with sha256 in scrypt.
+- Don't test nonce with sha and various fixes for scrypt.
+- Make scrypt buffers and midstate compatible with cgminer.
+- Use cgminer specific output array entries in scrypt kernel.
+- Provide initial support for the scrypt kernel to compile with and mine scrypt
+with the --scrypt option.
+- Enable completely compiling scrypt out.
+- Begin import of scrypt opencl kernel from reaper.
+- bitforce_get_result returns -1 on error now.
+- Check return value of read in BFgets
+- Bugfix: Make our Windows nanosleep/sleep replacements standards-compliant
+(which fixes nmsleep) and include compat.h for bitforce (for sleep)
+- rpc: Use a single switch statement for both stringifications of cgpu->status
+- Fix whitespace mangling.
+- miner.php fix rig # when miners fail
+- Only try to shut down work cleanly if we've successfully connected and started
+mining.
+- Use switch statement for cgpu->status and fix spelling.
+- Abbrv. correction
+- Bugfix: Don't declare devices SICK if they're just busy initialising
+- Bugfix: Calculate nsec in nmsleep correctly
+- Bugfix: Adapt OpenCL scanhash errors to driver API change (errors are now -1,
+not 0)
+- Remove superfluous ave_wait
+- Put kname change for broken nonce-range back in
+- Add average wait time to api stats
+- Change BFL driver thread initialising to a constant 100ms delay between
+devices instead of a random arrangement.
+- Spelling typo.
+- Time opencl work from start of queueing a kernel till it's flushed when
+calculating dynamic intensity.
+- Modify te scanhash API to use an int64_t and return -1 on error, allowing zero
+to be a valid return value.
+- Check for work restart after the hashmeter is invoked for we lose the hashes
+otherwise contributed in the count.
+- Remove disabled: label from mining thread function, using a separate
+mt_disable function.
+- Style changes.
+- Missed one nonce-range disabling.
+- Add average return time to api stats
+- miner.php allow rig names in number buttons
+- Remove bitforce_thread_init The delay thing does nothing useful... when long
+poll comes around, all threads restart at the same time anyway.
+- Change timeouts to time-vals for accuracy.
+- fix API support for big endian machines
+- Cope with signals interrupting the nanosleep of nmsleep.
+- Use standard cfsetispeed/cfsetospeed to set baud rate on *nix
+- miner.php split() flagged deprecated in PHP 5.3.0
+- More BFL tweaks. Add delay between closing and reopening port. Remove buffer
+clear in re-init Add kernel type (mini-rig or single)
+- Make long timeout 10seconds on bitforce for when usleep or nanosleep just
+can't be accurate...
+
+
 Version 2.5.0 - July 6, 2012
 
 - Fix --benchmark not working since the dynamic addition of pools and pool
diff --git a/README b/README
index bc1c59d6..5ff103a8 100644
--- a/README
+++ b/README
@@ -33,22 +33,32 @@ READ EXECUTIVE SUMMARY BELOW FOR FIRST TIME USERS!
 Dependencies:
 	curl dev library 	http://curl.haxx.se/libcurl/
 	(libcurl4-openssl-dev)
+
 	curses dev library
 	(libncurses5-dev or libpdcurses on WIN32)
+
 	pkg-config		http://www.freedesktop.org/wiki/Software/pkg-config
+	libtool			http://www.gnu.org/software/libtool/
+
 	jansson			http://www.digip.org/jansson/
 	(jansson is included in-tree and not necessary)
+
 	yasm 1.0.1+ http://yasm.tortall.net/
 	(yasm is optional, gives assembly routines for CPU mining)
+
 	AMD APP SDK		http://developer.amd.com/sdks/AMDAPPSDK
 	(This sdk is mandatory for GPU mining)
+
 	AMD ADL SDK		http://developer.amd.com/sdks/ADLSDK
 	(This sdk is mandatory for ATI GPU monitoring & clocking)
+
 	libudev headers
 	(This is only required for FPGA auto-detection and is linux only)
+
 	libusb headers
 	(This is only required for ZTEX support)
 
+
 CGMiner specific configuration options:
 	--enable-cpumining      Build with cpu mining support(default disabled)
 	--disable-opencl        Override detection and disable building with opencl
@@ -57,6 +67,9 @@ CGMiner specific configuration options:
 	--enable-icarus         Compile support for Icarus Board(default disabled)
 	--enable-modminer       Compile support for ModMiner FPGAs(default disabled)
 	--enable-ztex           Compile support for Ztex Board(default disabled)
+	--enable-scrypt         Compile support for scrypt litecoin mining (default disabled)
+	--without-curses        Compile support for curses TUI (default enabled)
+	--without-libudev       Autodetect FPGAs using libudev (default enabled)
 
 Basic *nix build instructions:
 	To build with GPU mining support:
@@ -153,6 +166,7 @@ Options for both config file and command line:
 --scan-time|-s <arg> Upper bound on time spent scanning current work, in seconds (default: 60)
 --sched-start <arg> Set a time of day in HH:MM to start mining (a once off without a stop time)
 --sched-stop <arg>  Set a time of day in HH:MM to stop mining (will quit without a start time)
+--scrypt            Use the scrypt algorithm for mining (litecoin only)
 --sharelog <arg>    Append share log to file
 --shares <arg>      Quit after mining N shares (default: unlimited)
 --socks-proxy <arg> Set socks4 proxy (host:port)
@@ -197,6 +211,14 @@ GPU only options:
 --worksize|-w <arg> Override detected optimal worksize - one value or comma separated list
 
 
+SCRYPT only options:
+
+--lookup-gap <arg>  Set GPU lookup gap for scrypt mining, comma separated
+--thread-concurrency <arg> Set GPU thread concurrency for scrypt mining, comma separated
+
+See SCRYPT-README for more information regarding litecoin mining.
+
+
 FPGA mining boards(BitForce, Icarus, ModMiner, Ztex) only options:
 
 --scan-serial|-S <arg> Serial port to probe for FPGA mining device
@@ -722,7 +744,8 @@ A: Cgminer is being packaged with other trojan scripts and some antivirus
 software is falsely accusing cgminer.exe as being the actual virus, rather
 than whatever it is being packaged with. If you installed cgminer yourself,
 then you do not have a virus on your computer. Complain to your antivirus
-software company.
+software company. They seem to be flagging even source code now from cgminer
+as viruses, even though text source files can't do anything by themself.
 
 Q: Can you modify the display to include more of one thing in the output and
 less of another, or can you change the quiet mode or can you add yet another
diff --git a/SCRYPT-README b/SCRYPT-README
new file mode 100644
index 00000000..02c9c44d
--- /dev/null
+++ b/SCRYPT-README
@@ -0,0 +1,143 @@
+If you wish to donate to the author, Con Kolivas, in LTC, please submit your
+donations to:
+
+Lc8TWMiKM7gRUrG8VB8pPNP1Yvt1SGZnoH
+
+Otherwise, please donate in BTC as per the main README.
+
+---
+
+
+Scrypt mining, AKA litecoin mining, for GPU is completely different to sha256
+used for bitcoin mining. The algorithm was originally developed in a manner
+that it was anticipated would make it suitable for mining on CPU but NOT GPU.
+Thanks to some innovative work by Artforz and mtrlt, this was proven to be
+wrong. However, it has very different requirements to bitcoin mining and is a
+lot more complicated to get working well. Note that it is a ram dependent
+workload, and requires you to have enough system ram as well as fast enough
+GPU ram. If you have less system ram than your GPU has, it may not be possible
+to mine at any reasonable rate.
+
+There are 5 main parameters to tuning scrypt, 2 of which you MUST set, and
+the others are optional for further fine tuning. When you start scrypt mining
+with the --scrypt option, cgminer will fail IN RANDOM WAYS. They are all due
+to parameters being outside what the GPU can cope with. Not giving cgminer a
+hint as to your GPU type, it will hardly ever perform well.
+
+NOTE that if it does not fail at startup, the presence of hardware errors (HW)
+are a sure sign that you have set the parameters too high.
+
+
+Step 1 on linux:
+export GPU_MAX_ALLOC_PERCENT=100
+If you do not do this, you may find it impossible to scrypt mine. You may find
+a value of 40 is enough and increasing this further has little effect.
+
+export GPU_USE_SYNC_OBJECTS=1
+may help CPU usage a little as well.
+
+--shaders XXX
+
+is a new option where you tell cgminer how many shaders your GPU has. This
+helps cgminer try to choose some meaningful baseline parameters. Use this table
+below to determine how many shaders your GPU has, and note that there are some
+variants of these cards, and nvidia shaders are much much lower and virtually
+pointless trying to mine on.
+
+GPU  Shaders
+7750 512
+7770 640
+7850 1024
+7870 1280
+7950 1792
+7970 2048
+
+6850 960
+6870 1120
+6950 1408
+6970 1536
+6990 (6970x2)
+
+6570 480
+6670 480
+6790 800
+
+6450 160
+
+5670 400
+5750 720
+5770 800
+5830 1120
+5850 1440
+5870 1600
+5970 (5870x2)
+
+These are only used as a rough guide for cgminer, and it is rare that this is
+all you will need to set.
+
+
+--intensity XX
+
+Just like in bitcoin mining, scrypt mining takes an intensity, however the
+scale goes from 0 to 20 to mimic the "Aggression" used in mtrlt's reaper. The
+reason this is crucial is that too high an intensity can actually be
+disastrous with scrypt because it CAN run out of ram. Intensities over 13
+start writing over the same ram and it is highly dependent on the GPU, but they
+can start actually DECREASING your hashrate, or even worse, start producing
+garbage with HW errors skyrocketing. The low level detail is that intensity is
+only guaranteed up to the power of 2 that most closely matches the thread
+concurrency. i.e. a thread concurrency of 6144 has 8192 as the nearest power
+of two above it, thus as 2^13=8192, that is an intensity of 13.
+
+
+Optional parameters to tune:
+-g, --thread-concurrency, --lookup-gap
+
+-g:
+Once you have found the optimal shaders and intensity, you can start increasing
+the -g value till cgminer fails to start. Rarely will you be able to go over
+about -g 4 and each increase in -g only increases hashrate slightly.
+
+--thread-concurrency:
+This tunes the optimal size of work that scrypt can do. It is internally tuned
+by cgminer to be the highest reasonable multiple of shaders that it can
+allocate on your GPU. Ideally it should be a multiple of your shader count.
+vliw5 architecture (R5XXX) would be best at 5x shaders, while VLIW4 (R6xxx and
+R7xxx) are best at 4x. Setting thread concurrency overrides anything you put
+into --shaders.
+
+--lookup-gap
+This tunes a compromise between ram usage and performance. Performance peaks
+at a gap of 2, but increasing the gap can save you some GPU ram, but almost
+always at the cost of significant loss of hashrate. Setting lookup gap
+overrides the default of 2, but cgminer will use the --shaders value to choose
+a thread-concurrency if you haven't chosen one.
+
+
+Overclocking for scrypt mining:
+First of all, do not underclock your memory initially. Scrypt mining requires
+memory speed and on most, but not all, GPUs, lowering memory speed lowers
+mining performance.
+
+Second, absolute engine clock speeds do NOT correlate with hashrate. The ratio
+of engine clock speed to memory matters, so if you set your memory to the
+default value, and then start overclocking as you are running it, you should
+find a sweet spot where the hashrate peaks and then it might actually drop if
+you increase the engine clock speed further. Unless you wish to run with a
+dynamic intensity, do not go over 13 without testing it while it's running to
+see that it increases hashrate AND utility WITHOUT increasing your HW errors.
+
+
+Suggested values for 7970 for example:
+export GPU_MAX_ALLOC_PERCENT=100
+--thread-concurrency 8192 -g 4 --gpu-engine 1135 --gpu-memclock 1375
+
+
+---
+
+If you wish to donate to the author, Con Kolivas, in LTC, please submit your
+donations to:
+
+Lc8TWMiKM7gRUrG8VB8pPNP1Yvt1SGZnoH
+
+Otherwise, please donate in BTC as per the main README.
diff --git a/adl.c b/adl.c
index 8573e16f..69c06dee 100644
--- a/adl.c
+++ b/adl.c
@@ -33,6 +33,10 @@
 #endif
 #include "adl_functions.h"
 
+#ifndef HAVE_CURSES
+#define wlogprint(...)  applog(LOG_WARNING, __VA_ARGS__)
+#endif
+
 bool adl_active;
 bool opt_reorder = false;
 
@@ -237,6 +241,8 @@ void init_adl(int nDevs)
 		result = ADL_Adapter_ID_Get(iAdapterIndex, &lpAdapterID);
 		if (result != ADL_OK) {
 			applog(LOG_INFO, "Failed to ADL_Adapter_ID_Get. Error %d", result);
+			if (result == -10)
+				applog(LOG_INFO, "This error says the device is not enabled");
 			continue;
 		}
 
@@ -764,6 +770,7 @@ bool gpu_stats(int gpu, float *temp, int *engineclock, int *memclock, float *vdd
 	return true;
 }
 
+#ifdef HAVE_CURSES
 static void get_enginerange(int gpu, int *imin, int *imax)
 {
 	struct gpu_adl *ga;
@@ -776,6 +783,7 @@ static void get_enginerange(int gpu, int *imin, int *imax)
 	*imin = ga->lpOdParameters.sEngineClock.iMin / 100;
 	*imax = ga->lpOdParameters.sEngineClock.iMax / 100;
 }
+#endif
 
 int set_engineclock(int gpu, int iEngineClock)
 {
@@ -824,6 +832,7 @@ out:
 	return ret;
 }
 
+#ifdef HAVE_CURSES
 static void get_memoryrange(int gpu, int *imin, int *imax)
 {
 	struct gpu_adl *ga;
@@ -836,6 +845,7 @@ static void get_memoryrange(int gpu, int *imin, int *imax)
 	*imin = ga->lpOdParameters.sMemoryClock.iMin / 100;
 	*imax = ga->lpOdParameters.sMemoryClock.iMax / 100;
 }
+#endif
 
 int set_memoryclock(int gpu, int iMemoryClock)
 {
@@ -876,6 +886,7 @@ out:
 	return ret;
 }
 
+#ifdef HAVE_CURSES
 static void get_vddcrange(int gpu, float *imin, float *imax)
 {
 	struct gpu_adl *ga;
@@ -889,7 +900,6 @@ static void get_vddcrange(int gpu, float *imin, float *imax)
 	*imax = (float)ga->lpOdParameters.sVddc.iMax / 1000;
 }
 
-#ifdef HAVE_CURSES
 static float curses_float(const char *query)
 {
 	float ret;
@@ -997,6 +1007,7 @@ int set_fanspeed(int gpu, int iFanSpeed)
 	return ret;
 }
 
+#ifdef HAVE_CURSES
 static int set_powertune(int gpu, int iPercentage)
 {
 	struct gpu_adl *ga;
@@ -1018,6 +1029,7 @@ static int set_powertune(int gpu, int iPercentage)
 	unlock_adl();
 	return ret;
 }
+#endif
 
 /* Returns whether the fanspeed is optimal already or not. The fan_window bool
  * tells us whether the current fanspeed is in the target range for fanspeeds.
diff --git a/api.c b/api.c
index 943c0d5d..557aa5e8 100644
--- a/api.c
+++ b/api.c
@@ -166,7 +166,7 @@ static const char SEPARATOR = '|';
 #define SEPSTR "|"
 static const char GPUSEP = ',';
 
-static const char *APIVERSION = "1.14";
+static const char *APIVERSION = "1.15";
 static const char *DEAD = "Dead";
 static const char *SICK = "Sick";
 static const char *NOSTART = "NoStart";
@@ -176,7 +176,9 @@ static const char *ALIVE = "Alive";
 static const char *REJECTING = "Rejecting";
 static const char *UNKNOWN = "Unknown";
 #define _DYNAMIC "D"
+#ifdef HAVE_OPENCL
 static const char *DYNAMIC = _DYNAMIC;
+#endif
 
 static const char *YES = "Y";
 static const char *NO = "N";
@@ -372,6 +374,8 @@ static const char *JSON_PARAMETER = "parameter";
 #define MSG_MINESTATS 70
 #define MSG_MISCHK 71
 #define MSG_CHECK 72
+#define MSG_POOLPRIO 73
+#define MSG_DUPPID 74
 
 enum code_severity {
 	SEVERITY_ERR,
@@ -385,6 +389,7 @@ enum code_parameters {
 	PARAM_GPU,
 	PARAM_PGA,
 	PARAM_CPU,
+	PARAM_PID,
 	PARAM_GPUMAX,
 	PARAM_PGAMAX,
 	PARAM_CPUMAX,
@@ -501,6 +506,8 @@ struct CODES {
  { SEVERITY_ERR,   MSG_ACCDENY,	PARAM_STR,	"Access denied to '%s' command" },
  { SEVERITY_SUCC,  MSG_ACCOK,	PARAM_NONE,	"Privileged access OK" },
  { SEVERITY_SUCC,  MSG_ENAPOOL,	PARAM_POOL,	"Enabling pool %d:'%s'" },
+ { SEVERITY_SUCC,  MSG_POOLPRIO,PARAM_NONE,	"Changed pool priorities" },
+ { SEVERITY_ERR,   MSG_DUPPID,	PARAM_PID,	"Duplicate pool specified %d" },
  { SEVERITY_SUCC,  MSG_DISPOOL,	PARAM_POOL,	"Disabling pool %d:'%s'" },
  { SEVERITY_INFO,  MSG_ALRENAP,	PARAM_POOL,	"Pool %d:'%s' already enabled" },
  { SEVERITY_INFO,  MSG_ALRDISP,	PARAM_POOL,	"Pool %d:'%s' already disabled" },
@@ -1062,6 +1069,7 @@ static char *message(int messageid, int paramid, char *param2, bool isjson)
 				case PARAM_GPU:
 				case PARAM_PGA:
 				case PARAM_CPU:
+				case PARAM_PID:
 					sprintf(buf, codes[i].description, paramid);
 					break;
 				case PARAM_POOL:
@@ -2128,6 +2136,74 @@ static void enablepool(__maybe_unused SOCKETTYPE c, char *param, bool isjson, __
 	strcpy(io_buffer, message(MSG_ENAPOOL, id, NULL, isjson));
 }
 
+static void poolpriority(__maybe_unused SOCKETTYPE c, char *param, bool isjson, __maybe_unused char group)
+{
+	char *ptr, *next;
+	int i, pr, prio = 0;
+
+	// TODO: all cgminer code needs a mutex added everywhere for change
+	//	access to total_pools and also parts of the pools[] array,
+	//	just copying total_pools here wont solve that
+
+	if (total_pools == 0) {
+		strcpy(io_buffer, message(MSG_NOPOOL, 0, NULL, isjson));
+		return;
+	}
+
+	if (param == NULL || *param == '\0') {
+		strcpy(io_buffer, message(MSG_MISPID, 0, NULL, isjson));
+		return;
+	}
+
+	bool pools_changed[total_pools];
+	int new_prio[total_pools];
+	for (i = 0; i < total_pools; ++i)
+		pools_changed[i] = false;
+
+	next = param;
+	while (next && *next) {
+		ptr = next;
+		next = strchr(ptr, ',');
+		if (next)
+			*(next++) = '\0';
+
+		i = atoi(ptr);
+		if (i < 0 || i >= total_pools) {
+			strcpy(io_buffer, message(MSG_INVPID, i, NULL, isjson));
+			return;
+		}
+
+		if (pools_changed[i]) {
+			strcpy(io_buffer, message(MSG_DUPPID, i, NULL, isjson));
+			return;
+		}
+
+		pools_changed[i] = true;
+		new_prio[i] = prio++;
+	}
+
+	// Only change them if no errors
+	for (i = 0; i < total_pools; i++) {
+		if (pools_changed[i])
+			pools[i]->prio = new_prio[i];
+	}
+
+	// In priority order, cycle through the unchanged pools and append them
+	for (pr = 0; pr < total_pools; pr++)
+		for (i = 0; i < total_pools; i++) {
+			if (!pools_changed[i] && pools[i]->prio == pr) {
+				pools[i]->prio = prio++;
+				pools_changed[i] = true;
+				break;
+			}
+		}
+
+	if (current_pool()->prio)
+		switch_pools(NULL);
+
+	strcpy(io_buffer, message(MSG_POOLPRIO, 0, NULL, isjson));
+}
+
 static void disablepool(__maybe_unused SOCKETTYPE c, char *param, bool isjson, __maybe_unused char group)
 {
 	struct pool *pool;
@@ -2155,7 +2231,7 @@ static void disablepool(__maybe_unused SOCKETTYPE c, char *param, bool isjson, _
 		return;
 	}
 
-	if (active_pools() <= 1) {
+	if (enabled_pools <= 1) {
 		strcpy(io_buffer, message(MSG_DISLASTP, id, NULL, isjson));
 		return;
 	}
@@ -2659,6 +2735,7 @@ struct CMDS {
 	{ "cpucount",		cpucount,	false },
 	{ "switchpool",		switchpool,	true },
 	{ "addpool",		addpool,	true },
+	{ "poolpriority",	poolpriority,	true },
 	{ "enablepool",		enablepool,	true },
 	{ "disablepool",	disablepool,	true },
 	{ "removepool",		removepool,	true },
@@ -3122,6 +3199,20 @@ void api(int api_thr_id)
 
 	serv.sin_port = htons(port);
 
+#ifndef WIN32
+	// On linux with SO_REUSEADDR, bind will get the port if the previous
+	// socket is closed (even if it is still in TIME_WAIT) but fail if
+	// another program has it open - which is what we want
+	int optval = 1;
+	// If it doesn't work, we don't really care - just show a debug message
+	if (SOCKETFAIL(setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)(&optval), sizeof(optval))))
+		applog(LOG_DEBUG, "API setsockopt SO_REUSEADDR failed (ignored): %s", SOCKERRMSG);
+#else
+	// On windows a 2nd program can bind to a port>1024 already in use unless
+	// SO_EXCLUSIVEADDRUSE is used - however then the bind to a closed port
+	// in TIME_WAIT will fail until the timeout - so we leave the options alone
+#endif
+
 	// try for more than 1 minute ... in case the old one hasn't completely gone yet
 	bound = 0;
 	bindstart = time(NULL);
@@ -3150,12 +3241,12 @@ void api(int api_thr_id)
 	}
 
 	if (opt_api_allow)
-		applog(LOG_WARNING, "API running in IP access mode");
+		applog(LOG_WARNING, "API running in IP access mode on port %d", port);
 	else {
 		if (opt_api_network)
-			applog(LOG_WARNING, "API running in UNRESTRICTED access mode");
+			applog(LOG_WARNING, "API running in UNRESTRICTED read access mode on port %d", port);
 		else
-			applog(LOG_WARNING, "API running in local access mode");
+			applog(LOG_WARNING, "API running in local read access mode on port %d", port);
 	}
 
 	io_buffer = malloc(MYBUFSIZ+1);
diff --git a/autogen.sh b/autogen.sh
index bf564eb1..e922cfc9 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -1,12 +1,17 @@
 #!/bin/sh
+cwd="$PWD"
+bs_dir="$(dirname $(readlink -f $0))"
+rm -rf "${bs_dir}"/autom4te.cache
+rm -f "${bs_dir}"/aclocal.m4 "${bs_dir}"/ltmain.sh
 
-# You need autoconf 2.5x, preferably 2.57 or later
-# You need automake 1.7 or later. 1.6 might work.
-
-set -e
-
-aclocal -I m4
-autoheader
-automake --add-missing --copy
-autoconf
-
+echo 'Running autoreconf -if...'
+autoreconf -if || exit 1
+if test -z "$NOCONFIGURE" ; then
+	echo 'Configuring...'
+	cd "${bs_dir}" &> /dev/null
+	test "$?" = "0" || e=1
+	test "$cwd" != "$bs_dir" && cd "$bs_dir" &> /dev/null
+	./configure $@
+	test "$e" = "1" && exit 1
+	cd "$cwd"
+fi
diff --git a/cgminer.c b/cgminer.c
index b2f11aa3..2d44bac0 100644
--- a/cgminer.c
+++ b/cgminer.c
@@ -87,7 +87,7 @@ static bool opt_benchmark;
 static bool have_longpoll;
 static bool want_per_device_stats;
 bool use_syslog;
-static bool opt_quiet;
+bool opt_quiet;
 static bool opt_realquiet;
 bool opt_loginput;
 const int opt_cutofftemp = 95;
@@ -107,6 +107,9 @@ int opt_dynamic_interval = 7;
 int nDevs;
 int opt_g_threads = 2;
 int gpu_threads;
+#ifdef USE_SCRYPT
+bool opt_scrypt;
+#endif
 #endif
 bool opt_restart = true;
 static bool opt_nogpu;
@@ -139,6 +142,7 @@ bool opt_api_listen;
 bool opt_api_network;
 bool opt_delaynet;
 bool opt_disable_pool = true;
+char *opt_icarus_options = NULL;
 char *opt_icarus_timing = NULL;
 
 char *opt_kernel_path;
@@ -164,9 +168,7 @@ static int total_threads;
 static pthread_mutex_t hash_lock;
 static pthread_mutex_t qd_lock;
 static pthread_mutex_t *stgd_lock;
-#ifdef HAVE_CURSES
-static pthread_mutex_t curses_lock;
-#endif
+pthread_mutex_t console_lock;
 static pthread_mutex_t ch_lock;
 static pthread_rwlock_t blk_lock;
 
@@ -197,15 +199,16 @@ unsigned int total_go, total_ro;
 struct pool **pools;
 static struct pool *currentpool = NULL;
 
-int total_pools;
+int total_pools, enabled_pools;
 enum pool_strategy pool_strategy = POOL_FAILOVER;
 int opt_rotate_period;
 static int total_urls, total_users, total_passes, total_userpasses;
 
+static
 #ifndef HAVE_CURSES
 const
 #endif
-static bool curses_active;
+bool curses_active;
 
 static char current_block[37];
 static char *current_hash;
@@ -243,7 +246,6 @@ struct thread_q *getq;
 
 static int total_work;
 struct work *staged_work = NULL;
-static int staged_extras;
 
 struct schedtime {
 	bool enable;
@@ -708,6 +710,13 @@ static char *set_api_description(const char *arg)
 }
 
 #ifdef USE_ICARUS
+static char *set_icarus_options(const char *arg)
+{
+	opt_set_charp(arg, &opt_icarus_options);
+
+	return NULL;
+}
+
 static char *set_icarus_timing(const char *arg)
 {
 	opt_set_charp(arg, &opt_icarus_timing);
@@ -850,6 +859,11 @@ static struct opt_table opt_config_table[] = {
 	OPT_WITH_ARG("--gpu-vddc",
 		     set_gpu_vddc, NULL, NULL,
 		     "Set the GPU voltage in Volts - one value for all or separate by commas for per card"),
+#endif
+#ifdef USE_SCRYPT
+	OPT_WITH_ARG("--lookup-gap",
+		     set_lookup_gap, NULL, NULL,
+		     "Set GPU lookup gap for scrypt mining, comma separated"),
 #endif
 	OPT_WITH_ARG("--intensity|-I",
 		     set_intensity, NULL, NULL,
@@ -863,9 +877,12 @@ static struct opt_table opt_config_table[] = {
 #ifdef HAVE_OPENCL
 	OPT_WITH_ARG("--kernel|-k",
 		     set_kernel, NULL, NULL,
-		     "Override kernel to use (diablo, poclbm, phatk or diakgcn) - one value or comma separated"),
+		     "Override sha256 kernel to use (diablo, poclbm, phatk or diakgcn) - one value or comma separated"),
 #endif
 #ifdef USE_ICARUS
+	OPT_WITH_ARG("--icarus-options",
+		     set_icarus_options, NULL, NULL,
+		     opt_hidden),
 	OPT_WITH_ARG("--icarus-timing",
 		     set_icarus_timing, NULL, NULL,
 		     opt_hidden),
@@ -953,6 +970,14 @@ static struct opt_table opt_config_table[] = {
 	OPT_WITH_ARG("--sched-stop",
 		     set_schedtime, NULL, &schedstop,
 		     "Set a time of day in HH:MM to stop mining (will quit without a start time)"),
+#ifdef USE_SCRYPT
+	OPT_WITHOUT_ARG("--scrypt",
+			opt_set_bool, &opt_scrypt,
+			"Use the scrypt algorithm for mining (litecoin only)"),
+	OPT_WITH_ARG("--shaders",
+		     set_shaders, NULL, NULL,
+		     "GPU shaders per card for tuning scrypt, comma separated"),
+#endif
 	OPT_WITH_ARG("--sharelog",
 		     set_sharelog, NULL, NULL,
 		     "Append share log to file"),
@@ -991,6 +1016,11 @@ static struct opt_table opt_config_table[] = {
 			opt_hidden
 #endif
 	),
+#ifdef USE_SCRYPT
+	OPT_WITH_ARG("--thread-concurrency",
+		     set_thread_concurrency, NULL, NULL,
+		     "Set GPU thread concurrency for scrypt mining, comma separated"),
+#endif
 	OPT_WITH_ARG("--url|-o",
 		     set_url, NULL, NULL,
 		     "URL for bitcoin JSON-RPC server"),
@@ -1171,6 +1201,9 @@ static char *opt_verusage_and_exit(const char *extra)
 #endif
 #ifdef USE_ZTEX
 		"ztex "
+#endif
+#ifdef USE_SCRYPT
+		"scrypt "
 #endif
 		"mining support.\n"
 		, packagename);
@@ -1222,6 +1255,27 @@ static bool jobj_binary(const json_t *obj, const char *key,
 	return true;
 }
 
+static void calc_midstate(struct work *work)
+{
+	union {
+		unsigned char c[64];
+		uint32_t i[16];
+	} data;
+	int swapcounter;
+
+	for (swapcounter = 0; swapcounter < 16; swapcounter++)
+		data.i[swapcounter] = swab32(((uint32_t*) (work->data))[swapcounter]);
+	sha2_context ctx;
+	sha2_starts( &ctx, 0 );
+	sha2_update( &ctx, data.c, 64 );
+	memcpy(work->midstate, ctx.state, sizeof(work->midstate));
+#if defined(__BIG_ENDIAN__) || defined(MIPSEB)
+	int i;
+	for (i = 0; i < 8; i++)
+		(((uint32_t*) (work->midstate))[i]) = swab32(((uint32_t*) (work->midstate))[i]);
+#endif
+}
+
 static bool work_decode(const json_t *val, struct work *work)
 {
 	if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data), true))) {
@@ -1229,28 +1283,13 @@ static bool work_decode(const json_t *val, struct work *work)
 		goto err_out;
 	}
 
-	if (likely(!jobj_binary(val, "midstate",
-			 work->midstate, sizeof(work->midstate), false))) {
+	if (!jobj_binary(val, "midstate", work->midstate, sizeof(work->midstate), false)) {
 		// Calculate it ourselves
-		union {
-			unsigned char c[64];
-			uint32_t i[16];
-		} data;
-		int swapcounter;
-		for (swapcounter = 0; swapcounter < 16; swapcounter++)
-			data.i[swapcounter] = swab32(((uint32_t*) (work->data))[swapcounter]);
-		sha2_context ctx;
-		sha2_starts( &ctx, 0 );
-		sha2_update( &ctx, data.c, 64 );
-		memcpy(work->midstate, ctx.state, sizeof(work->midstate));
-#if defined(__BIG_ENDIAN__) || defined(MIPSEB)
-		int i;
-		for (i = 0; i < 8; i++)
-			(((uint32_t*) (work->midstate))[i]) = swab32(((uint32_t*) (work->midstate))[i]);
-#endif
+		applog(LOG_DEBUG, "Calculating midstate locally");
+		calc_midstate(work);
 	}
 
-	if (likely(!jobj_binary(val, "hash1", work->hash1, sizeof(work->hash1), false))) {
+	if (!jobj_binary(val, "hash1", work->hash1, sizeof(work->hash1), false)) {
 		// Always the same anyway
 		memcpy(work->hash1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0", 64);
 	}
@@ -1295,7 +1334,7 @@ void decay_time(double *f, double fadd)
 		*f = (fadd + *f * 0.58) / 1.58;
 }
 
-static int requests_staged(void)
+static int total_staged(void)
 {
 	int ret;
 
@@ -1305,6 +1344,23 @@ static int requests_staged(void)
 	return ret;
 }
 
+static int pool_staged(struct pool *pool)
+{
+	int ret;
+
+	mutex_lock(stgd_lock);
+	ret = pool->staged;
+	mutex_unlock(stgd_lock);
+	return ret;
+}
+
+static int current_staged(void)
+{
+	struct pool *pool = current_pool();
+
+	return pool_staged(pool);
+}
+
 #ifdef HAVE_CURSES
 WINDOW *mainwin, *statuswin, *logwin;
 #endif
@@ -1312,8 +1368,10 @@ double total_secs = 0.1;
 static char statusline[256];
 /* logstart is where the log window should start */
 static int devcursor, logstart, logcursor;
+#ifdef HAVE_CURSES
 /* statusy is where the status window goes up to in cases where it won't fit at startup */
 static int statusy;
+#endif
 #ifdef HAVE_OPENCL
 struct cgpu_info gpus[MAX_GPUDEVICES]; /* Maximum number apparently possible */
 #endif
@@ -1322,12 +1380,12 @@ struct cgpu_info *cpus;
 #ifdef HAVE_CURSES
 static inline void unlock_curses(void)
 {
-	mutex_unlock(&curses_lock);
+	mutex_unlock(&console_lock);
 }
 
 static inline void lock_curses(void)
 {
-	mutex_lock(&curses_lock);
+	mutex_lock(&console_lock);
 }
 
 static bool curses_active_locked(void)
@@ -1353,15 +1411,26 @@ void tailsprintf(char *f, const char *fmt, ...)
 
 static void get_statline(char *buf, struct cgpu_info *cgpu)
 {
+	double displayed_hashes, displayed_rolling = cgpu->rolling;
+	bool mhash_base = true;
+
+	displayed_hashes = cgpu->total_mhashes / total_secs;
+	if (displayed_hashes < 1) {
+		displayed_hashes *= 1000;
+		displayed_rolling *= 1000;
+		mhash_base = false;
+	}
+
 	sprintf(buf, "%s%d ", cgpu->api->name, cgpu->device_id);
 	if (cgpu->api->get_statline_before)
 		cgpu->api->get_statline_before(buf, cgpu);
 	else
 		tailsprintf(buf, "               | ");
-	tailsprintf(buf, "(%ds):%.1f (avg):%.1f Mh/s | A:%d R:%d HW:%d U:%.1f/m",
+	tailsprintf(buf, "(%ds):%.1f (avg):%.1f %sh/s | A:%d R:%d HW:%d U:%.1f/m",
 		opt_log_interval,
-		cgpu->rolling,
-		cgpu->total_mhashes / total_secs,
+		displayed_rolling,
+		displayed_hashes,
+	        mhash_base ? "M" : "K",
 		cgpu->accepted,
 		cgpu->rejected,
 		cgpu->hw_errors,
@@ -1381,6 +1450,8 @@ static void text_print_status(int thr_id)
 	}
 }
 
+static int global_queued(void);
+
 #ifdef HAVE_CURSES
 /* Must be called with curses mutex lock held and curses_active */
 static void curses_print_status(void)
@@ -1398,7 +1469,7 @@ static void curses_print_status(void)
 	mvwprintw(statuswin, 2, 0, " %s", statusline);
 	wclrtoeol(statuswin);
 	mvwprintw(statuswin, 3, 0, " TQ: %d  ST: %d  SS: %d  DW: %d  NB: %d  LW: %d  GF: %d  RF: %d",
-		total_queued, requests_staged(), total_stale, total_discarded, new_blocks,
+		global_queued(), total_staged(), total_stale, total_discarded, new_blocks,
 		local_work, total_go, total_ro);
 	wclrtoeol(statuswin);
 	if (pool_strategy == POOL_LOADBALANCE && total_pools > 1)
@@ -1427,13 +1498,16 @@ static void curses_print_devstatus(int thr_id)
 {
 	static int awidth = 1, rwidth = 1, hwwidth = 1, uwidth = 1;
 	struct cgpu_info *cgpu = thr_info[thr_id].cgpu;
+	double displayed_hashes, displayed_rolling;
+	bool mhash_base = true;
 	char logline[255];
 
+	if (devcursor + cgpu->cgminer_id > LINES - 2)
+		return;
+
 	cgpu->utility = cgpu->accepted / ( total_secs ? total_secs : 1 ) * 60;
 
-	/* Check this isn't out of the window size */
-	if (wmove(statuswin,devcursor + cgpu->cgminer_id, 0) == ERR)
-		return;
+	wmove(statuswin,devcursor + cgpu->cgminer_id, 0);
 	wprintw(statuswin, " %s %*d: ", cgpu->api->name, dev_width, cgpu->device_id);
 	if (cgpu->api->get_statline_before) {
 		logline[0] = '\0';
@@ -1443,6 +1517,14 @@ static void curses_print_devstatus(int thr_id)
 	else
 		wprintw(statuswin, "               | ");
 
+	displayed_hashes = cgpu->total_mhashes / total_secs;
+	displayed_rolling = cgpu->rolling;
+	if (displayed_hashes < 1) {
+		displayed_hashes *= 1000;
+		displayed_rolling *= 1000;
+		mhash_base = false;
+	}
+
 	if (cgpu->status == LIFE_DEAD)
 		wprintw(statuswin, "DEAD ");
 	else if (cgpu->status == LIFE_SICK)
@@ -1452,13 +1534,15 @@ static void curses_print_devstatus(int thr_id)
 	else if (cgpu->deven == DEV_RECOVER)
 		wprintw(statuswin, "REST  ");
 	else
-		wprintw(statuswin, "%5.1f", cgpu->rolling);
+		wprintw(statuswin, "%5.1f", displayed_rolling);
 	adj_width(cgpu->accepted, &awidth);
 	adj_width(cgpu->rejected, &rwidth);
 	adj_width(cgpu->hw_errors, &hwwidth);
 	adj_width(cgpu->utility, &uwidth);
-	wprintw(statuswin, "/%5.1fMh/s | A:%*d R:%*d HW:%*d U:%*.2f/m",
-			cgpu->total_mhashes / total_secs,
+
+	wprintw(statuswin, "/%5.1f%sh/s | A:%*d R:%*d HW:%*d U:%*.2f/m",
+			displayed_hashes,
+			mhash_base ? "M" : "K",
 			awidth, cgpu->accepted,
 			rwidth, cgpu->rejected,
 			hwwidth, cgpu->hw_errors,
@@ -1559,13 +1643,10 @@ void wlogprint(const char *f, ...)
 #endif
 
 #ifdef HAVE_CURSES
-void log_curses(int prio, const char *f, va_list ap)
+bool log_curses_only(int prio, const char *f, va_list ap)
 {
 	bool high_prio;
 
-	if (opt_quiet && prio != LOG_ERR)
-		return;
-
 	high_prio = (prio == LOG_WARNING || prio == LOG_ERR);
 
 	if (curses_active_locked()) {
@@ -1577,8 +1658,9 @@ void log_curses(int prio, const char *f, va_list ap)
 			}
 		}
 		unlock_curses();
-	} else
-		vprintf(f, ap);
+		return true;
+	}
+	return false;
 }
 
 void clear_logwin(void)
@@ -1642,6 +1724,28 @@ bool regeneratehash(const struct work *work)
 		return false;
 }
 
+static void enable_pool(struct pool *pool)
+{
+	if (pool->enabled != POOL_ENABLED) {
+		enabled_pools++;
+		pool->enabled = POOL_ENABLED;
+	}
+}
+
+static void disable_pool(struct pool *pool)
+{
+	if (pool->enabled == POOL_ENABLED)
+		enabled_pools--;
+	pool->enabled = POOL_DISABLED;
+}
+
+static void reject_pool(struct pool *pool)
+{
+	if (pool->enabled == POOL_ENABLED)
+		enabled_pools--;
+	pool->enabled = POOL_REJECTING;
+}
+
 static bool submit_upstream_work(const struct work *work, CURL *curl)
 {
 	char *hexstr = NULL;
@@ -1695,8 +1799,12 @@ static bool submit_upstream_work(const struct work *work, CURL *curl)
 
 	if (!QUIET) {
 		hash32 = (uint32_t *)(work->hash);
-		sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[6]), (unsigned long)(hash32[5]),
-			work->block? " BLOCK!" : "");
+		if (opt_scrypt)
+			sprintf(hashshow, "%08lx.%08lx", (unsigned long)(hash32[7]), (unsigned long)(hash32[6]));
+		else {
+			sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[6]), (unsigned long)(hash32[5]),
+				work->block? " BLOCK!" : "");
+		}
 	}
 
 	/* Theoretically threads could race when modifying accepted and
@@ -1732,7 +1840,7 @@ static bool submit_upstream_work(const struct work *work, CURL *curl)
 		 * longpoll */
 		if (unlikely(pool->enabled == POOL_REJECTING)) {
 			applog(LOG_WARNING, "Rejecting pool %d now accepting shares, re-enabling!", pool->pool_no);
-			pool->enabled = POOL_ENABLED;
+			enable_pool(pool);
 			switch_pools(NULL);
 		}
 	} else {
@@ -1777,13 +1885,13 @@ static bool submit_upstream_work(const struct work *work, CURL *curl)
 		 * ensued. Do not do this if we know the share just happened to
 		 * be stale due to networking delays.
 		 */
-		if (pool->seq_rejects > 10 && !work->stale && opt_disable_pool && total_pools > 1) {
+		if (pool->seq_rejects > 10 && !work->stale && opt_disable_pool && enabled_pools > 1) {
 			double utility = total_accepted / ( total_secs ? total_secs : 1 ) * 60;
 
 			if (pool->seq_rejects > utility * 3) {
 				applog(LOG_WARNING, "Pool %d rejected %d sequential shares, disabling!",
 				       pool->pool_no, pool->seq_rejects);
-				pool->enabled = POOL_REJECTING;
+				reject_pool(pool);
 				if (pool == current_pool())
 					switch_pools(NULL);
 				pool->seq_rejects = 0;
@@ -2084,18 +2192,23 @@ static void recruit_curl(struct pool *pool)
 
 /* Grab an available curl if there is one. If not, then recruit extra curls
  * unless we are in a submit_fail situation, or we have opt_delaynet enabled
- * and there are already 5 curls in circulation */
+ * and there are already 5 curls in circulation. Limit total number to the
+ * number of mining threads per pool as well to prevent blasting a pool during
+ * network delays/outages. */
 static struct curl_ent *pop_curl_entry(struct pool *pool)
 {
+	int curl_limit = opt_delaynet ? 5 : mining_threads * 4 / 3;
 	struct curl_ent *ce;
 
 	mutex_lock(&pool->pool_lock);
+retry:
 	if (!pool->curls)
 		recruit_curl(pool);
 	else if (list_empty(&pool->curlring)) {
-		if ((pool->submit_fail || opt_delaynet) && pool->curls > 4)
+		if (pool->submit_fail || pool->curls >= curl_limit) {
 			pthread_cond_wait(&pool->cr_cond, &pool->pool_lock);
-		else
+			goto retry;
+		} else
 			recruit_curl(pool);
 	}
 	ce = list_entry(pool->curlring.next, struct curl_ent, node);
@@ -2114,6 +2227,51 @@ static void push_curl_entry(struct curl_ent *ce, struct pool *pool)
 	mutex_unlock(&pool->pool_lock);
 }
 
+/* This is overkill, but at least we'll know accurately how much work is
+ * queued to prevent ever being left without work */
+static void inc_queued(struct pool *pool)
+{
+	if (unlikely(!pool))
+		return;
+
+	mutex_lock(&qd_lock);
+	pool->queued++;
+	total_queued++;
+	mutex_unlock(&qd_lock);
+}
+
+static void dec_queued(struct pool *pool)
+{
+	if (unlikely(!pool))
+		return;
+
+	mutex_lock(&qd_lock);
+	pool->queued--;
+	total_queued--;
+	mutex_unlock(&qd_lock);
+}
+
+static int current_queued(void)
+{
+	struct pool *pool = current_pool();
+	int ret;
+
+	mutex_lock(&qd_lock);
+	ret = pool->queued;
+	mutex_unlock(&qd_lock);
+	return ret;
+}
+
+static int global_queued(void)
+{
+	int ret;
+
+	mutex_lock(&qd_lock);
+	ret = total_queued;
+	mutex_unlock(&qd_lock);
+	return ret;
+}
+
 /* ce and pool may appear uninitialised at push_curl_entry, but they're always
  * set when we don't have opt_benchmark enabled */
 static void *get_work_thread(void *userdata)
@@ -2137,6 +2295,7 @@ static void *get_work_thread(void *userdata)
 		get_benchmark_work(ret_work);
 	else {
 		pool = ret_work->pool = select_pool(wc->lagging);
+		inc_queued(pool);
 		
 		ce = pop_curl_entry(pool);
 
@@ -2156,6 +2315,8 @@ static void *get_work_thread(void *userdata)
 			fail_pause += opt_fail_pause;
 		}
 		fail_pause = opt_fail_pause;
+
+		dec_queued(pool);
 	}
 
 	applog(LOG_DEBUG, "Pushing work to requesting thread");
@@ -2195,9 +2356,6 @@ static bool stale_work(struct work *work, bool share)
 	struct pool *pool;
 	int getwork_delay;
 
-	if (work->mandatory)
-		return false;
-
 	if (share) {
 		/* Technically the rolltime should be correct but some pools
 		 * advertise a broken expire= that is lower than a meaningful
@@ -2223,14 +2381,20 @@ static bool stale_work(struct work *work, bool share)
 		work_expiry = 5;
 
 	gettimeofday(&now, NULL);
-	if ((now.tv_sec - work->tv_staged.tv_sec) >= work_expiry)
+	if ((now.tv_sec - work->tv_staged.tv_sec) >= work_expiry) {
+		applog(LOG_DEBUG, "Work stale due to expiry");
 		return true;
+	}
 
-	if (work->work_block != work_block)
+	if (work->work_block != work_block) {
+		applog(LOG_DEBUG, "Work stale due to block mismatch");
 		return true;
+	}
 
-	if (opt_fail_only && !share && pool != current_pool() && pool->enabled != POOL_REJECTING)
+	if (opt_fail_only && !share && pool != current_pool() && !work->mandatory) {
+		applog(LOG_DEBUG, "Work stale due to fail only pool mismatch");
 		return true;
+	}
 
 	return false;
 }
@@ -2407,14 +2571,11 @@ void switch_pools(struct pool *selected)
 	if (pool != last_pool)
 		applog(LOG_WARNING, "Switching to %s", pool->rpc_url);
 
-	/* Reset the queued amount to allow more to be queued for the new pool */
-	mutex_lock(&qd_lock);
-	total_queued = 0;
-	mutex_unlock(&qd_lock);
-
 	mutex_lock(&lp_lock);
 	pthread_cond_broadcast(&lp_cond);
 	mutex_unlock(&lp_lock);
+
+	queue_request(NULL, false);
 }
 
 static void discard_work(struct work *work)
@@ -2429,72 +2590,30 @@ static void discard_work(struct work *work)
 	free_work(work);
 }
 
-/* This is overkill, but at least we'll know accurately how much work is
- * queued to prevent ever being left without work */
-static void inc_queued(void)
-{
-	mutex_lock(&qd_lock);
-	total_queued++;
-	mutex_unlock(&qd_lock);
-}
-
-static void dec_queued(struct work *work)
-{
-	if (work->clone)
-		return;
-
-	mutex_lock(&qd_lock);
-	if (total_queued > 0)
-		total_queued--;
-	mutex_unlock(&qd_lock);
-}
-
-static int requests_queued(void)
-{
-	int ret;
-
-	mutex_lock(&qd_lock);
-	ret = total_queued;
-	mutex_unlock(&qd_lock);
-	return ret;
-}
-
-static void subtract_queued(int work_units)
-{
-	mutex_lock(&qd_lock);
-	total_queued -= work_units;
-	if (total_queued < 0)
-		total_queued = 0;
-	mutex_unlock(&qd_lock);
-}
+bool queue_request(struct thr_info *thr, bool needed);
 
 static void discard_stale(void)
 {
 	struct work *work, *tmp;
-	int stale = 0, nonclone = 0;
+	int stale = 0;
 
 	mutex_lock(stgd_lock);
 	HASH_ITER(hh, staged_work, work, tmp) {
 		if (stale_work(work, false)) {
 			HASH_DEL(staged_work, work);
-			if (work->clone)
-				--staged_extras;
-			else
-				nonclone++;
+			work->pool->staged--;
 			discard_work(work);
 			stale++;
 		}
 	}
 	mutex_unlock(stgd_lock);
 
-	applog(LOG_DEBUG, "Discarded %d stales that didn't match current hash", stale);
-
-	/* Dec queued outside the loop to not have recursive locks */
-	subtract_queued(nonclone);
+	if (stale) {
+		applog(LOG_DEBUG, "Discarded %d stales that didn't match current hash", stale);
+		queue_request(NULL, false);
+	}
 }
 
-bool queue_request(struct thr_info *thr, bool needed);
-
 /* A generic wait function for threads that poll that will wait a specified
  * time tdiff waiting on the pthread conditional that is broadcast when a
  * work restart is required. Returns the value of pthread_cond_timedwait
@@ -2653,9 +2772,8 @@ static bool hash_push(struct work *work)
 	mutex_lock(stgd_lock);
 	if (likely(!getq->frozen)) {
 		HASH_ADD_INT(staged_work, id, work);
+		work->pool->staged++;
 		HASH_SORT(staged_work, tv_sort);
-		if (work->clone)
-			++staged_extras;
 	} else
 		rc = false;
 	pthread_cond_signal(&getq->cond);
@@ -2725,18 +2843,6 @@ int curses_int(const char *query)
 static bool input_pool(bool live);
 #endif
 
-int active_pools(void)
-{
-	int ret = 0;
-	int i;
-
-	for (i = 0; i < total_pools; i++) {
-		if ((pools[i])->enabled == POOL_ENABLED)
-			ret++;
-	}
-	return ret;
-}
-
 #ifdef HAVE_CURSES
 static void display_pool_summary(struct pool *pool)
 {
@@ -2835,8 +2941,25 @@ void write_config(FILE *fcfg)
 				case KL_DIABLO:
 					fprintf(fcfg, "diablo");
 					break;
+				case KL_SCRYPT:
+					fprintf(fcfg, "scrypt");
+					break;
 			}
 		}
+#ifdef USE_SCRYPT
+		fputs("\",\n\"lookup-gap\" : \"", fcfg);
+		for(i = 0; i < nDevs; i++)
+			fprintf(fcfg, "%s%d", i > 0 ? "," : "",
+				(int)gpus[i].opt_lg);
+		fputs("\",\n\"thread-concurrency\" : \"", fcfg);
+		for(i = 0; i < nDevs; i++)
+			fprintf(fcfg, "%s%d", i > 0 ? "," : "",
+				(int)gpus[i].opt_tc);
+		fputs("\",\n\"shaders\" : \"", fcfg);
+		for(i = 0; i < nDevs; i++)
+			fprintf(fcfg, "%s%d", i > 0 ? "," : "",
+				(int)gpus[i].shaders);
+#endif
 #ifdef HAVE_ADL
 		fputs("\",\n\"gpu-engine\" : \"", fcfg);
 		for(i = 0; i < nDevs; i++)
@@ -2937,6 +3060,8 @@ void write_config(FILE *fcfg)
 		fprintf(fcfg, ",\n\"api-description\" : \"%s\"", opt_api_description);
 	if (opt_api_groups)
 		fprintf(fcfg, ",\n\"api-groups\" : \"%s\"", opt_api_groups);
+	if (opt_icarus_options)
+		fprintf(fcfg, ",\n\"icarus-options\" : \"%s\"", opt_icarus_options);
 	if (opt_icarus_timing)
 		fprintf(fcfg, ",\n\"icarus-timing\" : \"%s\"", opt_icarus_timing);
 	fputs("\n}", fcfg);
@@ -2983,6 +3108,7 @@ retry:
 		strategies[pool_strategy]);
 	if (pool_strategy == POOL_ROTATE)
 		wlogprint("Set to rotate every %d minutes\n", opt_rotate_period);
+	wlogprint("[F]ailover only %s\n", opt_fail_only ? "enabled" : "disabled");
 	wlogprint("[A]dd pool [R]emove pool [D]isable pool [E]nable pool\n");
 	wlogprint("[C]hange management strategy [S]witch pool [I]nformation\n");
 	wlogprint("Or press any other key to continue\n");
@@ -3008,7 +3134,7 @@ retry:
 			wlogprint("Unable to remove pool due to activity\n");
 			goto retry;
 		}
-		pool->enabled = POOL_DISABLED;
+		disable_pool(pool);
 		remove_pool(pool);
 		goto updated;
 	} else if (!strncasecmp(&input, "s", 1)) {
@@ -3018,11 +3144,11 @@ retry:
 			goto retry;
 		}
 		pool = pools[selected];
-		pool->enabled = POOL_ENABLED;
+		enable_pool(pool);
 		switch_pools(pool);
 		goto updated;
 	} else if (!strncasecmp(&input, "d", 1)) {
-		if (active_pools() <= 1) {
+		if (enabled_pools <= 1) {
 			wlogprint("Cannot disable last pool");
 			goto retry;
 		}
@@ -3032,7 +3158,7 @@ retry:
 			goto retry;
 		}
 		pool = pools[selected];
-		pool->enabled = POOL_DISABLED;
+		disable_pool(pool);
 		if (pool == current_pool())
 			switch_pools(NULL);
 		goto updated;
@@ -3043,7 +3169,7 @@ retry:
 			goto retry;
 		}
 		pool = pools[selected];
-		pool->enabled = POOL_ENABLED;
+		enable_pool(pool);
 		if (pool->prio < current_pool()->prio)
 			switch_pools(pool);
 		goto updated;
@@ -3076,6 +3202,9 @@ retry:
 		pool = pools[selected];
 		display_pool_summary(pool);
 		goto retry;
+	} else if (!strncasecmp(&input, "f", 1)) {
+		opt_fail_only ^= true;
+		goto updated;
 	} else
 		clear_logwin();
 
@@ -3349,6 +3478,7 @@ static void *api_thread(void *userdata)
 {
 	struct thr_info *mythr = userdata;
 
+	pthread_detach(pthread_self());
 	pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
 
 	api(api_thr_id);
@@ -3380,9 +3510,11 @@ static void hashmeter(int thr_id, struct timeval *diff,
 	double utility, efficiency = 0.0;
 	static double local_mhashes_done = 0;
 	static double rolling = 0;
-	double local_mhashes = (double)hashes_done / 1000000.0;
+	double local_mhashes, displayed_hashes, displayed_rolling;
+	bool mhash_base = true;
 	bool showlog = false;
 
+	local_mhashes = (double)hashes_done / 1000000.0;
 	/* Update the last time this thread reported in */
 	if (thr_id >= 0) {
 		gettimeofday(&thr_info[thr_id].last, NULL);
@@ -3398,8 +3530,8 @@ static void hashmeter(int thr_id, struct timeval *diff,
 		double thread_rolling = 0.0;
 		int i;
 
-		applog(LOG_DEBUG, "[thread %d: %llu hashes, %.0f khash/sec]",
-			thr_id, hashes_done, hashes_done / secs);
+		applog(LOG_DEBUG, "[thread %d: %llu hashes, %.1f khash/sec]",
+			thr_id, hashes_done, hashes_done / 1000 / secs);
 
 		/* Rolling average for each thread and each device */
 		decay_time(&thr->rolling, local_mhashes / secs);
@@ -3458,9 +3590,17 @@ static void hashmeter(int thr_id, struct timeval *diff,
 	utility = total_accepted / ( total_secs ? total_secs : 1 ) * 60;
 	efficiency = total_getworks ? total_accepted * 100.0 / total_getworks : 0.0;
 
-	sprintf(statusline, "%s(%ds):%.1f (avg):%.1f Mh/s | Q:%d  A:%d  R:%d  HW:%d  E:%.0f%%  U:%.1f/m",
+	displayed_hashes = total_mhashes_done / total_secs;
+	displayed_rolling = rolling;
+	if (displayed_hashes < 1) {
+		displayed_hashes *= 1000;
+		displayed_rolling *= 1000;
+		mhash_base = false;
+	}
+
+	sprintf(statusline, "%s(%ds):%.1f (avg):%.1f %sh/s | Q:%d  A:%d  R:%d  HW:%d  E:%.0f%%  U:%.1f/m",
 		want_per_device_stats ? "ALL " : "",
-		opt_log_interval, rolling, total_mhashes_done / total_secs,
+		opt_log_interval, displayed_rolling, displayed_hashes, mhash_base ? "M" : "K",
 		total_getworks, total_accepted, total_rejected, hw_errors, efficiency, utility);
 
 
@@ -3510,7 +3650,6 @@ static bool pool_active(struct pool *pool, bool pinging)
 			tq_push(thr_info[stage_thr_id].q, work);
 			total_getworks++;
 			pool->getwork_requested++;
-			inc_queued();
 			ret = true;
 			gettimeofday(&pool->tv_idle, NULL);
 		} else {
@@ -3591,79 +3730,40 @@ static void pool_resus(struct pool *pool)
 		switch_pools(NULL);
 }
 
-static time_t requested_tv_sec;
-
-static bool control_tset(bool *var)
-{
-	bool ret;
-
-	mutex_lock(&control_lock);
-	ret = *var;
-	*var = true;
-	mutex_unlock(&control_lock);
-
-	return ret;
-}
-
-static void control_tclear(bool *var)
-{
-	mutex_lock(&control_lock);
-	*var = false;
-	mutex_unlock(&control_lock);
-}
-
-static bool queueing;
-
 bool queue_request(struct thr_info *thr, bool needed)
 {
+	int cq, cs, ts, tq, maxq = opt_queue + mining_threads;
 	struct workio_cmd *wc;
-	struct timeval now;
-	time_t scan_post;
-	int rq, rs;
-	bool ret = true;
-
-	/* Prevent multiple requests being executed at once */
-	if (control_tset(&queueing))
-		return ret;
-
-	rq = requests_queued();
-	rs = requests_staged();
-
-	/* Grab more work every 2/3 of the scan time to avoid all work expiring
-	 * at the same time */
-	scan_post = opt_scantime * 2 / 3;
-	if (scan_post < 5)
-		scan_post = 5;
-
-	gettimeofday(&now, NULL);
-
-	/* Test to make sure we have enough work for pools without rolltime
-	 * and enough original work for pools with rolltime */
-	if ((rq >= mining_threads || rs >= mining_threads) &&
-	    rq > staged_extras + opt_queue &&
-	    now.tv_sec - requested_tv_sec < scan_post)
-		goto out;
-
-	requested_tv_sec = now.tv_sec;
-
-	inc_queued();
+	bool lag = false;
+
+	cq = current_queued();
+	cs = current_staged();
+	ts = total_staged();
+	tq = global_queued();
+
+	if (needed && cq >= maxq && !ts && !opt_fail_only) {
+		/* If we're queueing work faster than we can stage it, consider
+		 * the system lagging and allow work to be gathered from
+		 * another pool if possible */
+		lag = true;
+	} else {
+		/* Test to make sure we have enough work for pools without rolltime
+		 * and enough original work for pools with rolltime */
+		if (((cs || cq >= opt_queue) && ts >= maxq) ||
+		    ((cs || cq) && tq >= maxq))
+			return true;
+	}
 
 	/* fill out work request message */
 	wc = calloc(1, sizeof(*wc));
 	if (unlikely(!wc)) {
 		applog(LOG_ERR, "Failed to calloc wc in queue_request");
-		ret = false;
-		goto out;
+		return false;
 	}
 
 	wc->cmd = WC_GET_WORK;
 	wc->thr = thr;
-
-	/* If we're queueing work faster than we can stage it, consider the
-	 * system lagging and allow work to be gathered from another pool if
-	 * possible */
-	if (rq && needed && !rs && !opt_fail_only)
-		wc->lagging = true;
+	wc->lagging = lag;
 
 	applog(LOG_DEBUG, "Queueing getwork request to work thread");
 
@@ -3671,18 +3771,16 @@ bool queue_request(struct thr_info *thr, bool needed)
 	if (unlikely(!tq_push(thr_info[work_thr_id].q, wc))) {
 		applog(LOG_ERR, "Failed to tq_push in queue_request");
 		workio_cmd_free(wc);
-		ret = false;
+		return false;
 	}
 
-out:
-	control_tclear(&queueing);
-
-	return ret;
+	return true;
 }
 
 static struct work *hash_pop(const struct timespec *abstime)
 {
 	struct work *work = NULL;
+	bool queue = false;
 	int rc = 0;
 
 	mutex_lock(stgd_lock);
@@ -3692,11 +3790,15 @@ static struct work *hash_pop(const struct timespec *abstime)
 	if (HASH_COUNT(staged_work)) {
 		work = staged_work;
 		HASH_DEL(staged_work, work);
-		if (work->clone)
-			--staged_extras;
+		work->pool->staged--;
+		if (HASH_COUNT(staged_work) < (unsigned int)mining_threads)
+			queue = true;
 	}
 	mutex_unlock(stgd_lock);
 
+	if (queue)
+		queue_request(NULL, false);
+
 	return work;
 }
 
@@ -3750,6 +3852,7 @@ static struct work *make_clone(struct work *work)
 	memcpy(work_clone, work, sizeof(struct work));
 	work_clone->clone = true;
 	work_clone->longpoll = false;
+	work_clone->mandatory = false;
 	/* Make cloned work appear slightly older to bias towards keeping the
 	 * master work item which can be further rolled */
 	work_clone->tv_staged.tv_sec -= 1;
@@ -3762,7 +3865,7 @@ static struct work *make_clone(struct work *work)
  * the future */
 static struct work *clone_work(struct work *work)
 {
-	int mrs = mining_threads - requests_staged();
+	int mrs = mining_threads + opt_queue - total_staged();
 	struct work *work_clone;
 	bool cloned;
 
@@ -3802,8 +3905,8 @@ static bool get_work(struct work *work, bool requested, struct thr_info *thr,
 	struct timespec abstime = {0, 0};
 	struct timeval now;
 	struct work *work_heap;
+	int failures = 0, cq;
 	struct pool *pool;
-	int failures = 0;
 
 	/* Tell the watchdog thread this thread is waiting on getwork and
 	 * should not be restarted */
@@ -3814,9 +3917,11 @@ static bool get_work(struct work *work, bool requested, struct thr_info *thr,
 		thread_reportin(thr);
 		return true;
 	}
+
+	cq = current_queued();
 retry:
 	pool = current_pool();
-	if (!requested || requests_queued() < opt_queue) {
+	if (!requested || cq < opt_queue) {
 		if (unlikely(!queue_request(thr, true))) {
 			applog(LOG_WARNING, "Failed to queue_request in get_work");
 			goto out;
@@ -3829,7 +3934,7 @@ retry:
 		goto out;
 	}
 
-	if (!pool->lagging && requested && !newreq && !requests_staged() && requests_queued() >= mining_threads) {
+	if (!pool->lagging && requested && !newreq && !pool_staged(pool) && cq >= mining_threads + opt_queue) {
 		struct cgpu_info *cgpu = thr->cgpu;
 		bool stalled = true;
 		int i;
@@ -3866,7 +3971,6 @@ retry:
 	}
 
 	if (stale_work(work_heap, false)) {
-		dec_queued(work_heap);
 		discard_work(work_heap);
 		goto retry;
 	}
@@ -3881,7 +3985,6 @@ retry:
 
 	work_heap = clone_work(work_heap);
 	memcpy(work, work_heap, sizeof(struct work));
-	dec_queued(work_heap);
 	free_work(work_heap);
 
 	ret = true;
@@ -3963,6 +4066,13 @@ bool hashtest(const struct work *work)
 
 bool test_nonce(struct work *work, uint32_t nonce)
 {
+	if (opt_scrypt) {
+		uint32_t *work_nonce = (uint32_t *)(work->data + 64 + 12);
+
+		*work_nonce = nonce;
+		return true;
+	}
+
 	work->data[64 + 12 + 0] = (nonce >> 0) & 0xff;
 	work->data[64 + 12 + 1] = (nonce >> 8) & 0xff;
 	work->data[64 + 12 + 2] = (nonce >> 16) & 0xff;
@@ -4100,7 +4210,9 @@ void *miner_thread(void *userdata)
 			}
 			pool_stats->getwork_calls++;
 
+			thread_reportin(mythr);
 			hashes = api->scanhash(mythr, work, work->blk.nonce + max_nonce);
+			thread_reportin(mythr);
 
 			gettimeofday(&getwork_start, NULL);
 
@@ -4251,10 +4363,8 @@ static void convert_to_work(json_t *val, int rolltime, struct pool *pool)
 
 	if (unlikely(!stage_work(work)))
 		free_work(work);
-	else {
-		inc_queued();
+	else
 		applog(LOG_DEBUG, "Converted longpoll data to work");
-	}
 }
 
 /* If we want longpoll, enable it for the chosen default pool, or, if
@@ -4399,12 +4509,15 @@ static void reap_curl(struct pool *pool)
 {
 	struct curl_ent *ent, *iter;
 	struct timeval now;
+	int reaped = 0;
 
 	gettimeofday(&now, NULL);
 	mutex_lock(&pool->pool_lock);
 	list_for_each_entry_safe(ent, iter, &pool->curlring, node) {
+		if (pool->curls < 2)
+			break;
 		if (now.tv_sec - ent->tv.tv_sec > 60) {
-			applog(LOG_DEBUG, "Reaped curl %d from pool %d", pool->curls, pool->pool_no);
+			reaped++;
 			pool->curls--;
 			list_del(&ent->node);
 			curl_easy_cleanup(ent->curl);
@@ -4412,6 +4525,8 @@ static void reap_curl(struct pool *pool)
 		}
 	}
 	mutex_unlock(&pool->pool_lock);
+	if (reaped)
+		applog(LOG_DEBUG, "Reaped %d curl%s from pool %d", reaped, reaped > 1 ? "s" : "", pool->pool_no);
 }
 
 static void *watchpool_thread(void __maybe_unused *userdata)
@@ -4451,12 +4566,12 @@ static void *watchpool_thread(void __maybe_unused *userdata)
 }
 
 /* Work is sorted according to age, so discard the oldest work items, leaving
- * only 1 staged work item per mining thread */
+ * only 1/3 more staged work item than mining threads */
 static void age_work(void)
 {
-	int discarded = 0;
+	int discarded = 0, maxq = (mining_threads + opt_queue) * 4 / 3;
 
-	while (requests_staged() > mining_threads * 4 / 3 + opt_queue) {
+	while (total_staged() > maxq) {
 		struct work *work = hash_pop(NULL);
 
 		if (unlikely(!work))
@@ -4476,7 +4591,6 @@ static void age_work(void)
 #define WATCHDOG_DEAD_TIME		600
 #define WATCHDOG_SICK_COUNT		(WATCHDOG_SICK_TIME/WATCHDOG_INTERVAL)
 #define WATCHDOG_DEAD_COUNT		(WATCHDOG_DEAD_TIME/WATCHDOG_INTERVAL)
-#define WATCHDOG_LOW_HASH		1.0 /* consider < 1MH too low for any device */
 
 static void *watchdog_thread(void __maybe_unused *userdata)
 {
@@ -4493,11 +4607,13 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 		struct timeval now;
 
 		sleep(interval);
-		if (requests_queued() < opt_queue)
-			queue_request(NULL, false);
+
+		discard_stale();
 
 		age_work();
 
+		queue_request(NULL, false);
+
 		hashmeter(-1, &zero_tv, 0);
 
 #ifdef HAVE_CURSES
@@ -4557,9 +4673,6 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 			struct cgpu_info *cgpu = devices[i];
 			struct thr_info *thr = cgpu->thr[0];
 			enum dev_enable *denable;
-			bool dev_count_well;
-			bool dev_count_sick;
-			bool dev_count_dead;
 			char dev_str[8];
 			int gpu;
 
@@ -4587,21 +4700,16 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 			if (thr->getwork || *denable == DEV_DISABLED)
 				continue;
 
-			if (cgpu->rolling < WATCHDOG_LOW_HASH)
-				cgpu->low_count++;
-			else
-				cgpu->low_count = 0;
-
-			dev_count_well = (cgpu->low_count < WATCHDOG_SICK_COUNT);
-			dev_count_sick = (cgpu->low_count > WATCHDOG_SICK_COUNT);
-			dev_count_dead = (cgpu->low_count > WATCHDOG_DEAD_COUNT);
-
-			if (cgpu->status != LIFE_WELL && (now.tv_sec - thr->last.tv_sec < WATCHDOG_SICK_TIME) && dev_count_well) {
+#ifdef WANT_CPUMINE
+			if (!strcmp(cgpu->api->dname, "cpu"))
+				continue;
+#endif
+			if (cgpu->status != LIFE_WELL && (now.tv_sec - thr->last.tv_sec < WATCHDOG_SICK_TIME)) {
 				if (cgpu->status != LIFE_INIT)
 				applog(LOG_ERR, "%s: Recovered, declaring WELL!", dev_str);
 				cgpu->status = LIFE_WELL;
 				cgpu->device_last_well = time(NULL);
-			} else if (cgpu->status == LIFE_WELL && ((now.tv_sec - thr->last.tv_sec > WATCHDOG_SICK_TIME) || dev_count_sick)) {
+			} else if (cgpu->status == LIFE_WELL && (now.tv_sec - thr->last.tv_sec > WATCHDOG_SICK_TIME)) {
 				thr->rolling = cgpu->rolling = 0;
 				cgpu->status = LIFE_SICK;
 				applog(LOG_ERR, "%s: Idle for more than 60 seconds, declaring SICK!", dev_str);
@@ -4620,7 +4728,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 					applog(LOG_ERR, "%s: Attempting to restart", dev_str);
 					reinit_device(cgpu);
 				}
-			} else if (cgpu->status == LIFE_SICK && ((now.tv_sec - thr->last.tv_sec > WATCHDOG_DEAD_TIME) || dev_count_dead)) {
+			} else if (cgpu->status == LIFE_SICK && (now.tv_sec - thr->last.tv_sec > WATCHDOG_DEAD_TIME)) {
 				cgpu->status = LIFE_DEAD;
 				applog(LOG_ERR, "%s: Not responded for more than 10 minutes, declaring DEAD!", dev_str);
 				gettimeofday(&thr->sick, NULL);
@@ -4658,7 +4766,8 @@ static void print_summary(void)
 {
 	struct timeval diff;
 	int hours, mins, secs, i;
-	double utility, efficiency = 0.0;
+	double utility, efficiency = 0.0, displayed_hashes;
+	bool mhash_base = true;
 
 	timersub(&total_tv_end, &total_tv_start, &diff);
 	hours = diff.tv_sec / 3600;
@@ -4677,8 +4786,14 @@ static void print_summary(void)
 		applog(LOG_WARNING, "CPU hasher algorithm used: %s", algo_names[opt_algo]);
 #endif
 	applog(LOG_WARNING, "Runtime: %d hrs : %d mins : %d secs", hours, mins, secs);
+	displayed_hashes = total_mhashes_done / total_secs;
+	if (displayed_hashes < 1) {
+		displayed_hashes *= 1000;
+		mhash_base = false;
+	}
+
 	if (total_secs)
-		applog(LOG_WARNING, "Average hashrate: %.1f Megahash/s", total_mhashes_done / total_secs);
+		applog(LOG_WARNING, "Average hashrate: %.1f %shash/s", displayed_hashes, mhash_base? "Mega" : "Kilo");
 	applog(LOG_WARNING, "Solved blocks: %d", found_blocks);
 	applog(LOG_WARNING, "Queued work requests: %d", total_getworks);
 	applog(LOG_WARNING, "Share submissions: %d", total_accepted + total_rejected);
@@ -4814,7 +4929,7 @@ void add_pool_details(bool live, char *url, char *user, char *pass)
 
 	/* Test the pool is not idle if we're live running, otherwise
 	 * it will be tested separately */
-	pool->enabled = POOL_ENABLED;
+	enable_pool(pool);
 	if (live && !pool_active(pool, false))
 		pool->idle = true;
 }
@@ -5039,6 +5154,7 @@ int main(int argc, char *argv[])
 	bool pools_active = false;
 	struct sigaction handler;
 	struct thr_info *thr;
+	char *s;
 	unsigned int k;
 	int i, j;
 
@@ -5057,9 +5173,7 @@ int main(int argc, char *argv[])
 
 	mutex_init(&hash_lock);
 	mutex_init(&qd_lock);
-#ifdef HAVE_CURSES
-	mutex_init(&curses_lock);
-#endif
+	mutex_init(&console_lock);
 	mutex_init(&control_lock);
 	mutex_init(&sharelog_lock);
 	mutex_init(&ch_lock);
@@ -5089,7 +5203,9 @@ int main(int argc, char *argv[])
 	opt_kernel_path = alloca(PATH_MAX);
 	strcpy(opt_kernel_path, CGMINER_PREFIX);
 	cgminer_path = alloca(PATH_MAX);
-	strcpy(cgminer_path, dirname(argv[0]));
+	s = strdup(argv[0]);
+	strcpy(cgminer_path, dirname(s));
+	free(s);
 	strcat(cgminer_path, "/");
 #ifdef WANT_CPUMINE
 	// Hack to make cgminer silent when called recursively on WIN32
@@ -5142,7 +5258,7 @@ int main(int argc, char *argv[])
 		strcpy(pool->rpc_url, "Benchmark");
 		pool->rpc_user = pool->rpc_url;
 		pool->rpc_pass = pool->rpc_url;
-		pool->enabled = POOL_ENABLED;
+		enable_pool(pool);
 		pool->idle = false;
 		successful_connect = true;
 	}
@@ -5181,6 +5297,11 @@ int main(int argc, char *argv[])
 		opt_log_output = true;
 
 #ifdef WANT_CPUMINE
+#ifdef USE_SCRYPT
+	if (opt_scrypt)
+		set_scrypt_algo(&opt_algo);
+	else
+#endif
 	if (0 <= opt_bench_algo) {
 		double rate = bench_algo_stage3(opt_bench_algo);
 
@@ -5380,7 +5501,7 @@ int main(int argc, char *argv[])
 	for (i = 0; i < total_pools; i++) {
 		struct pool *pool  = pools[i];
 
-		pool->enabled = POOL_ENABLED;
+		enable_pool(pool);
 		pool->idle = true;
 	}
 
@@ -5522,8 +5643,6 @@ begin_bench:
 	thr = &thr_info[api_thr_id];
 	if (thr_info_create(thr, NULL, api_thread, thr))
 		quit(1, "API thread create failed");
-	pthread_detach(thr->pth);
-
 
 #ifdef HAVE_CURSES
 	/* Create curses input thread for keyboard input. Create this last so
diff --git a/configure.ac b/configure.ac
index 08449fea..abc6d9fc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,8 +1,8 @@
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 m4_define([v_maj], [2])
-m4_define([v_min], [5])
-m4_define([v_mic], [0])
+m4_define([v_min], [6])
+m4_define([v_mic], [4])
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 m4_define([v_ver], [v_maj.v_min.v_mic])
 m4_define([lt_rev], m4_eval(v_maj + v_min))
@@ -173,6 +173,8 @@ AC_ARG_ENABLE([adl],
 	[adl=$enableval]
 	)
 
+scrypt="no"
+
 if test "$found_opencl" = 1; then
 	if test "x$adl" != xno; then
 		AC_CHECK_FILE([ADL_SDK/adl_sdk.h], have_adl=true, have_adl=false,)
@@ -183,10 +185,20 @@ if test "$found_opencl" = 1; then
 			DLOPEN_FLAGS=""
 		fi
 	fi
+
+	AC_ARG_ENABLE([scrypt],
+		[AC_HELP_STRING([--enable-scrypt],[Compile support for scrypt litecoin mining (default disabled)])],
+		[scrypt=$enableval]
+		)
+	if test "x$scrypt" = xyes; then
+		AC_DEFINE([USE_SCRYPT], [1], [Defined to 1 if scrypt support is wanted])
+	fi
 else
 	DLOPEN_FLAGS=""
 fi
 
+AM_CONDITIONAL([HAS_SCRYPT], [test x$scrypt = xyes])
+
 bitforce="no"
 
 AC_ARG_ENABLE([bitforce],
@@ -377,10 +389,11 @@ fi
 
 AC_DEFINE_UNQUOTED([CGMINER_PREFIX], ["$prefix/bin"], [Path to cgminer install])
 
-AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120223"], [Filename for phatk kernel])
-AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120327"], [Filename for poclbm kernel])
-AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120427"], [Filename for diakgcn kernel])
-AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120328"], [Filename for diablo kernel])
+AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120724"], [Filename for phatk kernel])
+AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120724"], [Filename for poclbm kernel])
+AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120724"], [Filename for diakgcn kernel])
+AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120724"], [Filename for diablo kernel])
+AC_DEFINE_UNQUOTED([SCRYPT_KERNNAME], ["scrypt120724"], [Filename for scrypt kernel])
 
 
 AC_SUBST(OPENCL_LIBS)
@@ -425,17 +438,25 @@ echo "  curses.TUI...........: $cursesmsg"
 if test "x$opencl" != xno; then
 	if test $found_opencl = 1; then
 		echo "  OpenCL...............: FOUND. GPU mining support enabled"
+	if test "x$scrypt" != xno; then
+		echo "  scrypt...............: Enabled"
+	else
+		echo "  scrypt...............: Disabled"
+	fi
+
 	else
 		echo "  OpenCL...............: NOT FOUND. GPU mining support DISABLED"
 		if test "x$cpumining$bitforce$icarus$ztex$modminer" = xnonononono; then
 			AC_MSG_ERROR([No mining configured in])
 		fi
+		echo "  scrypt...............: Disabled (needs OpenCL)"
 	fi
 else
 	echo "  OpenCL...............: Detection overrided. GPU mining support DISABLED"
 	if test "x$cpumining$bitforce$icarus$ztex$modminer" = xnonononono; then
 		AC_MSG_ERROR([No mining configured in])
 	fi
+	echo "  scrypt...............: Disabled (needs OpenCL)"
 fi
 
 if test "x$adl" != xno; then
diff --git a/debian/changelog b/debian/changelog
index 85339a96..df3ef2c3 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,112 +1,146 @@
+cgminer (2.6.2-1) precise; urgency=low
+  Version 2.6.2 - August 3, 2012
+
+  * Scrypt mining does not support block testing yet so don't try to print it.
+  * Clear the bitforce buffer whenever we get an unexpected result as it has
+    likely throttled and we are getting cached responses out of order, and use the
+    temperature monitoring as a kind of watchdog to flush unexpected results.
+  * It is not critical getting the temperature response in bitforce so don't
+    mandatorily wait on the mutex lock.
+  * Check there is a cutoff temp actually set in bitforce before using it as a cut
+    off value otherwise it may think it's set to zero degrees.
+  * We dropped the temporary stopping of curl recruiting on submit_fail by
+    mistake, reinstate it.
+  * Make threads report in either side of the scanhash function in case we miss
+    reporting in when restarting work.
+  * Don't make mandatory work and its clones last forever.
+  * Make test work for pool_active mandatory work items to smooth out staged work
+    counts when in failover-only mode.
+  * Add debugging output when work is found stale as to why.
+  * Print the 3 parameters that are passed to applog for a debug line in
+    bitforce.c
+  * Clear bitforce buffer on init as previously.
+  * Add some headroom to the number of curls available per pool to allow for
+    longpoll and sendwork curls.
+  * Revert "Revert "Change BFL driver thread initialising to a constant 100ms
+    delay between devices instead of a random arrangement.""
+  * Revert "Remove bitforce_thread_init"
+  * Show the correct base units on GPU summary.
+  * Differentiate between the send return value being a bool and the get return
+    value when managing them in bitforce scanhash.
+  * 23a8c60 Revert "bitforce: Skip out of sending work if work restart requested" 
+
+ -- nushor <nushor@nushor-desktop>  Fri, 03 Aug 2012 11:27:44 -0500
+
 cgminer (2.4.2-1) stable; urgency=medium
   Version 2.4.2 - June 2, 2012
 
-  - API.class compiled with Java SE 6.0_03 - works with Win7x64
-  - miner.php highlight devs too slow finding shares (possibly failing)
-  - API update version to V1.11 and document changes
-  - API save default config file if none specified
-  - api.c save success incorrectly returns error
-  - api.c replace BUFSIZ (linux/windows have different values)
-  - Move RPC API content out of README to API-README
-  - Open a longpoll connection if a pool is in the REJECTING state as it's the
+  * API.class compiled with Java SE 6.0_03 - works with Win7x64
+  * miner.php highlight devs too slow finding shares (possibly failing)
+  * API update version to V1.11 and document changes
+  * API save default config file if none specified
+  * api.c save success incorrectly returns error
+  * api.c replace BUFSIZ (linux/windows have different values)
+  * Move RPC API content out of README to API-README
+  * Open a longpoll connection if a pool is in the REJECTING state as it's the
     only way to re-enable it automatically.
-  - Use only one longpoll as much as possible by using a pthread conditional
+  * Use only one longpoll as much as possible by using a pthread conditional
     broadcast that each longpoll thread waits on and checks if it's the current pool
     before
-  - If shares are known stale, don't use them to decide to disable a pool for
+  * If shares are known stale, don't use them to decide to disable a pool for
     sequential rejects.
-  - Restarting cgminer from within after ADL has been corrupted only leads to a
+  * Restarting cgminer from within after ADL has been corrupted only leads to a
     crash. Display a warning only and disable fanspeed monitoring.
-  - Icarus: fix abort calculation/allow user specified abort
-  - Icarus: make --icarus-timing hidden and document it in FPGA-README
-  - Icarus: high accuracy timing and other bitstream speed support
-  - add-MIPSEB-to-icarus-for-BIG_ENDIAN
-  - work_decode only needs swab32 on midstate under BIG ENDIAN
-  - add compile command to api-example.c
-  - save config bugfix: writing an extra ',' when no gpus
-  - Add dpkg-source commits
+  * Icarus: fix abort calculation/allow user specified abort
+  * Icarus: make --icarus-timing hidden and document it in FPGA-README
+  * Icarus: high accuracy timing and other bitstream speed support
+  * add-MIPSEB-to-icarus-for-BIG_ENDIAN
+  * work_decode only needs swab32 on midstate under BIG ENDIAN
+  * add compile command to api-example.c
+  * save config bugfix: writing an extra ',' when no gpus
+  * Add dpkg-source commits
 
  -- nushor <nushor11@gmail.com>  Sun, 03 Jun 2012 22:02:03 -0500
 
 cgminer (2.4.1-1) stable; urgency=low
   Version 2.4.1-1 - May 6, 2012
-  - In the unlikely event of finding a block, display the block solved count with
+  * In the unlikely event of finding a block, display the block solved count with
     the pool it came from for auditing.
-  - Display the device summary on exit even if a device has been disabled.
-  - Use correct pool enabled enums in api.c.
-  - Import Debian packaging configs
-  - Ensure we test for a pool recovering from idle so long as it's not set to
+  * Display the device summary on exit even if a device has been disabled.
+  * Use correct pool enabled enums in api.c.
+  * Import Debian packaging configs
+  * Ensure we test for a pool recovering from idle so long as it's not set to
     disabled.
-  - Fix pool number display.
-  - Give cgminer -T message only if curses is in use.
-  - Reinit_adl is no longer used.
-  - API 'stats' allow devices to add their own stats also for testing/debug
-  - API add getwork stats to cgminer - accesable from API 'stats'
-  - Don't initialise variables to zero when in global scope since they're already
+  * Fix pool number display.
+  * Give cgminer -T message only if curses is in use.
+  * Reinit_adl is no longer used.
+  * API 'stats' allow devices to add their own stats also for testing/debug
+  * API add getwork stats to cgminer - accesable from API 'stats'
+  * Don't initialise variables to zero when in global scope since they're already
     initialised.
-  - Get rid of unitialised variable warning when it's false.
-  - Move a pool to POOL_REJECTING to be disabled only after 3 minutes of
+  * Get rid of unitialised variable warning when it's false.
+  * Move a pool to POOL_REJECTING to be disabled only after 3 minutes of
     continuous rejected shares.
-  - Some tweaks to reporting and logging.
-  - Change FPGA detection order since BFL hangs on an ICA
-  - API support new pool status
-  - Add a temporarily disabled state for enabled pools called POOL_REJECTING and
+  * Some tweaks to reporting and logging.
+  * Change FPGA detection order since BFL hangs on an ICA
+  * API support new pool status
+  * Add a temporarily disabled state for enabled pools called POOL_REJECTING and
     use the work from each longpoll to help determine when a rejecting pool has
     started working again. Switch pools based on the multipool strategy once a pool
     is re-enabled.
-  - Removing extra debug
-  - Fix the benchmark feature by bypassing the new networking code.
-  - Reset sequential reject counter after a pool is disabled for when it is
+  * Removing extra debug
+  * Fix the benchmark feature by bypassing the new networking code.
+  * Reset sequential reject counter after a pool is disabled for when it is
     re-enabled.
-  - Icarus - correct MH/s and U: with work restart set at 8 seconds
-  - ztex updateFreq was always reporting on fpga 0
-  - Trying harder to get 1.15y working
-  - Specifying threads on multi fpga boards extra cgpu
-  - Missing the add cgpu per extra fpga on 1.15y boards
-  - API add last share time to each pool
-  - Don't try to reap curls if benchmarking is enabled.
+  * Icarus - correct MH/s and U: with work restart set at 8 seconds
+  * ztex updateFreq was always reporting on fpga 0
+  * Trying harder to get 1.15y working
+  * Specifying threads on multi fpga boards extra cgpu
+  * Missing the add cgpu per extra fpga on 1.15y boards
+  * API add last share time to each pool
+  * Don't try to reap curls if benchmarking is enabled.
 
  -- nushor <nushor11@gmail.com>  Sun, 06 May 2012 11:09:46 -0500
 
 cgminer (2.4.0-1) stable; urgency=low
   Version 2.4.0 - May 3, 2012
 
-  - Only show longpoll warning once when it has failed.
-  - Convert hashes to an unsigned long long as well.
-  - Detect pools that have issues represented by endless rejected shares and
+  * Only show longpoll warning once when it has failed.
+  * Convert hashes to an unsigned long long as well.
+  * Detect pools that have issues represented by endless rejected shares and
     disable them, with a parameter to optionally disable this feature.
-  - Bugfix: Use a 64-bit type for hashes_done (miner_thread) since it can overflow
+  * Bugfix: Use a 64-bit type for hashes_done (miner_thread) since it can overflow
     32-bit on some FPGAs
-  - Implement an older header fix for a label existing before the pthread_cleanup
+  * Implement an older header fix for a label existing before the pthread_cleanup
     macro.
-  - Limit the number of curls we recruit on communication failures and with
+  * Limit the number of curls we recruit on communication failures and with
     delaynet enabled to 5 by maintaining a per-pool curl count, and using a pthread
     conditional that wakes up when one is returned to the ring buffer.
-  - Generalise add_pool() functions since they're repeated in add_pool_details.
-  - Bugfix: Return failure, rather than quit, if BFwrite fails
-  - Disable failing devices such that the user can attempt to re-enable them
-  - Bugfix: thread_shutdown shouldn't try to free the device, since it's needed
+  * Generalise add_pool() functions since they're repeated in add_pool_details.
+  * Bugfix: Return failure, rather than quit, if BFwrite fails
+  * Disable failing devices such that the user can attempt to re-enable them
+  * Bugfix: thread_shutdown shouldn't try to free the device, since it's needed
     afterward
-  - API bool's and 1TBS fixes
-  - Icarus - minimise code delays and name timer variables
-  - api.c V1.9 add 'restart' + redesign 'quit' so thread exits cleanly
-  - api.c bug - remove extra ']'s in notify command
-  - Increase pool watch interval to 30 seconds.
-  - Reap curls that are unused for over a minute. This allows connections to be
+  * API bool's and 1TBS fixes
+  * Icarus - minimise code delays and name timer variables
+  * api.c V1.9 add 'restart' + redesign 'quit' so thread exits cleanly
+  * api.c bug - remove extra ']'s in notify command
+  * Increase pool watch interval to 30 seconds.
+  * Reap curls that are unused for over a minute. This allows connections to be
     closed, thereby allowing the number of curl handles to always be the minimum
     necessary to not delay networking.
-  - Use the ringbuffer of curls from the same pool for submit as well as getwork
+  * Use the ringbuffer of curls from the same pool for submit as well as getwork
     threads. Since the curl handles were already connected to the same pool and are
     immediately available, share submission will not be delayed by getworks.
-  - Implement a scaleable networking framework designed to cope with any sized
+  * Implement a scaleable networking framework designed to cope with any sized
     network requirements, yet minimise the number of connections being reopened. Do
     this by create a ring buffer linked list of curl handles to be used by getwork,
     recruiting extra handles when none is immediately available.
-  - There is no need for the submit and getwork curls to be tied to the pool
+  * There is no need for the submit and getwork curls to be tied to the pool
     struct.
-  - Do not recruit extra connection threads if there have been connection errors
+  * Do not recruit extra connection threads if there have been connection errors
     to the pool in question.
-  - We should not retry submitting shares indefinitely or we may end up with a
+  * We should not retry submitting shares indefinitely or we may end up with a
     huge backlog during network outages, so discard stale shares if we failed to
     submit them and they've become stale in the interim.
 
@@ -114,32 +148,32 @@ cgminer (2.4.0-1) stable; urgency=low
 
 cgminer (2.3.6-3) stable; urgency=low
   Version 2.3.6-3 - may 3, 2012
-  - More bug fixes, Pre 2.4.1 release.
+  * More bug fixes, Pre 2.4.1 release.
 
  -- nushor <nushor11@gmail.com>  Thurs, 03 May 2012 00:36:50 -0500
 
 cgminer (2.3.6-2) stable; urgency=low
   Version 2.3.6-2 - May 2, 2012
-  - Various bug fixes, latest build from repository.
+  * Various bug fixes, latest build from repository.
 
  -- nushor <nushor11@gmail.com>  Wed, 02 May 2012 18:17:49 -0500
 
 cgminer (2.3.6-1) stable; urgency=low
 
   Version 2.3.6 - April 29, 2012
-  - Shorten stale share messages slightly.
-  - Protect the freeing of current_hash under mutex_lock to prevent racing on it
+  * Shorten stale share messages slightly.
+  * Protect the freeing of current_hash under mutex_lock to prevent racing on it
     when set_curblock is hit concurrently.
-  - Change default behaviour to submitting stale, removing the --submit-stale
+  * Change default behaviour to submitting stale, removing the --submit-stale
     option and adding a --no-submit-stale option.
-  - Make sure to start the getwork and submit threads when a pool is added on the
+  * Make sure to start the getwork and submit threads when a pool is added on the
     fly. This fixes a crash when a pool is added to running cgminer and then
     switched to.
-  - Faster hardware can easily outstrip the speed we can get work and submit
+  * Faster hardware can easily outstrip the speed we can get work and submit
     shares when using only one connection per pool.
-  - Test the queued list to see if any get/submits are already queued and if they
+  * Test the queued list to see if any get/submits are already queued and if they
     are, start recruiting extra connections by generating new threads.
-  - This allows us to reuse network connections at low loads but recuit new open
+  * This allows us to reuse network connections at low loads but recuit new open
     connections as they're needed, so that cgminer can scale to hardware of any
     size.
 
diff --git a/debian/patches/series b/debian/patches/series
new file mode 100644
index 00000000..539cc484
--- /dev/null
+++ b/debian/patches/series
@@ -0,0 +1,3 @@
+v2.4.1
+v2.4.2
+v2.6.2
diff --git a/debian/patches/v2.6.2 b/debian/patches/v2.6.2
new file mode 100644
index 00000000..2223a00e
--- /dev/null
+++ b/debian/patches/v2.6.2
@@ -0,0 +1,1275 @@
+--- a/FPGA-README
++++ b/FPGA-README
+@@ -16,7 +16,25 @@
+ 
+ Icarus
+ 
+-There is a hidden option in cgminer when Icarus support is compiled in:
++There are two hidden options in cgminer when Icarus support is compiled in:
++
++--icarus-options <arg> Set specific FPGA board configurations - one set of values for all or comma separated
++           baud:work_division:fpga_count
++
++           baud           The Serial/USB baud rate - 115200 or 57600 only - default 115200
++           work_division  The fraction of work divided up for each FPGA chip - 1, 2, 4 or 8
++                          e.g. 2 means each FPGA does half the nonce range - default 2
++           fpga_count     The actual number of FPGA working - this would normally be the same
++                          as work_division - range is from 1 up to 'work_division'
++                          It defaults to the value of work_division - or 2 if you don't specify
++                          work_division
++
++If you define fewer comma seperated values than Icarus devices, the last values will be used
++for all extra devices
++
++An example would be: --icarus-options 57600:2:1
++This would mean: use 57600 baud, the FPGA board divides the work in half however
++only 1 FPGA actually runs on the board (e.g. like an early CM1 Icarus copy bitstream)
+ 
+ --icarus-timing <arg> Set how the Icarus timing is calculated - one setting/value for all or comma separated
+            default[=N]   Use the default Icarus hash time (2.6316ns)
+@@ -24,6 +42,9 @@
+            long          Re-calculate the hash time continuously
+            value[=N]     Specify the hash time in nanoseconds (e.g. 2.6316) and abort time (e.g. 2.6316=80)
+ 
++If you define fewer comma seperated values than Icarus devices, the last values will be used
++for all extra devices
++
+ Icarus timing is required for devices that do not exactly match a default Icarus Rev3 in
+ processing speed
+ If you have an Icarus Rev3 you should not normally need to use --icarus-timing since the
+@@ -55,9 +76,9 @@
+ 'short' mode and take note of the final hash time value (Hs) calculated
+ You can also use the RPC API 'stats' command to see the current hash time (Hs) at any time
+ 
+-The Icarus code currently only works with a dual FPGA device that supports the same commands as
++The Icarus code currently only works with an FPGA device that supports the same commands as
+ Icarus Rev3 requires and also is less than ~840MH/s and greater than 2MH/s
+-If a dual FPGA device does hash faster than ~840MH/s it should work correctly if you supply the
++If an FPGA device does hash faster than ~840MH/s it should work correctly if you supply the
+ correct hash time nanoseconds value
+ 
+ The timing code itself will affect the Icarus performance since it increases the delay after
+--- a/NEWS
++++ b/NEWS
+@@ -1,7 +1,42 @@
++Version 2.6.2 - August 3, 2012
++
++- Scrypt mining does not support block testing yet so don't try to print it.
++- Clear the bitforce buffer whenever we get an unexpected result as it has
++likely throttled and we are getting cached responses out of order, and use the
++temperature monitoring as a kind of watchdog to flush unexpected results.
++- It is not critical getting the temperature response in bitforce so don't
++mandatorily wait on the mutex lock.
++- Check there is a cutoff temp actually set in bitforce before using it as a cut
++off value otherwise it may think it's set to zero degrees.
++- We dropped the temporary stopping of curl recruiting on submit_fail by
++mistake, reinstate it.
++- Make threads report in either side of the scanhash function in case we miss
++reporting in when restarting work.
++- Don't make mandatory work and its clones last forever.
++- Make test work for pool_active mandatory work items to smooth out staged work
++counts when in failover-only mode.
++- Add debugging output when work is found stale as to why.
++- Print the 3 parameters that are passed to applog for a debug line in
++bitforce.c
++- Clear bitforce buffer on init as previously.
++- Add some headroom to the number of curls available per pool to allow for
++longpoll and sendwork curls.
++- Revert "Revert "Change BFL driver thread initialising to a constant 100ms
++delay between devices instead of a random arrangement.""
++- Revert "Remove bitforce_thread_init"
++- Show the correct base units on GPU summary.
++- Differentiate between the send return value being a bool and the get return
++value when managing them in bitforce scanhash.
++- 23a8c60 Revert "bitforce: Skip out of sending work if work restart requested"
++
++
+ Version 2.6.1 - July 30, 2012
+ 
++- Display scrypt as being built in as well.
++- Fix build warning about KL_SCRYPT when built without scrypt support.
+ - Remove the low hash count determinant of hardware being sick. A low hash rate
+-can be for poor network connectivity or scrypt mining, neither of which a
++can be for poor network connectivity or scrypt mining, neither of which are due
++to a sick device.
+ - api.c poolpriority changes
+ 
+ 
+--- a/cgminer.c
++++ b/cgminer.c
+@@ -142,6 +142,7 @@
+ bool opt_api_network;
+ bool opt_delaynet;
+ bool opt_disable_pool = true;
++char *opt_icarus_options = NULL;
+ char *opt_icarus_timing = NULL;
+ 
+ char *opt_kernel_path;
+@@ -710,6 +711,13 @@
+ }
+ 
+ #ifdef USE_ICARUS
++static char *set_icarus_options(const char *arg)
++{
++	opt_set_charp(arg, &opt_icarus_options);
++
++	return NULL;
++}
++
+ static char *set_icarus_timing(const char *arg)
+ {
+ 	opt_set_charp(arg, &opt_icarus_timing);
+@@ -873,6 +881,9 @@
+ 		     "Override sha256 kernel to use (diablo, poclbm, phatk or diakgcn) - one value or comma separated"),
+ #endif
+ #ifdef USE_ICARUS
++	OPT_WITH_ARG("--icarus-options",
++		     set_icarus_options, NULL, NULL,
++		     opt_hidden),
+ 	OPT_WITH_ARG("--icarus-timing",
+ 		     set_icarus_timing, NULL, NULL,
+ 		     opt_hidden),
+@@ -1770,10 +1781,9 @@
+ 
+ 	if (!QUIET) {
+ 		hash32 = (uint32_t *)(work->hash);
+-		if (opt_scrypt) {
+-			sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[7]), (unsigned long)(hash32[6]),
+-				work->block? " BLOCK!" : "");
+-		} else {
++		if (opt_scrypt)
++			sprintf(hashshow, "%08lx.%08lx", (unsigned long)(hash32[7]), (unsigned long)(hash32[6]));
++		else {
+ 			sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[6]), (unsigned long)(hash32[5]),
+ 				work->block? " BLOCK!" : "");
+ 		}
+@@ -2169,14 +2179,14 @@
+  * network delays/outages. */
+ static struct curl_ent *pop_curl_entry(struct pool *pool)
+ {
+-	int curl_limit = opt_delaynet ? 5 : mining_threads;
++	int curl_limit = opt_delaynet ? 5 : mining_threads * 4 / 3;
+ 	struct curl_ent *ce;
+ 
+ 	mutex_lock(&pool->pool_lock);
+ 	if (!pool->curls)
+ 		recruit_curl(pool);
+ 	else if (list_empty(&pool->curlring)) {
+-		if (pool->curls >= curl_limit)
++		if (pool->submit_fail || pool->curls >= curl_limit)
+ 			pthread_cond_wait(&pool->cr_cond, &pool->pool_lock);
+ 		else
+ 			recruit_curl(pool);
+@@ -2278,9 +2288,6 @@
+ 	struct pool *pool;
+ 	int getwork_delay;
+ 
+-	if (work->mandatory)
+-		return false;
+-
+ 	if (share) {
+ 		/* Technically the rolltime should be correct but some pools
+ 		 * advertise a broken expire= that is lower than a meaningful
+@@ -2306,14 +2313,20 @@
+ 		work_expiry = 5;
+ 
+ 	gettimeofday(&now, NULL);
+-	if ((now.tv_sec - work->tv_staged.tv_sec) >= work_expiry)
++	if ((now.tv_sec - work->tv_staged.tv_sec) >= work_expiry) {
++		applog(LOG_DEBUG, "Work stale due to expiry");
+ 		return true;
++	}
+ 
+-	if (work->work_block != work_block)
++	if (work->work_block != work_block) {
++		applog(LOG_DEBUG, "Work stale due to block mismatch");
+ 		return true;
++	}
+ 
+-	if (opt_fail_only && !share && pool != current_pool() && pool->enabled != POOL_REJECTING)
++	if (opt_fail_only && !share && pool != current_pool() && !work->mandatory) {
++		applog(LOG_DEBUG, "Work stale due to fail only pool mismatch");
+ 		return true;
++	}
+ 
+ 	return false;
+ }
+@@ -3011,6 +3024,8 @@
+ 		fprintf(fcfg, ",\n\"api-description\" : \"%s\"", opt_api_description);
+ 	if (opt_api_groups)
+ 		fprintf(fcfg, ",\n\"api-groups\" : \"%s\"", opt_api_groups);
++	if (opt_icarus_options)
++		fprintf(fcfg, ",\n\"icarus-options\" : \"%s\"", opt_icarus_options);
+ 	if (opt_icarus_timing)
+ 		fprintf(fcfg, ",\n\"icarus-timing\" : \"%s\"", opt_icarus_timing);
+ 	fputs("\n}", fcfg);
+@@ -3584,6 +3599,7 @@
+ 		struct work *work = make_work();
+ 		bool rc;
+ 
++		work->mandatory = true;
+ 		rc = work_decode(json_object_get(val, "result"), work);
+ 		if (rc) {
+ 			applog(LOG_DEBUG, "Successfully retrieved and deciphered work from pool %u %s",
+@@ -3835,6 +3851,7 @@
+ 	memcpy(work_clone, work, sizeof(struct work));
+ 	work_clone->clone = true;
+ 	work_clone->longpoll = false;
++	work_clone->mandatory = false;
+ 	/* Make cloned work appear slightly older to bias towards keeping the
+ 	 * master work item which can be further rolled */
+ 	work_clone->tv_staged.tv_sec -= 1;
+@@ -4192,7 +4209,9 @@
+ 			}
+ 			pool_stats->getwork_calls++;
+ 
++			thread_reportin(mythr);
+ 			hashes = api->scanhash(mythr, work, work->blk.nonce + max_nonce);
++			thread_reportin(mythr);
+ 
+ 			gettimeofday(&getwork_start, NULL);
+ 
+--- a/configure.ac
++++ b/configure.ac
+@@ -2,7 +2,7 @@
+ ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
+ m4_define([v_maj], [2])
+ m4_define([v_min], [6])
+-m4_define([v_mic], [1])
++m4_define([v_mic], [2])
+ ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
+ m4_define([v_ver], [v_maj.v_min.v_mic])
+ m4_define([lt_rev], m4_eval(v_maj + v_min))
+--- a/debian/changelog
++++ b/debian/changelog
+@@ -1,112 +1,151 @@
++cgminer (2.4.3-1) stable; urgency=medium
++  Version 2.4.3 - June 14, 2012
++
++  * can_roll and should_roll should have no bearing on the cycle period within the 
++    miner_thread so remove it.
++  * Check for strategy being changed to load balance when enabling LPs.
++  * Check that all threads on the device that called get_work are waiting on getwork 
++    before considering the pool lagging.
++  * Iterate over each thread belonging to each device in the hashmeter instead of 
++    searching for them now that they're a list.
++  * When using rotate pool strategy, ensure we only select from alive enabled pools.
++  * Start longpoll from every pool when load balance strategy is in use.
++  * Add mandatory and block fields to the work struct. Flag any shares that are 
++    detected as blocks as mandatory to submit, along with longpoll work from a previously 
++    rejecting pool.
++  * Consider the fan optimal if fanspeed is dropping but within the optimal speed window.
++  * Fix typo in some API messages (succeess/success)
++  * api.c MMQ stat bugs
++  * Bugfix: Fix warnings when built without libudev support
++  * Bugfix: slay a variety of warnings
++  * Bugfix: modminer: Fix unsigned/signed comparison and similar warnings
++  * API add ModMinerQuad support
++  * Bugfix: Honour forceauto parameter in serial_detect functions
++  * modminer: Temperature sensor improvements
++  * modminer: Make log messages more consistent in format
++  * Only adjust GPU speed up if the fanspeed is within the normal fanrange and hasn't been
++    turned to maximum speed under overheat conditions.
++  * ModMiner use valid .name
++  * New driver: BTCFPGA ModMiner
++  * Abstract generally useful FPGA code into fpgautils.c
++  * API add stats for pool getworks
++  * miner.php option to hide specific fields from the display
++  * miner.php add version numbers to the summary page
++  * Update debian configs to v2.4.2
++  * Add API and FPGA READMEs into Makefile to be included in source distribution.
++  * Icarus - fix unit64_t printf warnings
++
++ -- nushor <nushor11@gmail.com>  Fri, 15 Jun 2012 11:31:51 -0500
++
+ cgminer (2.4.2-1) stable; urgency=medium
+   Version 2.4.2 - June 2, 2012
+ 
+-  - API.class compiled with Java SE 6.0_03 - works with Win7x64
+-  - miner.php highlight devs too slow finding shares (possibly failing)
+-  - API update version to V1.11 and document changes
+-  - API save default config file if none specified
+-  - api.c save success incorrectly returns error
+-  - api.c replace BUFSIZ (linux/windows have different values)
+-  - Move RPC API content out of README to API-README
+-  - Open a longpoll connection if a pool is in the REJECTING state as it's the
++  * API.class compiled with Java SE 6.0_03 - works with Win7x64
++  * miner.php highlight devs too slow finding shares (possibly failing)
++  * API update version to V1.11 and document changes
++  * API save default config file if none specified
++  * api.c save success incorrectly returns error
++  * api.c replace BUFSIZ (linux/windows have different values)
++  * Move RPC API content out of README to API-README
++  * Open a longpoll connection if a pool is in the REJECTING state as it's the
+     only way to re-enable it automatically.
+-  - Use only one longpoll as much as possible by using a pthread conditional
++  * Use only one longpoll as much as possible by using a pthread conditional
+     broadcast that each longpoll thread waits on and checks if it's the current pool
+     before
+-  - If shares are known stale, don't use them to decide to disable a pool for
++  * If shares are known stale, don't use them to decide to disable a pool for
+     sequential rejects.
+-  - Restarting cgminer from within after ADL has been corrupted only leads to a
++  * Restarting cgminer from within after ADL has been corrupted only leads to a
+     crash. Display a warning only and disable fanspeed monitoring.
+-  - Icarus: fix abort calculation/allow user specified abort
+-  - Icarus: make --icarus-timing hidden and document it in FPGA-README
+-  - Icarus: high accuracy timing and other bitstream speed support
+-  - add-MIPSEB-to-icarus-for-BIG_ENDIAN
+-  - work_decode only needs swab32 on midstate under BIG ENDIAN
+-  - add compile command to api-example.c
+-  - save config bugfix: writing an extra ',' when no gpus
+-  - Add dpkg-source commits
++  * Icarus: fix abort calculation/allow user specified abort
++  * Icarus: make --icarus-timing hidden and document it in FPGA-README
++  * Icarus: high accuracy timing and other bitstream speed support
++  * add-MIPSEB-to-icarus-for-BIG_ENDIAN
++  * work_decode only needs swab32 on midstate under BIG ENDIAN
++  * add compile command to api-example.c
++  * save config bugfix: writing an extra ',' when no gpus
++  * Add dpkg-source commits
+ 
+  -- nushor <nushor11@gmail.com>  Sun, 03 Jun 2012 22:02:03 -0500
+ 
+ cgminer (2.4.1-1) stable; urgency=low
+   Version 2.4.1-1 - May 6, 2012
+-  - In the unlikely event of finding a block, display the block solved count with
++  * In the unlikely event of finding a block, display the block solved count with
+     the pool it came from for auditing.
+-  - Display the device summary on exit even if a device has been disabled.
+-  - Use correct pool enabled enums in api.c.
+-  - Import Debian packaging configs
+-  - Ensure we test for a pool recovering from idle so long as it's not set to
++  * Display the device summary on exit even if a device has been disabled.
++  * Use correct pool enabled enums in api.c.
++  * Import Debian packaging configs
++  * Ensure we test for a pool recovering from idle so long as it's not set to
+     disabled.
+-  - Fix pool number display.
+-  - Give cgminer -T message only if curses is in use.
+-  - Reinit_adl is no longer used.
+-  - API 'stats' allow devices to add their own stats also for testing/debug
+-  - API add getwork stats to cgminer - accesable from API 'stats'
+-  - Don't initialise variables to zero when in global scope since they're already
++  * Fix pool number display.
++  * Give cgminer -T message only if curses is in use.
++  * Reinit_adl is no longer used.
++  * API 'stats' allow devices to add their own stats also for testing/debug
++  * API add getwork stats to cgminer - accesable from API 'stats'
++  * Don't initialise variables to zero when in global scope since they're already
+     initialised.
+-  - Get rid of unitialised variable warning when it's false.
+-  - Move a pool to POOL_REJECTING to be disabled only after 3 minutes of
++  * Get rid of unitialised variable warning when it's false.
++  * Move a pool to POOL_REJECTING to be disabled only after 3 minutes of
+     continuous rejected shares.
+-  - Some tweaks to reporting and logging.
+-  - Change FPGA detection order since BFL hangs on an ICA
+-  - API support new pool status
+-  - Add a temporarily disabled state for enabled pools called POOL_REJECTING and
++  * Some tweaks to reporting and logging.
++  * Change FPGA detection order since BFL hangs on an ICA
++  * API support new pool status
++  * Add a temporarily disabled state for enabled pools called POOL_REJECTING and
+     use the work from each longpoll to help determine when a rejecting pool has
+     started working again. Switch pools based on the multipool strategy once a pool
+     is re-enabled.
+-  - Removing extra debug
+-  - Fix the benchmark feature by bypassing the new networking code.
+-  - Reset sequential reject counter after a pool is disabled for when it is
++  * Removing extra debug
++  * Fix the benchmark feature by bypassing the new networking code.
++  * Reset sequential reject counter after a pool is disabled for when it is
+     re-enabled.
+-  - Icarus - correct MH/s and U: with work restart set at 8 seconds
+-  - ztex updateFreq was always reporting on fpga 0
+-  - Trying harder to get 1.15y working
+-  - Specifying threads on multi fpga boards extra cgpu
+-  - Missing the add cgpu per extra fpga on 1.15y boards
+-  - API add last share time to each pool
+-  - Don't try to reap curls if benchmarking is enabled.
++  * Icarus - correct MH/s and U: with work restart set at 8 seconds
++  * ztex updateFreq was always reporting on fpga 0
++  * Trying harder to get 1.15y working
++  * Specifying threads on multi fpga boards extra cgpu
++  * Missing the add cgpu per extra fpga on 1.15y boards
++  * API add last share time to each pool
++  * Don't try to reap curls if benchmarking is enabled.
+ 
+  -- nushor <nushor11@gmail.com>  Sun, 06 May 2012 11:09:46 -0500
+ 
+ cgminer (2.4.0-1) stable; urgency=low
+   Version 2.4.0 - May 3, 2012
+ 
+-  - Only show longpoll warning once when it has failed.
+-  - Convert hashes to an unsigned long long as well.
+-  - Detect pools that have issues represented by endless rejected shares and
++  * Only show longpoll warning once when it has failed.
++  * Convert hashes to an unsigned long long as well.
++  * Detect pools that have issues represented by endless rejected shares and
+     disable them, with a parameter to optionally disable this feature.
+-  - Bugfix: Use a 64-bit type for hashes_done (miner_thread) since it can overflow
++  * Bugfix: Use a 64-bit type for hashes_done (miner_thread) since it can overflow
+     32-bit on some FPGAs
+-  - Implement an older header fix for a label existing before the pthread_cleanup
++  * Implement an older header fix for a label existing before the pthread_cleanup
+     macro.
+-  - Limit the number of curls we recruit on communication failures and with
++  * Limit the number of curls we recruit on communication failures and with
+     delaynet enabled to 5 by maintaining a per-pool curl count, and using a pthread
+     conditional that wakes up when one is returned to the ring buffer.
+-  - Generalise add_pool() functions since they're repeated in add_pool_details.
+-  - Bugfix: Return failure, rather than quit, if BFwrite fails
+-  - Disable failing devices such that the user can attempt to re-enable them
+-  - Bugfix: thread_shutdown shouldn't try to free the device, since it's needed
++  * Generalise add_pool() functions since they're repeated in add_pool_details.
++  * Bugfix: Return failure, rather than quit, if BFwrite fails
++  * Disable failing devices such that the user can attempt to re-enable them
++  * Bugfix: thread_shutdown shouldn't try to free the device, since it's needed
+     afterward
+-  - API bool's and 1TBS fixes
+-  - Icarus - minimise code delays and name timer variables
+-  - api.c V1.9 add 'restart' + redesign 'quit' so thread exits cleanly
+-  - api.c bug - remove extra ']'s in notify command
+-  - Increase pool watch interval to 30 seconds.
+-  - Reap curls that are unused for over a minute. This allows connections to be
++  * API bool's and 1TBS fixes
++  * Icarus - minimise code delays and name timer variables
++  * api.c V1.9 add 'restart' + redesign 'quit' so thread exits cleanly
++  * api.c bug - remove extra ']'s in notify command
++  * Increase pool watch interval to 30 seconds.
++  * Reap curls that are unused for over a minute. This allows connections to be
+     closed, thereby allowing the number of curl handles to always be the minimum
+     necessary to not delay networking.
+-  - Use the ringbuffer of curls from the same pool for submit as well as getwork
++  * Use the ringbuffer of curls from the same pool for submit as well as getwork
+     threads. Since the curl handles were already connected to the same pool and are
+     immediately available, share submission will not be delayed by getworks.
+-  - Implement a scaleable networking framework designed to cope with any sized
++  * Implement a scaleable networking framework designed to cope with any sized
+     network requirements, yet minimise the number of connections being reopened. Do
+     this by create a ring buffer linked list of curl handles to be used by getwork,
+     recruiting extra handles when none is immediately available.
+-  - There is no need for the submit and getwork curls to be tied to the pool
++  * There is no need for the submit and getwork curls to be tied to the pool
+     struct.
+-  - Do not recruit extra connection threads if there have been connection errors
++  * Do not recruit extra connection threads if there have been connection errors
+     to the pool in question.
+-  - We should not retry submitting shares indefinitely or we may end up with a
++  * We should not retry submitting shares indefinitely or we may end up with a
+     huge backlog during network outages, so discard stale shares if we failed to
+     submit them and they've become stale in the interim.
+ 
+@@ -114,32 +153,32 @@
+ 
+ cgminer (2.3.6-3) stable; urgency=low
+   Version 2.3.6-3 - may 3, 2012
+-  - More bug fixes, Pre 2.4.1 release.
++  * More bug fixes, Pre 2.4.1 release.
+ 
+  -- nushor <nushor11@gmail.com>  Thurs, 03 May 2012 00:36:50 -0500
+ 
+ cgminer (2.3.6-2) stable; urgency=low
+   Version 2.3.6-2 - May 2, 2012
+-  - Various bug fixes, latest build from repository.
++  * Various bug fixes, latest build from repository.
+ 
+  -- nushor <nushor11@gmail.com>  Wed, 02 May 2012 18:17:49 -0500
+ 
+ cgminer (2.3.6-1) stable; urgency=low
+ 
+   Version 2.3.6 - April 29, 2012
+-  - Shorten stale share messages slightly.
+-  - Protect the freeing of current_hash under mutex_lock to prevent racing on it
++  * Shorten stale share messages slightly.
++  * Protect the freeing of current_hash under mutex_lock to prevent racing on it
+     when set_curblock is hit concurrently.
+-  - Change default behaviour to submitting stale, removing the --submit-stale
++  * Change default behaviour to submitting stale, removing the --submit-stale
+     option and adding a --no-submit-stale option.
+-  - Make sure to start the getwork and submit threads when a pool is added on the
++  * Make sure to start the getwork and submit threads when a pool is added on the
+     fly. This fixes a crash when a pool is added to running cgminer and then
+     switched to.
+-  - Faster hardware can easily outstrip the speed we can get work and submit
++  * Faster hardware can easily outstrip the speed we can get work and submit
+     shares when using only one connection per pool.
+-  - Test the queued list to see if any get/submits are already queued and if they
++  * Test the queued list to see if any get/submits are already queued and if they
+     are, start recruiting extra connections by generating new threads.
+-  - This allows us to reuse network connections at low loads but recuit new open
++  * This allows us to reuse network connections at low loads but recuit new open
+     connections as they're needed, so that cgminer can scale to hardware of any
+     size.
+ 
+--- a/driver-bitforce.c
++++ b/driver-bitforce.c
+@@ -157,7 +157,7 @@
+ 	return true;
+ }
+ 
+-static void biforce_clear_buffer(struct cgpu_info *bitforce)
++static void bitforce_clear_buffer(struct cgpu_info *bitforce)
+ {
+ 	int fdDev = bitforce->device_fd;
+ 	char pdevbuf[0x100];
+@@ -185,6 +185,8 @@
+ 
+ 	applog(LOG_WARNING, "BFL%i: Re-initialising", bitforce->device_id);
+ 
++	bitforce_clear_buffer(bitforce);
++
+ 	mutex_lock(&bitforce->device_mutex);
+ 	if (fdDev) {
+ 		BFclose(fdDev);
+@@ -239,7 +241,11 @@
+ 	if (!fdDev)
+ 		return false;
+ 
+-	mutex_lock(&bitforce->device_mutex);
++	/* It is not critical getting temperature so don't get stuck if  we
++	 * can't grab the mutex here */
++	if (mutex_trylock(&bitforce->device_mutex))
++		return false;
++
+ 	BFwrite(fdDev, "ZLX", 3);
+ 	BFgets(pdevbuf, sizeof(pdevbuf), fdDev);
+ 	mutex_unlock(&bitforce->device_mutex);
+@@ -255,7 +261,7 @@
+ 
+ 		if (temp > 0) {
+ 			bitforce->temp = temp;
+-			if (temp > bitforce->cutofftemp) {
++			if (unlikely(bitforce->cutofftemp > 0 && temp > bitforce->cutofftemp)) {
+ 				applog(LOG_WARNING, "BFL%i: Hit thermal cutoff limit, disabling!", bitforce->device_id);
+ 				bitforce->deven = DEV_RECOVER;
+ 
+@@ -264,7 +270,15 @@
+ 				bitforce->dev_thermal_cutoff_count++;
+ 			}
+ 		}
++	} else {
++		/* Use the temperature monitor as a kind of watchdog for when
++		 * our responses are out of sync and flush the buffer to
++		 * hopefully recover */
++		applog(LOG_WARNING, "BFL%i: Garbled response probably throttling, clearing buffer");
++		bitforce_clear_buffer(bitforce);
++		return false;;
+ 	}
++
+ 	return true;
+ }
+ 
+@@ -287,8 +301,7 @@
+ 	BFgets(pdevbuf, sizeof(pdevbuf), fdDev);
+ 	if (!pdevbuf[0] || !strncasecmp(pdevbuf, "B", 1)) {
+ 		mutex_unlock(&bitforce->device_mutex);
+-		if (!restart_wait(WORK_CHECK_INTERVAL_MS))
+-			return false;
++		nmsleep(WORK_CHECK_INTERVAL_MS);
+ 		goto re_send;
+ 	} else if (unlikely(strncasecmp(pdevbuf, "OK", 2))) {
+ 		mutex_unlock(&bitforce->device_mutex);
+@@ -300,6 +313,7 @@
+ 			goto re_send;
+ 		}
+ 		applog(LOG_ERR, "BFL%i: Error: Send work reports: %s", bitforce->device_id, pdevbuf);
++		bitforce_clear_buffer(bitforce);
+ 		return false;
+ 	}
+ 
+@@ -340,6 +354,7 @@
+ 
+ 	if (unlikely(strncasecmp(pdevbuf, "OK", 2))) {
+ 		applog(LOG_ERR, "BFL%i: Error: Send block data reports: %s", bitforce->device_id, pdevbuf);
++		bitforce_clear_buffer(bitforce);
+ 		return false;
+ 	}
+ 
+@@ -414,7 +429,7 @@
+ 		}
+ 
+ 		if (delay_time_ms != bitforce->sleep_ms)
+-			  applog(LOG_DEBUG, "BFL%i: Wait time changed to: %d", bitforce->device_id, bitforce->sleep_ms, bitforce->wait_ms);
++			  applog(LOG_DEBUG, "BFL%i: Wait time changed to: %d, waited %u", bitforce->device_id, bitforce->sleep_ms, bitforce->wait_ms);
+ 
+ 		/* Work out the average time taken. Float for calculation, uint for display */
+ 		bitforce->avg_wait_f += (tv_to_ms(elapsed) - bitforce->avg_wait_f) / TIME_AVG_CONSTANT;
+@@ -428,6 +443,7 @@
+ 		return 0;	/* Device idle */
+ 	else if (strncasecmp(pdevbuf, "NONCE-FOUND", 11)) {
+ 		applog(LOG_WARNING, "BFL%i: Error: Get result reports: %s", bitforce->device_id, pdevbuf);
++		bitforce_clear_buffer(bitforce);
+ 		return 0;
+ 	}
+ 
+@@ -475,9 +491,10 @@
+ {
+ 	struct cgpu_info *bitforce = thr->cgpu;
+ 	unsigned int sleep_time;
++	bool send_ret;
+ 	int64_t ret;
+ 
+-	ret = bitforce_send_work(thr, work);
++	send_ret = bitforce_send_work(thr, work);
+ 
+ 	if (!bitforce->nonce_range) {
+ 		/* Initially wait 2/3 of the average cycle time so we can request more
+@@ -503,8 +520,10 @@
+ 		bitforce->wait_ms = sleep_time;
+ 	}
+ 
+-	if (ret)
++	if (send_ret)
+ 		ret = bitforce_get_result(thr, work);
++	else
++		ret = -1;
+ 
+ 	if (ret == -1) {
+ 		ret = 0;
+@@ -513,7 +532,7 @@
+ 		bitforce->device_not_well_reason = REASON_DEV_COMMS_ERROR;
+ 		bitforce->dev_comms_error_count++;
+ 		/* empty read buffer */
+-		biforce_clear_buffer(bitforce);
++		bitforce_clear_buffer(bitforce);
+ 	}
+ 	return ret;
+ }
+@@ -523,6 +542,20 @@
+ 	return bitforce_get_temp(bitforce);
+ }
+ 
++static bool bitforce_thread_init(struct thr_info *thr)
++{
++	struct cgpu_info *bitforce = thr->cgpu;
++	unsigned int wait;
++
++	/* Pause each new thread at least 100ms between initialising
++	 * so the devices aren't making calls all at the same time. */
++	wait = thr->id * MAX_START_DELAY_US;
++	applog(LOG_DEBUG, "BFL%i: Delaying start by %dms", bitforce->device_id, wait / 1000);
++	usleep(wait);
++
++	return true;
++}
++
+ static struct api_data *bitforce_api_stats(struct cgpu_info *cgpu)
+ {
+ 	struct api_data *root = NULL;
+@@ -546,6 +579,7 @@
+ 	.get_statline_before = get_bitforce_statline_before,
+ 	.get_stats = bitforce_get_stats,
+ 	.thread_prepare = bitforce_thread_prepare,
++	.thread_init = bitforce_thread_init,
+ 	.scanhash = bitforce_scanhash,
+ 	.thread_shutdown = bitforce_shutdown,
+ 	.thread_enable = biforce_thread_enable
+--- a/driver-icarus.c
++++ b/driver-icarus.c
+@@ -65,7 +65,7 @@
+ #define ASSERT1(condition) __maybe_unused static char sizeof_uint32_t_must_be_4[(condition)?1:-1]
+ ASSERT1(sizeof(uint32_t) == 4);
+ 
+-#define ICARUS_READ_TIME ((double)ICARUS_READ_SIZE * (double)8.0 / (double)ICARUS_IO_SPEED)
++#define ICARUS_READ_TIME(baud) ((double)ICARUS_READ_SIZE * (double)8.0 / (double)(baud))
+ 
+ // Fraction of a second, USB timeout is measured in
+ // i.e. 10 means 1/10 of a second
+@@ -176,11 +176,36 @@
+ 	// (which will only affect W)
+ 	uint64_t history_count;
+ 	struct timeval history_time;
++
++	// icarus-options
++	int baud;
++	int work_division;
++	int fpga_count;
++	uint32_t nonce_mask;
+ };
+ 
++#define END_CONDITION 0x0000ffff
++
+ // One for each possible device
+ static struct ICARUS_INFO **icarus_info;
+ 
++// Looking for options in --icarus-timing and --icarus-options:
++//
++// Code increments this each time we start to look at a device
++// However, this means that if other devices are checked by
++// the Icarus code (e.g. BFL) they will count in the option offset
++//
++// This, however, is deterministic so that's OK
++//
++// If we were to increment after successfully finding an Icarus
++// that would be random since an Icarus may fail and thus we'd
++// not be able to predict the option order
++//
++// This also assumes that serial_detect() checks them sequentially
++// and in the order specified on the command line
++//
++static int option_offset = -1;
++
+ struct device_api icarus_api;
+ 
+ static void rev(unsigned char *s, size_t l)
+@@ -195,8 +220,8 @@
+ 	}
+ }
+ 
+-#define icarus_open2(devpath, purge)  serial_open(devpath, 115200, ICARUS_READ_FAULT_DECISECONDS, purge)
+-#define icarus_open(devpath)  icarus_open2(devpath, false)
++#define icarus_open2(devpath, baud, purge)  serial_open(devpath, baud, ICARUS_READ_FAULT_DECISECONDS, purge)
++#define icarus_open(devpath, baud)  icarus_open2(devpath, baud, false)
+ 
+ static int icarus_gets(unsigned char *buf, int fd, struct timeval *tv_finish, struct thr_info *thr, int read_count)
+ {
+@@ -272,7 +297,7 @@
+ 	}
+ }
+ 
+-static void set_timing_mode(struct cgpu_info *icarus)
++static void set_timing_mode(int this_option_offset, struct cgpu_info *icarus)
+ {
+ 	struct ICARUS_INFO *info = icarus_info[icarus->device_id];
+ 	double Hs;
+@@ -285,7 +310,7 @@
+ 		buf[0] = '\0';
+ 	else {
+ 		ptr = opt_icarus_timing;
+-		for (i = 0; i < icarus->device_id; i++) {
++		for (i = 0; i < this_option_offset; i++) {
+ 			comma = strchr(ptr, ',');
+ 			if (comma == NULL)
+ 				break;
+@@ -354,11 +379,123 @@
+ 
+ 	applog(LOG_DEBUG, "Icarus: Init: %d mode=%s read_count=%d Hs=%e",
+ 		icarus->device_id, timing_mode_str(info->timing_mode), info->read_count, info->Hs);
++}
++
++static uint32_t mask(int work_division)
++{
++	char err_buf[BUFSIZ+1];
++	uint32_t nonce_mask = 0x7fffffff;
+ 
++	// yes we can calculate these, but this way it's easy to see what they are
++	switch (work_division) {
++	case 1:
++		nonce_mask = 0xffffffff;
++		break;
++	case 2:
++		nonce_mask = 0x7fffffff;
++		break;
++	case 4:
++		nonce_mask = 0x3fffffff;
++		break;
++	case 8:
++		nonce_mask = 0x1fffffff;
++		break;
++	default:
++		sprintf(err_buf, "Invalid2 icarus-options for work_division (%d) must be 1, 2, 4 or 8", work_division);
++		quit(1, err_buf);
++	}
++
++	return nonce_mask;
++}
++
++static void get_options(int this_option_offset, int *baud, int *work_division, int *fpga_count)
++{
++	char err_buf[BUFSIZ+1];
++	char buf[BUFSIZ+1];
++	char *ptr, *comma, *colon, *colon2;
++	size_t max;
++	int i, tmp;
++
++	if (opt_icarus_options == NULL)
++		buf[0] = '\0';
++	else {
++		ptr = opt_icarus_options;
++		for (i = 0; i < this_option_offset; i++) {
++			comma = strchr(ptr, ',');
++			if (comma == NULL)
++				break;
++			ptr = comma + 1;
++		}
++
++		comma = strchr(ptr, ',');
++		if (comma == NULL)
++			max = strlen(ptr);
++		else
++			max = comma - ptr;
++
++		if (max > BUFSIZ)
++			max = BUFSIZ;
++		strncpy(buf, ptr, max);
++		buf[max] = '\0';
++	}
++
++	*baud = ICARUS_IO_SPEED;
++	*work_division = 2;
++	*fpga_count = 2;
++
++	if (*buf) {
++		colon = strchr(buf, ':');
++		if (colon)
++			*(colon++) = '\0';
++
++		if (*buf) {
++			tmp = atoi(buf);
++			switch (tmp) {
++			case 115200:
++				*baud = 115200;
++				break;
++			case 57600:
++				*baud = 57600;
++				break;
++			default:
++				sprintf(err_buf, "Invalid icarus-options for baud (%s) must be 115200 or 57600", buf);
++				quit(1, err_buf);
++			}
++		}
++
++		if (colon && *colon) {
++			colon2 = strchr(colon, ':');
++			if (colon2)
++				*(colon2++) = '\0';
++
++			if (*colon) {
++				tmp = atoi(colon);
++				if (tmp == 1 || tmp == 2 || tmp == 4 || tmp == 8) {
++					*work_division = tmp;
++					*fpga_count = tmp;	// default to the same
++				} else {
++					sprintf(err_buf, "Invalid icarus-options for work_division (%s) must be 1, 2, 4 or 8", colon);
++					quit(1, err_buf);
++				}
++			}
++
++			if (colon2 && *colon2) {
++				tmp = atoi(colon2);
++				if (tmp > 0 && tmp <= *work_division)
++					*fpga_count = tmp;
++				else {
++					sprintf(err_buf, "Invalid icarus-options for fpga_count (%s) must be >0 and <=work_division (%d)", colon2, *work_division);
++					quit(1, err_buf);
++				}
++			}
++		}
++	}
+ }
+ 
+ static bool icarus_detect_one(const char *devpath)
+ {
++	int this_option_offset = ++option_offset;
++
+ 	struct ICARUS_INFO *info;
+ 	struct timeval tv_start, tv_finish;
+ 	int fd;
+@@ -379,9 +516,13 @@
+ 	unsigned char ob_bin[64], nonce_bin[ICARUS_READ_SIZE];
+ 	char *nonce_hex;
+ 
++	int baud, work_division, fpga_count;
++
++	get_options(this_option_offset, &baud, &work_division, &fpga_count);
++
+ 	applog(LOG_DEBUG, "Icarus Detect: Attempting to open %s", devpath);
+ 
+-	fd = icarus_open2(devpath, true);
++	fd = icarus_open2(devpath, baud, true);
+ 	if (unlikely(fd == -1)) {
+ 		applog(LOG_ERR, "Icarus Detect: Failed to open %s", devpath);
+ 		return false;
+@@ -429,6 +570,9 @@
+ 	applog(LOG_INFO, "Found Icarus at %s, mark as %d",
+ 		devpath, icarus->device_id);
+ 
++	applog(LOG_DEBUG, "Icarus: Init: %d baud=%d work_division=%d fpga_count=%d",
++		icarus->device_id, baud, work_division, fpga_count);
++
+ 	// Since we are adding a new device on the end it needs to always be allocated
+ 	icarus_info[icarus->device_id] = (struct ICARUS_INFO *)malloc(sizeof(struct ICARUS_INFO));
+ 	if (unlikely(!(icarus_info[icarus->device_id])))
+@@ -439,10 +583,15 @@
+ 	// Initialise everything to zero for a new device
+ 	memset(info, 0, sizeof(struct ICARUS_INFO));
+ 
+-	info->golden_hashes = (golden_nonce_val & 0x7fffffff) << 1;
++	info->baud = baud;
++	info->work_division = work_division;
++	info->fpga_count = fpga_count;
++	info->nonce_mask = mask(work_division);
++
++	info->golden_hashes = (golden_nonce_val & info->nonce_mask) * fpga_count;
+ 	timersub(&tv_finish, &tv_start, &(info->golden_tv));
+ 
+-	set_timing_mode(icarus);
++	set_timing_mode(this_option_offset, icarus);
+ 
+ 	return true;
+ }
+@@ -458,7 +607,7 @@
+ 
+ 	struct timeval now;
+ 
+-	int fd = icarus_open(icarus->device_path);
++	int fd = icarus_open(icarus->device_path, icarus_info[icarus->device_id]->baud);
+ 	if (unlikely(-1 == fd)) {
+ 		applog(LOG_ERR, "Failed to open Icarus on %s",
+ 		       icarus->device_path);
+@@ -565,11 +714,9 @@
+ 
+ 	submit_nonce(thr, work, nonce);
+ 
+-	hash_count = (nonce & 0x7fffffff);
+-	if (hash_count++ == 0x7fffffff)
+-		hash_count = 0xffffffff;
+-	else
+-		hash_count <<= 1;
++	hash_count = (nonce & info->nonce_mask);
++	hash_count++;
++	hash_count *= info->fpga_count;
+ 
+ 	if (opt_debug || info->do_icarus_timing)
+ 		timersub(&tv_finish, &tv_start, &elapsed);
+@@ -580,7 +727,9 @@
+ 	}
+ 
+ 	// ignore possible end condition values
+-	if (info->do_icarus_timing && (nonce & 0x7fffffff) > 0x000fffff && (nonce & 0x7fffffff) < 0x7ff00000) {
++	if (info->do_icarus_timing
++	&&  ((nonce & info->nonce_mask) > END_CONDITION)
++	&&  ((nonce & info->nonce_mask) < (info->nonce_mask & ~END_CONDITION))) {
+ 		gettimeofday(&tv_history_start, NULL);
+ 
+ 		history0 = &(info->history[0]);
+@@ -590,7 +739,7 @@
+ 
+ 		Ti = (double)(elapsed.tv_sec)
+ 			+ ((double)(elapsed.tv_usec))/((double)1000000)
+-			- ICARUS_READ_TIME;
++			- ((double)ICARUS_READ_TIME(info->baud));
+ 		Xi = (double)hash_count;
+ 		history0->sumXiTi += Xi * Ti;
+ 		history0->sumXi += Xi;
+@@ -700,6 +849,9 @@
+ 	root = api_add_uint(root, "timing_values", &(info->history[0].values), false);
+ 	root = api_add_const(root, "timing_mode", timing_mode_str(info->timing_mode), false);
+ 	root = api_add_bool(root, "is_timing", &(info->do_icarus_timing), false);
++	root = api_add_int(root, "baud", &(info->baud), false);
++	root = api_add_int(root, "work_division", &(info->work_division), false);
++	root = api_add_int(root, "fpga_count", &(info->fpga_count), false);
+ 
+ 	return root;
+ }
+--- a/driver-opencl.c
++++ b/driver-opencl.c
+@@ -660,9 +660,19 @@
+ 
+ 	for (gpu = 0; gpu < nDevs; gpu++) {
+ 		struct cgpu_info *cgpu = &gpus[gpu];
++		double displayed_rolling, displayed_total;
++		bool mhash_base = true;
+ 
+-		wlog("GPU %d: %.1f / %.1f Mh/s | A:%d  R:%d  HW:%d  U:%.2f/m  I:%d\n",
+-			gpu, cgpu->rolling, cgpu->total_mhashes / total_secs,
++		displayed_rolling = cgpu->rolling;
++		displayed_total = cgpu->total_mhashes / total_secs;
++		if (displayed_rolling < 1) {
++			displayed_rolling *= 1000;
++			displayed_total *= 1000;
++			mhash_base = false;
++		}
++
++		wlog("GPU %d: %.1f / %.1f %sh/s | A:%d  R:%d  HW:%d  U:%.2f/m  I:%d\n",
++			gpu, displayed_rolling, displayed_total, mhash_base ? "M" : "K",
+ 			cgpu->accepted, cgpu->rejected, cgpu->hw_errors,
+ 			cgpu->utility, cgpu->intensity);
+ #ifdef HAVE_ADL
+@@ -710,7 +720,10 @@
+ 			if (thr->cgpu != cgpu)
+ 				continue;
+ 			get_datestamp(checkin, &thr->last);
+-			wlog("Thread %d: %.1f Mh/s %s ", i, thr->rolling, cgpu->deven != DEV_DISABLED ? "Enabled" : "Disabled");
++			displayed_rolling = thr->rolling;
++			if (!mhash_base)
++				displayed_rolling *= 1000;
++			wlog("Thread %d: %.1f %sh/s %s ", i, displayed_rolling, mhash_base ? "M" : "K" , cgpu->deven != DEV_DISABLED ? "Enabled" : "Disabled");
+ 			switch (cgpu->status) {
+ 				default:
+ 				case LIFE_WELL:
+--- a/miner.h
++++ b/miner.h
+@@ -500,6 +500,11 @@
+ 		quit(1, "WTF MUTEX ERROR ON UNLOCK!");
+ }
+ 
++static inline int mutex_trylock(pthread_mutex_t *lock)
++{
++	return pthread_mutex_trylock(lock);
++}
++
+ static inline void wr_lock(pthread_rwlock_t *lock)
+ {
+ 	if (unlikely(pthread_rwlock_wrlock(lock)))
+@@ -557,6 +562,7 @@
+ extern bool opt_api_network;
+ extern bool opt_delaynet;
+ extern bool opt_restart;
++extern char *opt_icarus_options;
+ extern char *opt_icarus_timing;
+ #ifdef USE_BITFORCE
+ extern bool opt_bfl_noncerange;
+--- a/miner.php
++++ b/miner.php
+@@ -87,11 +87,13 @@
+  'DATE' => null,
+  'RIGS' => null,
+  'SUMMARY' => array('Elapsed', 'MHS av', 'Found Blocks=Blks', 'Accepted', 'Rejected=Rej', 'Utility'),
+- 'DEVS' => array('ID', 'Name', 'GPU', 'Status', 'MHS av', 'Accepted', 'Rejected=Rej', 'Utility'),
++ 'DEVS+NOTIFY' => array('DEVS.Name=Name', 'DEVS.ID=ID', 'DEVS.Status=Status', 'DEVS.Temperature=Temp',
++			'DEVS.MHS av=MHS av', 'DEVS.Accepted=Accept', 'DEVS.Rejected=Rej',
++			'DEVS.Utility=Utility', 'NOTIFY.Last Not Well=Not Well'),
+  'POOL' => array('POOL', 'Status', 'Accepted', 'Rejected=Rej', 'Last Share Time'));
+ $mobilesum = array(
+  'SUMMARY' => array('MHS av', 'Found Blocks', 'Accepted', 'Rejected', 'Utility'),
+- 'DEVS' => array('MHS av', 'Accepted', 'Rejected', 'Utility'),
++ 'DEVS+NOTIFY' => array('DEVS.MHS av', 'DEVS.Accepted', 'DEVS.Rejected', 'DEVS.Utility'),
+  'POOL' => array('Accepted', 'Rejected'));
+ #
+ # customsummarypages is an array of these Custom Summary Pages
+@@ -716,6 +718,9 @@
+  if ($class == '' && ($rownum % 2) == 0)
+ 	$class = $c2class;
+ 
++ if ($ret == '')
++	$ret = $b;
++
+  return array($ret, $class);
+ }
+ #
+@@ -1274,8 +1279,171 @@
+ 	'GPU' => 'devs',	// You would normally use DEVS
+ 	'PGA' => 'devs',	// You would normally use DEVS
+ 	'NOTIFY' => 'notify',
++	'DEVDETAILS' => 'devdetails',
++	'STATS' => 'stats',
+ 	'CONFIG' => 'config');
+ #
++function joinfields($section1, $section2, $join, $results)
++{
++ global $sectionmap;
++
++ $name1 = $sectionmap[$section1];
++ $name2 = $sectionmap[$section2];
++ $newres = array();
++
++ // foreach rig in section1
++ foreach ($results[$name1] as $rig => $result)
++ {
++	$status = null;
++
++	// foreach answer section in the rig api call
++	foreach ($result as $name1b => $fields1b)
++	{
++		if ($name1b == 'STATUS')
++		{
++			// remember the STATUS from section1
++			$status = $result[$name1b];
++			continue;
++		}
++
++		// foreach answer section in the rig api call (for the other api command)
++		foreach ($results[$name2][$rig] as $name2b => $fields2b)
++		{
++			if ($name2b == 'STATUS')
++				continue;
++
++			// If match the same field values of fields in $join
++			$match = true;
++			foreach ($join as $field)
++				if ($fields1b[$field] != $fields2b[$field])
++				{
++					$match = false;
++					break;
++				}
++
++			if ($match === true)
++			{
++				if ($status != null)
++				{
++					$newres[$rig]['STATUS'] = $status;
++					$status = null;
++				}
++
++				$subsection = $section1.'+'.$section2;
++				$subsection .= preg_replace('/[^0-9]/', '', $name1b.$name2b);
++
++				foreach ($fields1b as $nam => $val)
++					$newres[$rig][$subsection]["$section1.$nam"] = $val;
++				foreach ($fields2b as $nam => $val)
++					$newres[$rig][$subsection]["$section2.$nam"] = $val;
++			}
++		}
++	}
++ }
++ return $newres;
++}
++#
++function joinall($section1, $section2, $results)
++{
++ global $sectionmap;
++
++ $name1 = $sectionmap[$section1];
++ $name2 = $sectionmap[$section2];
++ $newres = array();
++
++ // foreach rig in section1
++ foreach ($results[$name1] as $rig => $result)
++ {
++	// foreach answer section in the rig api call
++	foreach ($result as $name1b => $fields1b)
++	{
++		if ($name1b == 'STATUS')
++		{
++			// copy the STATUS from section1
++			$newres[$rig][$name1b] = $result[$name1b];
++			continue;
++		}
++
++		// foreach answer section in the rig api call (for the other api command)
++		foreach ($results[$name2][$rig] as $name2b => $fields2b)
++		{
++			if ($name2b == 'STATUS')
++				continue;
++
++			$subsection = $section1.'+'.$section2;
++			$subsection .= preg_replace('/[^0-9]/', '', $name1b.$name2b);
++
++			foreach ($fields1b as $nam => $val)
++				$newres[$rig][$subsection]["$section1.$nam"] = $val;
++			foreach ($fields2b as $nam => $val)
++				$newres[$rig][$subsection]["$section2.$nam"] = $val;
++		}
++	}
++ }
++ return $newres;
++}
++#
++function joinsections($sections, $results, $errors)
++{
++ global $sectionmap;
++
++#echo "results['pools']=".print_r($results['pools'],true)."<br>";
++
++ // GPU's don't have Name,ID fields - so create them
++ foreach ($results as $section => $res)
++	foreach ($res as $rig => $result)
++		foreach ($result as $name => $fields)
++		{
++			$subname = preg_replace('/[0-9]/', '', $name);
++			if ($subname == 'GPU' and isset($result[$name]['GPU']))
++			{
++				$results[$section][$rig][$name]['Name'] = 'GPU';
++				$results[$section][$rig][$name]['ID'] = $result[$name]['GPU'];
++			}
++		}
++
++ foreach ($sections as $section => $fields)
++	if ($section != 'DATE' && !isset($sectionmap[$section]))
++	{
++		$both = explode('+', $section, 2);
++		if (count($both) > 1)
++		{
++			switch($both[0])
++			{
++			case 'SUMMARY':
++				switch($both[1])
++				{
++				case 'POOL':
++				case 'DEVS':
++				case 'CONFIG':
++					$sectionmap[$section] = $section;
++					$results[$section] = joinall($both[0], $both[1], $results);
++					break;
++				}
++				break;
++			case 'DEVS':
++				$join = array('Name', 'ID');
++				switch($both[1])
++				{
++				case 'NOTIFY':
++				case 'DEVDETAILS':
++					$sectionmap[$section] = $section;
++					$results[$section] = joinfields($both[0], $both[1], $join, $results);
++					break;
++				}
++				break;
++			default:
++				$errors[] = "Error: Invalid section '$section'";
++				break;
++			}
++		}
++		else
++			$errors[] = "Error: Invalid section '$section'";
++	}
++
++ return array($results, $errors);
++}
++#
+ function secmatch($section, $field)
+ {
+  if ($section == $field)
+@@ -1335,7 +1503,14 @@
+ 				$value = null;
+ 		}
+ 
+-		list($showvalue, $class) = fmt($secname, $name, $value, $when, $row);
++		if (strpos($secname, '+') === false)
++			list($showvalue, $class) = fmt($secname, $name, $value, $when, $row);
++		else
++		{
++			$parts = explode('.', $name, 2);
++			list($showvalue, $class) = fmt($parts[0], $parts[1], $value, $when, $row);
++		}
++
+ 		echo "<td$class align=right>$showvalue</td>";
+ 	}
+ 	endrow();
+@@ -1356,15 +1531,19 @@
+  $errors = array();
+  foreach ($sections as $section => $fields)
+  {
+-	if (isset($sectionmap[$section]))
++	$all = explode('+', $section);
++	foreach ($all as $section)
+ 	{
+-		$cmd = $sectionmap[$section];
+-		if (!isset($cmds[$cmd]))
+-			$cmds[$cmd] = 1;
++		if (isset($sectionmap[$section]))
++		{
++			$cmd = $sectionmap[$section];
++			if (!isset($cmds[$cmd]))
++				$cmds[$cmd] = 1;
++		}
++		else
++			if ($section != 'DATE')
++				$errors[] = "Error: unknown section '$section' in custom summary page '$pagename'";
+ 	}
+-	else
+-		if ($section != 'DATE')
+-			$errors[] = "Error: unknown section '$section' in custom summary page '$pagename'";
+  }
+ 
+  $results = array();
+@@ -1399,6 +1578,7 @@
+  $shownsomething = false;
+  if (count($results) > 0)
+  {
++	list($results, $errors) = joinsections($sections, $results, $errors);
+ 	$first = true;
+ 	foreach ($sections as $section => $fields)
+ 	{
diff --git a/diablo120328.cl b/diablo120724.cl
similarity index 99%
rename from diablo120328.cl
rename to diablo120724.cl
index e9d2d87f..4b64c300 100644
--- a/diablo120328.cl
+++ b/diablo120724.cl
@@ -1242,8 +1242,8 @@ void search(
     
     ZA[924] = (ZCh(ZA[922], ZA[920], ZA[918]) + ZA[923]) + ZR26(ZA[922]);
     
-#define FOUND (0x80)
-#define NFLAG (0x7F)
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
 
 #if defined(VECTORS4)
 	bool result = any(ZA[924] == 0x136032EDU);
diff --git a/diakgcn120427.cl b/diakgcn120724.cl
similarity index 99%
rename from diakgcn120427.cl
rename to diakgcn120724.cl
index 37d51c51..7dd73fb9 100644
--- a/diakgcn120427.cl
+++ b/diakgcn120724.cl
@@ -571,8 +571,8 @@ __kernel
 
 	V[7] += V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
 
-#define FOUND (0x80)
-#define NFLAG (0x7F)
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
 
 #ifdef VECTORS4
 	if ((V[7].x == 0x136032edU) ^ (V[7].y == 0x136032edU) ^ (V[7].z == 0x136032edU) ^ (V[7].w == 0x136032edU))
diff --git a/driver-bitforce.c b/driver-bitforce.c
index 31892297..bacac16c 100644
--- a/driver-bitforce.c
+++ b/driver-bitforce.c
@@ -10,6 +10,7 @@
 
 #include <limits.h>
 #include <pthread.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <strings.h>
 #include <sys/time.h>
@@ -17,6 +18,32 @@
 
 #include "config.h"
 
+#ifdef WIN32
+
+#include <windows.h>
+
+#define dlsym (void*)GetProcAddress
+#define dlclose FreeLibrary
+
+typedef unsigned long FT_STATUS;
+typedef PVOID FT_HANDLE;
+__stdcall FT_STATUS (*FT_ListDevices)(PVOID pArg1, PVOID pArg2, DWORD Flags);
+__stdcall FT_STATUS (*FT_Open)(int idx, FT_HANDLE*);
+__stdcall FT_STATUS (*FT_GetComPortNumber)(FT_HANDLE, LPLONG lplComPortNumber);
+__stdcall FT_STATUS (*FT_Close)(FT_HANDLE);
+const uint32_t FT_OPEN_BY_DESCRIPTION =       2;
+const uint32_t FT_LIST_ALL         = 0x20000000;
+const uint32_t FT_LIST_NUMBER_ONLY = 0x80000000;
+enum {
+	FT_OK,
+};
+
+// Code must deal with a timeout. Make it 1 second on windows, 0.1 on linux.
+#define BFopen(devpath)  serial_open(devpath, 0, 10, true)
+#else /* WIN32 */
+#define BFopen(devpath)  serial_open(devpath, 0, 1, true)
+#endif /* WIN32 */
+
 #include "compat.h"
 #include "fpgautils.h"
 #include "miner.h"
@@ -32,15 +59,17 @@
 #define tv_to_ms(tval) (tval.tv_sec * 1000 + tval.tv_usec / 1000)
 #define TIME_AVG_CONSTANT 8
 
-struct device_api bitforce_api;
+#define KNAME_WORK  "full work"
+#define KNAME_RANGE "nonce range"
 
-#define BFopen(devpath)  serial_open(devpath, 0, -1, true)
+struct device_api bitforce_api;
 
 static void BFgets(char *buf, size_t bufLen, int fd)
 {
-	do
+	do {
+		buf[0] = '\0';
 		--bufLen;
-	while (likely(bufLen && read(fd, buf, 1) == 1 && (buf++)[0] != '\n'));
+	} while (likely(bufLen && read(fd, buf, 1) == 1 && (buf++)[0] != '\n'));
 
 	buf[0] = '\0';
 }
@@ -72,7 +101,7 @@ static bool bitforce_detect_one(const char *devpath)
 	BFwrite(fdDev, "ZGX", 3);
 	BFgets(pdevbuf, sizeof(pdevbuf), fdDev);
 	if (unlikely(!pdevbuf[0])) {
-		applog(LOG_ERR, "BFL: Error reading (ZGX)");
+		applog(LOG_ERR, "BFL: Error reading/timeout (ZGX)");
 		return 0;
 	}
 
@@ -93,10 +122,10 @@ static bool bitforce_detect_one(const char *devpath)
 	if (opt_bfl_noncerange) {
 		bitforce->nonce_range = true;
 		bitforce->sleep_ms = BITFORCE_SLEEP_MS;
-		bitforce->kname = "Mini-rig";
+		bitforce->kname = KNAME_RANGE;
 	} else {
 		bitforce->sleep_ms = BITFORCE_SLEEP_MS * 5;
-		bitforce->kname = "Single";
+		bitforce->kname = KNAME_WORK;
 	}
 
 	if (likely((!memcmp(pdevbuf, ">>>ID: ", 7)) && (s = strstr(pdevbuf + 3, ">>>")))) {
@@ -109,14 +138,96 @@ static bool bitforce_detect_one(const char *devpath)
 	return add_cgpu(bitforce);
 }
 
-static char bitforce_detect_auto()
+#define LOAD_SYM(sym)  do { \
+	if (!(sym = dlsym(dll, #sym))) {  \
+		applog(LOG_DEBUG, "Failed to load " #sym ", not using FTDI bitforce autodetect");  \
+		goto out;  \
+	}  \
+} while(0)
+
+#ifdef WIN32
+static int bitforce_autodetect_ftdi(void)
+{
+	char devpath[] = "\\\\.\\COMnnnnn";
+	char *devpathnum = &devpath[7];
+	char **bufptrs;
+	char *buf;
+	int found = 0;
+	int i;
+
+	FT_STATUS ftStatus;
+	DWORD numDevs;
+	HMODULE dll = LoadLibrary("FTD2XX.DLL");
+	if (!dll) {
+		applog(LOG_DEBUG, "FTD2XX.DLL failed to load, not using FTDI bitforce autodetect");
+		return 0;
+	}
+	LOAD_SYM(FT_ListDevices);
+	LOAD_SYM(FT_Open);
+	LOAD_SYM(FT_GetComPortNumber);
+	LOAD_SYM(FT_Close);
+	
+	ftStatus = FT_ListDevices(&numDevs, NULL, FT_LIST_NUMBER_ONLY);
+	if (ftStatus != FT_OK) {
+		applog(LOG_DEBUG, "FTDI device count failed, not using FTDI bitforce autodetect");
+		goto out;
+	}
+	applog(LOG_DEBUG, "FTDI reports %u devices", (unsigned)numDevs);
+
+	buf = alloca(65 * numDevs);
+	bufptrs = alloca(numDevs + 1);
+
+	for (i = 0; i < numDevs; ++i)
+		bufptrs[i] = &buf[i * 65];
+	bufptrs[numDevs] = NULL;
+	ftStatus = FT_ListDevices(bufptrs, &numDevs, FT_LIST_ALL | FT_OPEN_BY_DESCRIPTION);
+	if (ftStatus != FT_OK) {
+		applog(LOG_DEBUG, "FTDI device list failed, not using FTDI bitforce autodetect");
+		goto out;
+	}
+	
+	for (i = numDevs; i > 0; ) {
+		--i;
+		bufptrs[i][64] = '\0';
+		
+		if (!(strstr(bufptrs[i], "BitFORCE") && strstr(bufptrs[i], "SHA256")))
+			continue;
+		
+		FT_HANDLE ftHandle;
+		if (FT_OK != FT_Open(i, &ftHandle))
+			continue;
+		LONG lComPortNumber;
+		ftStatus = FT_GetComPortNumber(ftHandle, &lComPortNumber);
+		FT_Close(ftHandle);
+		if (FT_OK != ftStatus || lComPortNumber < 0)
+			continue;
+		
+		sprintf(devpathnum, "%d", (int)lComPortNumber);
+		
+		if (bitforce_detect_one(devpath))
+			++found;
+	}
+
+out:
+	dlclose(dll);
+	return found;
+}
+#else
+static int bitforce_autodetect_ftdi(void)
+{
+	return 0;
+}
+#endif
+
+static int bitforce_detect_auto(void)
 {
 	return (serial_autodetect_udev     (bitforce_detect_one, "BitFORCE*SHA256") ?:
 		serial_autodetect_devserial(bitforce_detect_one, "BitFORCE_SHA256") ?:
+		bitforce_autodetect_ftdi() ?:
 		0);
 }
 
-static void bitforce_detect()
+static void bitforce_detect(void)
 {
 	serial_detect_auto(bitforce_api.dname, bitforce_detect_one, bitforce_detect_auto);
 }
@@ -152,7 +263,7 @@ static bool bitforce_thread_prepare(struct thr_info *thr)
 	return true;
 }
 
-static void biforce_clear_buffer(struct cgpu_info *bitforce)
+static void bitforce_clear_buffer(struct cgpu_info *bitforce)
 {
 	int fdDev = bitforce->device_fd;
 	char pdevbuf[0x100];
@@ -180,6 +291,8 @@ void bitforce_init(struct cgpu_info *bitforce)
 
 	applog(LOG_WARNING, "BFL%i: Re-initialising", bitforce->device_id);
 
+	bitforce_clear_buffer(bitforce);
+
 	mutex_lock(&bitforce->device_mutex);
 	if (fdDev) {
 		BFclose(fdDev);
@@ -200,7 +313,7 @@ void bitforce_init(struct cgpu_info *bitforce)
 
 		if (unlikely(!pdevbuf[0])) {
 			mutex_unlock(&bitforce->device_mutex);
-			applog(LOG_ERR, "BFL%i: Error reading (ZGX)", bitforce->device_id);
+			applog(LOG_ERR, "BFL%i: Error reading/timeout (ZGX)", bitforce->device_id);
 			return;
 		}
 
@@ -234,14 +347,18 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce)
 	if (!fdDev)
 		return false;
 
-	mutex_lock(&bitforce->device_mutex);
+	/* It is not critical getting temperature so don't get stuck if  we
+	 * can't grab the mutex here */
+	if (mutex_trylock(&bitforce->device_mutex))
+		return false;
+
 	BFwrite(fdDev, "ZLX", 3);
 	BFgets(pdevbuf, sizeof(pdevbuf), fdDev);
 	mutex_unlock(&bitforce->device_mutex);
 	
 	if (unlikely(!pdevbuf[0])) {
-		applog(LOG_ERR, "BFL%i: Error: Get temp returned empty string", bitforce->device_id);
-		bitforce->temp = 0;
+		applog(LOG_ERR, "BFL%i: Error: Get temp returned empty string/timed out", bitforce->device_id);
+		bitforce->hw_errors++;
 		return false;
 	}
 
@@ -250,7 +367,7 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce)
 
 		if (temp > 0) {
 			bitforce->temp = temp;
-			if (temp > bitforce->cutofftemp) {
+			if (unlikely(bitforce->cutofftemp > 0 && temp > bitforce->cutofftemp)) {
 				applog(LOG_WARNING, "BFL%i: Hit thermal cutoff limit, disabling!", bitforce->device_id);
 				bitforce->deven = DEV_RECOVER;
 
@@ -259,7 +376,17 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce)
 				bitforce->dev_thermal_cutoff_count++;
 			}
 		}
+	} else {
+		/* Use the temperature monitor as a kind of watchdog for when
+		 * our responses are out of sync and flush the buffer to
+		 * hopefully recover */
+		applog(LOG_WARNING, "BFL%i: Garbled response probably throttling, clearing buffer");
+		/* Count throttling episodes as hardware errors */
+		bitforce->hw_errors++;
+		bitforce_clear_buffer(bitforce);
+		return false;;
 	}
+
 	return true;
 }
 
@@ -290,10 +417,12 @@ re_send:
 			applog(LOG_WARNING, "BFL%i: Does not support nonce range, disabling", bitforce->device_id);
 			bitforce->nonce_range = false;
 			bitforce->sleep_ms *= 5;
-			bitforce->kname = "Single";
+			bitforce->kname = KNAME_WORK;
 			goto re_send;
 		}
 		applog(LOG_ERR, "BFL%i: Error: Send work reports: %s", bitforce->device_id, pdevbuf);
+		bitforce->hw_errors++;
+		bitforce_clear_buffer(bitforce);
 		return false;
 	}
 
@@ -328,12 +457,14 @@ re_send:
 	}
 
 	if (unlikely(!pdevbuf[0])) {
-		applog(LOG_ERR, "BFL%i: Error: Send block data returned empty string", bitforce->device_id);
+		applog(LOG_ERR, "BFL%i: Error: Send block data returned empty string/timed out", bitforce->device_id);
 		return false;
 	}
 
 	if (unlikely(strncasecmp(pdevbuf, "OK", 2))) {
 		applog(LOG_ERR, "BFL%i: Error: Send block data reports: %s", bitforce->device_id, pdevbuf);
+		bitforce->hw_errors++;
+		bitforce_clear_buffer(bitforce);
 		return false;
 	}
 
@@ -408,7 +539,7 @@ static int64_t bitforce_get_result(struct thr_info *thr, struct work *work)
 		}
 
 		if (delay_time_ms != bitforce->sleep_ms)
-			  applog(LOG_DEBUG, "BFL%i: Wait time changed to: %d", bitforce->device_id, bitforce->sleep_ms, bitforce->wait_ms);
+			  applog(LOG_DEBUG, "BFL%i: Wait time changed to: %d, waited %u", bitforce->device_id, bitforce->sleep_ms, bitforce->wait_ms);
 
 		/* Work out the average time taken. Float for calculation, uint for display */
 		bitforce->avg_wait_f += (tv_to_ms(elapsed) - bitforce->avg_wait_f) / TIME_AVG_CONSTANT;
@@ -421,7 +552,9 @@ static int64_t bitforce_get_result(struct thr_info *thr, struct work *work)
 	else if (!strncasecmp(pdevbuf, "I", 1))
 		return 0;	/* Device idle */
 	else if (strncasecmp(pdevbuf, "NONCE-FOUND", 11)) {
+		bitforce->hw_errors++;
 		applog(LOG_WARNING, "BFL%i: Error: Get result reports: %s", bitforce->device_id, pdevbuf);
+		bitforce_clear_buffer(bitforce);
 		return 0;
 	}
 
@@ -438,7 +571,7 @@ static int64_t bitforce_get_result(struct thr_info *thr, struct work *work)
 				bitforce->nonce_range = false;
 				work->blk.nonce = 0xffffffff;
 				bitforce->sleep_ms *= 5;
-				bitforce->kname = "Single";
+				bitforce->kname = KNAME_WORK;
 		}
 			
 		submit_nonce(thr, work, nonce);
@@ -469,9 +602,10 @@ static int64_t bitforce_scanhash(struct thr_info *thr, struct work *work, int64_
 {
 	struct cgpu_info *bitforce = thr->cgpu;
 	unsigned int sleep_time;
+	bool send_ret;
 	int64_t ret;
 
-	ret = bitforce_send_work(thr, work);
+	send_ret = bitforce_send_work(thr, work);
 
 	if (!bitforce->nonce_range) {
 		/* Initially wait 2/3 of the average cycle time so we can request more
@@ -497,8 +631,10 @@ static int64_t bitforce_scanhash(struct thr_info *thr, struct work *work, int64_
 		bitforce->wait_ms = sleep_time;
 	}
 
-	if (ret)
+	if (send_ret)
 		ret = bitforce_get_result(thr, work);
+	else
+		ret = -1;
 
 	if (ret == -1) {
 		ret = 0;
@@ -506,8 +642,9 @@ static int64_t bitforce_scanhash(struct thr_info *thr, struct work *work, int64_
 		bitforce->device_last_not_well = time(NULL);
 		bitforce->device_not_well_reason = REASON_DEV_COMMS_ERROR;
 		bitforce->dev_comms_error_count++;
+		bitforce->hw_errors++;
 		/* empty read buffer */
-		biforce_clear_buffer(bitforce);
+		bitforce_clear_buffer(bitforce);
 	}
 	return ret;
 }
@@ -517,6 +654,20 @@ static bool bitforce_get_stats(struct cgpu_info *bitforce)
 	return bitforce_get_temp(bitforce);
 }
 
+static bool bitforce_thread_init(struct thr_info *thr)
+{
+	struct cgpu_info *bitforce = thr->cgpu;
+	unsigned int wait;
+
+	/* Pause each new thread at least 100ms between initialising
+	 * so the devices aren't making calls all at the same time. */
+	wait = thr->id * MAX_START_DELAY_US;
+	applog(LOG_DEBUG, "BFL%i: Delaying start by %dms", bitforce->device_id, wait / 1000);
+	usleep(wait);
+
+	return true;
+}
+
 static struct api_data *bitforce_api_stats(struct cgpu_info *cgpu)
 {
 	struct api_data *root = NULL;
@@ -540,6 +691,7 @@ struct device_api bitforce_api = {
 	.get_statline_before = get_bitforce_statline_before,
 	.get_stats = bitforce_get_stats,
 	.thread_prepare = bitforce_thread_prepare,
+	.thread_init = bitforce_thread_init,
 	.scanhash = bitforce_scanhash,
 	.thread_shutdown = bitforce_shutdown,
 	.thread_enable = biforce_thread_enable
diff --git a/driver-cpu.c b/driver-cpu.c
index 09ca478f..8ffc7802 100644
--- a/driver-cpu.c
+++ b/driver-cpu.c
@@ -131,6 +131,9 @@ extern bool scanhash_sse2_32(struct thr_info*, const unsigned char *pmidstate, u
 	uint32_t max_nonce, uint32_t *last_nonce,
 	uint32_t nonce);
 
+extern bool scanhash_scrypt(struct thr_info *thr, int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
+	const unsigned char *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done);
 
 
 
@@ -161,6 +164,9 @@ const char *algo_names[] = {
 #ifdef WANT_ALTIVEC_4WAY
     [ALGO_ALTIVEC_4WAY] = "altivec_4way",
 #endif
+#ifdef WANT_SCRYPT
+    [ALGO_SCRYPT] = "scrypt",
+#endif
 };
 
 static const sha256_func sha256_funcs[] = {
@@ -185,7 +191,10 @@ static const sha256_func sha256_funcs[] = {
 	[ALGO_SSE2_64]		= (sha256_func)scanhash_sse2_64,
 #endif
 #ifdef WANT_X8664_SSE4
-	[ALGO_SSE4_64]		= (sha256_func)scanhash_sse4_64
+	[ALGO_SSE4_64]		= (sha256_func)scanhash_sse4_64,
+#endif
+#ifdef WANT_SCRYPT
+	[ALGO_SCRYPT]		= (sha256_func)scanhash_scrypt
 #endif
 };
 #endif
@@ -662,6 +671,9 @@ char *set_algo(const char *arg, enum sha256_algos *algo)
 {
 	enum sha256_algos i;
 
+	if (opt_scrypt)
+		return "Can only use scrypt algorithm";
+
 	if (!strcmp(arg, "auto")) {
 		*algo = pick_fastest_algo();
 		return NULL;
@@ -676,6 +688,13 @@ char *set_algo(const char *arg, enum sha256_algos *algo)
 	return "Unknown algorithm";
 }
 
+#ifdef WANT_SCRYPT
+void set_scrypt_algo(enum sha256_algos *algo)
+{
+	*algo = ALGO_SCRYPT;
+}
+#endif
+
 void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo)
 {
 	strncpy(buf, algo_names[*algo], OPT_SHOW_LEN);
@@ -758,7 +777,7 @@ static bool cpu_thread_prepare(struct thr_info *thr)
 
 static uint64_t cpu_can_limit_work(struct thr_info *thr)
 {
-	return 0xfffff;
+	return 0xffff;
 }
 
 static bool cpu_thread_init(struct thr_info *thr)
diff --git a/driver-cpu.h b/driver-cpu.h
index ced400a5..e4b44527 100644
--- a/driver-cpu.h
+++ b/driver-cpu.h
@@ -34,6 +34,10 @@
 #define WANT_X8664_SSE4 1
 #endif
 
+#ifdef USE_SCRYPT
+#define WANT_SCRYPT
+#endif
+
 enum sha256_algos {
 	ALGO_C,			/* plain C */
 	ALGO_4WAY,		/* parallel SSE2 */
@@ -44,6 +48,7 @@ enum sha256_algos {
 	ALGO_SSE2_64,		/* SSE2 for x86_64 */
 	ALGO_SSE4_64,		/* SSE4 for x86_64 */
 	ALGO_ALTIVEC_4WAY,	/* parallel Altivec */
+	ALGO_SCRYPT,		/* scrypt */
 };
 
 extern const char *algo_names[];
@@ -55,5 +60,6 @@ extern void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo);
 extern char *force_nthreads_int(const char *arg, int *i);
 extern void init_max_name_len();
 extern double bench_algo_stage3(enum sha256_algos algo);
+extern void set_scrypt_algo(enum sha256_algos *algo);
 
 #endif /* __DEVICE_CPU_H__ */
diff --git a/driver-icarus.c b/driver-icarus.c
index 5f2c78ad..f1cf9d17 100644
--- a/driver-icarus.c
+++ b/driver-icarus.c
@@ -65,7 +65,7 @@
 #define ASSERT1(condition) __maybe_unused static char sizeof_uint32_t_must_be_4[(condition)?1:-1]
 ASSERT1(sizeof(uint32_t) == 4);
 
-#define ICARUS_READ_TIME ((double)ICARUS_READ_SIZE * (double)8.0 / (double)ICARUS_IO_SPEED)
+#define ICARUS_READ_TIME(baud) ((double)ICARUS_READ_SIZE * (double)8.0 / (double)(baud))
 
 // Fraction of a second, USB timeout is measured in
 // i.e. 10 means 1/10 of a second
@@ -176,11 +176,36 @@ struct ICARUS_INFO {
 	// (which will only affect W)
 	uint64_t history_count;
 	struct timeval history_time;
+
+	// icarus-options
+	int baud;
+	int work_division;
+	int fpga_count;
+	uint32_t nonce_mask;
 };
 
+#define END_CONDITION 0x0000ffff
+
 // One for each possible device
 static struct ICARUS_INFO **icarus_info;
 
+// Looking for options in --icarus-timing and --icarus-options:
+//
+// Code increments this each time we start to look at a device
+// However, this means that if other devices are checked by
+// the Icarus code (e.g. BFL) they will count in the option offset
+//
+// This, however, is deterministic so that's OK
+//
+// If we were to increment after successfully finding an Icarus
+// that would be random since an Icarus may fail and thus we'd
+// not be able to predict the option order
+//
+// This also assumes that serial_detect() checks them sequentially
+// and in the order specified on the command line
+//
+static int option_offset = -1;
+
 struct device_api icarus_api;
 
 static void rev(unsigned char *s, size_t l)
@@ -195,8 +220,8 @@ static void rev(unsigned char *s, size_t l)
 	}
 }
 
-#define icarus_open2(devpath, purge)  serial_open(devpath, 115200, ICARUS_READ_FAULT_DECISECONDS, purge)
-#define icarus_open(devpath)  icarus_open2(devpath, false)
+#define icarus_open2(devpath, baud, purge)  serial_open(devpath, baud, ICARUS_READ_FAULT_DECISECONDS, purge)
+#define icarus_open(devpath, baud)  icarus_open2(devpath, baud, false)
 
 static int icarus_gets(unsigned char *buf, int fd, struct timeval *tv_finish, struct thr_info *thr, int read_count)
 {
@@ -272,7 +297,7 @@ static const char *timing_mode_str(enum timing_mode timing_mode)
 	}
 }
 
-static void set_timing_mode(struct cgpu_info *icarus)
+static void set_timing_mode(int this_option_offset, struct cgpu_info *icarus)
 {
 	struct ICARUS_INFO *info = icarus_info[icarus->device_id];
 	double Hs;
@@ -285,7 +310,7 @@ static void set_timing_mode(struct cgpu_info *icarus)
 		buf[0] = '\0';
 	else {
 		ptr = opt_icarus_timing;
-		for (i = 0; i < icarus->device_id; i++) {
+		for (i = 0; i < this_option_offset; i++) {
 			comma = strchr(ptr, ',');
 			if (comma == NULL)
 				break;
@@ -354,11 +379,123 @@ static void set_timing_mode(struct cgpu_info *icarus)
 
 	applog(LOG_DEBUG, "Icarus: Init: %d mode=%s read_count=%d Hs=%e",
 		icarus->device_id, timing_mode_str(info->timing_mode), info->read_count, info->Hs);
+}
+
+static uint32_t mask(int work_division)
+{
+	char err_buf[BUFSIZ+1];
+	uint32_t nonce_mask = 0x7fffffff;
+
+	// yes we can calculate these, but this way it's easy to see what they are
+	switch (work_division) {
+	case 1:
+		nonce_mask = 0xffffffff;
+		break;
+	case 2:
+		nonce_mask = 0x7fffffff;
+		break;
+	case 4:
+		nonce_mask = 0x3fffffff;
+		break;
+	case 8:
+		nonce_mask = 0x1fffffff;
+		break;
+	default:
+		sprintf(err_buf, "Invalid2 icarus-options for work_division (%d) must be 1, 2, 4 or 8", work_division);
+		quit(1, err_buf);
+	}
+
+	return nonce_mask;
+}
+
+static void get_options(int this_option_offset, int *baud, int *work_division, int *fpga_count)
+{
+	char err_buf[BUFSIZ+1];
+	char buf[BUFSIZ+1];
+	char *ptr, *comma, *colon, *colon2;
+	size_t max;
+	int i, tmp;
+
+	if (opt_icarus_options == NULL)
+		buf[0] = '\0';
+	else {
+		ptr = opt_icarus_options;
+		for (i = 0; i < this_option_offset; i++) {
+			comma = strchr(ptr, ',');
+			if (comma == NULL)
+				break;
+			ptr = comma + 1;
+		}
+
+		comma = strchr(ptr, ',');
+		if (comma == NULL)
+			max = strlen(ptr);
+		else
+			max = comma - ptr;
 
+		if (max > BUFSIZ)
+			max = BUFSIZ;
+		strncpy(buf, ptr, max);
+		buf[max] = '\0';
+	}
+
+	*baud = ICARUS_IO_SPEED;
+	*work_division = 2;
+	*fpga_count = 2;
+
+	if (*buf) {
+		colon = strchr(buf, ':');
+		if (colon)
+			*(colon++) = '\0';
+
+		if (*buf) {
+			tmp = atoi(buf);
+			switch (tmp) {
+			case 115200:
+				*baud = 115200;
+				break;
+			case 57600:
+				*baud = 57600;
+				break;
+			default:
+				sprintf(err_buf, "Invalid icarus-options for baud (%s) must be 115200 or 57600", buf);
+				quit(1, err_buf);
+			}
+		}
+
+		if (colon && *colon) {
+			colon2 = strchr(colon, ':');
+			if (colon2)
+				*(colon2++) = '\0';
+
+			if (*colon) {
+				tmp = atoi(colon);
+				if (tmp == 1 || tmp == 2 || tmp == 4 || tmp == 8) {
+					*work_division = tmp;
+					*fpga_count = tmp;	// default to the same
+				} else {
+					sprintf(err_buf, "Invalid icarus-options for work_division (%s) must be 1, 2, 4 or 8", colon);
+					quit(1, err_buf);
+				}
+			}
+
+			if (colon2 && *colon2) {
+				tmp = atoi(colon2);
+				if (tmp > 0 && tmp <= *work_division)
+					*fpga_count = tmp;
+				else {
+					sprintf(err_buf, "Invalid icarus-options for fpga_count (%s) must be >0 and <=work_division (%d)", colon2, *work_division);
+					quit(1, err_buf);
+				}
+			}
+		}
+	}
 }
 
 static bool icarus_detect_one(const char *devpath)
 {
+	int this_option_offset = ++option_offset;
+
 	struct ICARUS_INFO *info;
 	struct timeval tv_start, tv_finish;
 	int fd;
@@ -379,9 +516,13 @@ static bool icarus_detect_one(const char *devpath)
 	unsigned char ob_bin[64], nonce_bin[ICARUS_READ_SIZE];
 	char *nonce_hex;
 
+	int baud, work_division, fpga_count;
+
+	get_options(this_option_offset, &baud, &work_division, &fpga_count);
+
 	applog(LOG_DEBUG, "Icarus Detect: Attempting to open %s", devpath);
 
-	fd = icarus_open2(devpath, true);
+	fd = icarus_open2(devpath, baud, true);
 	if (unlikely(fd == -1)) {
 		applog(LOG_ERR, "Icarus Detect: Failed to open %s", devpath);
 		return false;
@@ -429,6 +570,9 @@ static bool icarus_detect_one(const char *devpath)
 	applog(LOG_INFO, "Found Icarus at %s, mark as %d",
 		devpath, icarus->device_id);
 
+	applog(LOG_DEBUG, "Icarus: Init: %d baud=%d work_division=%d fpga_count=%d",
+		icarus->device_id, baud, work_division, fpga_count);
+
 	// Since we are adding a new device on the end it needs to always be allocated
 	icarus_info[icarus->device_id] = (struct ICARUS_INFO *)malloc(sizeof(struct ICARUS_INFO));
 	if (unlikely(!(icarus_info[icarus->device_id])))
@@ -439,10 +583,15 @@ static bool icarus_detect_one(const char *devpath)
 	// Initialise everything to zero for a new device
 	memset(info, 0, sizeof(struct ICARUS_INFO));
 
-	info->golden_hashes = (golden_nonce_val & 0x7fffffff) << 1;
+	info->baud = baud;
+	info->work_division = work_division;
+	info->fpga_count = fpga_count;
+	info->nonce_mask = mask(work_division);
+
+	info->golden_hashes = (golden_nonce_val & info->nonce_mask) * fpga_count;
 	timersub(&tv_finish, &tv_start, &(info->golden_tv));
 
-	set_timing_mode(icarus);
+	set_timing_mode(this_option_offset, icarus);
 
 	return true;
 }
@@ -458,7 +607,7 @@ static bool icarus_prepare(struct thr_info *thr)
 
 	struct timeval now;
 
-	int fd = icarus_open(icarus->device_path);
+	int fd = icarus_open(icarus->device_path, icarus_info[icarus->device_id]->baud);
 	if (unlikely(-1 == fd)) {
 		applog(LOG_ERR, "Failed to open Icarus on %s",
 		       icarus->device_path);
@@ -565,11 +714,9 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
 
 	submit_nonce(thr, work, nonce);
 
-	hash_count = (nonce & 0x7fffffff);
-	if (hash_count++ == 0x7fffffff)
-		hash_count = 0xffffffff;
-	else
-		hash_count <<= 1;
+	hash_count = (nonce & info->nonce_mask);
+	hash_count++;
+	hash_count *= info->fpga_count;
 
 	if (opt_debug || info->do_icarus_timing)
 		timersub(&tv_finish, &tv_start, &elapsed);
@@ -580,7 +727,9 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
 	}
 
 	// ignore possible end condition values
-	if (info->do_icarus_timing && (nonce & 0x7fffffff) > 0x000fffff && (nonce & 0x7fffffff) < 0x7ff00000) {
+	if (info->do_icarus_timing
+	&&  ((nonce & info->nonce_mask) > END_CONDITION)
+	&&  ((nonce & info->nonce_mask) < (info->nonce_mask & ~END_CONDITION))) {
 		gettimeofday(&tv_history_start, NULL);
 
 		history0 = &(info->history[0]);
@@ -590,7 +739,7 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
 
 		Ti = (double)(elapsed.tv_sec)
 			+ ((double)(elapsed.tv_usec))/((double)1000000)
-			- ICARUS_READ_TIME;
+			- ((double)ICARUS_READ_TIME(info->baud));
 		Xi = (double)hash_count;
 		history0->sumXiTi += Xi * Ti;
 		history0->sumXi += Xi;
@@ -700,6 +849,9 @@ static struct api_data *icarus_api_stats(struct cgpu_info *cgpu)
 	root = api_add_uint(root, "timing_values", &(info->history[0].values), false);
 	root = api_add_const(root, "timing_mode", timing_mode_str(info->timing_mode), false);
 	root = api_add_bool(root, "is_timing", &(info->do_icarus_timing), false);
+	root = api_add_int(root, "baud", &(info->baud), false);
+	root = api_add_int(root, "work_division", &(info->work_division), false);
+	root = api_add_int(root, "fpga_count", &(info->fpga_count), false);
 
 	return root;
 }
diff --git a/driver-modminer.c b/driver-modminer.c
index ff96ee45..040100db 100644
--- a/driver-modminer.c
+++ b/driver-modminer.c
@@ -91,7 +91,7 @@ modminer_detect_one(const char *devpath)
 
 #undef bailout
 
-static char
+static int
 modminer_detect_auto()
 {
 	return
diff --git a/driver-opencl.c b/driver-opencl.c
index 880a4dac..b6dfe12e 100644
--- a/driver-opencl.c
+++ b/driver-opencl.c
@@ -127,6 +127,83 @@ char *set_worksize(char *arg)
 	return NULL;
 }
 
+#ifdef USE_SCRYPT
+char *set_shaders(char *arg)
+{
+	int i, val = 0, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set lookup gap";
+	val = atoi(nextptr);
+
+	gpus[device++].shaders = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atoi(nextptr);
+
+		gpus[device++].shaders = val;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++)
+			gpus[i].shaders = gpus[0].shaders;
+	}
+
+	return NULL;
+}
+
+char *set_lookup_gap(char *arg)
+{
+	int i, val = 0, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set lookup gap";
+	val = atoi(nextptr);
+
+	gpus[device++].opt_lg = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atoi(nextptr);
+
+		gpus[device++].opt_lg = val;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++)
+			gpus[i].opt_lg = gpus[0].opt_lg;
+	}
+
+	return NULL;
+}
+
+char *set_thread_concurrency(char *arg)
+{
+	int i, val = 0, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set thread concurrency";
+	val = atoi(nextptr);
+
+	gpus[device++].opt_tc = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atoi(nextptr);
+
+		gpus[device++].opt_tc = val;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++)
+			gpus[i].opt_tc = gpus[0].opt_tc;
+	}
+
+	return NULL;
+}
+#endif
+
 static enum cl_kernels select_kernel(char *arg)
 {
 	if (!strcmp(arg, "diablo"))
@@ -137,6 +214,10 @@ static enum cl_kernels select_kernel(char *arg)
 		return KL_POCLBM;
 	if (!strcmp(arg, "phatk"))
 		return KL_PHATK;
+#ifdef USE_SCRYPT
+	if (!strcmp(arg, "scrypt"))
+		return KL_SCRYPT;
+#endif
 	return KL_NONE;
 }
 
@@ -146,6 +227,8 @@ char *set_kernel(char *arg)
 	int i, device = 0;
 	char *nextptr;
 
+	if (opt_scrypt)
+		return "Cannot use sha256 kernel with scrypt";
 	nextptr = strtok(arg, ",");
 	if (nextptr == NULL)
 		return "Invalid parameters for set kernel";
@@ -577,9 +660,19 @@ retry:
 
 	for (gpu = 0; gpu < nDevs; gpu++) {
 		struct cgpu_info *cgpu = &gpus[gpu];
+		double displayed_rolling, displayed_total;
+		bool mhash_base = true;
+
+		displayed_rolling = cgpu->rolling;
+		displayed_total = cgpu->total_mhashes / total_secs;
+		if (displayed_rolling < 1) {
+			displayed_rolling *= 1000;
+			displayed_total *= 1000;
+			mhash_base = false;
+		}
 
-		wlog("GPU %d: %.1f / %.1f Mh/s | A:%d  R:%d  HW:%d  U:%.2f/m  I:%d\n",
-			gpu, cgpu->rolling, cgpu->total_mhashes / total_secs,
+		wlog("GPU %d: %.1f / %.1f %sh/s | A:%d  R:%d  HW:%d  U:%.2f/m  I:%d\n",
+			gpu, displayed_rolling, displayed_total, mhash_base ? "M" : "K",
 			cgpu->accepted, cgpu->rejected, cgpu->hw_errors,
 			cgpu->utility, cgpu->intensity);
 #ifdef HAVE_ADL
@@ -627,7 +720,10 @@ retry:
 			if (thr->cgpu != cgpu)
 				continue;
 			get_datestamp(checkin, &thr->last);
-			wlog("Thread %d: %.1f Mh/s %s ", i, thr->rolling, cgpu->deven != DEV_DISABLED ? "Enabled" : "Disabled");
+			displayed_rolling = thr->rolling;
+			if (!mhash_base)
+				displayed_rolling *= 1000;
+			wlog("Thread %d: %.1f %sh/s %s ", i, displayed_rolling, mhash_base ? "M" : "K" , cgpu->deven != DEV_DISABLED ? "Enabled" : "Disabled");
 			switch (cgpu->status) {
 				default:
 				case LIFE_WELL:
@@ -986,11 +1082,40 @@ static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t
 	return status;
 }
 
+#ifdef USE_SCRYPT
+static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+	unsigned char *midstate = blk->work->midstate;
+	cl_kernel *kernel = &clState->kernel;
+	unsigned int num = 0;
+	cl_uint le_target;
+	cl_int status = 0;
+
+	le_target = *(cl_uint *)(blk->work->target + 28);
+	clState->cldata = blk->work->data;
+	status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
+
+	CL_SET_ARG(clState->CLbuffer0);
+	CL_SET_ARG(clState->outputBuffer);
+	CL_SET_ARG(clState->padbuffer8);
+	CL_SET_VARG(4, &midstate[0]);
+	CL_SET_VARG(4, &midstate[16]);
+	CL_SET_ARG(le_target);
+
+	return status;
+}
+#endif
+
 static void set_threads_hashes(unsigned int vectors, unsigned int *threads,
 			       int64_t *hashes, size_t *globalThreads,
 			       unsigned int minthreads, int intensity)
 {
-	*threads = 1 << (15 + intensity);
+	if (opt_scrypt) {
+		if (intensity < 0)
+			intensity = 0;
+		*threads = 1 << intensity;
+	} else
+		*threads = 1 << (15 + intensity);
 	if (*threads < minthreads)
 		*threads = minthreads;
 	*globalThreads = *threads;
@@ -1210,16 +1335,17 @@ static bool opencl_thread_prepare(struct thr_info *thr)
 	applog(LOG_INFO, "Init GPU thread %i GPU %i virtual GPU %i", i, gpu, virtual_gpu);
 	clStates[i] = initCl(virtual_gpu, name, sizeof(name));
 	if (!clStates[i]) {
+#ifdef HAVE_CURSES
 		if (use_curses)
 			enable_curses();
+#endif
 		applog(LOG_ERR, "Failed to init GPU thread %d, disabling device %d", i, gpu);
 		if (!failmessage) {
-			char *buf;
-
 			applog(LOG_ERR, "Restarting the GPU from the menu will not fix this.");
 			applog(LOG_ERR, "Try restarting cgminer.");
 			failmessage = true;
 #ifdef HAVE_CURSES
+			char *buf;
 			if (use_curses) {
 				buf = curses_input("Press enter to continue");
 				if (buf)
@@ -1241,19 +1367,25 @@ static bool opencl_thread_prepare(struct thr_info *thr)
 	if (!cgpu->kname)
 	{
 		switch (clStates[i]->chosen_kernel) {
-		case KL_DIABLO:
-			cgpu->kname = "diablo";
-			break;
-		case KL_DIAKGCN:
-			cgpu->kname = "diakgcn";
-			break;
-		case KL_PHATK:
-			cgpu->kname = "phatk";
-			break;
-		case KL_POCLBM:
-			cgpu->kname = "poclbm";
-		default:
-			break;
+			case KL_DIABLO:
+				cgpu->kname = "diablo";
+				break;
+			case KL_DIAKGCN:
+				cgpu->kname = "diakgcn";
+				break;
+			case KL_PHATK:
+				cgpu->kname = "phatk";
+				break;
+#ifdef USE_SCRYPT
+			case KL_SCRYPT:
+				cgpu->kname = "scrypt";
+				break;
+#endif
+			case KL_POCLBM:
+				cgpu->kname = "poclbm";
+				break;
+			default:
+				break;
 		}
 	}
 	applog(LOG_INFO, "initCl() finished. Found %s", name);
@@ -1271,7 +1403,7 @@ static bool opencl_thread_init(struct thr_info *thr)
 	struct cgpu_info *gpu = thr->cgpu;
 	struct opencl_thread_data *thrdata;
 	_clState *clState = clStates[thr_id];
-	cl_int status;
+	cl_int status = 0;
 	thrdata = calloc(1, sizeof(*thrdata));
 	thr->cgpu_data = thrdata;
 
@@ -1290,6 +1422,11 @@ static bool opencl_thread_init(struct thr_info *thr)
 		case KL_DIAKGCN:
 			thrdata->queue_kernel_parameters = &queue_diakgcn_kernel;
 			break;
+#ifdef USE_SCRYPT
+		case KL_SCRYPT:
+			thrdata->queue_kernel_parameters = &queue_scrypt_kernel;
+			break;
+#endif
 		default:
 		case KL_DIABLO:
 			thrdata->queue_kernel_parameters = &queue_diablo_kernel;
@@ -1304,7 +1441,7 @@ static bool opencl_thread_init(struct thr_info *thr)
 		return false;
 	}
 
-	status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0,
+	status |= clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0,
 			BUFFERSIZE, blank_res, 0, NULL, NULL);
 	if (unlikely(status != CL_SUCCESS)) {
 		applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
@@ -1333,7 +1470,12 @@ static void opencl_free_work(struct thr_info *thr, struct work *work)
 
 static bool opencl_prepare_work(struct thr_info __maybe_unused *thr, struct work *work)
 {
-	precalc_hash(&work->blk, (uint32_t *)(work->midstate), (uint32_t *)(work->data + 64));
+#ifdef USE_SCRYPT
+	if (opt_scrypt)
+		work->blk.work = work;
+	else
+#endif
+		precalc_hash(&work->blk, (uint32_t *)(work->midstate), (uint32_t *)(work->data + 64));
 	return true;
 }
 
@@ -1348,6 +1490,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 	_clState *clState = clStates[thr_id];
 	const cl_kernel *kernel = &clState->kernel;
 	const int dynamic_us = opt_dynamic_interval * 1000;
+	cl_bool blocking;
 
 	cl_int status;
 	size_t globalThreads[1];
@@ -1355,14 +1498,20 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 	unsigned int threads;
 	int64_t hashes;
 
+	if (gpu->dynamic)
+		blocking = CL_TRUE;
+	else
+		blocking = CL_FALSE;
+
 	/* This finish flushes the readbuffer set with CL_FALSE later */
-	clFinish(clState->commandQueue);
-	gettimeofday(&gpu->tv_gpuend, NULL);
+	if (!blocking)
+		clFinish(clState->commandQueue);
 
 	if (gpu->dynamic) {
 		struct timeval diff;
 		suseconds_t gpu_us;
 
+		gettimeofday(&gpu->tv_gpuend, NULL);
 		timersub(&gpu->tv_gpuend, &gpu->tv_gpustart, &diff);
 		gpu_us = diff.tv_sec * 1000000 + diff.tv_usec;
 		if (likely(gpu_us >= 0)) {
@@ -1384,6 +1533,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 			   localThreads[0], gpu->intensity);
 	if (hashes > gpu->max_hashes)
 		gpu->max_hashes = hashes;
+
 	status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]);
 	if (unlikely(status != CL_SUCCESS)) {
 		applog(LOG_ERR, "Error: clSetKernelArg of all params failed.");
@@ -1393,7 +1543,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 	/* MAXBUFFERS entry is used as a flag to say nonces exist */
 	if (thrdata->res[FOUND]) {
 		/* Clear the buffer again */
-		status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
+		status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, blocking, 0,
 				BUFFERSIZE, blank_res, 0, NULL, NULL);
 		if (unlikely(status != CL_SUCCESS)) {
 			applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
@@ -1408,7 +1558,8 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 			postcalc_hash_async(thr, work, thrdata->res);
 		}
 		memset(thrdata->res, 0, BUFFERSIZE);
-		clFinish(clState->commandQueue);
+		if (!blocking)
+			clFinish(clState->commandQueue);
 	}
 
 	gettimeofday(&gpu->tv_gpustart, NULL);
@@ -1423,14 +1574,14 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL,
 						globalThreads, localThreads, 0,  NULL, NULL);
 	if (unlikely(status != CL_SUCCESS)) {
-		applog(LOG_ERR, "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)");
+		applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
 		return -1;
 	}
 
-	status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
+	status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, blocking, 0,
 			BUFFERSIZE, thrdata->res, 0, NULL, NULL);
 	if (unlikely(status != CL_SUCCESS)) {
-		applog(LOG_ERR, "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)");
+		applog(LOG_ERR, "Error: clEnqueueReadBuffer failed error %d. (clEnqueueReadBuffer)", status);
 		return -1;
 	}
 
diff --git a/driver-opencl.h b/driver-opencl.h
index 600bd854..c1d61822 100644
--- a/driver-opencl.h
+++ b/driver-opencl.h
@@ -18,6 +18,11 @@ extern char *set_temp_target(char *arg);
 extern char *set_intensity(char *arg);
 extern char *set_vector(char *arg);
 extern char *set_worksize(char *arg);
+#ifdef USE_SCRYPT
+extern char *set_shaders(char *arg);
+extern char *set_lookup_gap(char *arg);
+extern char *set_thread_concurrency(char *arg);
+#endif
 extern char *set_kernel(char *arg);
 void manage_gpu(void);
 extern void pause_dynamic_threads(int gpu);
diff --git a/findnonce.c b/findnonce.c
index 98d7f0e7..9980a704 100644
--- a/findnonce.c
+++ b/findnonce.c
@@ -17,6 +17,7 @@
 #include <string.h>
 
 #include "findnonce.h"
+#include "scrypt.h"
 
 const uint32_t SHA256_K[64] = {
 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
@@ -45,7 +46,8 @@ const uint32_t SHA256_K[64] = {
 	d = d + h; \
 	h = h + (rotate(a, 30) ^ rotate(a, 19) ^ rotate(a, 10)) + ((a & b) | (c & (a | b)))
 
-void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) {
+void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data)
+{
 	cl_uint A, B, C, D, E, F, G, H;
 
 	A = state[0];
@@ -172,7 +174,7 @@ struct pc_data {
 	pthread_t pth;
 };
 
-static void send_nonce(struct pc_data *pcd, cl_uint nonce)
+static void send_sha_nonce(struct pc_data *pcd, cl_uint nonce)
 {
 	dev_blk_ctx *blk = &pcd->work->blk;
 	struct thr_info *thr = pcd->thr;
@@ -219,6 +221,19 @@ static void send_nonce(struct pc_data *pcd, cl_uint nonce)
 	}
 }
 
+static void send_scrypt_nonce(struct pc_data *pcd, uint32_t nonce)
+{
+	struct thr_info *thr = pcd->thr;
+	struct work *work = pcd->work;
+
+	if (scrypt_test(work->data, work->target, nonce))
+		submit_nonce(thr, pcd->work, nonce);
+	else {
+		applog(LOG_INFO, "Scrypt error, review settings");
+		thr->cgpu->hw_errors++;
+	}
+}
+
 static void *postcalc_hash(void *userdata)
 {
 	struct pc_data *pcd = (struct pc_data *)userdata;
@@ -228,9 +243,16 @@ static void *postcalc_hash(void *userdata)
 	pthread_detach(pthread_self());
 
 	for (entry = 0; entry < FOUND; entry++) {
-		if (pcd->res[entry])
-			send_nonce(pcd, pcd->res[entry]);
-		nonces++;
+		uint32_t nonce = pcd->res[entry];
+
+		if (nonce) {
+			applog(LOG_DEBUG, "OCL NONCE %u", nonce);
+			if (opt_scrypt)
+				send_scrypt_nonce(pcd, nonce);
+			else
+				send_sha_nonce(pcd, nonce);
+			nonces++;
+		}
 	}
 
 	free(pcd);
diff --git a/findnonce.h b/findnonce.h
index 5b93c15c..ce69569e 100644
--- a/findnonce.h
+++ b/findnonce.h
@@ -4,10 +4,10 @@
 #include "config.h"
 
 #define MAXTHREADS (0xFFFFFFFEULL)
-#define MAXBUFFERS (0xFF)
+#define MAXBUFFERS (0xFFF)
 #define BUFFERSIZE (sizeof(uint32_t) * MAXBUFFERS)
-#define FOUND (0x80)
-/* #define NFLAG (0x7F) Just for reference */
+#define FOUND (0x800)
+/* #define NFLAG (0x7FF) Just for reference */
 
 #ifdef HAVE_OPENCL
 extern void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data);
diff --git a/fpgautils.c b/fpgautils.c
index 0ebee7f6..a62b4913 100644
--- a/fpgautils.c
+++ b/fpgautils.c
@@ -38,7 +38,7 @@
 #include "miner.h"
 
 #ifdef HAVE_LIBUDEV
-char
+int
 serial_autodetect_udev(detectone_func_t detectone, const char*prodname)
 {
 	struct udev *udev = udev_new();
@@ -69,14 +69,14 @@ serial_autodetect_udev(detectone_func_t detectone, const char*prodname)
 	return found;
 }
 #else
-char
+int
 serial_autodetect_udev(__maybe_unused detectone_func_t detectone, __maybe_unused const char*prodname)
 {
 	return 0;
 }
 #endif
 
-char
+int
 serial_autodetect_devserial(detectone_func_t detectone, const char*prodname)
 {
 #ifndef WIN32
@@ -107,7 +107,7 @@ serial_autodetect_devserial(detectone_func_t detectone, const char*prodname)
 #endif
 }
 
-char
+int
 _serial_detect(const char*dname, detectone_func_t detectone, autoscan_func_t autoscan, bool forceauto)
 {
 	struct string_elist *iter, *tmp;
@@ -178,7 +178,8 @@ serial_open(const char*devpath, unsigned long baud, signed short timeout, bool p
 
 	SetCommConfig(hSerial, &comCfg, sizeof(comCfg));
 
-	const DWORD ctoms = (timeout == -1) ? 30000 : (timeout * 100);
+	// Code must specify a valid timeout value (0 means don't timeout)
+	const DWORD ctoms = (timeout * 100);
 	COMMTIMEOUTS cto = {ctoms, 0, ctoms, 0, ctoms};
 	SetCommTimeouts(hSerial, &cto);
 
@@ -210,6 +211,10 @@ serial_open(const char*devpath, unsigned long baud, signed short timeout, bool p
 	switch (baud) {
 	case 0:
 		break;
+	case 57600:
+		cfsetispeed( &my_termios, B57600 );
+		cfsetospeed( &my_termios, B57600 );
+		break;
 	case 115200:
 		cfsetispeed( &my_termios, B115200 );
 		cfsetospeed( &my_termios, B115200 );
@@ -230,10 +235,9 @@ serial_open(const char*devpath, unsigned long baud, signed short timeout, bool p
 	my_termios.c_oflag &= ~OPOST;
 	my_termios.c_lflag &= ~(ECHO | ECHONL | ICANON | ISIG | IEXTEN);
 
-	if (timeout >= 0) {
-		my_termios.c_cc[VTIME] = (cc_t)timeout;
-		my_termios.c_cc[VMIN] = 0;
-	}
+	// Code must specify a valid timeout value (0 means don't timeout)
+	my_termios.c_cc[VTIME] = (cc_t)timeout;
+	my_termios.c_cc[VMIN] = 0;
 
 	tcsetattr(fdDev, TCSANOW, &my_termios);
 	if (purge)
diff --git a/fpgautils.h b/fpgautils.h
index c45183b7..5b743bc5 100644
--- a/fpgautils.h
+++ b/fpgautils.h
@@ -14,17 +14,17 @@
 #include <stdio.h>
 
 typedef bool(*detectone_func_t)(const char*);
-typedef char(*autoscan_func_t)();
+typedef int(*autoscan_func_t)();
 
-extern char _serial_detect(const char*dname, detectone_func_t, autoscan_func_t, bool force_autoscan);
+extern int _serial_detect(const char*dname, detectone_func_t, autoscan_func_t, bool force_autoscan);
 #define serial_detect_fauto(dname, detectone, autoscan)  \
 	_serial_detect(dname, detectone, autoscan, true)
 #define serial_detect_auto(dname, detectone, autoscan)  \
 	_serial_detect(dname, detectone, autoscan, false)
 #define serial_detect(dname, detectone)  \
 	_serial_detect(dname, detectone,     NULL, false)
-extern char serial_autodetect_devserial(detectone_func_t, const char*prodname);
-extern char serial_autodetect_udev     (detectone_func_t, const char*prodname);
+extern int serial_autodetect_devserial(detectone_func_t, const char*prodname);
+extern int serial_autodetect_udev     (detectone_func_t, const char*prodname);
 
 extern int serial_open(const char*devpath, unsigned long baud, signed short timeout, bool purge);
 extern ssize_t _serial_read(int fd, char *buf, size_t buflen, char*eol);
diff --git a/logging.c b/logging.c
index 31956637..47d1970d 100644
--- a/logging.c
+++ b/logging.c
@@ -18,12 +18,15 @@ bool opt_log_output = false;
 /* per default priorities higher than LOG_NOTICE are logged */
 int opt_log_level = LOG_NOTICE;
 
-static void my_log_curses(int prio, char *f, va_list ap)
+static void my_log_curses(__maybe_unused int prio, char *f, va_list ap)
 {
+	if (opt_quiet && prio != LOG_ERR)
+		return;
+
 #ifdef HAVE_CURSES
 	extern bool use_curses;
-	if (use_curses)
-		log_curses(prio, f, ap);
+	if (use_curses && log_curses_only(prio, f, ap))
+		;
 	else
 #endif
 	{
@@ -31,57 +34,20 @@ static void my_log_curses(int prio, char *f, va_list ap)
 
 		strcpy(f + len - 1, "                    \n");
 
-#ifdef HAVE_CURSES
-		log_curses(prio, f, ap);
-#else
+		mutex_lock(&console_lock);
 		vprintf(f, ap);
-#endif
+		mutex_unlock(&console_lock);
 	}
 }
 
+static void log_generic(int prio, const char *fmt, va_list ap);
+
 void vapplog(int prio, const char *fmt, va_list ap)
 {
 	if (!opt_debug && prio == LOG_DEBUG)
 		return;
-
-#ifdef HAVE_SYSLOG_H
-	if (use_syslog) {
-		vsyslog(prio, fmt, ap);
-	}
-#else
-	if (0) {}
-#endif
-	else if (opt_log_output || prio <= LOG_NOTICE) {
-		char *f;
-		int len;
-		struct timeval tv = {0, 0};
-		struct tm *tm;
-
-		gettimeofday(&tv, NULL);
-
-		tm = localtime(&tv.tv_sec);
-
-		len = 40 + strlen(fmt) + 22;
-		f = alloca(len);
-		sprintf(f, " [%d-%02d-%02d %02d:%02d:%02d] %s\n",
-			tm->tm_year + 1900,
-			tm->tm_mon + 1,
-			tm->tm_mday,
-			tm->tm_hour,
-			tm->tm_min,
-			tm->tm_sec,
-			fmt);
-		/* Only output to stderr if it's not going to the screen as well */
-		if (!isatty(fileno((FILE *)stderr))) {
-			va_list apc;
-
-			va_copy(apc, ap);
-			vfprintf(stderr, f, apc);	/* atomic write to stderr */
-			fflush(stderr);
-		}
-
-		my_log_curses(prio, f, ap);
-	}
+	if (use_syslog || opt_log_output || prio <= LOG_NOTICE)
+		log_generic(prio, fmt, ap);
 }
 
 void applog(int prio, const char *fmt, ...)
@@ -100,7 +66,7 @@ void applog(int prio, const char *fmt, ...)
  * generic log function used by priority specific ones
  * equals vapplog() without additional priority checks
  */
-static void __maybe_unused log_generic(int prio, const char *fmt, va_list ap)
+static void log_generic(int prio, const char *fmt, va_list ap)
 {
 #ifdef HAVE_SYSLOG_H
 	if (use_syslog) {
@@ -121,7 +87,7 @@ static void __maybe_unused log_generic(int prio, const char *fmt, va_list ap)
 
 		len = 40 + strlen(fmt) + 22;
 		f = alloca(len);
-		sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d] %s\n",
+		sprintf(f, " [%d-%02d-%02d %02d:%02d:%02d] %s\n",
 			tm->tm_year + 1900,
 			tm->tm_mon + 1,
 			tm->tm_mday,
diff --git a/miner.h b/miner.h
index 120d7633..5afa071b 100644
--- a/miner.h
+++ b/miner.h
@@ -262,6 +262,7 @@ enum cl_kernels {
 	KL_PHATK,
 	KL_DIAKGCN,
 	KL_DIABLO,
+	KL_SCRYPT,
 };
 
 enum dev_reason {
@@ -335,7 +336,6 @@ struct cgpu_info {
 	int accepted;
 	int rejected;
 	int hw_errors;
-	unsigned int low_count;
 	double rolling;
 	double total_mhashes;
 	double utility;
@@ -355,10 +355,17 @@ struct cgpu_info {
 	int virtual_adl;
 	int intensity;
 	bool dynamic;
+
 	cl_uint vwidth;
 	size_t work_size;
 	enum cl_kernels kernel;
+	cl_ulong max_alloc;
 
+#ifdef USE_SCRYPT
+	int opt_lg, lookup_gap;
+	int opt_tc, thread_concurrency;
+	int shaders;
+#endif
 	struct timeval tv_gpustart;;
 	struct timeval tv_gpuend;
 	double gpu_us_average;
@@ -493,6 +500,11 @@ static inline void mutex_unlock(pthread_mutex_t *lock)
 		quit(1, "WTF MUTEX ERROR ON UNLOCK!");
 }
 
+static inline int mutex_trylock(pthread_mutex_t *lock)
+{
+	return pthread_mutex_trylock(lock);
+}
+
 static inline void wr_lock(pthread_rwlock_t *lock)
 {
 	if (unlikely(pthread_rwlock_wrlock(lock)))
@@ -550,6 +562,7 @@ extern bool opt_api_listen;
 extern bool opt_api_network;
 extern bool opt_delaynet;
 extern bool opt_restart;
+extern char *opt_icarus_options;
 extern char *opt_icarus_timing;
 #ifdef USE_BITFORCE
 extern bool opt_bfl_noncerange;
@@ -576,6 +589,8 @@ extern bool fulltest(const unsigned char *hash, const unsigned char *target);
 
 extern int opt_scantime;
 
+extern pthread_mutex_t console_lock;
+
 extern pthread_mutex_t restart_lock;
 extern pthread_cond_t restart_cond;
 
@@ -598,15 +613,20 @@ extern int set_memoryclock(int gpu, int iMemoryClock);
 extern void api(int thr_id);
 
 extern struct pool *current_pool(void);
-extern int active_pools(void);
+extern int enabled_pools;
 extern void add_pool_details(bool live, char *url, char *user, char *pass);
 
 #define MAX_GPUDEVICES 16
 
 #define MIN_INTENSITY -10
 #define _MIN_INTENSITY_STR "-10"
+#ifdef USE_SCRYPT
+#define MAX_INTENSITY 20
+#define _MAX_INTENSITY_STR "20"
+#else
 #define MAX_INTENSITY 14
 #define _MAX_INTENSITY_STR "14"
+#endif
 
 extern struct list_head scan_devices;
 extern int nDevs;
@@ -614,9 +634,15 @@ extern int opt_n_threads;
 extern int num_processors;
 extern int hw_errors;
 extern bool use_syslog;
+extern bool opt_quiet;
 extern struct thr_info *thr_info;
 extern struct cgpu_info gpus[MAX_GPUDEVICES];
 extern int gpu_threads;
+#ifdef USE_SCRYPT
+extern bool opt_scrypt;
+#else
+#define opt_scrypt (0)
+#endif
 extern double total_secs;
 extern int mining_threads;
 extern struct cgpu_info *cpus;
@@ -664,6 +690,9 @@ typedef struct {
 	cl_uint B1addK6, PreVal0addK7, W16addK16, W17addK17;
 	cl_uint zeroA, zeroB;
 	cl_uint oneA, twoA, threeA, fourA, fiveA, sixA, sevenA;
+#ifdef USE_SCRYPT
+	struct work *work;
+#endif
 } dev_blk_ctx;
 #else
 typedef struct {
@@ -677,9 +706,11 @@ struct curl_ent {
 	struct timeval tv;
 };
 
+/* Disabled needs to be the lowest enum as a freshly calloced value will then
+ * equal disabled */
 enum pool_enable {
-	POOL_ENABLED,
 	POOL_DISABLED,
+	POOL_ENABLED,
 	POOL_REJECTING,
 };
 
@@ -689,6 +720,8 @@ struct pool {
 	int accepted, rejected;
 	int seq_rejects;
 	int solved;
+	int queued;
+	int staged;
 
 	bool submit_fail;
 	bool idle;
@@ -796,7 +829,7 @@ extern void switch_pools(struct pool *selected);
 extern void remove_pool(struct pool *pool);
 extern void write_config(FILE *fcfg);
 extern void default_save_file(char *filename);
-extern void log_curses(int prio, const char *f, va_list ap);
+extern bool log_curses_only(int prio, const char *f, va_list ap);
 extern void clear_logwin(void);
 extern bool pool_tclear(struct pool *pool, bool *var);
 extern struct thread_q *tq_new(void);
diff --git a/miner.php b/miner.php
index 6f750be9..f83a302c 100644
--- a/miner.php
+++ b/miner.php
@@ -1,8 +1,9 @@
 <?php
 session_start();
 #
-global $miner, $port, $readonly, $notify, $rigs, $socktimeoutsec;
-global $checklastshare, $hidefields;
+global $miner, $port, $readonly, $notify, $rigs;
+global $socksndtimeoutsec, $sockrcvtimeoutsec;
+global $checklastshare, $poolinputs, $hidefields;
 global $ignorerefresh, $changerefresh, $autorefresh;
 global $allowcustompages, $customsummarypages;
 global $miner_font_family, $miner_font_size;
@@ -22,13 +23,18 @@ $readonly = false;
 #  coz it doesn't have notify - it just shows the error status table
 $notify = true;
 #
-# set $checklastshare to true to do the following checks:
+# Set $checklastshare to true to do the following checks:
 # If a device's last share is 12x expected ago then display as an error
 # If a device's last share is 8x expected ago then display as a warning
 # If either of the above is true, also display the whole line highlighted
 # This assumes shares are 1 difficulty shares
 $checklastshare = true;
 #
+# Set $poolinputs to true to show the input fields for adding a pool
+# and changing the pool priorities
+# N.B. also if $readonly is true, it will not display the fields
+$poolinputs = false;
+#
 # Set $rigs to an array of your cgminer rigs that are running
 #  format: 'IP:Port' or 'Host:Port' or 'Host:Port:Name'
 # If you only have one rig, it will just show the detail of that rig
@@ -38,12 +44,16 @@ $checklastshare = true;
 # e.g. $rigs = array('127.0.0.1:4028','myrig.com:4028:Sugoi');
 $rigs = array('127.0.0.1:4028');
 #
-# This should be OK for most cases
-# However, the longer it is the longer you have to wait while php
-# hangs if the target cgminer isn't runnning or listening
-# Feel free to increase it if your network is very slow
+# These should be OK for most cases
+# However, the longer SND is, the longer you have to wait while
+# php hangs if the target cgminer isn't runnning or listening
+# RCV should only ever be relevant if cgminer has hung but the
+# API thread is still running, RCV would normally be >= SND
+# Feel free to increase SND if your network is very slow
+# or decrease RCV if that happens often to you
 # Also, on some windows PHP, apparently the $usec is ignored
-$socktimeoutsec = 10;
+$socksndtimeoutsec = 10;
+$sockrcvtimeoutsec = 40;
 #
 # List of fields NOT to be displayed
 # You can use this to hide data you don't want to see or don't want
@@ -82,11 +92,13 @@ $mobilepage = array(
  'DATE' => null,
  'RIGS' => null,
  'SUMMARY' => array('Elapsed', 'MHS av', 'Found Blocks=Blks', 'Accepted', 'Rejected=Rej', 'Utility'),
- 'DEVS' => array('ID', 'Name', 'GPU', 'Status', 'MHS av', 'Accepted', 'Rejected=Rej', 'Utility'),
+ 'DEVS+NOTIFY' => array('DEVS.Name=Name', 'DEVS.ID=ID', 'DEVS.Status=Status', 'DEVS.Temperature=Temp',
+			'DEVS.MHS av=MHS av', 'DEVS.Accepted=Accept', 'DEVS.Rejected=Rej',
+			'DEVS.Utility=Utility', 'NOTIFY.Last Not Well=Not Well'),
  'POOL' => array('POOL', 'Status', 'Accepted', 'Rejected=Rej', 'Last Share Time'));
 $mobilesum = array(
  'SUMMARY' => array('MHS av', 'Found Blocks', 'Accepted', 'Rejected', 'Utility'),
- 'DEVS' => array('MHS av', 'Accepted', 'Rejected', 'Utility'),
+ 'DEVS+NOTIFY' => array('DEVS.MHS av', 'DEVS.Accepted', 'DEVS.Rejected', 'DEVS.Utility'),
  'POOL' => array('Accepted', 'Rejected'));
 #
 # customsummarypages is an array of these Custom Summary Pages
@@ -190,7 +202,7 @@ function getdom($domname)
 function htmlhead($checkapi, $rig, $pg = null)
 {
  global $miner_font_family, $miner_font_size;
- global $error, $readonly, $here;
+ global $error, $readonly, $poolinputs, $here;
  global $ignorerefresh, $autorefresh;
 
  $extraparams = '';
@@ -245,6 +257,8 @@ if ($ignorerefresh == false)
 echo "function prc(a,m){pr('&arg='+a,m)}
 function prs(a,r){var c=a.substr(3);var z=c.split('|',2);var m=z[0].substr(0,1).toUpperCase()+z[0].substr(1)+' GPU '+z[1];prc(a+'&rig='+r,m)}
 function prs2(a,n,r){var v=document.getElementById('gi'+n).value;var c=a.substr(3);var z=c.split('|',2);var m='Set GPU '+z[1]+' '+z[0].substr(0,1).toUpperCase()+z[0].substr(1)+' to '+v;prc(a+','+v+'&rig='+r,m)}\n";
+	if ($poolinputs === true)
+		echo "function cbs(s){var t=s.replace(/\\\\/g,'\\\\\\\\'); return t.replace(/,/g, '\\\\,')}\nfunction pla(r){var u=document.getElementById('purl').value;var w=document.getElementById('pwork').value;var p=document.getElementById('ppass').value;pr('&rig='+r+'&arg=addpool|'+cbs(u)+','+cbs(w)+','+cbs(p),'Add Pool '+u)}\nfunction psp(r){var p=document.getElementById('prio').value;pr('&rig='+r+'&arg=poolpriority|'+p,'Set Pool Priorities to '+p)}\n";
  }
 ?>
 </script>
@@ -260,7 +274,7 @@ $error = null;
 #
 function getsock($addr, $port)
 {
- global $haderror, $error, $socktimeoutsec;
+ global $haderror, $error, $socksndtimeoutsec, $sockrcvtimeoutsec;
 
  $error = null;
  $socket = null;
@@ -277,7 +291,8 @@ function getsock($addr, $port)
  // Ignore if this fails since the socket connect may work anyway
  //  and nothing is gained by aborting if the option cannot be set
  //  since we don't know in advance if it can connect
- socket_set_option($socket, SOL_SOCKET, SO_SNDTIMEO, array('sec' => $socktimeoutsec, 'usec' => 0));
+ socket_set_option($socket, SOL_SOCKET, SO_SNDTIMEO, array('sec' => $socksndtimeoutsec, 'usec' => 0));
+ socket_set_option($socket, SOL_SOCKET, SO_RCVTIMEO, array('sec' => $sockrcvtimeoutsec, 'usec' => 0));
 
  $res = socket_connect($socket, $addr, $port);
  if ($res === false)
@@ -307,6 +322,46 @@ function readsockline($socket)
  return $line;
 }
 #
+function api_convert_escape($str)
+{
+ $res = '';
+ $len = strlen($str);
+ for ($i = 0; $i < $len; $i++)
+ {
+	$ch = substr($str, $i, 1);
+	if ($ch != '\\' || $i == ($len-1))
+		$res .= $ch;
+	else
+	{
+		$i++;
+		$ch = substr($str, $i, 1);
+		switch ($ch)
+		{
+		case '|':
+			$res .= "\1";
+			break;
+		case '\\':
+			$res .= "\2";
+			break;
+		case '=':
+			$res .= "\3";
+			break;
+		case ',':
+			$res .= "\4";
+			break;
+		default:
+			$res .= $ch;
+		}
+	}
+ }
+ return $res;
+}
+#
+function revert($str)
+{
+ return str_replace(array("\1", "\2", "\3", "\4"), array("|", "\\", "=", ","), $str);
+}
+#
 function api($cmd)
 {
  global $haderror, $error;
@@ -328,6 +383,8 @@ function api($cmd)
 
 #	print "$cmd returned '$line'\n";
 
+	$line = api_convert_escape($line);
+
 	$data = array();
 
 	$objs = explode('|', $line);
@@ -365,7 +422,7 @@ function api($cmd)
 					continue;
 
 				if (count($id) == 2)
-					$data[$name][$id[0]] = $id[1];
+					$data[$name][$id[0]] = revert($id[1]);
 				else
 					$data[$name][$counter] = $id[0];
 
@@ -440,6 +497,9 @@ function classlastshare($when, $alldata, $warnclass, $errorclass)
  if (!isset($alldata['MHS av']))
 	return '';
 
+ if ($alldata['MHS av'] == 0)
+	return '';
+
  if (!isset($alldata['Last Share Time']))
 	return '';
 
@@ -476,6 +536,10 @@ function fmt($section, $name, $value, $when, $alldata)
  $ret = $value;
  $class = '';
 
+ $nams = explode('.', $name);
+ if (count($nams) > 1)
+	$name = $nams[count($nams)-1];
+
  if ($value === null)
 	$ret = $b;
  else
@@ -483,6 +547,7 @@ function fmt($section, $name, $value, $when, $alldata)
 	{
 	case 'GPU.Last Share Time':
 	case 'PGA.Last Share Time':
+	case 'DEVS.Last Share Time':
 		if ($value == 0
 		||  (isset($alldata['Last Share Pool']) && $alldata['Last Share Pool'] == -1))
 		{
@@ -503,6 +568,7 @@ function fmt($section, $name, $value, $when, $alldata)
 		break;
 	case 'GPU.Last Share Pool':
 	case 'PGA.Last Share Pool':
+	case 'DEVS.Last Share Pool':
 		if ($value == -1)
 		{
 			$ret = 'None';
@@ -565,6 +631,7 @@ function fmt($section, $name, $value, $when, $alldata)
 		break;
 	case 'GPU.Utility':
 	case 'PGA.Utility':
+	case 'DEVS.Utility':
 	case 'SUMMARY.Utility':
 	case 'total.Utility':
 		$ret = $value.'/m';
@@ -585,18 +652,24 @@ function fmt($section, $name, $value, $when, $alldata)
 			}
 		break;
 	case 'PGA.Temperature':
-		$ret = $value.'&deg;C';
-		break;
 	case 'GPU.Temperature':
+	case 'DEVS.Temperature':
 		$ret = $value.'&deg;C';
+		if (!isset($alldata['GPU']))
+			break;
 	case 'GPU.GPU Clock':
+	case 'DEVS.GPU Clock':
 	case 'GPU.Memory Clock':
+	case 'DEVS.Memory Clock':
 	case 'GPU.GPU Voltage':
+	case 'DEVS.GPU Voltage':
 	case 'GPU.GPU Activity':
+	case 'DEVS.GPU Activity':
 		if ($value == 0)
 			$class = $warnclass;
 		break;
 	case 'GPU.Fan Percent':
+	case 'DEVS.Fan Percent':
 		if ($value == 0)
 			$class = $warnclass;
 		else
@@ -609,6 +682,7 @@ function fmt($section, $name, $value, $when, $alldata)
 		}
 		break;
 	case 'GPU.Fan Speed':
+	case 'DEVS.Fan Speed':
 		if ($value == 0)
 			$class = $warnclass;
 		else
@@ -624,6 +698,7 @@ function fmt($section, $name, $value, $when, $alldata)
 		break;
 	case 'GPU.MHS av':
 	case 'PGA.MHS av':
+	case 'DEVS.MHS av':
 	case 'SUMMARY.MHS av':
 	case 'total.MHS av':
 		$parts = explode('.', $value, 2);
@@ -650,6 +725,7 @@ function fmt($section, $name, $value, $when, $alldata)
 		break;
 	case 'GPU.Total MH':
 	case 'PGA.Total MH':
+	case 'DEVS.Total MH':
 	case 'SUMMARY.Total MH':
 	case 'total.Total MH':
 	case 'SUMMARY.Getworks':
@@ -657,11 +733,13 @@ function fmt($section, $name, $value, $when, $alldata)
 	case 'total.Getworks':
 	case 'GPU.Accepted':
 	case 'PGA.Accepted':
+	case 'DEVS.Accepted':
 	case 'SUMMARY.Accepted':
 	case 'POOL.Accepted':
 	case 'total.Accepted':
 	case 'GPU.Rejected':
 	case 'PGA.Rejected':
+	case 'DEVS.Rejected':
 	case 'SUMMARY.Rejected':
 	case 'POOL.Rejected':
 	case 'total.Rejected':
@@ -679,12 +757,14 @@ function fmt($section, $name, $value, $when, $alldata)
 		break;
 	case 'GPU.Status':
 	case 'PGA.Status':
+	case 'DEVS.Status':
 	case 'POOL.Status':
 		if ($value != 'Alive')
 			$class = $errorclass;
 		break;
 	case 'GPU.Enabled':
 	case 'PGA.Enabled':
+	case 'DEVS.Enabled':
 		if ($value != 'Y')
 			$class = $warnclass;
 		break;
@@ -710,13 +790,17 @@ function fmt($section, $name, $value, $when, $alldata)
  if ($class == '' && ($rownum % 2) == 0)
 	$class = $c2class;
 
+ if ($ret == '')
+	$ret = $b;
+
  return array($ret, $class);
 }
 #
 global $poolcmd;
 $poolcmd = array(	'Switch to'	=> 'switchpool',
 			'Enable'	=> 'enablepool',
-			'Disable'	=> 'disablepool' );
+			'Disable'	=> 'disablepool',
+			'Remove'	=> 'removepool' );
 #
 function showhead($cmd, $values, $justnames = false)
 {
@@ -927,6 +1011,43 @@ function processgpus($rig)
  }
 }
 #
+function showpoolinputs($rig, $ans)
+{
+ global $readonly, $poolinputs;
+
+ if ($readonly === true || $poolinputs === false)
+	return;
+
+ newtable();
+ newrow();
+
+ $inps = array('Pool URL' => array('purl', 20),
+		'Worker Name' => array('pwork', 10),
+		'Worker Password' => array('ppass', 10));
+ $b = '&nbsp;';
+
+ echo "<td align=right class=h> Add a pool: </td><td>";
+
+ foreach ($inps as $text => $name)
+	echo "$text: <input name='".$name[0]."' id='".$name[0]."' value='' type=text size=".$name[1]."> ";
+
+ echo "</td><td align=middle><input type=button value='Add' onclick='pla($rig)'></td>";
+
+ endrow();
+
+ if (count($ans) > 1)
+ {
+	newrow();
+
+	echo '<td align=right class=h> Set pool priorities: </td>';
+	echo "<td> Comma list of pool numbers: <input type=text name=prio id=prio size=20>";
+	echo "</td><td align=middle><input type=button value='Set' onclick='psp($rig)'></td>";
+
+	endrow();
+ }
+ endtable();
+}
+#
 function process($cmds, $rig)
 {
  global $error, $devs;
@@ -946,12 +1067,15 @@ function process($cmds, $rig)
 	{
 		details($cmd, $process, $rig);
 
+		if ($cmd == 'devs')
+			$devs = $process;
+
+		if ($cmd == 'pools')
+			showpoolinputs($rig, $process);
+
 		# Not after the last one
 		if (--$count > 0)
 			otherrow('<td><br><br></td>');
-
-		if ($cmd == 'devs')
-			$devs = $process;
 	}
  }
 }
@@ -1268,8 +1392,177 @@ $sectionmap = array(
 	'GPU' => 'devs',	// You would normally use DEVS
 	'PGA' => 'devs',	// You would normally use DEVS
 	'NOTIFY' => 'notify',
+	'DEVDETAILS' => 'devdetails',
+	'STATS' => 'stats',
 	'CONFIG' => 'config');
 #
+function joinfields($section1, $section2, $join, $results)
+{
+ global $sectionmap;
+
+ $name1 = $sectionmap[$section1];
+ $name2 = $sectionmap[$section2];
+ $newres = array();
+
+ // foreach rig in section1
+ foreach ($results[$name1] as $rig => $result)
+ {
+	$status = null;
+
+	// foreach answer section in the rig api call
+	foreach ($result as $name1b => $fields1b)
+	{
+		if ($name1b == 'STATUS')
+		{
+			// remember the STATUS from section1
+			$status = $result[$name1b];
+			continue;
+		}
+
+		// foreach answer section in the rig api call (for the other api command)
+		foreach ($results[$name2][$rig] as $name2b => $fields2b)
+		{
+			if ($name2b == 'STATUS')
+				continue;
+
+			// If match the same field values of fields in $join
+			$match = true;
+			foreach ($join as $field)
+				if ($fields1b[$field] != $fields2b[$field])
+				{
+					$match = false;
+					break;
+				}
+
+			if ($match === true)
+			{
+				if ($status != null)
+				{
+					$newres[$rig]['STATUS'] = $status;
+					$status = null;
+				}
+
+				$subsection = $section1.'+'.$section2;
+				$subsection .= preg_replace('/[^0-9]/', '', $name1b.$name2b);
+
+				foreach ($fields1b as $nam => $val)
+					$newres[$rig][$subsection]["$section1.$nam"] = $val;
+				foreach ($fields2b as $nam => $val)
+					$newres[$rig][$subsection]["$section2.$nam"] = $val;
+			}
+		}
+	}
+ }
+ return $newres;
+}
+#
+function joinall($section1, $section2, $results)
+{
+ global $sectionmap;
+
+ $name1 = $sectionmap[$section1];
+ $name2 = $sectionmap[$section2];
+ $newres = array();
+
+ // foreach rig in section1
+ foreach ($results[$name1] as $rig => $result)
+ {
+	// foreach answer section in the rig api call
+	foreach ($result as $name1b => $fields1b)
+	{
+		if ($name1b == 'STATUS')
+		{
+			// copy the STATUS from section1
+			$newres[$rig][$name1b] = $result[$name1b];
+			continue;
+		}
+
+		// foreach answer section in the rig api call (for the other api command)
+		foreach ($results[$name2][$rig] as $name2b => $fields2b)
+		{
+			if ($name2b == 'STATUS')
+				continue;
+
+			$subsection = $section1.'+'.$section2;
+			$subsection .= preg_replace('/[^0-9]/', '', $name1b.$name2b);
+
+			foreach ($fields1b as $nam => $val)
+				$newres[$rig][$subsection]["$section1.$nam"] = $val;
+			foreach ($fields2b as $nam => $val)
+				$newres[$rig][$subsection]["$section2.$nam"] = $val;
+		}
+	}
+ }
+ return $newres;
+}
+#
+function joinsections($sections, $results, $errors)
+{
+ global $sectionmap;
+
+#echo "results['pools']=".print_r($results['pools'],true)."<br>";
+
+ // GPU's don't have Name,ID fields - so create them
+ foreach ($results as $section => $res)
+	foreach ($res as $rig => $result)
+		foreach ($result as $name => $fields)
+		{
+			$subname = preg_replace('/[0-9]/', '', $name);
+			if ($subname == 'GPU' and isset($result[$name]['GPU']))
+			{
+				$results[$section][$rig][$name]['Name'] = 'GPU';
+				$results[$section][$rig][$name]['ID'] = $result[$name]['GPU'];
+			}
+		}
+
+ foreach ($sections as $section => $fields)
+	if ($section != 'DATE' && !isset($sectionmap[$section]))
+	{
+		$both = explode('+', $section, 2);
+		if (count($both) > 1)
+		{
+			switch($both[0])
+			{
+			case 'SUMMARY':
+				switch($both[1])
+				{
+				case 'POOL':
+				case 'DEVS':
+				case 'CONFIG':
+					$sectionmap[$section] = $section;
+					$results[$section] = joinall($both[0], $both[1], $results);
+					break;
+				default:
+					$errors[] = "Error: Invalid section '$section'";
+					break;
+				}
+				break;
+			case 'DEVS':
+				$join = array('Name', 'ID');
+				switch($both[1])
+				{
+				case 'NOTIFY':
+				case 'DEVDETAILS':
+					$sectionmap[$section] = $section;
+					$results[$section] = joinfields($both[0], $both[1], $join, $results);
+					break;
+				default:
+					$errors[] = "Error: Invalid section '$section'";
+					break;
+				}
+				break;
+			default:
+				$errors[] = "Error: Invalid section '$section'";
+				break;
+			}
+		}
+		else
+			$errors[] = "Error: Invalid section '$section'";
+	}
+
+ return array($results, $errors);
+}
+#
 function secmatch($section, $field)
 {
  if ($section == $field)
@@ -1329,7 +1622,14 @@ function customset($showfields, $sum, $section, $rig, $isbutton, $result, $total
 				$value = null;
 		}
 
-		list($showvalue, $class) = fmt($secname, $name, $value, $when, $row);
+		if (strpos($secname, '+') === false)
+			list($showvalue, $class) = fmt($secname, $name, $value, $when, $row);
+		else
+		{
+			$parts = explode('.', $name, 2);
+			list($showvalue, $class) = fmt($parts[0], $parts[1], $value, $when, $row);
+		}
+
 		echo "<td$class align=right>$showvalue</td>";
 	}
 	endrow();
@@ -1350,15 +1650,19 @@ function processcustompage($pagename, $sections, $sum, $namemap)
  $errors = array();
  foreach ($sections as $section => $fields)
  {
-	if (isset($sectionmap[$section]))
+	$all = explode('+', $section);
+	foreach ($all as $section)
 	{
-		$cmd = $sectionmap[$section];
-		if (!isset($cmds[$cmd]))
-			$cmds[$cmd] = 1;
+		if (isset($sectionmap[$section]))
+		{
+			$cmd = $sectionmap[$section];
+			if (!isset($cmds[$cmd]))
+				$cmds[$cmd] = 1;
+		}
+		else
+			if ($section != 'DATE')
+				$errors[] = "Error: unknown section '$section' in custom summary page '$pagename'";
 	}
-	else
-		if ($section != 'DATE')
-			$errors[] = "Error: unknown section '$section' in custom summary page '$pagename'";
  }
 
  $results = array();
@@ -1393,6 +1697,7 @@ function processcustompage($pagename, $sections, $sum, $namemap)
  $shownsomething = false;
  if (count($results) > 0)
  {
+	list($results, $errors) = joinsections($sections, $results, $errors);
 	$first = true;
 	foreach ($sections as $section => $fields)
 	{
@@ -1586,7 +1891,8 @@ function display()
 			$miner = $parts[0];
 			$port = $parts[1];
 
-			$preprocess = $arg;
+			if ($readonly !== true)
+				$preprocess = $arg;
 		}
 	}
  }
diff --git a/mkinstalldirs b/mkinstalldirs
index 4191a45d..55d537f8 100755
--- a/mkinstalldirs
+++ b/mkinstalldirs
@@ -81,9 +81,9 @@ case $dirmode in
       echo "mkdir -p -- $*"
       exec mkdir -p -- "$@"
     else
-      # On NextStep and OpenStep, the `mkdir' command does not
+      # On NextStep and OpenStep, the 'mkdir' command does not
       # recognize any option.  It will interpret all options as
-      # directories to create, and then abort because `.' already
+      # directories to create, and then abort because '.' already
       # exists.
       test -d ./-p && rmdir ./-p
       test -d ./--version && rmdir ./--version
diff --git a/ocl.c b/ocl.c
index 464cb4e1..9e9ef022 100644
--- a/ocl.c
+++ b/ocl.c
@@ -33,7 +33,7 @@
 #include "findnonce.h"
 #include "ocl.h"
 
-int opt_platform_id;
+int opt_platform_id = -1;
 
 char *file_contents(const char *filename, int *length)
 {
@@ -80,7 +80,7 @@ int clDevicesNum(void) {
 	cl_uint numPlatforms;
 	cl_platform_id *platforms;
 	cl_platform_id platform = NULL;
-	unsigned int most_devices = 0, i;
+	unsigned int most_devices = 0, i, mdplatform = 0;
 
 	status = clGetPlatformIDs(0, NULL, &numPlatforms);
 	/* If this fails, assume no GPUs. */
@@ -118,11 +118,15 @@ int clDevicesNum(void) {
 		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
 		if (status != CL_SUCCESS) {
 			applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status);
+			if (i < numPlatforms - 1)
+				continue;
 			return -1;
 		}
 		applog(LOG_INFO, "Platform %d devices: %d", i, numDevices);
-		if (numDevices > most_devices)
+		if (numDevices > most_devices) {
 			most_devices = numDevices;
+			mdplatform = i;
+		}
 		if (numDevices) {
 			unsigned int j;
 			char pbuff[256];
@@ -137,6 +141,9 @@ int clDevicesNum(void) {
 		}
 	}
 
+	if (opt_platform_id < 0)
+		opt_platform_id = mdplatform;;
+
 	return most_devices;
 }
 
@@ -201,6 +208,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 {
 	_clState *clState = calloc(1, sizeof(_clState));
 	bool patchbfi = false, prog_built = false;
+	struct cgpu_info *cgpu = &gpus[gpu];
 	cl_platform_id platform = NULL;
 	char pbuff[256], vbuff[255];
 	cl_platform_id* platforms;
@@ -302,6 +310,18 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 		return NULL;
 	}
 
+	/////////////////////////////////////////////////////////////////
+	// Create an OpenCL command queue
+	/////////////////////////////////////////////////////////////////
+	clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
+						     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
+	if (status != CL_SUCCESS) /* Try again without OOE enable */
+		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
+	if (status != CL_SUCCESS) {
+		applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
+		return NULL;
+	}
+
 	/* Check for BFI INT support. Hopefully people don't mix devices with
 	 * and without it! */
 	char * extensions = malloc(1024);
@@ -344,18 +364,30 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 	}
 	applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size);
 
+	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL);
+	if (status != CL_SUCCESS) {
+		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status);
+		return NULL;
+	}
+	applog(LOG_DEBUG, "Max mem alloc size is %u", cgpu->max_alloc);
+
 	/* Create binary filename based on parameters passed to opencl
 	 * compiler to ensure we only load a binary that matches what would
 	 * have otherwise created. The filename is:
 	 * name + kernelname +/- g(offset) + v + vectors + w + work_size + l + sizeof(long) + .bin
+	 * For scrypt the filename is:
+	 * name + kernelname + g + lg + lookup_gap + tc + thread_concurrency + w + work_size + l + sizeof(long) + .bin
 	 */
 	char binaryfilename[255];
 	char filename[255];
 	char numbuf[10];
 
-	if (gpus[gpu].kernel == KL_NONE) {
-		/* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */
-		if (!strstr(name, "Tahiti") &&
+	if (cgpu->kernel == KL_NONE) {
+		if (opt_scrypt) {
+			applog(LOG_INFO, "Selecting scrypt kernel");
+			clState->chosen_kernel = KL_SCRYPT;
+		} else if (!strstr(name, "Tahiti") &&
+			/* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */
 			(strstr(vbuff, "844.4") ||  // Linux 64 bit ATI 2.6 SDK
 			 strstr(vbuff, "851.4") ||  // Windows 64 bit ""
 			 strstr(vbuff, "831.4") ||
@@ -372,9 +404,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 			applog(LOG_INFO, "Selecting phatk kernel");
 			clState->chosen_kernel = KL_PHATK;
 		}
-		gpus[gpu].kernel = clState->chosen_kernel;
+		cgpu->kernel = clState->chosen_kernel;
 	} else {
-		clState->chosen_kernel = gpus[gpu].kernel;
+		clState->chosen_kernel = cgpu->kernel;
 		if (clState->chosen_kernel == KL_PHATK &&
 		    (strstr(vbuff, "844.4") || strstr(vbuff, "851.4") ||
 		     strstr(vbuff, "831.4") || strstr(vbuff, "898.1") ||
@@ -407,6 +439,12 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 			strcpy(filename, DIAKGCN_KERNNAME".cl");
 			strcpy(binaryfilename, DIAKGCN_KERNNAME);
 			break;
+		case KL_SCRYPT:
+			strcpy(filename, SCRYPT_KERNNAME".cl");
+			strcpy(binaryfilename, SCRYPT_KERNNAME);
+			/* Scrypt only supports vector 1 */
+			cgpu->vwidth = 1;
+			break;
 		case KL_NONE: /* Shouldn't happen */
 		case KL_DIABLO:
 			strcpy(filename, DIABLO_KERNNAME".cl");
@@ -414,24 +452,64 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 			break;
 	}
 
-	if (gpus[gpu].vwidth)
-		clState->vwidth = gpus[gpu].vwidth;
+	if (cgpu->vwidth)
+		clState->vwidth = cgpu->vwidth;
 	else {
 		clState->vwidth = preferred_vwidth;
-		gpus[gpu].vwidth = preferred_vwidth;
+		cgpu->vwidth = preferred_vwidth;
 	}
 
-	if ((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) &&
-		clState->vwidth == 1 && clState->hasOpenCL11plus)
+	if (((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) &&
+		clState->vwidth == 1 && clState->hasOpenCL11plus) || opt_scrypt)
 			clState->goffset = true;
 
-	if (gpus[gpu].work_size && gpus[gpu].work_size <= clState->max_work_size)
-		clState->wsize = gpus[gpu].work_size;
+	if (cgpu->work_size && cgpu->work_size <= clState->max_work_size)
+		clState->wsize = cgpu->work_size;
 	else if (strstr(name, "Tahiti"))
 		clState->wsize = 64;
 	else
 		clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth;
-	gpus[gpu].work_size = clState->wsize;
+	cgpu->work_size = clState->wsize;
+
+#ifdef USE_SCRYPT
+	if (opt_scrypt) {
+		cl_ulong ma = cgpu->max_alloc, mt;
+		int pow2 = 0;
+
+		if (!cgpu->opt_lg) {
+			applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu);
+			cgpu->lookup_gap = 2;
+		} else
+			cgpu->lookup_gap = cgpu->opt_lg;
+
+		if (!cgpu->opt_tc) {
+			cgpu->thread_concurrency = ma / 32768 / cgpu->lookup_gap;
+			if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) {
+				cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders;
+				if (cgpu->thread_concurrency > cgpu->shaders * 5)
+					cgpu->thread_concurrency = cgpu->shaders * 5;
+			}
+				
+			applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu,  cgpu->thread_concurrency);
+		} else
+			cgpu->thread_concurrency = cgpu->opt_tc;
+
+		/* If we have memory to spare, try to find a power of 2 value
+		 * >= required amount to map nicely to an intensity */
+		mt = cgpu->thread_concurrency * 32768 * cgpu->lookup_gap;
+		if (ma > mt) {
+			while (ma >>= 1)
+				pow2++;
+			ma = 1;
+			while (--pow2 && ma < mt)
+				ma <<= 1;
+			if (ma >= mt) {
+				cgpu->max_alloc = ma;
+				applog(LOG_DEBUG, "Max alloc decreased to %lu", cgpu->max_alloc);
+			}
+		}
+	}
+#endif
 
 	FILE *binaryfile;
 	size_t *binary_sizes;
@@ -460,14 +538,18 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 	strcat(binaryfilename, name);
 	if (clState->goffset)
 		strcat(binaryfilename, "g");
-	strcat(binaryfilename, "v");
-	sprintf(numbuf, "%d", clState->vwidth);
-	strcat(binaryfilename, numbuf);
-	strcat(binaryfilename, "w");
-	sprintf(numbuf, "%d", (int)clState->wsize);
+	if (opt_scrypt) {
+#ifdef USE_SCRYPT
+		sprintf(numbuf, "lg%dtc%d", cgpu->lookup_gap, cgpu->thread_concurrency);
+		strcat(binaryfilename, numbuf);
+#endif
+	} else {
+		sprintf(numbuf, "v%d", clState->vwidth);
+		strcat(binaryfilename, numbuf);
+	}
+	sprintf(numbuf, "w%d", (int)clState->wsize);
 	strcat(binaryfilename, numbuf);
-	strcat(binaryfilename, "l");
-	sprintf(numbuf, "%d", (int)sizeof(long));
+	sprintf(numbuf, "l%d", (int)sizeof(long));
 	strcat(binaryfilename, numbuf);
 	strcat(binaryfilename, ".bin");
 
@@ -528,8 +610,16 @@ build:
 	/* create a cl program executable for all the devices specified */
 	char *CompilerOptions = calloc(1, 256);
 
-	sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d",
-		(int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth);
+#ifdef USE_SCRYPT
+	if (opt_scrypt)
+		sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d",
+			cgpu->lookup_gap, cgpu->thread_concurrency, (int)clState->wsize);
+	else
+#endif
+	{
+		sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d",
+			(int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth);
+	}
 	applog(LOG_DEBUG, "Setting worksize to %d", clState->wsize);
 	if (clState->vwidth > 1)
 		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth);
@@ -708,18 +798,33 @@ built:
 		return NULL;
 	}
 
-	/////////////////////////////////////////////////////////////////
-	// Create an OpenCL command queue
-	/////////////////////////////////////////////////////////////////
-	clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
-						     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
-	if (status != CL_SUCCESS) /* Try again without OOE enable */
-		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
-	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
-		return NULL;
-	}
+#ifdef USE_SCRYPT
+	if (opt_scrypt) {
+		size_t ipt = (1024 / cgpu->lookup_gap + (1024 % cgpu->lookup_gap > 0));
+		size_t bufsize = 128 * ipt * cgpu->thread_concurrency;
+
+		/* Use the max alloc value which has been rounded to a power of
+		 * 2 greater >= required amount earlier */
+		if (bufsize > cgpu->max_alloc) {
+			applog(LOG_WARNING, "Maximum buffer memory device %d supports says %u, your scrypt settings come to %u",
+			       gpu, cgpu->max_alloc, bufsize);
+		} else
+			bufsize = cgpu->max_alloc;
+		applog(LOG_DEBUG, "Creating scrypt buffer sized %d", bufsize);
+		clState->padbufsize = bufsize;
+		clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status);
+		if (status != CL_SUCCESS) {
+			applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease CT or increase LG", status);
+			return NULL;
+		}
 
+		clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status);
+		if (status != CL_SUCCESS) {
+			applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status);
+			return NULL;
+		}
+	}
+#endif
 	clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
 	if (status != CL_SUCCESS) {
 		applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);
diff --git a/ocl.h b/ocl.h
index 2f2f2c24..984e7d62 100644
--- a/ocl.h
+++ b/ocl.h
@@ -19,6 +19,12 @@ typedef struct {
 	cl_command_queue commandQueue;
 	cl_program program;
 	cl_mem outputBuffer;
+#ifdef USE_SCRYPT
+	cl_mem CLbuffer0;
+	cl_mem padbuffer8;
+	size_t padbufsize;
+	void * cldata;
+#endif
 	bool hasBitAlign;
 	bool hasOpenCL11plus;
 	bool goffset;
diff --git a/phatk120223.cl b/phatk120724.cl
similarity index 99%
rename from phatk120223.cl
rename to phatk120724.cl
index 7d1c3200..0f604436 100644
--- a/phatk120223.cl
+++ b/phatk120724.cl
@@ -387,8 +387,8 @@ void search(	const uint state0, const uint state1, const uint state2, const uint
 	W[117] += W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]) -
 		(-(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64))  + s1(64+59)+ ch(59+64)));
 
-#define FOUND (0x80)
-#define NFLAG (0x7F)
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
 
 #ifdef VECTORS4
 	bool result = W[117].x & W[117].y & W[117].z & W[117].w;
diff --git a/poclbm120327.cl b/poclbm120724.cl
similarity index 99%
rename from poclbm120327.cl
rename to poclbm120724.cl
index 72491a26..3e8b9943 100644
--- a/poclbm120327.cl
+++ b/poclbm120724.cl
@@ -1311,8 +1311,8 @@ Vals[1]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
 Vals[1]+=K[59];
 Vals[1]+=Vals[5];
 
-#define FOUND (0x80)
-#define NFLAG (0x7F)
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
 
 #if defined(VECTORS2) || defined(VECTORS4)
 	Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
diff --git a/scrypt.c b/scrypt.c
new file mode 100644
index 00000000..70c3fd3e
--- /dev/null
+++ b/scrypt.c
@@ -0,0 +1,466 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include "config.h"
+#include "miner.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#define byteswap(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+
+typedef struct SHA256Context {
+	uint32_t state[8];
+	uint32_t buf[16];
+} SHA256_CTX;
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static inline void
+be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
+{
+	uint32_t i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = byteswap(src[i]);
+}
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define SHR(x, n)	(x >> n)
+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	t0 = h + S1(e) + Ch(e, f, g) + k;		\
+	t1 = S0(a) + Maj(a, b, c);			\
+	d += t0;					\
+	h  = t0 + t1;
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, k)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i] + k)
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap)
+{
+	uint32_t W[64];
+	uint32_t S[8];
+	uint32_t t0, t1;
+	int i;
+
+	/* 1. Prepare message schedule W. */
+	if(swap)
+		for (i = 0; i < 16; i++)
+			W[i] = byteswap(block[i]);
+	else
+		memcpy(W, block, 64);
+	for (i = 16; i < 64; i += 2) {
+		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+	}
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 32);
+
+	/* 3. Mix. */
+	RNDr(S, W, 0, 0x428a2f98);
+	RNDr(S, W, 1, 0x71374491);
+	RNDr(S, W, 2, 0xb5c0fbcf);
+	RNDr(S, W, 3, 0xe9b5dba5);
+	RNDr(S, W, 4, 0x3956c25b);
+	RNDr(S, W, 5, 0x59f111f1);
+	RNDr(S, W, 6, 0x923f82a4);
+	RNDr(S, W, 7, 0xab1c5ed5);
+	RNDr(S, W, 8, 0xd807aa98);
+	RNDr(S, W, 9, 0x12835b01);
+	RNDr(S, W, 10, 0x243185be);
+	RNDr(S, W, 11, 0x550c7dc3);
+	RNDr(S, W, 12, 0x72be5d74);
+	RNDr(S, W, 13, 0x80deb1fe);
+	RNDr(S, W, 14, 0x9bdc06a7);
+	RNDr(S, W, 15, 0xc19bf174);
+	RNDr(S, W, 16, 0xe49b69c1);
+	RNDr(S, W, 17, 0xefbe4786);
+	RNDr(S, W, 18, 0x0fc19dc6);
+	RNDr(S, W, 19, 0x240ca1cc);
+	RNDr(S, W, 20, 0x2de92c6f);
+	RNDr(S, W, 21, 0x4a7484aa);
+	RNDr(S, W, 22, 0x5cb0a9dc);
+	RNDr(S, W, 23, 0x76f988da);
+	RNDr(S, W, 24, 0x983e5152);
+	RNDr(S, W, 25, 0xa831c66d);
+	RNDr(S, W, 26, 0xb00327c8);
+	RNDr(S, W, 27, 0xbf597fc7);
+	RNDr(S, W, 28, 0xc6e00bf3);
+	RNDr(S, W, 29, 0xd5a79147);
+	RNDr(S, W, 30, 0x06ca6351);
+	RNDr(S, W, 31, 0x14292967);
+	RNDr(S, W, 32, 0x27b70a85);
+	RNDr(S, W, 33, 0x2e1b2138);
+	RNDr(S, W, 34, 0x4d2c6dfc);
+	RNDr(S, W, 35, 0x53380d13);
+	RNDr(S, W, 36, 0x650a7354);
+	RNDr(S, W, 37, 0x766a0abb);
+	RNDr(S, W, 38, 0x81c2c92e);
+	RNDr(S, W, 39, 0x92722c85);
+	RNDr(S, W, 40, 0xa2bfe8a1);
+	RNDr(S, W, 41, 0xa81a664b);
+	RNDr(S, W, 42, 0xc24b8b70);
+	RNDr(S, W, 43, 0xc76c51a3);
+	RNDr(S, W, 44, 0xd192e819);
+	RNDr(S, W, 45, 0xd6990624);
+	RNDr(S, W, 46, 0xf40e3585);
+	RNDr(S, W, 47, 0x106aa070);
+	RNDr(S, W, 48, 0x19a4c116);
+	RNDr(S, W, 49, 0x1e376c08);
+	RNDr(S, W, 50, 0x2748774c);
+	RNDr(S, W, 51, 0x34b0bcb5);
+	RNDr(S, W, 52, 0x391c0cb3);
+	RNDr(S, W, 53, 0x4ed8aa4a);
+	RNDr(S, W, 54, 0x5b9cca4f);
+	RNDr(S, W, 55, 0x682e6ff3);
+	RNDr(S, W, 56, 0x748f82ee);
+	RNDr(S, W, 57, 0x78a5636f);
+	RNDr(S, W, 58, 0x84c87814);
+	RNDr(S, W, 59, 0x8cc70208);
+	RNDr(S, W, 60, 0x90befffa);
+	RNDr(S, W, 61, 0xa4506ceb);
+	RNDr(S, W, 62, 0xbef9a3f7);
+	RNDr(S, W, 63, 0xc67178f2);
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+}
+
+static inline void
+SHA256_InitState(uint32_t * state)
+{
+	/* Magic initialization constants */
+	state[0] = 0x6A09E667;
+	state[1] = 0xBB67AE85;
+	state[2] = 0x3C6EF372;
+	state[3] = 0xA54FF53A;
+	state[4] = 0x510E527F;
+	state[5] = 0x9B05688C;
+	state[6] = 0x1F83D9AB;
+	state[7] = 0x5BE0CD19;
+}
+
+static const uint32_t passwdpad[12] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000};
+static const uint32_t outerpad[8] = {0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300};
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+static inline void
+PBKDF2_SHA256_80_128(const uint32_t * passwd, uint32_t * buf)
+{
+	SHA256_CTX PShictx, PShoctx;
+	uint32_t tstate[8];
+	uint32_t ihash[8];
+	uint32_t i;
+	uint32_t pad[16];
+	
+	static const uint32_t innerpad[11] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa0040000};
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	SHA256_InitState(tstate);
+	SHA256_Transform(tstate, passwd, 1);
+	memcpy(pad, passwd+16, 16);
+	memcpy(pad+4, passwdpad, 48);
+	SHA256_Transform(tstate, pad, 1);
+	memcpy(ihash, tstate, 32);
+
+	SHA256_InitState(PShictx.state);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x36363636;
+	for (; i < 16; i++)
+		pad[i] = 0x36363636;
+	SHA256_Transform(PShictx.state, pad, 0);
+	SHA256_Transform(PShictx.state, passwd, 1);
+	be32enc_vect(PShictx.buf, passwd+16, 4);
+	be32enc_vect(PShictx.buf+5, innerpad, 11);
+
+	SHA256_InitState(PShoctx.state);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for (; i < 16; i++)
+		pad[i] = 0x5c5c5c5c;
+	SHA256_Transform(PShoctx.state, pad, 0);
+	memcpy(PShoctx.buf+8, outerpad, 32);
+
+	/* Iterate through the blocks. */
+	for (i = 0; i < 4; i++) {
+		uint32_t istate[8];
+		uint32_t ostate[8];
+		
+		memcpy(istate, PShictx.state, 32);
+		PShictx.buf[4] = i + 1;
+		SHA256_Transform(istate, PShictx.buf, 0);
+		memcpy(PShoctx.buf, istate, 32);
+
+		memcpy(ostate, PShoctx.state, 32);
+		SHA256_Transform(ostate, PShoctx.buf, 0);
+		be32enc_vect(buf+i*8, ostate, 8);
+	}
+}
+
+
+static inline uint32_t
+PBKDF2_SHA256_80_128_32(const uint32_t * passwd, const uint32_t * salt)
+{
+	uint32_t tstate[8];
+	uint32_t ostate[8];
+	uint32_t ihash[8];
+	uint32_t i;
+
+	/* Compute HMAC state after processing P and S. */
+	uint32_t pad[16];
+	
+	static const uint32_t ihash_finalblk[16] = {0x00000001,0x80000000,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x00000620};
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	SHA256_InitState(tstate);
+	SHA256_Transform(tstate, passwd, 1);
+	memcpy(pad, passwd+16, 16);
+	memcpy(pad+4, passwdpad, 48);
+	SHA256_Transform(tstate, pad, 1);
+	memcpy(ihash, tstate, 32);
+
+	SHA256_InitState(ostate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for (; i < 16; i++)
+		pad[i] = 0x5c5c5c5c;
+	SHA256_Transform(ostate, pad, 0);
+
+	SHA256_InitState(tstate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x36363636;
+	for (; i < 16; i++)
+		pad[i] = 0x36363636;
+	SHA256_Transform(tstate, pad, 0);
+	SHA256_Transform(tstate, salt, 1);
+	SHA256_Transform(tstate, salt+16, 1);
+	SHA256_Transform(tstate, ihash_finalblk, 0);
+	memcpy(pad, tstate, 32);
+	memcpy(pad+8, outerpad, 32);
+
+	/* Feed the inner hash to the outer SHA256 operation. */
+	SHA256_Transform(ostate, pad, 0);
+	/* Finish the outer SHA256 operation. */
+	return byteswap(ostate[7]);
+}
+
+
+/**
+ * salsa20_8(B):
+ * Apply the salsa20/8 core to the provided block.
+ */
+static inline void
+salsa20_8(uint32_t B[16], const uint32_t Bx[16])
+{
+	uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
+	size_t i;
+
+	x00 = (B[ 0] ^= Bx[ 0]);
+	x01 = (B[ 1] ^= Bx[ 1]);
+	x02 = (B[ 2] ^= Bx[ 2]);
+	x03 = (B[ 3] ^= Bx[ 3]);
+	x04 = (B[ 4] ^= Bx[ 4]);
+	x05 = (B[ 5] ^= Bx[ 5]);
+	x06 = (B[ 6] ^= Bx[ 6]);
+	x07 = (B[ 7] ^= Bx[ 7]);
+	x08 = (B[ 8] ^= Bx[ 8]);
+	x09 = (B[ 9] ^= Bx[ 9]);
+	x10 = (B[10] ^= Bx[10]);
+	x11 = (B[11] ^= Bx[11]);
+	x12 = (B[12] ^= Bx[12]);
+	x13 = (B[13] ^= Bx[13]);
+	x14 = (B[14] ^= Bx[14]);
+	x15 = (B[15] ^= Bx[15]);
+	for (i = 0; i < 8; i += 2) {
+#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
+		/* Operate on columns. */
+		x04 ^= R(x00+x12, 7);	x09 ^= R(x05+x01, 7);	x14 ^= R(x10+x06, 7);	x03 ^= R(x15+x11, 7);
+		x08 ^= R(x04+x00, 9);	x13 ^= R(x09+x05, 9);	x02 ^= R(x14+x10, 9);	x07 ^= R(x03+x15, 9);
+		x12 ^= R(x08+x04,13);	x01 ^= R(x13+x09,13);	x06 ^= R(x02+x14,13);	x11 ^= R(x07+x03,13);
+		x00 ^= R(x12+x08,18);	x05 ^= R(x01+x13,18);	x10 ^= R(x06+x02,18);	x15 ^= R(x11+x07,18);
+
+		/* Operate on rows. */
+		x01 ^= R(x00+x03, 7);	x06 ^= R(x05+x04, 7);	x11 ^= R(x10+x09, 7);	x12 ^= R(x15+x14, 7);
+		x02 ^= R(x01+x00, 9);	x07 ^= R(x06+x05, 9);	x08 ^= R(x11+x10, 9);	x13 ^= R(x12+x15, 9);
+		x03 ^= R(x02+x01,13);	x04 ^= R(x07+x06,13);	x09 ^= R(x08+x11,13);	x14 ^= R(x13+x12,13);
+		x00 ^= R(x03+x02,18);	x05 ^= R(x04+x07,18);	x10 ^= R(x09+x08,18);	x15 ^= R(x14+x13,18);
+#undef R
+	}
+	B[ 0] += x00;
+	B[ 1] += x01;
+	B[ 2] += x02;
+	B[ 3] += x03;
+	B[ 4] += x04;
+	B[ 5] += x05;
+	B[ 6] += x06;
+	B[ 7] += x07;
+	B[ 8] += x08;
+	B[ 9] += x09;
+	B[10] += x10;
+	B[11] += x11;
+	B[12] += x12;
+	B[13] += x13;
+	B[14] += x14;
+	B[15] += x15;
+}
+
+/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
+   scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
+ */
+static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad)
+{
+	uint32_t * V;
+	uint32_t X[32];
+	uint32_t i;
+	uint32_t j;
+	uint32_t k;
+	uint64_t *p1, *p2;
+
+	p1 = (uint64_t *)X;
+	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+
+	PBKDF2_SHA256_80_128(input, X);
+
+	for (i = 0; i < 1024; i += 2) {
+		memcpy(&V[i * 32], X, 128);
+
+		salsa20_8(&X[0], &X[16]);
+		salsa20_8(&X[16], &X[0]);
+
+		memcpy(&V[(i + 1) * 32], X, 128);
+
+		salsa20_8(&X[0], &X[16]);
+		salsa20_8(&X[16], &X[0]);
+	}
+	for (i = 0; i < 1024; i += 2) {
+		j = X[16] & 1023;
+		p2 = (uint64_t *)(&V[j * 32]);
+		for(k = 0; k < 16; k++)
+			p1[k] ^= p2[k];
+
+		salsa20_8(&X[0], &X[16]);
+		salsa20_8(&X[16], &X[0]);
+
+		j = X[16] & 1023;
+		p2 = (uint64_t *)(&V[j * 32]);
+		for(k = 0; k < 16; k++)
+			p1[k] ^= p2[k];
+
+		salsa20_8(&X[0], &X[16]);
+		salsa20_8(&X[16], &X[0]);
+	}
+
+	return PBKDF2_SHA256_80_128_32(input, X);
+}
+
+/* Used externally as confirmation of correct OCL code */
+bool scrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
+{
+	uint32_t tmp_hash7, Htarg = ((const uint32_t *)ptarget)[7];
+	char *scratchbuf;
+	uint32_t data[20];
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+	data[19] = byteswap(nonce);
+	scratchbuf = alloca(131584);
+	tmp_hash7 = scrypt_1024_1_1_256_sp(data, scratchbuf);
+
+	return (tmp_hash7 <= Htarg);
+}
+
+bool scanhash_scrypt(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
+		     unsigned char *pdata, unsigned char __maybe_unused *phash1,
+		     unsigned char __maybe_unused *phash, const unsigned char *ptarget,
+		     uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
+{
+	uint32_t *nonce = (uint32_t *)(pdata + 76);
+	char *scratchbuf;
+	uint32_t data[20];
+	uint32_t tmp_hash7;
+	uint32_t Htarg = ((const uint32_t *)ptarget)[7];
+	bool ret = false;
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+
+	scratchbuf = malloc(131583);
+	if (unlikely(!scratchbuf)) {
+		applog(LOG_ERR, "Failed to malloc scratchbuf in scanhash_scrypt");
+		return ret;
+	}
+
+	while(1) {
+		*nonce = ++n;
+		data[19] = n;
+		tmp_hash7 = scrypt_1024_1_1_256_sp(data, scratchbuf);
+
+		if (unlikely(tmp_hash7 <= Htarg)) {
+			((uint32_t *)pdata)[19] = byteswap(n);
+			*last_nonce = n;
+			ret = true;
+			break;
+		}
+
+		if (unlikely((n >= max_nonce) || thr->work_restart)) {
+			*last_nonce = n;
+			break;
+		}
+	}
+
+	free(scratchbuf);;
+	return ret;
+}
+
diff --git a/scrypt.h b/scrypt.h
new file mode 100644
index 00000000..45dd46bf
--- /dev/null
+++ b/scrypt.h
@@ -0,0 +1,13 @@
+#ifndef SCRYPT_H
+#define SCRYPT_H
+
+#ifdef USE_SCRYPT
+extern bool scrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce);
+#else /* USE_SCRYPT */
+static inline bool scrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
+{
+	return false;
+}
+#endif /* USE_SCRYPT */
+
+#endif /* SCRYPT_H */
diff --git a/scrypt120724.cl b/scrypt120724.cl
new file mode 100644
index 00000000..d38f6a54
--- /dev/null
+++ b/scrypt120724.cl
@@ -0,0 +1,757 @@
+#define rotl(x,y) rotate(x,y)
+#define Ch(x,y,z) bitselect(z,y,x)
+#define Maj(x,y,z) Ch((x^z),y,z)
+
+#define EndianSwap(n) (rotl(n&0x00FF00FF,24U)|rotl(n&0xFF00FF00,8U))
+
+#define Tr2(x)		(rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
+#define Tr1(x)		(rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))
+#define Wr2(x)		(rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
+#define Wr1(x)		(rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))
+
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	h += Tr1(e) + Ch(e, f, g) + k;		\
+	d += h;					\
+	h += Tr2(a) + Maj(a, b, c);
+
+void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
+{
+	uint4 S0 = *state0;
+	uint4 S1 = *state1;
+	
+#define A S0.x
+#define B S0.y
+#define C S0.z
+#define D S0.w
+#define E S1.x
+#define F S1.y
+#define G S1.z
+#define H S1.w
+
+	uint4 W[4];
+
+	W[ 0].x = block0.x;
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x428a2f98U);
+	W[ 0].y = block0.y;
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x71374491U);
+	W[ 0].z = block0.z;
+	RND(G,H,A,B,C,D,E,F, W[0].z+0xb5c0fbcfU);
+	W[ 0].w = block0.w;
+	RND(F,G,H,A,B,C,D,E, W[0].w+0xe9b5dba5U);
+
+	W[ 1].x = block1.x;
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU);
+	W[ 1].y = block1.y;
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U);
+	W[ 1].z = block1.z;
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U);
+	W[ 1].w = block1.w;
+	RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U);
+
+	W[ 2].x = block2.x;
+	RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U);
+	W[ 2].y = block2.y;
+	RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U);
+	W[ 2].z = block2.z;
+	RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU);
+	W[ 2].w = block2.w;
+	RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U);
+
+	W[ 3].x = block3.x;
+	RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U);
+	W[ 3].y = block3.y;
+	RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU);
+	W[ 3].z = block3.z;
+	RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U);
+	W[ 3].w = block3.w;
+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U);
+
+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U);
+
+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U);
+
+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U);
+
+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU);
+
+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU);
+
+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU);
+
+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU);
+
+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU);
+
+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U);
+
+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU);
+
+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U);
+
+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U);
+
+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U);
+
+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U);
+
+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U);
+
+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U);
+
+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U);
+
+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U);
+
+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU);
+
+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U);
+
+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U);
+
+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU);
+
+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU);
+
+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U);
+
+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U);
+
+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU);
+
+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U);
+
+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U);
+
+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U);
+
+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U);
+
+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U);
+
+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U);
+
+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U);
+
+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U);
+
+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU);
+
+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U);
+
+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U);
+
+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU);
+
+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU);
+
+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U);
+
+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU);
+
+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU);
+
+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U);
+
+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U);
+
+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU);
+
+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU);
+
+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U);
+
+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U);
+	
+#undef A
+#undef B
+#undef C
+#undef D
+#undef E
+#undef F
+#undef G
+#undef H
+
+	*state0 += S0;
+	*state1 += S1;
+}
+
+void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
+{
+#define A (*state0).x
+#define B (*state0).y
+#define C (*state0).z
+#define D (*state0).w
+#define E (*state1).x
+#define F (*state1).y
+#define G (*state1).z
+#define H (*state1).w
+
+	uint4 W[4];
+
+	W[0].x = block0.x;
+	D=0x98c7e2a2U+W[0].x;
+	H=0xfc08884dU+W[0].x;
+
+	W[0].y = block0.y;
+	C=0xcd2a11aeU+Tr1(D)+Ch(D,0x510e527fU,0x9b05688cU)+W[0].y;
+	G=0xC3910C8EU+C+Tr2(H)+Ch(H,0xfb6feee7U,0x2a01a605U);
+
+	W[0].z = block0.z;
+	B=0x0c2e12e0U+Tr1(C)+Ch(C,D,0x510e527fU)+W[0].z;
+	F=0x4498517BU+B+Tr2(G)+Maj(G,H,0x6a09e667U);
+
+	W[0].w = block0.w;
+	A=0xa4ce148bU+Tr1(B)+Ch(B,C,D)+W[0].w; 
+	E=0x95F61999U+A+Tr2(F)+Maj(F,G,H);
+
+	W[1].x = block1.x;
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU);
+	W[1].y = block1.y;
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U);
+	W[1].z = block1.z;
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U);
+	W[1].w = block1.w;
+	RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U);
+	
+	W[2].x = block2.x;
+	RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U);
+	W[2].y = block2.y;
+	RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U);
+	W[2].z = block2.z;
+	RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU);
+	W[2].w = block2.w;
+	RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U);
+	
+	W[3].x = block3.x;
+	RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U);
+	W[3].y = block3.y;
+	RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU);
+	W[3].z = block3.z;
+	RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U);
+	W[3].w = block3.w;
+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U);
+
+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U);
+
+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U);
+
+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U);
+
+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU);
+
+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU);
+
+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU);
+
+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU);
+
+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU);
+
+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U);
+
+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU);
+
+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U);
+
+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U);
+
+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U);
+
+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U);
+
+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U);
+
+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U);
+
+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U);
+
+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U);
+
+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU);
+
+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U);
+
+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U);
+
+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU);
+
+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU);
+
+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U);
+
+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U);
+
+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU);
+
+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U);
+
+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U);
+
+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U);
+
+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U);
+
+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U);
+
+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U);
+
+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U);
+
+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U);
+
+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU);
+
+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U);
+
+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U);
+
+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU);
+
+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU);
+
+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U);
+
+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU);
+
+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU);
+
+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U);
+
+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U);
+
+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU);
+
+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU);
+
+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U);
+
+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U);
+	
+#undef A
+#undef B
+#undef C
+#undef D
+#undef E
+#undef F
+#undef G
+#undef H
+
+	*state0 += (uint4)(0x6A09E667U,0xBB67AE85U,0x3C6EF372U,0xA54FF53AU);
+	*state1 += (uint4)(0x510E527FU,0x9B05688CU,0x1F83D9ABU,0x5BE0CD19U);
+}
+
+__constant uint fixedW[64] =
+{
+	0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
+	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794,
+	0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f,
+	0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c,
+	0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa,
+	0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012,
+	0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4,
+	0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848,
+};
+
+void SHA256_fixed(uint4*restrict state0,uint4*restrict state1)
+{
+	uint4 S0 = *state0;
+	uint4 S1 = *state1;
+
+#define A S0.x
+#define B S0.y
+#define C S0.z
+#define D S0.w
+#define E S1.x
+#define F S1.y
+#define G S1.z
+#define H S1.w
+
+	RND(A,B,C,D,E,F,G,H, fixedW[0]);
+	RND(H,A,B,C,D,E,F,G, fixedW[1]);
+	RND(G,H,A,B,C,D,E,F, fixedW[2]);
+	RND(F,G,H,A,B,C,D,E, fixedW[3]);
+	RND(E,F,G,H,A,B,C,D, fixedW[4]);
+	RND(D,E,F,G,H,A,B,C, fixedW[5]);
+	RND(C,D,E,F,G,H,A,B, fixedW[6]);
+	RND(B,C,D,E,F,G,H,A, fixedW[7]);
+	RND(A,B,C,D,E,F,G,H, fixedW[8]);
+	RND(H,A,B,C,D,E,F,G, fixedW[9]);
+	RND(G,H,A,B,C,D,E,F, fixedW[10]);
+	RND(F,G,H,A,B,C,D,E, fixedW[11]);
+	RND(E,F,G,H,A,B,C,D, fixedW[12]);
+	RND(D,E,F,G,H,A,B,C, fixedW[13]);
+	RND(C,D,E,F,G,H,A,B, fixedW[14]);
+	RND(B,C,D,E,F,G,H,A, fixedW[15]);
+	RND(A,B,C,D,E,F,G,H, fixedW[16]);
+	RND(H,A,B,C,D,E,F,G, fixedW[17]);
+	RND(G,H,A,B,C,D,E,F, fixedW[18]);
+	RND(F,G,H,A,B,C,D,E, fixedW[19]);
+	RND(E,F,G,H,A,B,C,D, fixedW[20]);
+	RND(D,E,F,G,H,A,B,C, fixedW[21]);
+	RND(C,D,E,F,G,H,A,B, fixedW[22]);
+	RND(B,C,D,E,F,G,H,A, fixedW[23]);
+	RND(A,B,C,D,E,F,G,H, fixedW[24]);
+	RND(H,A,B,C,D,E,F,G, fixedW[25]);
+	RND(G,H,A,B,C,D,E,F, fixedW[26]);
+	RND(F,G,H,A,B,C,D,E, fixedW[27]);
+	RND(E,F,G,H,A,B,C,D, fixedW[28]);
+	RND(D,E,F,G,H,A,B,C, fixedW[29]);
+	RND(C,D,E,F,G,H,A,B, fixedW[30]);
+	RND(B,C,D,E,F,G,H,A, fixedW[31]);
+	RND(A,B,C,D,E,F,G,H, fixedW[32]);
+	RND(H,A,B,C,D,E,F,G, fixedW[33]);
+	RND(G,H,A,B,C,D,E,F, fixedW[34]);
+	RND(F,G,H,A,B,C,D,E, fixedW[35]);
+	RND(E,F,G,H,A,B,C,D, fixedW[36]);
+	RND(D,E,F,G,H,A,B,C, fixedW[37]);
+	RND(C,D,E,F,G,H,A,B, fixedW[38]);
+	RND(B,C,D,E,F,G,H,A, fixedW[39]);
+	RND(A,B,C,D,E,F,G,H, fixedW[40]);
+	RND(H,A,B,C,D,E,F,G, fixedW[41]);
+	RND(G,H,A,B,C,D,E,F, fixedW[42]);
+	RND(F,G,H,A,B,C,D,E, fixedW[43]);
+	RND(E,F,G,H,A,B,C,D, fixedW[44]);
+	RND(D,E,F,G,H,A,B,C, fixedW[45]);
+	RND(C,D,E,F,G,H,A,B, fixedW[46]);
+	RND(B,C,D,E,F,G,H,A, fixedW[47]);
+	RND(A,B,C,D,E,F,G,H, fixedW[48]);
+	RND(H,A,B,C,D,E,F,G, fixedW[49]);
+	RND(G,H,A,B,C,D,E,F, fixedW[50]);
+	RND(F,G,H,A,B,C,D,E, fixedW[51]);
+	RND(E,F,G,H,A,B,C,D, fixedW[52]);
+	RND(D,E,F,G,H,A,B,C, fixedW[53]);
+	RND(C,D,E,F,G,H,A,B, fixedW[54]);
+	RND(B,C,D,E,F,G,H,A, fixedW[55]);
+	RND(A,B,C,D,E,F,G,H, fixedW[56]);
+	RND(H,A,B,C,D,E,F,G, fixedW[57]);
+	RND(G,H,A,B,C,D,E,F, fixedW[58]);
+	RND(F,G,H,A,B,C,D,E, fixedW[59]);
+	RND(E,F,G,H,A,B,C,D, fixedW[60]);
+	RND(D,E,F,G,H,A,B,C, fixedW[61]);
+	RND(C,D,E,F,G,H,A,B, fixedW[62]);
+	RND(B,C,D,E,F,G,H,A, fixedW[63]);
+	
+#undef A
+#undef B
+#undef C
+#undef D
+#undef E
+#undef F
+#undef G
+#undef H
+	*state0 += S0;
+	*state1 += S1;
+}
+
+void shittify(uint4 B[8])
+{
+	uint4 tmp[4];
+	tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w);
+	tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w);
+	tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w);
+	tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w);
+	
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i] = EndianSwap(tmp[i]);
+
+	tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w);
+	tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w);
+	tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w);
+	tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w);
+	
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i+4] = EndianSwap(tmp[i]);
+}
+
+void unshittify(uint4 B[8])
+{
+	uint4 tmp[4];
+	tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w);
+	tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w);
+	tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w);
+	tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w);
+	
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i] = EndianSwap(tmp[i]);
+
+	tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w);
+	tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w);
+	tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w);
+	tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w);
+	
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i+4] = EndianSwap(tmp[i]);
+}
+
+void salsa(uint4 B[8])
+{
+	uint4 w[4];
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		w[i] = (B[i]^=B[i+4]);
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+	{
+		w[0] ^= rotl(w[3]     +w[2]     , 7U);
+		w[1] ^= rotl(w[0]     +w[3]     , 9U);
+		w[2] ^= rotl(w[1]     +w[0]     ,13U);
+		w[3] ^= rotl(w[2]     +w[1]     ,18U);
+		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+	}
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		w[i] = (B[i+4]^=(B[i]+=w[i]));
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+	{
+		w[0] ^= rotl(w[3]     +w[2]     , 7U);
+		w[1] ^= rotl(w[0]     +w[3]     , 9U);
+		w[2] ^= rotl(w[1]     +w[0]     ,13U);
+		w[3] ^= rotl(w[2]     +w[1]     ,18U);
+		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+	}
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i+4] += w[i];
+}
+
+#define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
+#define CO Coord(z,x,y)
+
+void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
+{
+	shittify(X);
+	const uint zSIZE = 8;
+	const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0));
+	const uint xSIZE = CONCURRENT_THREADS;
+	uint x = get_global_id(0)%xSIZE;
+
+	for(uint y=0; y<1024/LOOKUP_GAP; ++y)
+	{
+#pragma unroll
+		for(uint z=0; z<zSIZE; ++z)
+			lookup[CO] = X[z];
+		for(uint i=0; i<LOOKUP_GAP; ++i) 
+			salsa(X);
+	}
+#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
+	{
+		uint y = (1024/LOOKUP_GAP);
+#pragma unroll
+		for(uint z=0; z<zSIZE; ++z)
+			lookup[CO] = X[z];
+		for(uint i=0; i<1024%LOOKUP_GAP; ++i)
+			salsa(X); 
+	}
+#endif
+	for (uint i=0; i<1024; ++i) 
+	{
+		uint4 V[8];
+		uint j = X[7].x & 0x3FF;
+		uint y = (j/LOOKUP_GAP);
+#pragma unroll
+		for(uint z=0; z<zSIZE; ++z)
+			V[z] = lookup[CO];
+
+#if (LOOKUP_GAP == 1)
+#elif (LOOKUP_GAP == 2)
+		if (j&1)
+			salsa(V);
+#else
+		uint val = j%LOOKUP_GAP;
+		for (uint z=0; z<val; ++z) 
+			salsa(V);
+#endif
+
+#pragma unroll
+		for(uint z=0; z<zSIZE; ++z)
+			X[z] ^= V[z];
+		salsa(X);
+	}
+	unshittify(X);
+}
+
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(__global const uint4 * restrict input,
+__global uint*restrict output, __global uint4*restrict padcache,
+const uint4 midstate0, const uint4 midstate16, const uint target)
+{
+	uint gid = get_global_id(0);
+	uint4 X[8];
+	uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;
+	uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);
+	uint4 pad0 = midstate0, pad1 = midstate16;
+
+	SHA256(&pad0,&pad1, data, (uint4)(0x80000000U,0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0,0x280));
+	SHA256_fresh(&ostate0,&ostate1, pad0^0x5C5C5C5CU, pad1^0x5C5C5C5CU, 0x5C5C5C5CU, 0x5C5C5C5CU);
+	SHA256_fresh(&tstate0,&tstate1, pad0^0x36363636U, pad1^0x36363636U, 0x36363636U, 0x36363636U);
+
+	tmp0 = tstate0;
+	tmp1 = tstate1;
+	SHA256(&tstate0, &tstate1, input[0],input[1],input[2],input[3]);
+
+#pragma unroll
+	for (uint i=0; i<4; i++) 
+	{
+		pad0 = tstate0;
+		pad1 = tstate1;
+		X[i*2 ] = ostate0;
+		X[i*2+1] = ostate1;
+
+		SHA256(&pad0,&pad1, data, (uint4)(i+1,0x80000000U,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0,0x4a0U));
+		SHA256(X+i*2,X+i*2+1, pad0, pad1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x300U));
+	}
+	scrypt_core(X,padcache);
+	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
+	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
+	SHA256_fixed(&tmp0,&tmp1);
+	SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x300U));
+
+	bool found = (EndianSwap(ostate1.w) <= target);
+	if (found)
+		output[FOUND] = output[NFLAG & gid] = gid;
+}
+
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
+ * 2012 Con Kolivas.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */