diff --git a/Makefile.am b/Makefile.am index d7614172..116abdc5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -17,27 +17,14 @@ INCLUDES = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) bin_PROGRAMS = cgminer -bin_SCRIPTS = phatk110817.cl poclbm110817.cl +bin_SCRIPTS = phatk120203.cl poclbm120203.cl -if HAS_CPUMINE cgminer_SOURCES = elist.h miner.h compat.h bench_block.h \ main.c util.c uthash.h \ ocl.c ocl.h findnonce.c findnonce.h \ - sha256_generic.c sha256_4way.c sha256_via.c \ - sha256_cryptopp.c sha256_sse2_amd64.c \ - sha256_sse4_amd64.c sha256_sse2_i386.c \ - sha256_altivec_4way.c \ adl.c adl.h adl_functions.h \ - phatk110817.cl poclbm110817.cl \ + phatk120203.cl poclbm120203.cl \ sha2.c sha2.h api.c -else -cgminer_SOURCES = elist.h miner.h compat.h bench_block.h \ - main.c util.c uthash.h \ - ocl.c ocl.h findnonce.c findnonce.h \ - adl.c adl.h adl_functions.h \ - phatk110817.cl poclbm110817.cl \ - sha2.c sha2.h api.c -endif cgminer_LDFLAGS = $(PTHREAD_FLAGS) cgminer_LDADD = $(DLOPEN_FLAGS) @LIBCURL_LIBS@ @JANSSON_LIBS@ @PTHREAD_LIBS@ \ @@ -46,6 +33,11 @@ cgminer_LDADD = $(DLOPEN_FLAGS) @LIBCURL_LIBS@ @JANSSON_LIBS@ @PTHREAD_LIBS@ \ cgminer_CPPFLAGS = -I$(top_builddir)/lib -I$(top_srcdir)/lib @OPENCL_FLAGS@ if HAS_CPUMINE +cgminer_SOURCES += sha256_generic.c sha256_4way.c sha256_via.c \ + sha256_cryptopp.c sha256_sse2_amd64.c \ + sha256_sse4_amd64.c sha256_sse2_i386.c \ + sha256_altivec_4way.c + if HAVE_x86_64 if HAS_YASM SUBDIRS += x86_64 diff --git a/ocl.c b/ocl.c index 16d1a1e8..d976ef73 100644 --- a/ocl.c +++ b/ocl.c @@ -366,13 +366,13 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) switch (chosen_kernel) { case KL_POCLBM: - strcpy(filename, "poclbm110817.cl"); - strcpy(binaryfilename, "poclbm110817"); + strcpy(filename, "poclbm120203.cl"); + strcpy(binaryfilename, "poclbm120203"); break; case KL_NONE: /* Shouldn't happen */ case KL_PHATK: - strcpy(filename, "phatk110817.cl"); - strcpy(binaryfilename, "phatk110817"); + strcpy(filename, "phatk120203.cl"); + strcpy(binaryfilename, "phatk120203"); break; } diff --git a/phatk110817.cl b/phatk120203.cl similarity index 96% rename from phatk110817.cl rename to phatk120203.cl index 20da40ae..ebb0bd23 100644 --- a/phatk110817.cl +++ b/phatk120203.cl @@ -1,6 +1,6 @@ // This file is taken and modified from the public-domain poclbm project, and // I have therefore decided to keep it public-domain. - +// Modified version copyright 2011-2012 Con Kolivas #ifdef VECTORS4 typedef uint4 u; @@ -51,9 +51,6 @@ __constant uint H[8] = { #ifdef BITALIGN #pragma OPENCL EXTENSION cl_amd_media_ops : enable #define rot(x, y) amd_bitalign(x, x, (uint)(32 - y)) -#else - #define rot(x, y) rotate(x, (uint)y) -#endif // This part is not from the stock poclbm kernel. It's part of an optimization // added in the Phoenix Miner. @@ -63,7 +60,7 @@ __constant uint H[8] = { // detected, use it for Ch. Otherwise, construct Ch out of simpler logical // primitives. -#ifdef BFI_INT + #ifdef BFI_INT // Well, slight problem... It turns out BFI_INT isn't actually exposed to // OpenCL (or CAL IL for that matter) in any way. However, there is // a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via @@ -75,13 +72,22 @@ __constant uint H[8] = { #define Ch(x, y, z) amd_bytealign(x,y,z) // Ma can also be implemented in terms of BFI_INT... #define Ma(z, x, y) amd_bytealign(z^x,y,x) -#else - #define Ch(x, y, z) bitselect(x,y,z) - // Ma can also be implemented in terms of bitselect - #define Ma(z, x, y) bitselect(z^x,y,x) + #else // BFI_INT + // Later SDKs optimise this to BFI INT without patching and GCN + // actually fails if manually patched with BFI_INT + + #define Ch(x, y, z) bitselect((u)z, (u)y, (u)x) + #define Ma(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x) + #define rotr(x, y) amd_bitalign((u)x, (u)x, (u)y) + #endif +#else // BITALIGN + #define Ch(x, y, z) (z ^ (x & (y ^ z))) + #define Ma(x, y, z) ((x & z) | (y & (x | z))) + #define rotr(x, y) rotate((u)x, (u)(32-y)) #endif + //Various intermediate calculations for each SHA round #define s0(n) (S0(Vals[(0 + 128 - (n)) % 8])) #define S0(n) (rot(n, 30u)^rot(n, 19u)^rot(n,10u)) diff --git a/poclbm110817.cl b/poclbm120203.cl similarity index 99% rename from poclbm110817.cl rename to poclbm120203.cl index 0db94584..b145d44f 100644 --- a/poclbm110817.cl +++ b/poclbm120203.cl @@ -1,6 +1,6 @@ // -ck modified kernel taken from Phoenix taken from poclbm, with aspects of // phatk and others. -// Modified version copyright 2011 Con Kolivas +// Modified version copyright 2011-2012 Con Kolivas // This file is taken and modified from the public-domain poclbm project, and // we have therefore decided to keep it public-domain in Phoenix. @@ -33,7 +33,10 @@ __constant uint K[64] = { // detected, use it for ch. Otherwise, construct ch out of simpler logical // primitives. -#ifdef BFI_INT +#ifdef BITALIGN + #pragma OPENCL EXTENSION cl_amd_media_ops : enable + #define rotr(x, y) amd_bitalign((u)x, (u)x, (u)y) + #ifdef BFI_INT // Well, slight problem... It turns out BFI_INT isn't actually exposed to // OpenCL (or CAL IL for that matter) in any way. However, there is // a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via @@ -46,16 +49,17 @@ __constant uint K[64] = { // Ma can also be implemented in terms of BFI_INT... #define Ma(x, y, z) amd_bytealign( (z^x), (y), (x) ) -#else + #else // BFI_INT + // Later SDKs optimise this to BFI INT without patching and GCN + // actually fails if manually patched with BFI_INT + + #define ch(x, y, z) bitselect((u)z, (u)y, (u)x) + #define Ma(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x) +#endif +#else // BITALIGN #define ch(x, y, z) (z ^ (x & (y ^ z))) #define Ma(x, y, z) ((x & z) | (y & (x | z))) -#endif - -#ifdef BITALIGN - #pragma OPENCL EXTENSION cl_amd_media_ops : enable - #define rotr(x, y) amd_bitalign((u)x, (u)x, (u)y) -#else - #define rotr(x, y) rotate((u)x, (u)(32-y)) + #define rotr(x, y) rotate((u)x, (u)(32 - y)) #endif // AMD's KernelAnalyzer throws errors compiling the kernel if we use @@ -63,7 +67,7 @@ __constant uint K[64] = { // problems. (this is used 4 times, and likely optimized out by the compiler.) #define Ma2(x, y, z) ((y & z) | (x & (y | z))) -__kernel void search( const uint state0, const uint state1, const uint state2, const uint state3, +__kernel void search(const uint state0, const uint state1, const uint state2, const uint state3, const uint state4, const uint state5, const uint state6, const uint state7, const uint b1, const uint c1, const uint d1, const uint f1, const uint g1, const uint h1,