From 2a3b7359eb033bacb5ee3220697eb53be8f361b9 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 5 Mar 2012 16:44:53 +1100 Subject: [PATCH] Clean up use of macros in poclbm and use bitselect everywhere possible. --- poclbm120222.cl | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/poclbm120222.cl b/poclbm120222.cl index 8fb00901..a0a964cc 100644 --- a/poclbm120222.cl +++ b/poclbm120222.cl @@ -36,7 +36,10 @@ __constant uint K[64] = { #ifdef BITALIGN #pragma OPENCL EXTENSION cl_amd_media_ops : enable #define rotr(x, y) amd_bitalign((u)x, (u)x, (u)y) - #ifdef BFI_INT +#else + #define rotr(x, y) rotate((u)x, (u)(32 - y)) +#endif +#ifdef BFI_INT // Well, slight problem... It turns out BFI_INT isn't actually exposed to // OpenCL (or CAL IL for that matter) in any way. However, there is // a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via @@ -49,23 +52,19 @@ __constant uint K[64] = { // Ma can also be implemented in terms of BFI_INT... #define Ma(x, y, z) amd_bytealign( (z^x), (y), (x) ) - #else // BFI_INT - // Later SDKs optimise this to BFI INT without patching and GCN - // actually fails if manually patched with BFI_INT + + // AMD's KernelAnalyzer throws errors compiling the kernel if we use + // amd_bytealign on constants with vectors enabled, so we use this to avoid + // problems. (this is used 4 times, and likely optimized out by the compiler.) + #define Ma2(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x) +#else // BFI_INT + //GCN actually fails if manually patched with BFI_INT #define ch(x, y, z) bitselect((u)z, (u)y, (u)x) #define Ma(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x) -#endif -#else // BITALIGN - #define ch(x, y, z) (z ^ (x & (y ^ z))) - #define Ma(x, y, z) ((x & z) | (y & (x | z))) - #define rotr(x, y) rotate((u)x, (u)(32 - y)) + #define Ma2(x, y, z) Ma(x, y, z) #endif -// AMD's KernelAnalyzer throws errors compiling the kernel if we use -// amd_bytealign on constants with vectors enabled, so we use this to avoid -// problems. (this is used 4 times, and likely optimized out by the compiler.) -#define Ma2(x, y, z) ((y & z) | (x & (y | z))) __kernel __attribute__((vec_type_hint(u)))