diff --git a/ocl.c b/ocl.c index 2a4a37e6..bf2f6594 100644 --- a/ocl.c +++ b/ocl.c @@ -1,3 +1,4 @@ +#define _GNU_SOURCE #include #include #include @@ -93,6 +94,63 @@ int clDevicesNum() { return numDevices; } +void advance(char **area, unsigned *remaining, const char *marker) +{ + char *find = memmem(*area, *remaining, marker, strlen(marker)); + if (!find) + fprintf(stderr, "Marker \"%s\" not found\n", marker), exit(1); + *remaining -= find - *area; + *area = find; +} + +#define OP3_INST_BFE_UINT 4UL +#define OP3_INST_BFE_INT 5UL +#define OP3_INST_BFI_INT 6UL +#define OP3_INST_BIT_ALIGN_INT 12UL +#define OP3_INST_BYTE_ALIGN_INT 13UL + +void patch_opcodes(char *w, unsigned remaining) +{ + uint64_t *opcode = (uint64_t *)w; + int patched = 0; + int count_bfe_int = 0; + int count_bfe_uint = 0; + int count_byte_align = 0; + while (42) + { + int clamp = (*opcode >> (32 + 31)) & 0x1; + int dest_rel = (*opcode >> (32 + 28)) & 0x1; + int alu_inst = (*opcode >> (32 + 13)) & 0x1f; + int s2_neg = (*opcode >> (32 + 12)) & 0x1; + int s2_rel = (*opcode >> (32 + 9)) & 0x1; + int pred_sel = (*opcode >> 29) & 0x3; + if (!clamp && !dest_rel && !s2_neg && !s2_rel && !pred_sel) { + if (alu_inst == OP3_INST_BFE_INT) { + count_bfe_int++; + } else if (alu_inst == OP3_INST_BFE_UINT) { + count_bfe_uint++; + } else if (alu_inst == OP3_INST_BYTE_ALIGN_INT) { + count_byte_align++; + // patch this instruction to BFI_INT + *opcode &= 0xfffc1fffffffffffUL; + *opcode |= OP3_INST_BFI_INT << (32 + 13); + patched++; + } + } + if (remaining <= 8) { + break; + } + opcode++; + remaining -= 8; + } + if (opt_debug) { + printf("Potential OP3 instructions identified: " + "%i BFE_INT, %i BFE_UINT, %i BYTE_ALIGN\n", + count_bfe_int, count_bfe_uint, count_byte_align); + printf("Patched a total of %i BFI_INT instructions\n", patched); + } +} + _clState *initCl(int gpu, char *name, size_t nameSize) { cl_int status = 0; @@ -165,7 +223,7 @@ _clState *initCl(int gpu, char *name, size_t nameSize) { printf("List of devices:\n"); - int i; + unsigned int i; for(i=0; iprogram, CL_PROGRAM_NUM_DEVICES, sizeof(nDevices), &nDevices, NULL ); + binary_sizes = (size_t *)malloc( sizeof(size_t)*nDevices ); + err = clGetProgramInfo( clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nDevices, binary_sizes, NULL ); + + /* copy over all of the generated binaries. */ + binaries = (char **)malloc( sizeof(char *)*nDevices ); + for( i = 0; i < nDevices; i++ ) { + printf("binary size %d : %d\n", i, binary_sizes[i]); + if( binary_sizes[i] != 0 ) + binaries[i] = (char *)malloc( sizeof(char)*binary_sizes[i] ); + else + binaries[i] = NULL; + } + err = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL ); + // all the code should be within the first 83000 bytes or so, but scan + // a bit more for headroom + unsigned bytes_to_scan = 93000; + for (i = 0; i < nDevices; i++) { + if (!binaries[i]) + continue; + + unsigned remaining = bytes_to_scan; + char *w = binaries[i]; + int j; + + if (opt_debug) + printf("At %p (%u rem. bytes), searching outer elf marker\n", w, remaining); + advance(&w, &remaining, "ELF"); + if (opt_debug) + printf("At %p (%u rem. bytes), searching inner elf marker\n", w, remaining); + advance(&w, &remaining, "ELF"); + if (opt_debug) + printf("At %p (%u rem. bytes), searching first .text marker\n", w, remaining); + advance(&w, &remaining, ".text"); + if (opt_debug) + printf("At %p (%u rem. bytes), searching second .text marker\n", w, remaining); + advance(&w, &remaining, ".text"); + // now we are pointing to the first opcode + patch_opcodes(w, remaining); + } + + status = clReleaseProgram(clState->program); + if(status != CL_SUCCESS) + { + printf("Error: Releasing program. (clReleaseProgram)\n"); + return NULL; + } + + clState->program = clCreateProgramWithBinary(clState->context, numDevices, &devices[gpu], binary_sizes, binaries, &status, NULL); + if(status != CL_SUCCESS) + { + printf("Error: Loading Binary into cl_program (clCreateProgramWithBinary)\n"); + return NULL; + } + + /* create a cl program executable for all the devices specified */ + status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL); + if(status != CL_SUCCESS) + { + printf("Error: Building Program (clBuildProgram)\n"); + size_t logSize; + status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); + + char *log = malloc(logSize); + status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); + printf("%s\n", log); + return NULL; + } + /* get a kernel object handle for a kernel with the given name */ clState->kernel = clCreateKernel(clState->program, "oclminer", &status); if(status != CL_SUCCESS) diff --git a/oclminer.cl b/oclminer.cl index b706c921..c6e37660 100644 --- a/oclminer.cl +++ b/oclminer.cl @@ -1,4 +1,16 @@ -#define rotr(x, n) rotate(x, (uint)(32 - n)) +typedef uint z; +#define BITALIGN + +#ifdef BITALIGN +#pragma OPENCL EXTENSION cl_amd_media_ops : enable +#define rotr(a, b) amd_bitalign((z)a, (z)a, (z)b) +#define Ch(a, b, c) amd_bytealign(a, b, c) +#define Ma(a, b, c) amd_bytealign((b), (a | c), (c & a)) +#else +#define rotr(a, b) rotate((z)a, (z)(32 - b)) +#define Ch(a, b, c) (c ^ (a & (b ^ c))) +#define Ma(a, b, c) ((b & c) | (a & (b | c))) +#endif #define WGS __attribute__((reqd_work_group_size(128, 1, 1)))