Browse Source

First BFI_INT patch changes.

nfactor-troky
Con Kolivas 14 years ago
parent
commit
c548dea848
  1. 136
      ocl.c
  2. 14
      oclminer.cl

136
ocl.c

@ -1,3 +1,4 @@ @@ -1,3 +1,4 @@
#define _GNU_SOURCE
#include <signal.h>
#include <stdlib.h>
#include <string.h>
@ -93,6 +94,63 @@ int clDevicesNum() { @@ -93,6 +94,63 @@ int clDevicesNum() {
return numDevices;
}
void advance(char **area, unsigned *remaining, const char *marker)
{
char *find = memmem(*area, *remaining, marker, strlen(marker));
if (!find)
fprintf(stderr, "Marker \"%s\" not found\n", marker), exit(1);
*remaining -= find - *area;
*area = find;
}
#define OP3_INST_BFE_UINT 4UL
#define OP3_INST_BFE_INT 5UL
#define OP3_INST_BFI_INT 6UL
#define OP3_INST_BIT_ALIGN_INT 12UL
#define OP3_INST_BYTE_ALIGN_INT 13UL
void patch_opcodes(char *w, unsigned remaining)
{
uint64_t *opcode = (uint64_t *)w;
int patched = 0;
int count_bfe_int = 0;
int count_bfe_uint = 0;
int count_byte_align = 0;
while (42)
{
int clamp = (*opcode >> (32 + 31)) & 0x1;
int dest_rel = (*opcode >> (32 + 28)) & 0x1;
int alu_inst = (*opcode >> (32 + 13)) & 0x1f;
int s2_neg = (*opcode >> (32 + 12)) & 0x1;
int s2_rel = (*opcode >> (32 + 9)) & 0x1;
int pred_sel = (*opcode >> 29) & 0x3;
if (!clamp && !dest_rel && !s2_neg && !s2_rel && !pred_sel) {
if (alu_inst == OP3_INST_BFE_INT) {
count_bfe_int++;
} else if (alu_inst == OP3_INST_BFE_UINT) {
count_bfe_uint++;
} else if (alu_inst == OP3_INST_BYTE_ALIGN_INT) {
count_byte_align++;
// patch this instruction to BFI_INT
*opcode &= 0xfffc1fffffffffffUL;
*opcode |= OP3_INST_BFI_INT << (32 + 13);
patched++;
}
}
if (remaining <= 8) {
break;
}
opcode++;
remaining -= 8;
}
if (opt_debug) {
printf("Potential OP3 instructions identified: "
"%i BFE_INT, %i BFE_UINT, %i BYTE_ALIGN\n",
count_bfe_int, count_bfe_uint, count_byte_align);
printf("Patched a total of %i BFI_INT instructions\n", patched);
}
}
_clState *initCl(int gpu, char *name, size_t nameSize) {
cl_int status = 0;
@ -165,7 +223,7 @@ _clState *initCl(int gpu, char *name, size_t nameSize) { @@ -165,7 +223,7 @@ _clState *initCl(int gpu, char *name, size_t nameSize) {
printf("List of devices:\n");
int i;
unsigned int i;
for(i=0; i<numDevices; i++) {
char pbuff[100];
status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
@ -236,6 +294,82 @@ _clState *initCl(int gpu, char *name, size_t nameSize) { @@ -236,6 +294,82 @@ _clState *initCl(int gpu, char *name, size_t nameSize) {
return NULL;
}
size_t nDevices;
size_t * binary_sizes;
char ** binaries;
unsigned int i;
int err;
/* figure out number of devices and the sizes of the binary for each device. */
err = clGetProgramInfo( clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(nDevices), &nDevices, NULL );
binary_sizes = (size_t *)malloc( sizeof(size_t)*nDevices );
err = clGetProgramInfo( clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nDevices, binary_sizes, NULL );
/* copy over all of the generated binaries. */
binaries = (char **)malloc( sizeof(char *)*nDevices );
for( i = 0; i < nDevices; i++ ) {
printf("binary size %d : %d\n", i, binary_sizes[i]);
if( binary_sizes[i] != 0 )
binaries[i] = (char *)malloc( sizeof(char)*binary_sizes[i] );
else
binaries[i] = NULL;
}
err = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL );
// all the code should be within the first 83000 bytes or so, but scan
// a bit more for headroom
unsigned bytes_to_scan = 93000;
for (i = 0; i < nDevices; i++) {
if (!binaries[i])
continue;
unsigned remaining = bytes_to_scan;
char *w = binaries[i];
int j;
if (opt_debug)
printf("At %p (%u rem. bytes), searching outer elf marker\n", w, remaining);
advance(&w, &remaining, "ELF");
if (opt_debug)
printf("At %p (%u rem. bytes), searching inner elf marker\n", w, remaining);
advance(&w, &remaining, "ELF");
if (opt_debug)
printf("At %p (%u rem. bytes), searching first .text marker\n", w, remaining);
advance(&w, &remaining, ".text");
if (opt_debug)
printf("At %p (%u rem. bytes), searching second .text marker\n", w, remaining);
advance(&w, &remaining, ".text");
// now we are pointing to the first opcode
patch_opcodes(w, remaining);
}
status = clReleaseProgram(clState->program);
if(status != CL_SUCCESS)
{
printf("Error: Releasing program. (clReleaseProgram)\n");
return NULL;
}
clState->program = clCreateProgramWithBinary(clState->context, numDevices, &devices[gpu], binary_sizes, binaries, &status, NULL);
if(status != CL_SUCCESS)
{
printf("Error: Loading Binary into cl_program (clCreateProgramWithBinary)\n");
return NULL;
}
/* create a cl program executable for all the devices specified */
status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL);
if(status != CL_SUCCESS)
{
printf("Error: Building Program (clBuildProgram)\n");
size_t logSize;
status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
char *log = malloc(logSize);
status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
printf("%s\n", log);
return NULL;
}
/* get a kernel object handle for a kernel with the given name */
clState->kernel = clCreateKernel(clState->program, "oclminer", &status);
if(status != CL_SUCCESS)

14
oclminer.cl

@ -1,4 +1,16 @@ @@ -1,4 +1,16 @@
#define rotr(x, n) rotate(x, (uint)(32 - n))
typedef uint z;
#define BITALIGN
#ifdef BITALIGN
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#define rotr(a, b) amd_bitalign((z)a, (z)a, (z)b)
#define Ch(a, b, c) amd_bytealign(a, b, c)
#define Ma(a, b, c) amd_bytealign((b), (a | c), (c & a))
#else
#define rotr(a, b) rotate((z)a, (z)(32 - b))
#define Ch(a, b, c) (c ^ (a & (b ^ c)))
#define Ma(a, b, c) ((b & c) | (a & (b | c)))
#endif
#define WGS __attribute__((reqd_work_group_size(128, 1, 1)))

Loading…
Cancel
Save