Browse Source

Add AMD BFI_INT optimizations to SHA-2 and RIPEMD160 hash functions.

The full effect of this optimization will come later.
master
samr7 13 years ago
parent
commit
e025be9648
  1. 45
      calc_addrs.cl
  2. 206
      oclvanitygen.c

45
calc_addrs.cl

@ -726,12 +726,22 @@ sha2_256_init(uint *out) @@ -726,12 +726,22 @@ sha2_256_init(uint *out)
/* The state variable remapping is really contorted */
#define sha2_stvar(vals, i, v) vals[(64+v-i) % 8]
#define sha2_s0(a) (rotate(a, 30U) ^ rotate(a, 19U) ^ rotate(a, 10U))
#define sha2_s1(a) (rotate(a, 26U) ^ rotate(a, 21U) ^ rotate(a, 7U))
#if defined(AMD_BFI_INT)
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#define sha2_ch(a, b, c) amd_bytealign(a, b, c)
#define sha2_ma(a, b, c) amd_bytealign((a^c), b, a)
#else
#define sha2_ch(a, b, c) (c ^ (a & (b ^ c)))
#define sha2_ma(a, b, c) ((a & c) | (b & (a | c)))
#endif
void
sha2_256_block(uint *out, uint *in)
{
int i;
uint state[8], s0, s1, t1, t2;
uint state[8], t1, t2;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
@ -751,18 +761,17 @@ sha2_256_block(uint *out, uint *in) @@ -751,18 +761,17 @@ sha2_256_block(uint *out, uint *in)
}
/* Compute the t1, t2 augmentations */
t1 = sha2_stvar(state, i, 4);
t2 = sha2_stvar(state, i, 0);
s0 = (rotate(t2, 30U) ^ rotate(t2, 19U) ^ rotate(t2, 10U));
s1 = (rotate(t1, 26U) ^ rotate(t1, 21U) ^ rotate(t1, 7U));
t1 = (sha2_stvar(state, i, 7) + s1 + sha2_k[i] + in[i % 16] +
((t1 & sha2_stvar(state, i, 5)) ^
(~t1 & sha2_stvar(state, i, 6))));
t2 = s0 + ((t2 & sha2_stvar(state, i, 1)) ^
(t2 & sha2_stvar(state, i, 2)) ^
(sha2_stvar(state, i, 1) & sha2_stvar(state, i, 2)));
t1 = (sha2_stvar(state, i, 7) +
sha2_s1(sha2_stvar(state, i, 4)) +
sha2_ch(sha2_stvar(state, i, 4),
sha2_stvar(state, i, 5),
sha2_stvar(state, i, 6)) +
sha2_k[i] +
in[i % 16]);
t2 = (sha2_s0(sha2_stvar(state, i, 0)) +
sha2_ma(sha2_stvar(state, i, 0),
sha2_stvar(state, i, 1),
sha2_stvar(state, i, 2)));
sha2_stvar(state, i, 3) += t1;
sha2_stvar(state, i, 7) = t1 + t2;
}
@ -817,11 +826,19 @@ __constant uchar ripemd160_rlp[] = { @@ -817,11 +826,19 @@ __constant uchar ripemd160_rlp[] = {
#define ripemd160_val(v, i, n) (v)[(80+(n)-(i)) % 5]
#define ripemd160_valp(v, i, n) (v)[5 + ((80+(n)-(i)) % 5)]
#if defined(AMD_BFI_INT)
#define ripemd160_f0(x, y, z) (x ^ y ^ z)
#define ripemd160_f1(x, y, z) amd_bytealign(x, y, z)
#define ripemd160_f2(x, y, z) (z ^ (x | ~y))
#define ripemd160_f3(x, y, z) amd_bytealign(z, x, y)
#define ripemd160_f4(x, y, z) (x ^ (y | ~z))
#else
#define ripemd160_f0(x, y, z) (x ^ y ^ z)
#define ripemd160_f1(x, y, z) ((x & y) | (~x & z))
#define ripemd160_f2(x, y, z) ((x | ~y) ^ z)
#define ripemd160_f2(x, y, z) (z ^ (x | ~y))
#define ripemd160_f3(x, y, z) ((x & z) | (y & ~z))
#define ripemd160_f4(x, y, z) (x ^ (y | ~z))
#endif
#define ripemd160_round(i, in, vals, f, fp, t) do { \
ripemd160_val(vals, i, 0) = \
rotate(ripemd160_val(vals, i, 0) + \

206
oclvanitygen.c

@ -264,11 +264,11 @@ vg_ocl_device_getulong(cl_device_id did, cl_device_info param) @@ -264,11 +264,11 @@ vg_ocl_device_getulong(cl_device_id did, cl_device_info param)
return val;
}
size_t
cl_uint
vg_ocl_device_getuint(cl_device_id did, cl_device_info param)
{
cl_int ret;
size_t val;
cl_uint val;
size_t size_ret;
ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
if (ret != CL_SUCCESS) {
@ -288,8 +288,9 @@ vg_ocl_dump_info(vg_ocl_context_t *vocp) @@ -288,8 +288,9 @@ vg_ocl_dump_info(vg_ocl_context_t *vocp)
did = vocp->voc_ocldid;
printf("Device: %s\n",
vg_ocl_device_getstr(did, CL_DEVICE_NAME));
printf("Vendor: %s\n",
vg_ocl_device_getstr(did, CL_DEVICE_VENDOR));
printf("Vendor: %s (%04x)\n",
vg_ocl_device_getstr(did, CL_DEVICE_VENDOR),
vg_ocl_device_getuint(did, CL_DEVICE_VENDOR_ID));
printf("Driver: %s\n",
vg_ocl_device_getstr(did, CL_DRIVER_VERSION));
printf("Profile: %s\n",
@ -385,29 +386,32 @@ enum { @@ -385,29 +386,32 @@ enum {
VG_OCL_UNROLL_LOOPS = (1 << 0),
VG_OCL_EXPENSIVE_BRANCHES = (1 << 1),
VG_OCL_DEEP_VLIW = (1 << 2),
VG_OCL_NV_VERBOSE = (1 << 3),
VG_OCL_BROKEN = (1 << 4),
VG_OCL_NO_BINARIES = (1 << 5),
VG_OCL_AMD_BFI_INT = (1 << 3),
VG_OCL_NV_VERBOSE = (1 << 4),
VG_OCL_BROKEN = (1 << 5),
VG_OCL_NO_BINARIES = (1 << 6),
VG_OCL_OPTIMIZATIONS = (VG_OCL_UNROLL_LOOPS |
VG_OCL_EXPENSIVE_BRANCHES |
VG_OCL_DEEP_VLIW),
VG_OCL_DEEP_VLIW |
VG_OCL_AMD_BFI_INT),
};
int
vg_ocl_get_quirks(vg_ocl_context_t *vocp)
{
const char *vend;
uint32_t vend;
const char *dvn;
unsigned int quirks = 0;
/* Loop unrolling for devices other than CPUs */
if (!(vg_ocl_device_gettype(vocp->voc_ocldid) & CL_DEVICE_TYPE_CPU))
quirks |= VG_OCL_UNROLL_LOOPS;
vend = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_VENDOR);
if (!strcmp(vend, "NVIDIA Corporation") ||
!strcmp(vend, "NVIDIA")) {
vend = vg_ocl_device_getuint(vocp->voc_ocldid, CL_DEVICE_VENDOR_ID);
switch (vend) {
case 0x10de: /* NVIDIA */
quirks |= VG_OCL_NV_VERBOSE;
#ifdef WIN32
if (strcmp(vg_ocl_device_getstr(vocp->voc_ocldid,
@ -420,15 +424,22 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp) @@ -420,15 +424,22 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp)
quirks |= VG_OCL_BROKEN;
}
#endif
} else if (!strcmp(vend, "Advanced Micro Devices, Inc.") ||
!strcmp(vend, "AMD")) {
break;
case 0x1002: /* AMD/ATI */
quirks |= VG_OCL_EXPENSIVE_BRANCHES;
quirks |= VG_OCL_DEEP_VLIW;
vend = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME);
if (!strcmp(vend, "ATI RV710")) {
dvn = vg_ocl_device_getstr(vocp->voc_ocldid,
CL_DEVICE_EXTENSIONS);
if (dvn && strstr(dvn, "cl_amd_media_ops"))
quirks |= VG_OCL_AMD_BFI_INT;
dvn = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME);
if (!strcmp(dvn, "ATI RV710")) {
quirks &= ~VG_OCL_OPTIMIZATIONS;
quirks |= VG_OCL_NO_BINARIES;
}
break;
default:
break;
}
return quirks;
}
@ -481,19 +492,147 @@ vg_ocl_hash_program(vg_ocl_context_t *vocp, const char *opts, @@ -481,19 +492,147 @@ vg_ocl_hash_program(vg_ocl_context_t *vocp, const char *opts,
MD5_Final(hash_out, &ctx);
}
typedef struct {
unsigned char e_ident[16];
uint16_t e_type;
uint16_t e_machine;
uint32_t e_version;
uint32_t e_entry;
uint32_t e_phoff;
uint32_t e_shoff;
uint32_t e_flags;
uint16_t e_ehsize;
uint16_t e_phentsize;
uint16_t e_phnum;
uint16_t e_shentsize;
uint16_t e_shnum;
uint16_t e_shstrndx;
} vg_elf32_header_t;
typedef struct {
uint32_t sh_name;
uint32_t sh_type;
uint32_t sh_flags;
uint32_t sh_addr;
uint32_t sh_offset;
uint32_t sh_size;
uint32_t sh_link;
uint32_t sh_info;
uint32_t sh_addralign;
uint32_t sh_entsize;
} vg_elf32_shdr_t;
int
vg_ocl_amd_patch_inner(unsigned char *binary, size_t size)
{
vg_elf32_header_t *ehp;
vg_elf32_shdr_t *shp, *nshp;
uint32_t *instr;
size_t off;
int i, n, txt2idx, patched;
ehp = (vg_elf32_header_t *) binary;
if ((size < sizeof(*ehp)) ||
memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) ||
!ehp->e_shoff)
return 0;
off = ehp->e_shoff + (ehp->e_shstrndx * ehp->e_shentsize);
nshp = (vg_elf32_shdr_t *) (binary + off);
if ((off + sizeof(*nshp)) > size)
return 0;
shp = (vg_elf32_shdr_t *) (binary + ehp->e_shoff);
n = 0;
txt2idx = 0;
for (i = 0; i < ehp->e_shnum; i++) {
off = nshp->sh_offset + shp[i].sh_name;
if (((off + 6) >= size) ||
memcmp(binary + off, ".text", 6))
continue;
n++;
if (n == 2)
txt2idx = i;
}
if (n != 2)
return 0;
off = shp[txt2idx].sh_offset;
instr = (uint32_t *) (binary + off);
n = shp[txt2idx].sh_size / 4;
patched = 0;
for (i = 0; i < n; i += 2) {
if (((instr[i] & 0x02001000) == 0) &&
((instr[i+1] & 0x9003f000) == 0x0001a000)) {
instr[i+1] ^= (0x0001a000 ^ 0x0000c000);
patched++;
}
}
return patched;
}
int
vg_ocl_amd_patch(vg_ocl_context_t *vocp, unsigned char *binary, size_t size)
{
vg_context_t *vcp = vocp->base.vxc_vc;
vg_elf32_header_t *ehp;
unsigned char *ptr;
size_t offset = 1;
int ninner = 0, nrun, npatched = 0;
ehp = (vg_elf32_header_t *) binary;
if ((size < sizeof(*ehp)) ||
memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\0", 8) ||
!ehp->e_shoff)
return 0;
offset = 1;
while (offset < (size - 8)) {
ptr = (unsigned char *) memchr(binary + offset,
0x7f,
size - offset);
if (!ptr)
return npatched;
offset = ptr - binary;
ehp = (vg_elf32_header_t *) ptr;
if (((size - offset) < sizeof(*ehp)) ||
memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) ||
!ehp->e_shoff) {
offset += 1;
continue;
}
ninner++;
nrun = vg_ocl_amd_patch_inner(ptr, size - offset);
npatched += nrun;
if (vcp->vc_verbose > 1)
printf("AMD BFI_INT: patched %d instructions "
"in kernel %d\n",
nrun, ninner);
npatched++;
offset += 1;
}
return npatched;
}
int
vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
const char *filename, const char *opts)
{
FILE *kfp;
char *buf, *tbuf;
int len, fromsource = 0;
int len, fromsource = 0, patched = 0;
size_t sz, szr;
cl_program prog;
cl_int ret, sts;
unsigned char prog_hash[16];
char bin_name[64];
if (vcp->vc_verbose > 1)
printf("OpenCL compiler flags: %s\n", opts ? opts : "");
sz = 128 * 1024;
buf = (char *) malloc(sz);
if (!buf) {
@ -568,6 +707,7 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp, @@ -568,6 +707,7 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
}
}
fclose(kfp);
rebuild:
prog = clCreateProgramWithBinary(vocp->voc_oclctx,
1, &vocp->voc_ocldid,
&szr,
@ -581,24 +721,22 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp, @@ -581,24 +721,22 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
return 0;
}
if (vcp->vc_verbose > 1)
printf("OpenCL compiler flags: %s\n", opts ? opts : "");
if (vcp->vc_verbose > 0) {
if (fromsource) {
if (fromsource && !patched) {
printf("Compiling kernel...");
fflush(stdout);
}
}
ret = clBuildProgram(prog, 1, &vocp->voc_ocldid, opts, NULL, NULL);
if (ret != CL_SUCCESS) {
if ((vcp->vc_verbose > 0) && fromsource)
if ((vcp->vc_verbose > 0) && fromsource && !patched)
printf("failure.\n");
vg_ocl_error(NULL, ret, "clBuildProgram");
} else if ((vcp->vc_verbose > 0) && fromsource) {
} else if ((vcp->vc_verbose > 0) && fromsource && !patched) {
printf("done!\n");
}
if ((ret != CL_SUCCESS) || (vcp->vc_verbose > 1)) {
if ((ret != CL_SUCCESS) ||
((vcp->vc_verbose > 1) && fromsource && !patched)) {
vg_ocl_buildlog(vocp, prog);
}
if (ret != CL_SUCCESS) {
@ -641,6 +779,23 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp, @@ -641,6 +779,23 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
goto out;
}
if ((vocp->voc_quirks & VG_OCL_AMD_BFI_INT) && !patched) {
patched = vg_ocl_amd_patch(vocp,
(unsigned char *) buf, szr);
if (patched > 0) {
if (vcp->vc_verbose > 1)
printf("AMD BFI_INT patch complete\n");
clReleaseProgram(prog);
goto rebuild;
}
printf("WARNING: AMD BFI_INT patching failed\n");
if (patched < 0) {
/* Program was incompletely modified */
free(buf);
goto out;
}
}
kfp = fopen(bin_name, "wb");
if (!kfp) {
printf("WARNING: could not save CL kernel binary: %s\n",
@ -743,6 +898,9 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did, @@ -743,6 +898,9 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did,
if (vocp->voc_quirks & VG_OCL_DEEP_VLIW)
end += snprintf(optbuf + end, sizeof(optbuf) - end,
"-DDEEP_VLIW ");
if (vocp->voc_quirks & VG_OCL_AMD_BFI_INT)
end += snprintf(optbuf + end, sizeof(optbuf) - end,
"-DAMD_BFI_INT ");
if (vocp->voc_quirks & VG_OCL_NV_VERBOSE)
end += snprintf(optbuf + end, sizeof(optbuf) - end,
"-cl-nv-verbose ");

Loading…
Cancel
Save