From dc2d553d5bad301ee553a1276abed4b9635b4d2c Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 14 Feb 2012 21:10:47 +1100 Subject: [PATCH] Hand optimise first variable declaration order in poclbm kernel. --- poclbm120213.cl | 97 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 72 insertions(+), 25 deletions(-) diff --git a/poclbm120213.cl b/poclbm120213.cl index f2455c15..3ab6f636 100644 --- a/poclbm120213.cl +++ b/poclbm120213.cl @@ -88,31 +88,38 @@ __kernel void search(const uint state0, const uint state1, const uint state2, co W[20]=fcty_e; W[20]+=nonce; -W[16]=state0; -W[16]+=W[20]; -W[19]=d1; -W[19]+=(rotr(W[16],6)^rotr(W[16],11)^rotr(W[16],25)); + +W[16]=W[20]; +W[16]+=state0; + +W[19]=(rotr(W[16],6)^rotr(W[16],11)^rotr(W[16],25)); +W[19]+=d1; W[19]+=ch(W[16],b1,c1); W[19]+=K[4]; -W[23]=h1; W[19]+=0x80000000; -W[23]+=W[19]; + +W[23]=W[19]; +W[23]+=h1; W[20]+=fcty_e2; W[19]+=(rotr(W[20],2)^rotr(W[20],13)^rotr(W[20],22)); + W[18]=c1; W[18]+=(rotr(W[23],6)^rotr(W[23],11)^rotr(W[23],25)); W[18]+=ch(W[23],W[16],b1); W[18]+=K[5]; -W[22]=g1; -W[22]+=W[18]; + +W[22]=W[18]; +W[22]+=g1; W[19]+=Ma2(g1,W[20],f1); W[18]+=(rotr(W[19],2)^rotr(W[19],13)^rotr(W[19],22)); + W[17]=b1; W[17]+=(rotr(W[22],6)^rotr(W[22],11)^rotr(W[22],25)); W[17]+=ch(W[22],W[23],W[16]); W[17]+=K[6]; -W[21]=f1; -W[21]+=W[17]; + +W[21]=W[17]; +W[21]+=f1; W[18]+=Ma2(f1,W[19],W[20]); W[17]+=(rotr(W[18],2)^rotr(W[18],13)^rotr(W[18],22)); W[16]+=(rotr(W[21],6)^rotr(W[21],11)^rotr(W[21],25)); @@ -184,6 +191,7 @@ W[22]+=K[17]; W[22]+=fw1; W[18]+=W[22]; W[22]+=(rotr(W[23],2)^rotr(W[23],13)^rotr(W[23],22)); + W[2]=(rotr(nonce,7)^rotr(nonce,18)^(nonce>>3U)); W[2]+=fw2; W[21]+=(rotr(W[18],6)^rotr(W[18],11)^rotr(W[18],25)); @@ -194,6 +202,7 @@ W[22]+=Ma(W[17],W[23],W[16]); W[17]+=W[21]; W[21]+=(rotr(W[22],2)^rotr(W[22],13)^rotr(W[22],22)); W[21]+=Ma(W[16],W[22],W[23]); + W[3]=nonce; W[3]+=fw3; W[20]+=(rotr(W[17],6)^rotr(W[17],11)^rotr(W[17],25)); @@ -202,6 +211,7 @@ W[20]+=K[19]; W[20]+=W[3]; W[16]+=W[20]; W[20]+=(rotr(W[21],2)^rotr(W[21],13)^rotr(W[21],22)); + W[4]=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); W[4]+=0x80000000; W[19]+=(rotr(W[16],6)^rotr(W[16],11)^rotr(W[16],25)); @@ -215,10 +225,12 @@ W[19]+=Ma(W[22],W[20],W[21]); W[18]+=(rotr(W[23],6)^rotr(W[23],11)^rotr(W[23],25)); W[18]+=ch(W[23],W[16],W[17]); W[18]+=K[21]; + W[5]=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U)); W[18]+=W[5]; W[22]+=W[18]; W[18]+=(rotr(W[19],2)^rotr(W[19],13)^rotr(W[19],22)); + W[6]=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U)); W[6]+=0x00000280U; W[17]+=(rotr(W[22],6)^rotr(W[22],11)^rotr(W[22],25)); @@ -229,6 +241,7 @@ W[18]+=Ma(W[21],W[19],W[20]); W[21]+=W[17]; W[17]+=(rotr(W[18],2)^rotr(W[18],13)^rotr(W[18],22)); W[17]+=Ma(W[20],W[18],W[19]); + W[7]=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); W[7]+=fw0; W[16]+=(rotr(W[21],6)^rotr(W[21],11)^rotr(W[21],25)); @@ -237,6 +250,7 @@ W[16]+=K[23]; W[16]+=W[7]; W[20]+=W[16]; W[16]+=(rotr(W[17],2)^rotr(W[17],13)^rotr(W[17],22)); + W[8]=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); W[8]+=fw1; W[23]+=(rotr(W[20],6)^rotr(W[20],11)^rotr(W[20],25)); @@ -247,6 +261,7 @@ W[16]+=Ma(W[19],W[17],W[18]); W[19]+=W[23]; W[23]+=(rotr(W[16],2)^rotr(W[16],13)^rotr(W[16],22)); W[23]+=Ma(W[18],W[16],W[17]); + W[9]=W[2]; W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); W[22]+=(rotr(W[19],6)^rotr(W[19],11)^rotr(W[19],25)); @@ -255,6 +270,7 @@ W[22]+=K[25]; W[22]+=W[9]; W[18]+=W[22]; W[22]+=(rotr(W[23],2)^rotr(W[23],13)^rotr(W[23],22)); + W[10]=W[3]; W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); W[21]+=(rotr(W[18],6)^rotr(W[18],11)^rotr(W[18],25)); @@ -265,6 +281,7 @@ W[22]+=Ma(W[17],W[23],W[16]); W[17]+=W[21]; W[21]+=(rotr(W[22],2)^rotr(W[22],13)^rotr(W[22],22)); W[21]+=Ma(W[16],W[22],W[23]); + W[11]=W[4]; W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); W[20]+=(rotr(W[17],6)^rotr(W[17],11)^rotr(W[17],25)); @@ -273,6 +290,7 @@ W[20]+=K[27]; W[20]+=W[11]; W[16]+=W[20]; W[20]+=(rotr(W[21],2)^rotr(W[21],13)^rotr(W[21],22)); + W[12]=W[5]; W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); W[19]+=(rotr(W[16],6)^rotr(W[16],11)^rotr(W[16],25)); @@ -283,6 +301,7 @@ W[20]+=Ma(W[23],W[21],W[22]); W[23]+=W[19]; W[19]+=(rotr(W[20],2)^rotr(W[20],13)^rotr(W[20],22)); W[19]+=Ma(W[22],W[20],W[21]); + W[13]=W[6]; W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U)); W[18]+=(rotr(W[23],6)^rotr(W[23],11)^rotr(W[23],25)); @@ -291,6 +310,7 @@ W[18]+=K[29]; W[18]+=W[13]; W[22]+=W[18]; W[18]+=(rotr(W[19],2)^rotr(W[19],13)^rotr(W[19],22)); + W[14]=0x00a00055U; W[14]+=W[7]; W[17]+=(rotr(W[22],6)^rotr(W[22],11)^rotr(W[22],25)); @@ -302,6 +322,7 @@ W[18]+=Ma(W[21],W[19],W[20]); W[21]+=W[17]; W[17]+=(rotr(W[18],2)^rotr(W[18],13)^rotr(W[18],22)); W[17]+=Ma(W[20],W[18],W[19]); + W[15]=fw15; W[15]+=W[8]; W[16]+=(rotr(W[21],6)^rotr(W[21],11)^rotr(W[21],25)); @@ -311,6 +332,7 @@ W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U)); W[16]+=W[15]; W[20]+=W[16]; W[16]+=(rotr(W[17],2)^rotr(W[17],13)^rotr(W[17],22)); + W[0]=fw01r; W[0]+=W[9]; W[23]+=(rotr(W[20],6)^rotr(W[20],11)^rotr(W[20],25)); @@ -322,6 +344,7 @@ W[16]+=Ma(W[19],W[17],W[18]); W[19]+=W[23]; W[23]+=(rotr(W[16],2)^rotr(W[16],13)^rotr(W[16],22)); W[23]+=Ma(W[18],W[16],W[17]); + W[1]=fw1; W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U)); W[1]+=W[10]; @@ -633,52 +656,68 @@ W[16]+=W[15]; W[20]+=W[16]; W[16]+=(rotr(W[17],2)^rotr(W[17],13)^rotr(W[17],22)); W[16]+=Ma(W[19],W[17],W[18]); + W[0]=W[16]; -W[7]=W[23]; -W[7]+=state7; + +W[7]=state7; +W[7]+=W[23]; + W[23]=0xb0edbdd0; W[23]+=K[0]; W[0]+=state0; W[23]+=W[0]; -W[3]=W[19]; -W[3]+=state3; + +W[3]=state3; +W[3]+=W[19]; + W[19]=0xa54ff53a; W[19]+=W[23]; + W[1]=W[17]; W[1]+=state1; -W[6]=W[22]; -W[6]+=state6; + +W[6]=state6; +W[6]+=W[22]; + W[22]=0x1f83d9abU; W[22]+=(rotr(W[19],6)^rotr(W[19],11)^rotr(W[19],25)); W[22]+=(0x9b05688cU^(W[19]&0xca0b3af3U)); W[22]+=K[1]; -W[2]=W[18]; -W[2]+=state2; -W[22]+=W[1]; + +W[2]=state2; +W[2]+=W[18]; + W[18]=0x3c6ef372U; +W[22]+=W[1]; W[18]+=W[22]; W[23]+=0x08909ae5U; W[22]+=(rotr(W[23],2)^rotr(W[23],13)^rotr(W[23],22)); -W[5]=W[21]; -W[5]+=state5; + +W[5]=state5; +W[5]+=W[21]; + W[21]=0x9b05688cU; W[21]+=(rotr(W[18],6)^rotr(W[18],11)^rotr(W[18],25)); W[21]+=ch(W[18],W[19],0x510e527fU); W[21]+=K[2]; W[21]+=W[2]; + W[17]=0xbb67ae85U; W[17]+=W[21]; W[22]+=Ma2(0xbb67ae85U,W[23],0x6a09e667U); W[21]+=(rotr(W[22],2)^rotr(W[22],13)^rotr(W[22],22)); -W[4]=W[20]; -W[4]+=state4; + +W[4]=state4; +W[4]+=W[20]; + W[20]=0x510e527fU; W[20]+=(rotr(W[17],6)^rotr(W[17],11)^rotr(W[17],25)); W[20]+=ch(W[17],W[18],W[19]); W[20]+=K[3]; W[20]+=W[3]; -W[16]=0x6a09e667U; -W[16]+=W[20]; + +W[16]=W[20]; +W[16]+=0x6a09e667U; W[21]+=Ma2(0x6a09e667U,W[22],W[23]); W[20]+=(rotr(W[21],2)^rotr(W[21],13)^rotr(W[21],22)); W[19]+=(rotr(W[16],6)^rotr(W[16],11)^rotr(W[16],25)); @@ -832,6 +871,7 @@ W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); W[16]+=W[7]; W[20]+=W[16]; W[16]+=(rotr(W[17],2)^rotr(W[17],13)^rotr(W[17],22)); + W[8]=0x80000000; W[8]+=W[1]; W[23]+=(rotr(W[20],6)^rotr(W[20],11)^rotr(W[20],25)); @@ -843,6 +883,7 @@ W[16]+=Ma(W[19],W[17],W[18]); W[19]+=W[23]; W[23]+=(rotr(W[16],2)^rotr(W[16],13)^rotr(W[16],22)); W[23]+=Ma(W[18],W[16],W[17]); + W[9]=W[2]; W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); W[22]+=(rotr(W[19],6)^rotr(W[19],11)^rotr(W[19],25)); @@ -851,6 +892,7 @@ W[22]+=K[25]; W[22]+=W[9]; W[18]+=W[22]; W[22]+=(rotr(W[23],2)^rotr(W[23],13)^rotr(W[23],22)); + W[10]=W[3]; W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); W[21]+=(rotr(W[18],6)^rotr(W[18],11)^rotr(W[18],25)); @@ -861,6 +903,7 @@ W[22]+=Ma(W[17],W[23],W[16]); W[17]+=W[21]; W[21]+=(rotr(W[22],2)^rotr(W[22],13)^rotr(W[22],22)); W[21]+=Ma(W[16],W[22],W[23]); + W[11]=W[4]; W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); W[20]+=(rotr(W[17],6)^rotr(W[17],11)^rotr(W[17],25)); @@ -869,6 +912,7 @@ W[20]+=K[27]; W[20]+=W[11]; W[16]+=W[20]; W[20]+=(rotr(W[21],2)^rotr(W[21],13)^rotr(W[21],22)); + W[12]=W[5]; W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); W[19]+=(rotr(W[16],6)^rotr(W[16],11)^rotr(W[16],25)); @@ -879,6 +923,7 @@ W[20]+=Ma(W[23],W[21],W[22]); W[23]+=W[19]; W[19]+=(rotr(W[20],2)^rotr(W[20],13)^rotr(W[20],22)); W[19]+=Ma(W[22],W[20],W[21]); + W[13]=W[6]; W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U)); W[18]+=(rotr(W[23],6)^rotr(W[23],11)^rotr(W[23],25)); @@ -887,6 +932,7 @@ W[18]+=K[29]; W[18]+=W[13]; W[22]+=W[18]; W[18]+=(rotr(W[19],2)^rotr(W[19],13)^rotr(W[19],22)); + W[14]=0x00400022U; W[14]+=W[7]; W[17]+=(rotr(W[22],6)^rotr(W[22],11)^rotr(W[22],25)); @@ -898,6 +944,7 @@ W[18]+=Ma(W[21],W[19],W[20]); W[21]+=W[17]; W[17]+=(rotr(W[18],2)^rotr(W[18],13)^rotr(W[18],22)); W[17]+=Ma(W[20],W[18],W[19]); + W[15]=0x00000100U; W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U)); W[15]+=W[8];