comparison libpostproc/postprocess_template.c @ 163:32e7f17a04a7 libavcodec

faster mmx2 / 3dnow deblocking filter brightness_debug (draws luminance histogram & autodetected white/black level)
author michael
date Mon, 19 Nov 2001 22:20:30 +0000
parents d1a4f4ca7178
children dedb3aef2bee
comparison
equal deleted inserted replaced
162:de80712db90b 163:32e7f17a04a7
19 /* 19 /*
20 C MMX MMX2 3DNow 20 C MMX MMX2 3DNow
21 isVertDC Ec Ec 21 isVertDC Ec Ec
22 isVertMinMaxOk Ec Ec 22 isVertMinMaxOk Ec Ec
23 doVertLowPass E e e 23 doVertLowPass E e e
24 doVertDefFilter Ec Ec Ec 24 doVertDefFilter Ec Ec e e
25 isHorizDC Ec Ec 25 isHorizDC Ec Ec
26 isHorizMinMaxOk a E 26 isHorizMinMaxOk a E
27 doHorizLowPass E e e 27 doHorizLowPass E e e
28 doHorizDefFilter Ec Ec Ec 28 doHorizDefFilter Ec Ec e e
29 deRing E e e* 29 deRing E e e*
30 Vertical RKAlgo1 E a a 30 Vertical RKAlgo1 E a a
31 Horizontal RKAlgo1 a a 31 Horizontal RKAlgo1 a a
32 Vertical X1# a E E 32 Vertical X1# a E E
33 Horizontal X1# a E E 33 Horizontal X1# a E E
61 border remover 61 border remover
62 optimize c versions 62 optimize c versions
63 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks 63 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
64 smart blur 64 smart blur
65 ... 65 ...
66
67 Notes:
68 */ 66 */
69 67
70 //Changelog: use the CVS log 68 //Changelog: use the CVS log
71 69
72 #include "../config.h" 70 #include "../config.h"
78 #include <malloc.h> 76 #include <malloc.h>
79 #endif 77 #endif
80 //#undef HAVE_MMX2 78 //#undef HAVE_MMX2
81 //#define HAVE_3DNOW 79 //#define HAVE_3DNOW
82 //#undef HAVE_MMX 80 //#undef HAVE_MMX
81 //#define DEBUG_BRIGHTNESS
83 #include "postprocess.h" 82 #include "postprocess.h"
84 83
85 #define MIN(a,b) ((a) > (b) ? (b) : (a)) 84 #define MIN(a,b) ((a) > (b) ? (b) : (a))
86 #define MAX(a,b) ((a) < (b) ? (b) : (a)) 85 #define MAX(a,b) ((a) < (b) ? (b) : (a))
87 #define ABS(a) ((a) > 0 ? (a) : (-(a))) 86 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
1065 } 1064 }
1066 1065
1067 1066
1068 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) 1067 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1069 { 1068 {
1070 #ifdef HAVE_MMX 1069 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1070 /*
1071 uint8_t tmp[16];
1072 const int l1= stride;
1073 const int l2= stride + l1;
1074 const int l3= stride + l2;
1075 const int l4= (int)tmp - (int)src - stride*3;
1076 const int l5= (int)tmp - (int)src - stride*3 + 8;
1077 const int l6= stride*3 + l3;
1078 const int l7= stride + l6;
1079 const int l8= stride + l7;
1080
1081 memcpy(tmp, src+stride*7, 8);
1082 memcpy(tmp+8, src+stride*8, 8);
1083 */
1071 src+= stride*4; 1084 src+= stride*4;
1072 //FIXME try pmul for *5 stuff 1085 asm volatile(
1073 // src[0]=0; 1086
1087 #if 0 //sligtly more accurate and slightly slower
1088 "pxor %%mm7, %%mm7 \n\t" // 0
1089 "leal (%0, %1), %%eax \n\t"
1090 "leal (%%eax, %1, 4), %%ebx \n\t"
1091 // 0 1 2 3 4 5 6 7
1092 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1093 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1094
1095
1096 "movq (%0, %1, 2), %%mm0 \n\t" // l2
1097 "movq (%0), %%mm1 \n\t" // l0
1098 "movq %%mm0, %%mm2 \n\t" // l2
1099 PAVGB(%%mm7, %%mm0) // ~l2/2
1100 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
1101 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
1102
1103 "movq (%%eax), %%mm1 \n\t" // l1
1104 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3
1105 "movq %%mm1, %%mm4 \n\t" // l1
1106 PAVGB(%%mm7, %%mm1) // ~l1/2
1107 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
1108 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
1109
1110 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
1111 "psubusb %%mm1, %%mm0 \n\t"
1112 "psubusb %%mm4, %%mm1 \n\t"
1113 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
1114 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
1115
1116 "movq (%0, %1, 4), %%mm0 \n\t" // l4
1117 "movq %%mm0, %%mm4 \n\t" // l4
1118 PAVGB(%%mm7, %%mm0) // ~l4/2
1119 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
1120 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
1121
1122 "movq (%%ebx), %%mm2 \n\t" // l5
1123 "movq %%mm3, %%mm5 \n\t" // l3
1124 PAVGB(%%mm7, %%mm3) // ~l3/2
1125 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
1126 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
1127
1128 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
1129 "psubusb %%mm3, %%mm0 \n\t"
1130 "psubusb %%mm6, %%mm3 \n\t"
1131 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
1132 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
1133 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
1134
1135 "movq (%%ebx, %1), %%mm6 \n\t" // l6
1136 "movq %%mm6, %%mm5 \n\t" // l6
1137 PAVGB(%%mm7, %%mm6) // ~l6/2
1138 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
1139 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
1140
1141 "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7
1142 "movq %%mm2, %%mm4 \n\t" // l5
1143 PAVGB(%%mm7, %%mm2) // ~l5/2
1144 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
1145 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
1146
1147 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
1148 "psubusb %%mm2, %%mm6 \n\t"
1149 "psubusb %%mm4, %%mm2 \n\t"
1150 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
1151 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
1152
1153
1154 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
1155 "movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ?
1156 "paddusb b01, %%mm4 \n\t"
1157 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
1158 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
1159 "pand %%mm4, %%mm3 \n\t"
1160
1161 "movq %%mm3, %%mm1 \n\t"
1162 // "psubusb b01, %%mm3 \n\t"
1163 PAVGB(%%mm7, %%mm3)
1164 PAVGB(%%mm7, %%mm3)
1165 "paddusb %%mm1, %%mm3 \n\t"
1166 // "paddusb b01, %%mm3 \n\t"
1167
1168 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3
1169 "movq (%0, %1, 4), %%mm5 \n\t" //l4
1170 "movq (%0, %1, 4), %%mm4 \n\t" //l4
1171 "psubusb %%mm6, %%mm5 \n\t"
1172 "psubusb %%mm4, %%mm6 \n\t"
1173 "por %%mm6, %%mm5 \n\t" // |l3-l4|
1174 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
1175 "pxor %%mm6, %%mm0 \n\t"
1176 "pand %%mm0, %%mm3 \n\t"
1177 PMINUB(%%mm5, %%mm3, %%mm0)
1178
1179 "psubusb b01, %%mm3 \n\t"
1180 PAVGB(%%mm7, %%mm3)
1181
1182 "movq (%%eax, %1, 2), %%mm0 \n\t"
1183 "movq (%0, %1, 4), %%mm2 \n\t"
1184 "pxor %%mm6, %%mm0 \n\t"
1185 "pxor %%mm6, %%mm2 \n\t"
1186 "psubb %%mm3, %%mm0 \n\t"
1187 "paddb %%mm3, %%mm2 \n\t"
1188 "pxor %%mm6, %%mm0 \n\t"
1189 "pxor %%mm6, %%mm2 \n\t"
1190 "movq %%mm0, (%%eax, %1, 2) \n\t"
1191 "movq %%mm2, (%0, %1, 4) \n\t"
1192 #endif
1193
1194 "leal (%0, %1), %%eax \n\t"
1195 "pcmpeqb %%mm6, %%mm6 \n\t" // -1
1196 // 0 1 2 3 4 5 6 7
1197 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
1198 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
1199
1200
1201 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3
1202 "movq (%0, %1, 4), %%mm0 \n\t" // l4
1203 "pxor %%mm6, %%mm1 \n\t" // -l3-1
1204 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
1205 // mm1=-l3-1, mm0=128-q
1206
1207 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5
1208 "movq (%%eax, %1), %%mm3 \n\t" // l2
1209 "pxor %%mm6, %%mm2 \n\t" // -l5-1
1210 "movq %%mm2, %%mm5 \n\t" // -l5-1
1211 "movq b80, %%mm4 \n\t" // 128
1212 "leal (%%eax, %1, 4), %%ebx \n\t"
1213 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
1214 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
1215 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
1216 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
1217 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
1218
1219 "movq (%%eax), %%mm2 \n\t" // l1
1220 "pxor %%mm6, %%mm2 \n\t" // -l1-1
1221 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
1222 PAVGB((%0), %%mm1) // (l0-l3+256)/2
1223 "movq b80, %%mm3 \n\t" // 128
1224 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
1225 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
1226 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
1227 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
1228
1229 PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2
1230 "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7
1231 "pxor %%mm6, %%mm1 \n\t" // -l7-1
1232 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
1233 "movq b80, %%mm2 \n\t" // 128
1234 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
1235 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
1236 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
1237 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
1238
1239 "movq b00, %%mm1 \n\t" // 0
1240 "movq b00, %%mm5 \n\t" // 0
1241 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
1242 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
1243 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
1244 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
1245 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
1246
1247 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
1248
1249 "movq b00, %%mm7 \n\t" // 0
1250 "movq pQPb, %%mm2 \n\t" // QP
1251 PAVGB(%%mm6, %%mm2) // 128 + QP/2
1252 "psubb %%mm6, %%mm2 \n\t"
1253
1254 "movq %%mm4, %%mm1 \n\t"
1255 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
1256 "pxor %%mm1, %%mm4 \n\t"
1257 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
1258 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
1259 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
1260 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
1261
1262 "movq %%mm4, %%mm3 \n\t" // d
1263 "psubusb b01, %%mm4 \n\t"
1264 PAVGB(%%mm7, %%mm4) // d/32
1265 PAVGB(%%mm7, %%mm4) // (d + 32)/64
1266 "paddb %%mm3, %%mm4 \n\t" // 5d/64
1267 "pand %%mm2, %%mm4 \n\t"
1268
1269 "movq b80, %%mm5 \n\t" // 128
1270 "psubb %%mm0, %%mm5 \n\t" // q
1271 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
1272 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
1273 "pxor %%mm7, %%mm5 \n\t"
1274
1275 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
1276 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
1277
1278 "pand %%mm7, %%mm4 \n\t"
1279 "movq (%%eax, %1, 2), %%mm0 \n\t"
1280 "movq (%0, %1, 4), %%mm2 \n\t"
1281 "pxor %%mm1, %%mm0 \n\t"
1282 "pxor %%mm1, %%mm2 \n\t"
1283 "paddb %%mm4, %%mm0 \n\t"
1284 "psubb %%mm4, %%mm2 \n\t"
1285 "pxor %%mm1, %%mm0 \n\t"
1286 "pxor %%mm1, %%mm2 \n\t"
1287 "movq %%mm0, (%%eax, %1, 2) \n\t"
1288 "movq %%mm2, (%0, %1, 4) \n\t"
1289
1290 :
1291 : "r" (src), "r" (stride)
1292 : "%eax", "%ebx"
1293 );
1294
1295 /*
1296 {
1297 int x;
1298 src-= stride;
1299 for(x=0; x<BLOCK_SIZE; x++)
1300 {
1301 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1302 if(ABS(middleEnergy)< 8*QP)
1303 {
1304 const int q=(src[l4] - src[l5])/2;
1305 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1306 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1307
1308 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1309 d= MAX(d, 0);
1310
1311 d= (5*d + 32) >> 6;
1312 d*= SIGN(-middleEnergy);
1313
1314 if(q>0)
1315 {
1316 d= d<0 ? 0 : d;
1317 d= d>q ? q : d;
1318 }
1319 else
1320 {
1321 d= d>0 ? 0 : d;
1322 d= d<q ? q : d;
1323 }
1324
1325 src[l4]-= d;
1326 src[l5]+= d;
1327 }
1328 src++;
1329 }
1330 src-=8;
1331 for(x=0; x<8; x++)
1332 {
1333 int y;
1334 for(y=4; y<6; y++)
1335 {
1336 int d= src[x+y*stride] - tmp[x+(y-4)*8];
1337 int ad= ABS(d);
1338 static int max=0;
1339 static int sum=0;
1340 static int num=0;
1341 static int bias=0;
1342
1343 if(max<ad) max=ad;
1344 sum+= ad>3 ? 1 : 0;
1345 if(ad>3)
1346 {
1347 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
1348 }
1349 if(y==4) bias+=d;
1350 num++;
1351 if(num%1000000 == 0)
1352 {
1353 printf(" %d %d %d %d\n", num, sum, max, bias);
1354 }
1355 }
1356 }
1357 }
1358 */
1359 #elif defined (HAVE_MMX)
1360 src+= stride*4;
1361
1074 asm volatile( 1362 asm volatile(
1075 "pxor %%mm7, %%mm7 \n\t" 1363 "pxor %%mm7, %%mm7 \n\t"
1076 "leal (%0, %1), %%eax \n\t" 1364 "leal (%0, %1), %%eax \n\t"
1077 "leal (%%eax, %1, 4), %%ebx \n\t" 1365 "leal (%%eax, %1, 4), %%ebx \n\t"
1078 // 0 1 2 3 4 5 6 7 1366 // 0 1 2 3 4 5 6 7
3959 if(y+15 >= height) 4247 if(y+15 >= height)
3960 { 4248 {
3961 uint8_t *dstBlock= &(dst[y*dstStride]); 4249 uint8_t *dstBlock= &(dst[y*dstStride]);
3962 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); 4250 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
3963 } 4251 }
3964 } 4252 /*
4253 for(x=0; x<width; x+=32)
4254 {
4255 int i;
4256 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
4257 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
4258 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]
4259 + dstBlock[x +13*dstStride] + dstBlock[x +14*dstStride]
4260 + dstBlock[x +15*dstStride];
4261 }
4262 */ }
3965 #ifdef HAVE_3DNOW 4263 #ifdef HAVE_3DNOW
3966 asm volatile("femms"); 4264 asm volatile("femms");
3967 #elif defined (HAVE_MMX) 4265 #elif defined (HAVE_MMX)
3968 asm volatile("emms"); 4266 asm volatile("emms");
3969 #endif 4267 #endif
3975 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r", 4273 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
3976 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000), 4274 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3977 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000) 4275 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
3978 , black, white); 4276 , black, white);
3979 #endif 4277 #endif
4278 #ifdef DEBUG_BRIGHTNESS
4279 if(!isColor)
4280 {
4281 int max=1;
4282 int i;
4283 for(i=0; i<256; i++)
4284 if(yHistogram[i] > max) max=yHistogram[i];
4285
4286 for(i=1; i<256; i++)
4287 {
4288 int x;
4289 int start=yHistogram[i-1]/(max/256+1);
4290 int end=yHistogram[i]/(max/256+1);
4291 int inc= end > start ? 1 : -1;
4292 for(x=start; x!=end+inc; x+=inc)
4293 dst[ i*dstStride + x]+=128;
4294 }
4295
4296 for(i=0; i<100; i+=2)
4297 {
4298 dst[ (white)*dstStride + i]+=128;
4299 dst[ (black)*dstStride + i]+=128;
4300 }
4301
4302 }
4303 #endif
4304
3980 } 4305 }