Mercurial > mplayer.hg
comparison postproc/postprocess_template.c @ 2246:f7c1485b33be
fixed the height%8!=0 bug
simplified a few things
removed last row variants of the deinterlace filters, they are not needed anymore
added cubic interpolating deinterlacer
author | michael |
---|---|
date | Wed, 17 Oct 2001 20:42:07 +0000 |
parents | 440b15b32181 |
children | 4840e356d0d3 |
comparison
equal
deleted
inserted
replaced
2245:156cb1809d72 | 2246:f7c1485b33be |
---|---|
28 doHorizDefFilter E ac ac | 28 doHorizDefFilter E ac ac |
29 deRing | 29 deRing |
30 Vertical RKAlgo1 E a a | 30 Vertical RKAlgo1 E a a |
31 Vertical X1 a E E | 31 Vertical X1 a E E |
32 Horizontal X1 a E E | 32 Horizontal X1 a E E |
33 LinIpolDeinterlace a E E* | 33 LinIpolDeinterlace e E E* |
34 LinBlendDeinterlace a E E* | 34 CubicIpolDeinterlace a e e* |
35 LinBlendDeinterlace e E E* | |
35 MedianDeinterlace Ec Ec | 36 MedianDeinterlace Ec Ec |
36 | 37 |
37 | 38 |
38 * i dont have a 3dnow CPU -> its untested | 39 * i dont have a 3dnow CPU -> its untested |
39 E = Exact implementation | 40 E = Exact implementation |
40 e = allmost exact implementation | 41 e = allmost exact implementation (slightly different rounding,...) |
41 a = alternative / approximate impl | 42 a = alternative / approximate impl |
42 c = checked against the other implementations (-vo md5) | 43 c = checked against the other implementations (-vo md5) |
43 */ | 44 */ |
44 | 45 |
45 /* | 46 /* |
60 fix warnings (unused vars, ...) | 61 fix warnings (unused vars, ...) |
61 noise reduction filters | 62 noise reduction filters |
62 ... | 63 ... |
63 | 64 |
64 Notes: | 65 Notes: |
65 | |
66 | 66 |
67 */ | 67 */ |
68 | 68 |
69 //Changelog: use the CVS log | 69 //Changelog: use the CVS log |
70 | 70 |
176 } | 176 } |
177 #endif | 177 #endif |
178 | 178 |
179 //FIXME? |255-0| = 1 (shouldnt be a problem ...) | 179 //FIXME? |255-0| = 1 (shouldnt be a problem ...) |
180 /** | 180 /** |
181 * Check if the middle 8x8 Block in the given 8x10 block is flat | 181 * Check if the middle 8x8 Block in the given 8x16 block is flat |
182 */ | 182 */ |
183 static inline int isVertDC(uint8_t src[], int stride){ | 183 static inline int isVertDC(uint8_t src[], int stride){ |
184 int numEq= 0; | 184 int numEq= 0; |
185 int y; | 185 int y; |
186 src+= stride; // src points to begin of the 8x8 Block | 186 src+= stride*4; // src points to begin of the 8x8 Block |
187 #ifdef HAVE_MMX | 187 #ifdef HAVE_MMX |
188 asm volatile( | 188 asm volatile( |
189 "pushl %1\n\t" | 189 "pushl %1\n\t" |
190 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F | 190 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F |
191 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D | 191 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D |
293 | 293 |
294 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP) | 294 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP) |
295 { | 295 { |
296 #ifdef HAVE_MMX | 296 #ifdef HAVE_MMX |
297 int isOk; | 297 int isOk; |
298 src+= stride*3; | |
298 asm volatile( | 299 asm volatile( |
299 // "int $3 \n\t" | 300 // "int $3 \n\t" |
300 "movq (%1, %2), %%mm0 \n\t" | 301 "movq (%1, %2), %%mm0 \n\t" |
301 "movq (%1, %2, 8), %%mm1 \n\t" | 302 "movq (%1, %2, 8), %%mm1 \n\t" |
302 "movq %%mm0, %%mm2 \n\t" | 303 "movq %%mm0, %%mm2 \n\t" |
318 return isOk ? 1 : 0; | 319 return isOk ? 1 : 0; |
319 #else | 320 #else |
320 | 321 |
321 int isOk2= 1; | 322 int isOk2= 1; |
322 int x; | 323 int x; |
324 src+= stride*3; | |
323 for(x=0; x<BLOCK_SIZE; x++) | 325 for(x=0; x<BLOCK_SIZE; x++) |
324 { | 326 { |
325 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; | 327 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; |
326 } | 328 } |
327 /* if(isOk && !isOk2 || !isOk && isOk2) | 329 /* if(isOk && !isOk2 || !isOk && isOk2) |
341 #endif | 343 #endif |
342 | 344 |
343 } | 345 } |
344 | 346 |
345 /** | 347 /** |
346 * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle) | 348 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
347 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 | 349 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
348 */ | 350 */ |
349 static inline void doVertLowPass(uint8_t *src, int stride, int QP) | 351 static inline void doVertLowPass(uint8_t *src, int stride, int QP) |
350 { | 352 { |
351 // QP= 64; | |
352 | |
353 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 353 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
354 //#ifdef HAVE_MMX2 | 354 src+= stride*3; |
355 asm volatile( //"movv %0 %1 %2\n\t" | 355 asm volatile( //"movv %0 %1 %2\n\t" |
356 "pushl %0 \n\t" | 356 "pushl %0 \n\t" |
357 "movq pQPb, %%mm0 \n\t" // QP,..., QP | 357 "movq pQPb, %%mm0 \n\t" // QP,..., QP |
358 // "movq bFF , %%mm0 \n\t" // QP,..., QP | |
359 | 358 |
360 "movq (%0), %%mm6 \n\t" | 359 "movq (%0), %%mm6 \n\t" |
361 "movq (%0, %1), %%mm5 \n\t" | 360 "movq (%0, %1), %%mm5 \n\t" |
362 "movq %%mm5, %%mm1 \n\t" | 361 "movq %%mm5, %%mm1 \n\t" |
363 "movq %%mm6, %%mm2 \n\t" | 362 "movq %%mm6, %%mm2 \n\t" |
393 // 1 2 3 4 5 6 7 8 | 392 // 1 2 3 4 5 6 7 8 |
394 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1 | 393 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1 |
395 // 6 4 2 2 1 1 | 394 // 6 4 2 2 1 1 |
396 // 6 4 4 2 | 395 // 6 4 4 2 |
397 // 6 8 2 | 396 // 6 8 2 |
398 /* | 397 |
399 "movq %%mm6, %%mm2 \n\t" //1 | |
400 "movq %%mm6, %%mm3 \n\t" //1 | |
401 "paddusb b02, %%mm3 \n\t" | |
402 "psrlw $2, %%mm3 \n\t" //1 /4 | |
403 "pand b3F, %%mm3 \n\t" | |
404 "psubb %%mm3, %%mm2 \n\t" | |
405 "movq (%0, %1), %%mm0 \n\t" // 1 | |
406 "movq %%mm0, %%mm1 \n\t" // 1 | |
407 "paddusb b02, %%mm0 \n\t" | |
408 "psrlw $2, %%mm0 \n\t" // 1 /4 | |
409 "pand b3F, %%mm0 \n\t" | |
410 "paddusb %%mm2, %%mm0 \n\t" //3 1 /4 | |
411 */ | |
412 "movq (%0, %1), %%mm0 \n\t" // 1 | 398 "movq (%0, %1), %%mm0 \n\t" // 1 |
413 "movq %%mm0, %%mm1 \n\t" // 1 | 399 "movq %%mm0, %%mm1 \n\t" // 1 |
414 PAVGB(%%mm6, %%mm0) //1 1 /2 | 400 PAVGB(%%mm6, %%mm0) //1 1 /2 |
415 PAVGB(%%mm6, %%mm0) //3 1 /4 | 401 PAVGB(%%mm6, %%mm0) //3 1 /4 |
416 | 402 |
468 PAVGB(%%mm4, %%mm5) // 11 /2 | 454 PAVGB(%%mm4, %%mm5) // 11 /2 |
469 PAVGB(%%mm5, %%mm0) // 11 11 /4 | 455 PAVGB(%%mm5, %%mm0) // 11 11 /4 |
470 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 | 456 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 |
471 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 | 457 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
472 PAVGB(%%mm0, %%mm1) // 11224222 /16 | 458 PAVGB(%%mm0, %%mm1) // 11224222 /16 |
473 // "pxor %%mm1, %%mm1 \n\t" | |
474 "movq %%mm1, (%%eax, %1, 2) \n\t" // X | 459 "movq %%mm1, (%%eax, %1, 2) \n\t" // X |
475 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 | 460 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
476 PAVGB((%%ebx), %%mm2) // 112 4 /8 | 461 PAVGB((%%ebx), %%mm2) // 112 4 /8 |
477 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 | 462 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
478 PAVGB(%%mm0, %%mm6) // 1 1 /2 | 463 PAVGB(%%mm0, %%mm6) // 1 1 /2 |
479 PAVGB(%%mm7, %%mm6) // 1 12 /4 | 464 PAVGB(%%mm7, %%mm6) // 1 12 /4 |
480 PAVGB(%%mm2, %%mm6) // 1122424 /4 | 465 PAVGB(%%mm2, %%mm6) // 1122424 /4 |
481 // "pxor %%mm6, %%mm6 \n\t" | |
482 "movq %%mm6, (%%ebx) \n\t" // X | 466 "movq %%mm6, (%%ebx) \n\t" // X |
483 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 | 467 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
484 PAVGB(%%mm7, %%mm5) // 11 2 /4 | 468 PAVGB(%%mm7, %%mm5) // 11 2 /4 |
485 PAVGB(%%mm7, %%mm5) // 11 6 /8 | 469 PAVGB(%%mm7, %%mm5) // 11 6 /8 |
486 | 470 |
487 PAVGB(%%mm3, %%mm0) // 112 /4 | 471 PAVGB(%%mm3, %%mm0) // 112 /4 |
488 PAVGB(%%mm0, %%mm5) // 112246 /16 | 472 PAVGB(%%mm0, %%mm5) // 112246 /16 |
489 // "pxor %%mm5, %%mm5 \n\t" | |
490 // "movq pQPb, %%mm5 \n\t" | |
491 "movq %%mm5, (%%eax, %1, 4) \n\t" // X | 473 "movq %%mm5, (%%eax, %1, 4) \n\t" // X |
492 "popl %0\n\t" | 474 "popl %0\n\t" |
493 | 475 |
494 : | 476 : |
495 : "r" (src), "r" (stride) | 477 : "r" (src), "r" (stride) |
504 const int l6= stride + l5; | 486 const int l6= stride + l5; |
505 const int l7= stride + l6; | 487 const int l7= stride + l6; |
506 const int l8= stride + l7; | 488 const int l8= stride + l7; |
507 const int l9= stride + l8; | 489 const int l9= stride + l8; |
508 int x; | 490 int x; |
491 src+= stride*3; | |
509 for(x=0; x<BLOCK_SIZE; x++) | 492 for(x=0; x<BLOCK_SIZE; x++) |
510 { | 493 { |
511 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; | 494 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; |
512 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; | 495 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; |
513 | 496 |
549 1 12 12 23 | 532 1 12 12 23 |
550 */ | 533 */ |
551 static inline void vertRK1Filter(uint8_t *src, int stride, int QP) | 534 static inline void vertRK1Filter(uint8_t *src, int stride, int QP) |
552 { | 535 { |
553 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 536 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
537 src+= stride*3; | |
554 // FIXME rounding | 538 // FIXME rounding |
555 asm volatile( | 539 asm volatile( |
556 "pxor %%mm7, %%mm7 \n\t" // 0 | 540 "pxor %%mm7, %%mm7 \n\t" // 0 |
557 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | 541 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE |
558 "leal (%0, %1), %%eax \n\t" | 542 "leal (%0, %1), %%eax \n\t" |
620 const int l6= stride + l5; | 604 const int l6= stride + l5; |
621 const int l7= stride + l6; | 605 const int l7= stride + l6; |
622 const int l8= stride + l7; | 606 const int l8= stride + l7; |
623 const int l9= stride + l8; | 607 const int l9= stride + l8; |
624 int x; | 608 int x; |
609 src+= stride*3; | |
625 for(x=0; x<BLOCK_SIZE; x++) | 610 for(x=0; x<BLOCK_SIZE; x++) |
626 { | 611 { |
627 if(ABS(src[l4]-src[l5]) < QP + QP/4) | 612 if(ABS(src[l4]-src[l5]) < QP + QP/4) |
628 { | 613 { |
629 int v = (src[l5] - src[l4]); | 614 int v = (src[l5] - src[l4]); |
648 * MMX2 version does correct clipping C version doesnt | 633 * MMX2 version does correct clipping C version doesnt |
649 */ | 634 */ |
650 static inline void vertX1Filter(uint8_t *src, int stride, int QP) | 635 static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
651 { | 636 { |
652 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 637 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
638 src+= stride*3; | |
639 | |
653 asm volatile( | 640 asm volatile( |
654 "pxor %%mm7, %%mm7 \n\t" // 0 | 641 "pxor %%mm7, %%mm7 \n\t" // 0 |
655 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | 642 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE |
656 "leal (%0, %1), %%eax \n\t" | 643 "leal (%0, %1), %%eax \n\t" |
657 "leal (%%eax, %1, 4), %%ebx \n\t" | 644 "leal (%%eax, %1, 4), %%ebx \n\t" |
742 const int l6= stride + l5; | 729 const int l6= stride + l5; |
743 const int l7= stride + l6; | 730 const int l7= stride + l6; |
744 const int l8= stride + l7; | 731 const int l8= stride + l7; |
745 const int l9= stride + l8; | 732 const int l9= stride + l8; |
746 int x; | 733 int x; |
734 | |
735 src+= stride*3; | |
747 for(x=0; x<BLOCK_SIZE; x++) | 736 for(x=0; x<BLOCK_SIZE; x++) |
748 { | 737 { |
749 int a= src[l3] - src[l4]; | 738 int a= src[l3] - src[l4]; |
750 int b= src[l4] - src[l5]; | 739 int b= src[l4] - src[l5]; |
751 int c= src[l5] - src[l6]; | 740 int c= src[l5] - src[l6]; |
1005 | 994 |
1006 | 995 |
1007 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | 996 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) |
1008 { | 997 { |
1009 #ifdef HAVE_MMX | 998 #ifdef HAVE_MMX |
1010 src+= stride; | 999 src+= stride*4; |
1011 //FIXME try pmul for *5 stuff | 1000 //FIXME try pmul for *5 stuff |
1012 // src[0]=0; | 1001 // src[0]=0; |
1013 asm volatile( | 1002 asm volatile( |
1014 "pxor %%mm7, %%mm7 \n\t" | 1003 "pxor %%mm7, %%mm7 \n\t" |
1015 "leal (%0, %1), %%eax \n\t" | 1004 "leal (%0, %1), %%eax \n\t" |
1152 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) | 1141 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
1153 "pxor %%mm7, %%mm5 \n\t" | 1142 "pxor %%mm7, %%mm5 \n\t" |
1154 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| | 1143 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
1155 // 100 opcodes | 1144 // 100 opcodes |
1156 "movd %2, %%mm2 \n\t" // QP | 1145 "movd %2, %%mm2 \n\t" // QP |
1157 //"pcmpeqb %%mm2, %%mm2\n\t" | |
1158 "punpcklwd %%mm2, %%mm2 \n\t" | 1146 "punpcklwd %%mm2, %%mm2 \n\t" |
1159 "punpcklwd %%mm2, %%mm2 \n\t" | 1147 "punpcklwd %%mm2, %%mm2 \n\t" |
1160 "psllw $3, %%mm2 \n\t" // 8QP | 1148 "psllw $3, %%mm2 \n\t" // 8QP |
1161 "movq %%mm2, %%mm3 \n\t" // 8QP | 1149 "movq %%mm2, %%mm3 \n\t" // 8QP |
1162 "pcmpgtw %%mm4, %%mm2 \n\t" | 1150 "pcmpgtw %%mm4, %%mm2 \n\t" |
1230 "movq (%%eax, %1, 2), %%mm0 \n\t" | 1218 "movq (%%eax, %1, 2), %%mm0 \n\t" |
1231 "paddb %%mm4, %%mm0 \n\t" | 1219 "paddb %%mm4, %%mm0 \n\t" |
1232 "movq %%mm0, (%%eax, %1, 2) \n\t" | 1220 "movq %%mm0, (%%eax, %1, 2) \n\t" |
1233 "movq (%0, %1, 4), %%mm0 \n\t" | 1221 "movq (%0, %1, 4), %%mm0 \n\t" |
1234 "psubb %%mm4, %%mm0 \n\t" | 1222 "psubb %%mm4, %%mm0 \n\t" |
1235 // "pxor %%mm0, %%mm0 \n\t" | |
1236 "movq %%mm0, (%0, %1, 4) \n\t" | 1223 "movq %%mm0, (%0, %1, 4) \n\t" |
1237 | 1224 |
1238 : | 1225 : |
1239 : "r" (src), "r" (stride), "r" (QP) | 1226 : "r" (src), "r" (stride), "r" (QP) |
1240 : "%eax", "%ebx" | 1227 : "%eax", "%ebx" |
1248 const int l6= stride + l5; | 1235 const int l6= stride + l5; |
1249 const int l7= stride + l6; | 1236 const int l7= stride + l6; |
1250 const int l8= stride + l7; | 1237 const int l8= stride + l7; |
1251 // const int l9= stride + l8; | 1238 // const int l9= stride + l8; |
1252 int x; | 1239 int x; |
1240 src+= stride*3; | |
1253 for(x=0; x<BLOCK_SIZE; x++) | 1241 for(x=0; x<BLOCK_SIZE; x++) |
1254 { | 1242 { |
1255 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | 1243 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
1256 if(ABS(middleEnergy) < 8*QP) | 1244 if(ABS(middleEnergy) < 8*QP) |
1257 { | 1245 { |
1879 #endif | 1867 #endif |
1880 } | 1868 } |
1881 | 1869 |
1882 /** | 1870 /** |
1883 * Deinterlaces the given block | 1871 * Deinterlaces the given block |
1884 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | 1872 * will be called for every 8x8 block, and can read & write into an 8x16 block |
1885 */ | 1873 */ |
1886 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) | 1874 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) |
1887 { | 1875 { |
1888 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1876 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1889 asm volatile( | 1877 asm volatile( |
1892 // 0 1 2 3 4 5 6 7 8 9 | 1880 // 0 1 2 3 4 5 6 7 8 9 |
1893 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1881 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
1894 | 1882 |
1895 "movq (%0), %%mm0 \n\t" | 1883 "movq (%0), %%mm0 \n\t" |
1896 "movq (%%eax, %1), %%mm1 \n\t" | 1884 "movq (%%eax, %1), %%mm1 \n\t" |
1897 PAVGB(%%mm1, %%mm0)\ | 1885 PAVGB(%%mm1, %%mm0) |
1898 "movq %%mm0, (%%eax) \n\t" | 1886 "movq %%mm0, (%%eax) \n\t" |
1899 "movq (%0, %1, 4), %%mm0 \n\t" | 1887 "movq (%0, %1, 4), %%mm0 \n\t" |
1900 PAVGB(%%mm0, %%mm1)\ | 1888 PAVGB(%%mm0, %%mm1) |
1901 "movq %%mm1, (%%eax, %1, 2) \n\t" | 1889 "movq %%mm1, (%%eax, %1, 2) \n\t" |
1902 "movq (%%ebx, %1), %%mm1 \n\t" | 1890 "movq (%%ebx, %1), %%mm1 \n\t" |
1903 PAVGB(%%mm1, %%mm0)\ | 1891 PAVGB(%%mm1, %%mm0) |
1904 "movq %%mm0, (%%ebx) \n\t" | 1892 "movq %%mm0, (%%ebx) \n\t" |
1905 "movq (%0, %1, 8), %%mm0 \n\t" | 1893 "movq (%0, %1, 8), %%mm0 \n\t" |
1906 PAVGB(%%mm0, %%mm1)\ | 1894 PAVGB(%%mm0, %%mm1) |
1907 "movq %%mm1, (%%ebx, %1, 2) \n\t" | 1895 "movq %%mm1, (%%ebx, %1, 2) \n\t" |
1908 | 1896 |
1909 : : "r" (src), "r" (stride) | 1897 : : "r" (src), "r" (stride) |
1910 : "%eax", "%ebx" | 1898 : "%eax", "%ebx" |
1911 ); | 1899 ); |
1922 #endif | 1910 #endif |
1923 } | 1911 } |
1924 | 1912 |
1925 /** | 1913 /** |
1926 * Deinterlaces the given block | 1914 * Deinterlaces the given block |
1927 * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block | 1915 * will be called for every 8x8 block, and can read & write into an 8x16 block |
1916 * no cliping in C version | |
1928 */ | 1917 */ |
1929 static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride) | 1918 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride) |
1930 { | 1919 { |
1931 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1920 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1932 asm volatile( | 1921 asm volatile( |
1933 "leal (%0, %1), %%eax \n\t" | 1922 "leal (%0, %1), %%eax \n\t" |
1934 "leal (%%eax, %1, 4), %%ebx \n\t" | 1923 "leal (%%eax, %1, 4), %%ebx \n\t" |
1935 // 0 1 2 3 4 5 6 7 8 9 | 1924 "leal (%%ebx, %1, 4), %%ecx \n\t" |
1936 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1925 "addl %1, %%ecx \n\t" |
1937 | 1926 "pxor %%mm7, %%mm7 \n\t" |
1938 "movq (%0), %%mm0 \n\t" | 1927 // 0 1 2 3 4 5 6 7 8 9 10 |
1939 "movq (%%eax, %1), %%mm1 \n\t" | 1928 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx |
1940 PAVGB(%%mm1, %%mm0)\ | 1929 |
1941 "movq %%mm0, (%%eax) \n\t" | 1930 #define DEINT_CUBIC(a,b,c,d,e)\ |
1942 "movq (%0, %1, 4), %%mm0 \n\t" | 1931 "movq " #a ", %%mm0 \n\t"\ |
1943 PAVGB(%%mm0, %%mm1)\ | 1932 "movq " #b ", %%mm1 \n\t"\ |
1944 "movq %%mm1, (%%eax, %1, 2) \n\t" | 1933 "movq " #d ", %%mm2 \n\t"\ |
1945 "movq (%%ebx, %1), %%mm1 \n\t" | 1934 "movq " #e ", %%mm3 \n\t"\ |
1946 PAVGB(%%mm1, %%mm0)\ | 1935 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ |
1947 "movq %%mm0, (%%ebx) \n\t" | 1936 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ |
1948 "movq %%mm1, (%%ebx, %1, 2) \n\t" | 1937 "movq %%mm0, %%mm2 \n\t"\ |
1949 | 1938 "punpcklbw %%mm7, %%mm0 \n\t"\ |
1939 "punpckhbw %%mm7, %%mm2 \n\t"\ | |
1940 "movq %%mm1, %%mm3 \n\t"\ | |
1941 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
1942 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1943 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ | |
1944 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ | |
1945 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ | |
1946 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ | |
1947 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ | |
1948 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ | |
1949 "packuswb %%mm3, %%mm1 \n\t"\ | |
1950 "movq %%mm1, " #c " \n\t" | |
1951 | |
1952 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1)) | |
1953 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8)) | |
1954 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx)) | |
1955 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2)) | |
1950 | 1956 |
1951 : : "r" (src), "r" (stride) | 1957 : : "r" (src), "r" (stride) |
1952 : "%eax", "%ebx" | 1958 : "%eax", "%ebx", "ecx" |
1953 ); | 1959 ); |
1954 #else | 1960 #else |
1955 int x; | 1961 int x; |
1956 for(x=0; x<8; x++) | 1962 for(x=0; x<8; x++) |
1957 { | 1963 { |
1958 src[stride] = (src[0] + src[stride*2])>>1; | 1964 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; |
1959 src[stride*3] = (src[stride*2] + src[stride*4])>>1; | 1965 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; |
1960 src[stride*5] = (src[stride*4] + src[stride*6])>>1; | 1966 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; |
1961 src[stride*7] = src[stride*6]; | 1967 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4; |
1962 src++; | 1968 src++; |
1963 } | 1969 } |
1964 #endif | 1970 #endif |
1965 } | 1971 } |
1966 | 1972 |
1967 /** | 1973 /** |
1968 * Deinterlaces the given block | 1974 * Deinterlaces the given block |
1969 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | 1975 * will be called for every 8x8 block, and can read & write into an 8x16 block |
1970 * will shift the image up by 1 line (FIXME if this is a problem) | 1976 * will shift the image up by 1 line (FIXME if this is a problem) |
1971 */ | 1977 */ |
1972 static inline void deInterlaceBlendLinear(uint8_t src[], int stride) | 1978 static inline void deInterlaceBlendLinear(uint8_t src[], int stride) |
1973 { | 1979 { |
1974 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1980 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2034 #endif | 2040 #endif |
2035 } | 2041 } |
2036 | 2042 |
2037 /** | 2043 /** |
2038 * Deinterlaces the given block | 2044 * Deinterlaces the given block |
2039 * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block | |
2040 * will shift the image up by 1 line (FIXME if this is a problem) | |
2041 */ | |
2042 static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride) | |
2043 { | |
2044 #if defined (HAVE_MMSX2) || defined (HAVE_3DNOW) | |
2045 asm volatile( | |
2046 "leal (%0, %1), %%eax \n\t" | |
2047 "leal (%%eax, %1, 4), %%ebx \n\t" | |
2048 // 0 1 2 3 4 5 6 7 8 9 | |
2049 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
2050 | |
2051 "movq (%0), %%mm0 \n\t" // L0 | |
2052 "movq (%%eax, %1), %%mm1 \n\t" // L2 | |
2053 PAVGB(%%mm1, %%mm0) // L0+L2 | |
2054 "movq (%%eax), %%mm2 \n\t" // L1 | |
2055 PAVGB(%%mm2, %%mm0) | |
2056 "movq %%mm0, (%0) \n\t" | |
2057 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3 | |
2058 PAVGB(%%mm0, %%mm2) // L1+L3 | |
2059 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 | |
2060 "movq %%mm2, (%%eax) \n\t" | |
2061 "movq (%0, %1, 4), %%mm2 \n\t" // L4 | |
2062 PAVGB(%%mm2, %%mm1) // L2+L4 | |
2063 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 | |
2064 "movq %%mm1, (%%eax, %1) \n\t" | |
2065 "movq (%%ebx), %%mm1 \n\t" // L5 | |
2066 PAVGB(%%mm1, %%mm0) // L3+L5 | |
2067 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 | |
2068 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
2069 "movq (%%ebx, %1), %%mm0 \n\t" // L6 | |
2070 PAVGB(%%mm0, %%mm2) // L4+L6 | |
2071 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 | |
2072 "movq %%mm2, (%0, %1, 4) \n\t" | |
2073 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7 | |
2074 PAVGB(%%mm2, %%mm1) // L5+L7 | |
2075 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 | |
2076 "movq %%mm1, (%%ebx) \n\t" | |
2077 PAVGB(%%mm2, %%mm0) // L7 + L8 | |
2078 "movq %%mm0, (%%ebx, %1) \n\t" | |
2079 "movq %%mm0, (%%ebx, %1, 2) \n\t" | |
2080 | |
2081 : : "r" (src), "r" (stride) | |
2082 : "%eax", "%ebx" | |
2083 ); | |
2084 #else | |
2085 int x; | |
2086 for(x=0; x<8; x++) | |
2087 { | |
2088 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |
2089 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |
2090 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |
2091 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |
2092 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |
2093 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |
2094 src[stride*6] = (src[stride*6] + src[stride*7])>>1; | |
2095 src[stride*7] = src[stride*6]; | |
2096 src++; | |
2097 } | |
2098 #endif | |
2099 } | |
2100 | |
2101 /** | |
2102 * Deinterlaces the given block | |
2103 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | 2045 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block |
2104 */ | 2046 */ |
2105 static inline void deInterlaceMedian(uint8_t src[], int stride) | 2047 static inline void deInterlaceMedian(uint8_t src[], int stride) |
2106 { | 2048 { |
2107 #ifdef HAVE_MMX | 2049 #ifdef HAVE_MMX |
2211 src++; | 2153 src++; |
2212 } | 2154 } |
2213 #endif | 2155 #endif |
2214 } | 2156 } |
2215 | 2157 |
2216 /** | |
2217 * Deinterlaces the given block | |
2218 * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block | |
2219 */ | |
2220 static inline void deInterlaceMedianLastRow(uint8_t src[], int stride) | |
2221 { | |
2222 #ifdef HAVE_MMX | |
2223 #ifdef HAVE_MMX2 | |
2224 asm volatile( | |
2225 "leal (%0, %1), %%eax \n\t" | |
2226 "leal (%%eax, %1, 4), %%ebx \n\t" | |
2227 // 0 1 2 3 4 5 6 7 8 9 | |
2228 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
2229 | |
2230 "movq (%0), %%mm0 \n\t" // | |
2231 "movq (%%eax, %1), %%mm2 \n\t" // | |
2232 "movq (%%eax), %%mm1 \n\t" // | |
2233 "movq %%mm0, %%mm3 \n\t" | |
2234 "pmaxub %%mm1, %%mm0 \n\t" // | |
2235 "pminub %%mm3, %%mm1 \n\t" // | |
2236 "pmaxub %%mm2, %%mm1 \n\t" // | |
2237 "pminub %%mm1, %%mm0 \n\t" | |
2238 "movq %%mm0, (%%eax) \n\t" | |
2239 | |
2240 "movq (%0, %1, 4), %%mm0 \n\t" // | |
2241 "movq (%%eax, %1, 2), %%mm1 \n\t" // | |
2242 "movq %%mm2, %%mm3 \n\t" | |
2243 "pmaxub %%mm1, %%mm2 \n\t" // | |
2244 "pminub %%mm3, %%mm1 \n\t" // | |
2245 "pmaxub %%mm0, %%mm1 \n\t" // | |
2246 "pminub %%mm1, %%mm2 \n\t" | |
2247 "movq %%mm2, (%%eax, %1, 2) \n\t" | |
2248 | |
2249 "movq (%%ebx), %%mm2 \n\t" // | |
2250 "movq (%%ebx, %1), %%mm1 \n\t" // | |
2251 "movq %%mm2, %%mm3 \n\t" | |
2252 "pmaxub %%mm0, %%mm2 \n\t" // | |
2253 "pminub %%mm3, %%mm0 \n\t" // | |
2254 "pmaxub %%mm1, %%mm0 \n\t" // | |
2255 "pminub %%mm0, %%mm2 \n\t" | |
2256 "movq %%mm2, (%%ebx) \n\t" | |
2257 | |
2258 "movq %%mm1, (%%ebx, %1, 2) \n\t" | |
2259 | |
2260 : : "r" (src), "r" (stride) | |
2261 : "%eax", "%ebx" | |
2262 ); | |
2263 #else //MMX & no MMX2 | |
2264 asm volatile( | |
2265 "leal (%0, %1), %%eax \n\t" | |
2266 "leal (%%eax, %1, 4), %%ebx \n\t" | |
2267 // 0 1 2 3 4 5 6 7 8 9 | |
2268 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
2269 "pxor %%mm7, %%mm7 \n\t" | |
2270 | |
2271 MEDIAN((%0), (%%eax), (%%eax, %1)) | |
2272 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) | |
2273 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1)) | |
2274 | |
2275 "movq (%%ebx, %1), %%mm0 \n\t" | |
2276 "movq %%mm0, (%%ebx, %1, 2) \n\t" | |
2277 | |
2278 : : "r" (src), "r" (stride) | |
2279 : "%eax", "%ebx" | |
2280 ); | |
2281 | |
2282 #endif //MMX | |
2283 #else | |
2284 //FIXME | |
2285 int x; | |
2286 for(x=0; x<8; x++) | |
2287 { | |
2288 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |
2289 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |
2290 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |
2291 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |
2292 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |
2293 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |
2294 src[stride*6] = (src[stride*6] + src[stride*7])>>1; | |
2295 src[stride*7] = src[stride*6]; | |
2296 src++; | |
2297 } | |
2298 #endif | |
2299 } | |
2300 | |
2301 #ifdef HAVE_ODIVX_POSTPROCESS | 2158 #ifdef HAVE_ODIVX_POSTPROCESS |
2302 #include "../opendivx/postprocess.h" | 2159 #include "../opendivx/postprocess.h" |
2303 int use_old_pp=0; | 2160 int use_old_pp=0; |
2304 #endif | 2161 #endif |
2305 | 2162 |
2535 /* we need 64bit here otherwise weŽll going to have a problem | 2392 /* we need 64bit here otherwise weŽll going to have a problem |
2536 after watching a black picture for 5 hours*/ | 2393 after watching a black picture for 5 hours*/ |
2537 static uint64_t *yHistogram= NULL; | 2394 static uint64_t *yHistogram= NULL; |
2538 int black=0, white=255; // blackest black and whitest white in the picture | 2395 int black=0, white=255; // blackest black and whitest white in the picture |
2539 | 2396 |
2397 /* Temporary buffers for handling the last row(s) */ | |
2398 static uint8_t *tempDst= NULL; | |
2399 static uint8_t *tempSrc= NULL; | |
2400 | |
2540 #ifdef TIMING | 2401 #ifdef TIMING |
2541 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; | 2402 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; |
2542 sumTime= rdtsc(); | 2403 sumTime= rdtsc(); |
2543 #endif | 2404 #endif |
2405 | |
2406 if(tempDst==NULL) | |
2407 { | |
2408 tempDst= (uint8_t*)memalign(8, 1024*24); | |
2409 tempSrc= (uint8_t*)memalign(8, 1024*24); | |
2410 } | |
2544 | 2411 |
2545 if(!yHistogram) | 2412 if(!yHistogram) |
2546 { | 2413 { |
2547 int i; | 2414 int i; |
2548 yHistogram= (uint64_t*)malloc(8*256); | 2415 yHistogram= (uint64_t*)malloc(8*256); |
2567 // printf("%d ", yHistogram[i]); | 2434 // printf("%d ", yHistogram[i]); |
2568 } | 2435 } |
2569 // printf("\n\n"); | 2436 // printf("\n\n"); |
2570 | 2437 |
2571 /* we allways get a completly black picture first */ | 2438 /* we allways get a completly black picture first */ |
2572 | |
2573 maxClipped= (uint64_t)(sum * maxClippedThreshold); | 2439 maxClipped= (uint64_t)(sum * maxClippedThreshold); |
2574 | 2440 |
2575 clipped= sum; | 2441 clipped= sum; |
2576 for(black=255; black>0; black--) | 2442 for(black=255; black>0; black--) |
2577 { | 2443 { |
2602 { | 2468 { |
2603 packedYScale= 0x0100010001000100LL; | 2469 packedYScale= 0x0100010001000100LL; |
2604 packedYOffset= 0; | 2470 packedYOffset= 0; |
2605 } | 2471 } |
2606 | 2472 |
2473 /* copy first row of 8x8 blocks */ | |
2607 for(x=0; x<width; x+=BLOCK_SIZE) | 2474 for(x=0; x<width; x+=BLOCK_SIZE) |
2608 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); | 2475 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); |
2609 | 2476 |
2610 for(y=0; y<height-7; y+=BLOCK_SIZE) | 2477 for(y=0; y<height; y+=BLOCK_SIZE) |
2611 { | 2478 { |
2612 //1% speedup if these are here instead of the inner loop | 2479 //1% speedup if these are here instead of the inner loop |
2613 uint8_t *srcBlock= &(src[y*srcStride]); | 2480 uint8_t *srcBlock= &(src[y*srcStride]); |
2614 uint8_t *dstBlock= &(dst[y*dstStride]); | 2481 uint8_t *dstBlock= &(dst[y*dstStride]); |
2615 uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start | 2482 |
2616 uint8_t *vertBlock= &(dstBlock[dstStride*3]); | 2483 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not |
2484 than use a temporary buffer */ | |
2485 if(y+15 >= height) | |
2486 { | |
2487 /* copy from line 5 to 12 of src, these will e copied with | |
2488 blockcopy to dst later */ | |
2489 memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5, | |
2490 srcStride*MAX(height-y-5, 0) ); | |
2491 | |
2492 /* duplicate last line to fill the void upto line 12 */ | |
2493 if(y+12 >= height) | |
2494 { | |
2495 int i; | |
2496 for(i=height-y; i<=12; i++) | |
2497 memcpy(tempSrc + srcStride*i, | |
2498 src + srcStride*(height-1), srcStride); | |
2499 } | |
2500 | |
2501 | |
2502 /* copy up to 5 lines of dst */ | |
2503 memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) ); | |
2504 dstBlock= tempDst; | |
2505 srcBlock= tempSrc; | |
2506 } | |
2617 | 2507 |
2618 // finish 1 block before the next otherwise weŽll might have a problem | 2508 // finish 1 block before the next otherwise weŽll might have a problem |
2619 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | 2509 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
2620 for(x=0; x<width; x+=BLOCK_SIZE) | 2510 for(x=0; x<width; x+=BLOCK_SIZE) |
2621 { | 2511 { |
2623 int QP= isColor ? | 2513 int QP= isColor ? |
2624 QPs[(y>>3)*QPStride + (x>>3)]: | 2514 QPs[(y>>3)*QPStride + (x>>3)]: |
2625 QPs[(y>>4)*QPStride + (x>>4)]; | 2515 QPs[(y>>4)*QPStride + (x>>4)]; |
2626 if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8; | 2516 if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8; |
2627 #ifdef HAVE_MMX | 2517 #ifdef HAVE_MMX |
2628 asm volatile( | 2518 asm volatile( |
2629 "movd %0, %%mm7 \n\t" | 2519 "movd %0, %%mm7 \n\t" |
2630 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP | 2520 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
2631 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | 2521 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP |
2632 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | 2522 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP |
2633 "movq %%mm7, pQPb \n\t" | 2523 "movq %%mm7, pQPb \n\t" |
2634 : : "r" (QP) | 2524 : : "r" (QP) |
2635 ); | 2525 ); |
2636 #endif | 2526 #endif |
2637 | 2527 |
2638 | |
2639 if(y + 12 < height) | |
2640 { | |
2641 #ifdef MORE_TIMING | 2528 #ifdef MORE_TIMING |
2642 T0= rdtsc(); | 2529 T0= rdtsc(); |
2643 #endif | 2530 #endif |
2644 | 2531 |
2645 #ifdef HAVE_MMX2 | 2532 #ifdef HAVE_MMX2 |
2646 prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); | 2533 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
2647 prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); | 2534 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
2648 prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); | 2535 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
2649 prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); | 2536 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
2650 #elif defined(HAVE_3DNOW) | 2537 #elif defined(HAVE_3DNOW) |
2651 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | 2538 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... |
2652 /* prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); | 2539 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
2653 prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); | 2540 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
2654 prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); | 2541 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
2655 prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); | 2542 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
2656 */ | 2543 */ |
2657 #endif | 2544 #endif |
2658 if(!isColor) yHistogram[ srcBlock[0] ]++; | 2545 |
2659 | 2546 if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++; |
2660 blockCopy(vertBlock + dstStride*2, dstStride, | 2547 |
2661 vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX); | 2548 blockCopy(dstBlock + dstStride*5, dstStride, |
2662 | 2549 srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX); |
2663 if(mode & LINEAR_IPOL_DEINT_FILTER) | 2550 |
2664 deInterlaceInterpolateLinear(dstBlock, dstStride); | 2551 if(mode & LINEAR_IPOL_DEINT_FILTER) |
2665 else if(mode & LINEAR_BLEND_DEINT_FILTER) | 2552 deInterlaceInterpolateLinear(dstBlock, dstStride); |
2666 deInterlaceBlendLinear(dstBlock, dstStride); | 2553 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
2667 else if(mode & MEDIAN_DEINT_FILTER) | 2554 deInterlaceBlendLinear(dstBlock, dstStride); |
2668 deInterlaceMedian(dstBlock, dstStride); | 2555 else if(mode & MEDIAN_DEINT_FILTER) |
2669 /* else if(mode & CUBIC_IPOL_DEINT_FILTER) | 2556 deInterlaceMedian(dstBlock, dstStride); |
2670 deInterlaceInterpolateCubic(dstBlock, dstStride); | 2557 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
2671 else if(mode & CUBIC_BLEND_DEINT_FILTER) | 2558 deInterlaceInterpolateCubic(dstBlock, dstStride); |
2672 deInterlaceBlendCubic(dstBlock, dstStride); | 2559 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
2560 deInterlaceBlendCubic(dstBlock, dstStride); | |
2673 */ | 2561 */ |
2674 | 2562 |
2563 /* only deblock if we have 2 blocks */ | |
2564 if(y + 8 < height) | |
2565 { | |
2675 #ifdef MORE_TIMING | 2566 #ifdef MORE_TIMING |
2676 T1= rdtsc(); | 2567 T1= rdtsc(); |
2677 memcpyTime+= T1-T0; | 2568 memcpyTime+= T1-T0; |
2678 T0=T1; | 2569 T0=T1; |
2679 #endif | 2570 #endif |
2680 if(mode & V_DEBLOCK) | 2571 if(mode & V_DEBLOCK) |
2681 { | 2572 { |
2682 if(mode & V_RK1_FILTER) | 2573 if(mode & V_RK1_FILTER) |
2683 vertRK1Filter(vertBlock, stride, QP); | 2574 vertRK1Filter(dstBlock, stride, QP); |
2684 else if(mode & V_X1_FILTER) | 2575 else if(mode & V_X1_FILTER) |
2685 vertX1Filter(vertBlock, stride, QP); | 2576 vertX1Filter(dstBlock, stride, QP); |
2686 else | 2577 else |
2687 { | 2578 { |
2688 if( isVertDC(vertBlock, stride)) | 2579 if( isVertDC(dstBlock, stride)) |
2689 { | 2580 { |
2690 if(isVertMinMaxOk(vertBlock, stride, QP)) | 2581 if(isVertMinMaxOk(dstBlock, stride, QP)) |
2691 doVertLowPass(vertBlock, stride, QP); | 2582 doVertLowPass(dstBlock, stride, QP); |
2692 } | 2583 } |
2693 else | 2584 else |
2694 doVertDefFilter(vertBlock, stride, QP); | 2585 doVertDefFilter(dstBlock, stride, QP); |
2695 } | 2586 } |
2696 } | 2587 } |
2697 #ifdef MORE_TIMING | 2588 #ifdef MORE_TIMING |
2698 T1= rdtsc(); | 2589 T1= rdtsc(); |
2699 vertTime+= T1-T0; | 2590 vertTime+= T1-T0; |
2700 T0=T1; | 2591 T0=T1; |
2701 #endif | 2592 #endif |
2702 } | 2593 } |
2703 else | 2594 |
2704 { | 2595 /* check if we have a previous block to deblock it with dstBlock */ |
2705 blockCopy(vertBlock + dstStride*1, dstStride, | |
2706 vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX); | |
2707 | |
2708 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
2709 deInterlaceInterpolateLinearLastRow(dstBlock, dstStride); | |
2710 else if(mode & LINEAR_BLEND_DEINT_FILTER) | |
2711 deInterlaceBlendLinearLastRow(dstBlock, dstStride); | |
2712 else if(mode & MEDIAN_DEINT_FILTER) | |
2713 deInterlaceMedianLastRow(dstBlock, dstStride); | |
2714 /* else if(mode & CUBIC_IPOL_DEINT_FILTER) | |
2715 deInterlaceInterpolateCubicLastRow(dstBlock, dstStride); | |
2716 else if(mode & CUBIC_BLEND_DEINT_FILTER) | |
2717 deInterlaceBlendCubicLastRow(dstBlock, dstStride); | |
2718 */ | |
2719 } | |
2720 | |
2721 if(x - 8 >= 0 && x<width) | 2596 if(x - 8 >= 0 && x<width) |
2722 { | 2597 { |
2723 #ifdef MORE_TIMING | 2598 #ifdef MORE_TIMING |
2724 T0= rdtsc(); | 2599 T0= rdtsc(); |
2725 #endif | 2600 #endif |
2747 } | 2622 } |
2748 else if(y!=0) | 2623 else if(y!=0) |
2749 dering(dstBlock - stride*9 + width-9, stride, QP); | 2624 dering(dstBlock - stride*9 + width-9, stride, QP); |
2750 //FIXME dering filter will not be applied to last block (bottom right) | 2625 //FIXME dering filter will not be applied to last block (bottom right) |
2751 | 2626 |
2752 | |
2753 dstBlock+=8; | 2627 dstBlock+=8; |
2754 srcBlock+=8; | 2628 srcBlock+=8; |
2755 vertBlock+=8; | 2629 } |
2756 vertSrcBlock+=8; | 2630 |
2631 /* did we use a tmp buffer */ | |
2632 if(y+15 > height) | |
2633 { | |
2634 uint8_t *dstBlock= &(dst[y*dstStride]); | |
2635 memcpy(dstBlock, tempDst, dstStride*(height-y) ); | |
2757 } | 2636 } |
2758 } | 2637 } |
2759 #ifdef HAVE_3DNOW | 2638 #ifdef HAVE_3DNOW |
2760 asm volatile("femms"); | 2639 asm volatile("femms"); |
2761 #elif defined (HAVE_MMX) | 2640 #elif defined (HAVE_MMX) |
2770 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000), | 2649 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000), |
2771 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000) | 2650 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000) |
2772 , black, white); | 2651 , black, white); |
2773 #endif | 2652 #endif |
2774 } | 2653 } |
2775 | |
2776 |