comparison postproc/postprocess.c @ 2246:f7c1485b33be

fixed the height%8!=0 bug simplified a few things removed last row variants of the deinterlace filters, they are not needed anymore added cubic interpolating deinterlacer
author michael
date Wed, 17 Oct 2001 20:42:07 +0000
parents 440b15b32181
children 4840e356d0d3
comparison
equal deleted inserted replaced
2245:156cb1809d72 2246:f7c1485b33be
28 doHorizDefFilter E ac ac 28 doHorizDefFilter E ac ac
29 deRing 29 deRing
30 Vertical RKAlgo1 E a a 30 Vertical RKAlgo1 E a a
31 Vertical X1 a E E 31 Vertical X1 a E E
32 Horizontal X1 a E E 32 Horizontal X1 a E E
33 LinIpolDeinterlace a E E* 33 LinIpolDeinterlace e E E*
34 LinBlendDeinterlace a E E* 34 CubicIpolDeinterlace a e e*
35 LinBlendDeinterlace e E E*
35 MedianDeinterlace Ec Ec 36 MedianDeinterlace Ec Ec
36 37
37 38
38 * i dont have a 3dnow CPU -> its untested 39 * i dont have a 3dnow CPU -> its untested
39 E = Exact implementation 40 E = Exact implementation
40 e = allmost exact implementation 41 e = allmost exact implementation (slightly different rounding,...)
41 a = alternative / approximate impl 42 a = alternative / approximate impl
42 c = checked against the other implementations (-vo md5) 43 c = checked against the other implementations (-vo md5)
43 */ 44 */
44 45
45 /* 46 /*
60 fix warnings (unused vars, ...) 61 fix warnings (unused vars, ...)
61 noise reduction filters 62 noise reduction filters
62 ... 63 ...
63 64
64 Notes: 65 Notes:
65
66 66
67 */ 67 */
68 68
69 //Changelog: use the CVS log 69 //Changelog: use the CVS log
70 70
176 } 176 }
177 #endif 177 #endif
178 178
179 //FIXME? |255-0| = 1 (shouldnt be a problem ...) 179 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
180 /** 180 /**
181 * Check if the middle 8x8 Block in the given 8x10 block is flat 181 * Check if the middle 8x8 Block in the given 8x16 block is flat
182 */ 182 */
183 static inline int isVertDC(uint8_t src[], int stride){ 183 static inline int isVertDC(uint8_t src[], int stride){
184 int numEq= 0; 184 int numEq= 0;
185 int y; 185 int y;
186 src+= stride; // src points to begin of the 8x8 Block 186 src+= stride*4; // src points to begin of the 8x8 Block
187 #ifdef HAVE_MMX 187 #ifdef HAVE_MMX
188 asm volatile( 188 asm volatile(
189 "pushl %1\n\t" 189 "pushl %1\n\t"
190 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F 190 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
191 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D 191 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
293 293
294 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP) 294 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
295 { 295 {
296 #ifdef HAVE_MMX 296 #ifdef HAVE_MMX
297 int isOk; 297 int isOk;
298 src+= stride*3;
298 asm volatile( 299 asm volatile(
299 // "int $3 \n\t" 300 // "int $3 \n\t"
300 "movq (%1, %2), %%mm0 \n\t" 301 "movq (%1, %2), %%mm0 \n\t"
301 "movq (%1, %2, 8), %%mm1 \n\t" 302 "movq (%1, %2, 8), %%mm1 \n\t"
302 "movq %%mm0, %%mm2 \n\t" 303 "movq %%mm0, %%mm2 \n\t"
318 return isOk ? 1 : 0; 319 return isOk ? 1 : 0;
319 #else 320 #else
320 321
321 int isOk2= 1; 322 int isOk2= 1;
322 int x; 323 int x;
324 src+= stride*3;
323 for(x=0; x<BLOCK_SIZE; x++) 325 for(x=0; x<BLOCK_SIZE; x++)
324 { 326 {
325 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; 327 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
326 } 328 }
327 /* if(isOk && !isOk2 || !isOk && isOk2) 329 /* if(isOk && !isOk2 || !isOk && isOk2)
341 #endif 343 #endif
342 344
343 } 345 }
344 346
345 /** 347 /**
346 * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle) 348 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
347 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 349 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
348 */ 350 */
349 static inline void doVertLowPass(uint8_t *src, int stride, int QP) 351 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
350 { 352 {
351 // QP= 64;
352
353 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 353 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
354 //#ifdef HAVE_MMX2 354 src+= stride*3;
355 asm volatile( //"movv %0 %1 %2\n\t" 355 asm volatile( //"movv %0 %1 %2\n\t"
356 "pushl %0 \n\t" 356 "pushl %0 \n\t"
357 "movq pQPb, %%mm0 \n\t" // QP,..., QP 357 "movq pQPb, %%mm0 \n\t" // QP,..., QP
358 // "movq bFF , %%mm0 \n\t" // QP,..., QP
359 358
360 "movq (%0), %%mm6 \n\t" 359 "movq (%0), %%mm6 \n\t"
361 "movq (%0, %1), %%mm5 \n\t" 360 "movq (%0, %1), %%mm5 \n\t"
362 "movq %%mm5, %%mm1 \n\t" 361 "movq %%mm5, %%mm1 \n\t"
363 "movq %%mm6, %%mm2 \n\t" 362 "movq %%mm6, %%mm2 \n\t"
393 // 1 2 3 4 5 6 7 8 392 // 1 2 3 4 5 6 7 8
394 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1 393 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
395 // 6 4 2 2 1 1 394 // 6 4 2 2 1 1
396 // 6 4 4 2 395 // 6 4 4 2
397 // 6 8 2 396 // 6 8 2
398 /* 397
399 "movq %%mm6, %%mm2 \n\t" //1
400 "movq %%mm6, %%mm3 \n\t" //1
401 "paddusb b02, %%mm3 \n\t"
402 "psrlw $2, %%mm3 \n\t" //1 /4
403 "pand b3F, %%mm3 \n\t"
404 "psubb %%mm3, %%mm2 \n\t"
405 "movq (%0, %1), %%mm0 \n\t" // 1
406 "movq %%mm0, %%mm1 \n\t" // 1
407 "paddusb b02, %%mm0 \n\t"
408 "psrlw $2, %%mm0 \n\t" // 1 /4
409 "pand b3F, %%mm0 \n\t"
410 "paddusb %%mm2, %%mm0 \n\t" //3 1 /4
411 */
412 "movq (%0, %1), %%mm0 \n\t" // 1 398 "movq (%0, %1), %%mm0 \n\t" // 1
413 "movq %%mm0, %%mm1 \n\t" // 1 399 "movq %%mm0, %%mm1 \n\t" // 1
414 PAVGB(%%mm6, %%mm0) //1 1 /2 400 PAVGB(%%mm6, %%mm0) //1 1 /2
415 PAVGB(%%mm6, %%mm0) //3 1 /4 401 PAVGB(%%mm6, %%mm0) //3 1 /4
416 402
468 PAVGB(%%mm4, %%mm5) // 11 /2 454 PAVGB(%%mm4, %%mm5) // 11 /2
469 PAVGB(%%mm5, %%mm0) // 11 11 /4 455 PAVGB(%%mm5, %%mm0) // 11 11 /4
470 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 456 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
471 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 457 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
472 PAVGB(%%mm0, %%mm1) // 11224222 /16 458 PAVGB(%%mm0, %%mm1) // 11224222 /16
473 // "pxor %%mm1, %%mm1 \n\t"
474 "movq %%mm1, (%%eax, %1, 2) \n\t" // X 459 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
475 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 460 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
476 PAVGB((%%ebx), %%mm2) // 112 4 /8 461 PAVGB((%%ebx), %%mm2) // 112 4 /8
477 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 462 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
478 PAVGB(%%mm0, %%mm6) // 1 1 /2 463 PAVGB(%%mm0, %%mm6) // 1 1 /2
479 PAVGB(%%mm7, %%mm6) // 1 12 /4 464 PAVGB(%%mm7, %%mm6) // 1 12 /4
480 PAVGB(%%mm2, %%mm6) // 1122424 /4 465 PAVGB(%%mm2, %%mm6) // 1122424 /4
481 // "pxor %%mm6, %%mm6 \n\t"
482 "movq %%mm6, (%%ebx) \n\t" // X 466 "movq %%mm6, (%%ebx) \n\t" // X
483 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 467 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
484 PAVGB(%%mm7, %%mm5) // 11 2 /4 468 PAVGB(%%mm7, %%mm5) // 11 2 /4
485 PAVGB(%%mm7, %%mm5) // 11 6 /8 469 PAVGB(%%mm7, %%mm5) // 11 6 /8
486 470
487 PAVGB(%%mm3, %%mm0) // 112 /4 471 PAVGB(%%mm3, %%mm0) // 112 /4
488 PAVGB(%%mm0, %%mm5) // 112246 /16 472 PAVGB(%%mm0, %%mm5) // 112246 /16
489 // "pxor %%mm5, %%mm5 \n\t"
490 // "movq pQPb, %%mm5 \n\t"
491 "movq %%mm5, (%%eax, %1, 4) \n\t" // X 473 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
492 "popl %0\n\t" 474 "popl %0\n\t"
493 475
494 : 476 :
495 : "r" (src), "r" (stride) 477 : "r" (src), "r" (stride)
504 const int l6= stride + l5; 486 const int l6= stride + l5;
505 const int l7= stride + l6; 487 const int l7= stride + l6;
506 const int l8= stride + l7; 488 const int l8= stride + l7;
507 const int l9= stride + l8; 489 const int l9= stride + l8;
508 int x; 490 int x;
491 src+= stride*3;
509 for(x=0; x<BLOCK_SIZE; x++) 492 for(x=0; x<BLOCK_SIZE; x++)
510 { 493 {
511 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; 494 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
512 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; 495 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
513 496
549 1 12 12 23 532 1 12 12 23
550 */ 533 */
551 static inline void vertRK1Filter(uint8_t *src, int stride, int QP) 534 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
552 { 535 {
553 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 536 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
537 src+= stride*3;
554 // FIXME rounding 538 // FIXME rounding
555 asm volatile( 539 asm volatile(
556 "pxor %%mm7, %%mm7 \n\t" // 0 540 "pxor %%mm7, %%mm7 \n\t" // 0
557 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE 541 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
558 "leal (%0, %1), %%eax \n\t" 542 "leal (%0, %1), %%eax \n\t"
620 const int l6= stride + l5; 604 const int l6= stride + l5;
621 const int l7= stride + l6; 605 const int l7= stride + l6;
622 const int l8= stride + l7; 606 const int l8= stride + l7;
623 const int l9= stride + l8; 607 const int l9= stride + l8;
624 int x; 608 int x;
609 src+= stride*3;
625 for(x=0; x<BLOCK_SIZE; x++) 610 for(x=0; x<BLOCK_SIZE; x++)
626 { 611 {
627 if(ABS(src[l4]-src[l5]) < QP + QP/4) 612 if(ABS(src[l4]-src[l5]) < QP + QP/4)
628 { 613 {
629 int v = (src[l5] - src[l4]); 614 int v = (src[l5] - src[l4]);
648 * MMX2 version does correct clipping C version doesnt 633 * MMX2 version does correct clipping C version doesnt
649 */ 634 */
650 static inline void vertX1Filter(uint8_t *src, int stride, int QP) 635 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
651 { 636 {
652 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 637 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
638 src+= stride*3;
639
653 asm volatile( 640 asm volatile(
654 "pxor %%mm7, %%mm7 \n\t" // 0 641 "pxor %%mm7, %%mm7 \n\t" // 0
655 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE 642 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
656 "leal (%0, %1), %%eax \n\t" 643 "leal (%0, %1), %%eax \n\t"
657 "leal (%%eax, %1, 4), %%ebx \n\t" 644 "leal (%%eax, %1, 4), %%ebx \n\t"
742 const int l6= stride + l5; 729 const int l6= stride + l5;
743 const int l7= stride + l6; 730 const int l7= stride + l6;
744 const int l8= stride + l7; 731 const int l8= stride + l7;
745 const int l9= stride + l8; 732 const int l9= stride + l8;
746 int x; 733 int x;
734
735 src+= stride*3;
747 for(x=0; x<BLOCK_SIZE; x++) 736 for(x=0; x<BLOCK_SIZE; x++)
748 { 737 {
749 int a= src[l3] - src[l4]; 738 int a= src[l3] - src[l4];
750 int b= src[l4] - src[l5]; 739 int b= src[l4] - src[l5];
751 int c= src[l5] - src[l6]; 740 int c= src[l5] - src[l6];
1005 994
1006 995
1007 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) 996 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1008 { 997 {
1009 #ifdef HAVE_MMX 998 #ifdef HAVE_MMX
1010 src+= stride; 999 src+= stride*4;
1011 //FIXME try pmul for *5 stuff 1000 //FIXME try pmul for *5 stuff
1012 // src[0]=0; 1001 // src[0]=0;
1013 asm volatile( 1002 asm volatile(
1014 "pxor %%mm7, %%mm7 \n\t" 1003 "pxor %%mm7, %%mm7 \n\t"
1015 "leal (%0, %1), %%eax \n\t" 1004 "leal (%0, %1), %%eax \n\t"
1152 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) 1141 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1153 "pxor %%mm7, %%mm5 \n\t" 1142 "pxor %%mm7, %%mm5 \n\t"
1154 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| 1143 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1155 // 100 opcodes 1144 // 100 opcodes
1156 "movd %2, %%mm2 \n\t" // QP 1145 "movd %2, %%mm2 \n\t" // QP
1157 //"pcmpeqb %%mm2, %%mm2\n\t"
1158 "punpcklwd %%mm2, %%mm2 \n\t" 1146 "punpcklwd %%mm2, %%mm2 \n\t"
1159 "punpcklwd %%mm2, %%mm2 \n\t" 1147 "punpcklwd %%mm2, %%mm2 \n\t"
1160 "psllw $3, %%mm2 \n\t" // 8QP 1148 "psllw $3, %%mm2 \n\t" // 8QP
1161 "movq %%mm2, %%mm3 \n\t" // 8QP 1149 "movq %%mm2, %%mm3 \n\t" // 8QP
1162 "pcmpgtw %%mm4, %%mm2 \n\t" 1150 "pcmpgtw %%mm4, %%mm2 \n\t"
1230 "movq (%%eax, %1, 2), %%mm0 \n\t" 1218 "movq (%%eax, %1, 2), %%mm0 \n\t"
1231 "paddb %%mm4, %%mm0 \n\t" 1219 "paddb %%mm4, %%mm0 \n\t"
1232 "movq %%mm0, (%%eax, %1, 2) \n\t" 1220 "movq %%mm0, (%%eax, %1, 2) \n\t"
1233 "movq (%0, %1, 4), %%mm0 \n\t" 1221 "movq (%0, %1, 4), %%mm0 \n\t"
1234 "psubb %%mm4, %%mm0 \n\t" 1222 "psubb %%mm4, %%mm0 \n\t"
1235 // "pxor %%mm0, %%mm0 \n\t"
1236 "movq %%mm0, (%0, %1, 4) \n\t" 1223 "movq %%mm0, (%0, %1, 4) \n\t"
1237 1224
1238 : 1225 :
1239 : "r" (src), "r" (stride), "r" (QP) 1226 : "r" (src), "r" (stride), "r" (QP)
1240 : "%eax", "%ebx" 1227 : "%eax", "%ebx"
1248 const int l6= stride + l5; 1235 const int l6= stride + l5;
1249 const int l7= stride + l6; 1236 const int l7= stride + l6;
1250 const int l8= stride + l7; 1237 const int l8= stride + l7;
1251 // const int l9= stride + l8; 1238 // const int l9= stride + l8;
1252 int x; 1239 int x;
1240 src+= stride*3;
1253 for(x=0; x<BLOCK_SIZE; x++) 1241 for(x=0; x<BLOCK_SIZE; x++)
1254 { 1242 {
1255 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); 1243 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1256 if(ABS(middleEnergy) < 8*QP) 1244 if(ABS(middleEnergy) < 8*QP)
1257 { 1245 {
1879 #endif 1867 #endif
1880 } 1868 }
1881 1869
1882 /** 1870 /**
1883 * Deinterlaces the given block 1871 * Deinterlaces the given block
1884 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block 1872 * will be called for every 8x8 block, and can read & write into an 8x16 block
1885 */ 1873 */
1886 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) 1874 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
1887 { 1875 {
1888 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1876 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1889 asm volatile( 1877 asm volatile(
1892 // 0 1 2 3 4 5 6 7 8 9 1880 // 0 1 2 3 4 5 6 7 8 9
1893 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1881 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1894 1882
1895 "movq (%0), %%mm0 \n\t" 1883 "movq (%0), %%mm0 \n\t"
1896 "movq (%%eax, %1), %%mm1 \n\t" 1884 "movq (%%eax, %1), %%mm1 \n\t"
1897 PAVGB(%%mm1, %%mm0)\ 1885 PAVGB(%%mm1, %%mm0)
1898 "movq %%mm0, (%%eax) \n\t" 1886 "movq %%mm0, (%%eax) \n\t"
1899 "movq (%0, %1, 4), %%mm0 \n\t" 1887 "movq (%0, %1, 4), %%mm0 \n\t"
1900 PAVGB(%%mm0, %%mm1)\ 1888 PAVGB(%%mm0, %%mm1)
1901 "movq %%mm1, (%%eax, %1, 2) \n\t" 1889 "movq %%mm1, (%%eax, %1, 2) \n\t"
1902 "movq (%%ebx, %1), %%mm1 \n\t" 1890 "movq (%%ebx, %1), %%mm1 \n\t"
1903 PAVGB(%%mm1, %%mm0)\ 1891 PAVGB(%%mm1, %%mm0)
1904 "movq %%mm0, (%%ebx) \n\t" 1892 "movq %%mm0, (%%ebx) \n\t"
1905 "movq (%0, %1, 8), %%mm0 \n\t" 1893 "movq (%0, %1, 8), %%mm0 \n\t"
1906 PAVGB(%%mm0, %%mm1)\ 1894 PAVGB(%%mm0, %%mm1)
1907 "movq %%mm1, (%%ebx, %1, 2) \n\t" 1895 "movq %%mm1, (%%ebx, %1, 2) \n\t"
1908 1896
1909 : : "r" (src), "r" (stride) 1897 : : "r" (src), "r" (stride)
1910 : "%eax", "%ebx" 1898 : "%eax", "%ebx"
1911 ); 1899 );
1922 #endif 1910 #endif
1923 } 1911 }
1924 1912
1925 /** 1913 /**
1926 * Deinterlaces the given block 1914 * Deinterlaces the given block
1927 * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block 1915 * will be called for every 8x8 block, and can read & write into an 8x16 block
1916 * no cliping in C version
1928 */ 1917 */
1929 static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride) 1918 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
1930 { 1919 {
1931 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1920 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1932 asm volatile( 1921 asm volatile(
1933 "leal (%0, %1), %%eax \n\t" 1922 "leal (%0, %1), %%eax \n\t"
1934 "leal (%%eax, %1, 4), %%ebx \n\t" 1923 "leal (%%eax, %1, 4), %%ebx \n\t"
1935 // 0 1 2 3 4 5 6 7 8 9 1924 "leal (%%ebx, %1, 4), %%ecx \n\t"
1936 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1925 "addl %1, %%ecx \n\t"
1937 1926 "pxor %%mm7, %%mm7 \n\t"
1938 "movq (%0), %%mm0 \n\t" 1927 // 0 1 2 3 4 5 6 7 8 9 10
1939 "movq (%%eax, %1), %%mm1 \n\t" 1928 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx
1940 PAVGB(%%mm1, %%mm0)\ 1929
1941 "movq %%mm0, (%%eax) \n\t" 1930 #define DEINT_CUBIC(a,b,c,d,e)\
1942 "movq (%0, %1, 4), %%mm0 \n\t" 1931 "movq " #a ", %%mm0 \n\t"\
1943 PAVGB(%%mm0, %%mm1)\ 1932 "movq " #b ", %%mm1 \n\t"\
1944 "movq %%mm1, (%%eax, %1, 2) \n\t" 1933 "movq " #d ", %%mm2 \n\t"\
1945 "movq (%%ebx, %1), %%mm1 \n\t" 1934 "movq " #e ", %%mm3 \n\t"\
1946 PAVGB(%%mm1, %%mm0)\ 1935 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1947 "movq %%mm0, (%%ebx) \n\t" 1936 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
1948 "movq %%mm1, (%%ebx, %1, 2) \n\t" 1937 "movq %%mm0, %%mm2 \n\t"\
1949 1938 "punpcklbw %%mm7, %%mm0 \n\t"\
1939 "punpckhbw %%mm7, %%mm2 \n\t"\
1940 "movq %%mm1, %%mm3 \n\t"\
1941 "punpcklbw %%mm7, %%mm1 \n\t"\
1942 "punpckhbw %%mm7, %%mm3 \n\t"\
1943 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1944 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1945 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1946 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1947 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1948 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1949 "packuswb %%mm3, %%mm1 \n\t"\
1950 "movq %%mm1, " #c " \n\t"
1951
1952 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
1953 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
1954 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
1955 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1950 1956
1951 : : "r" (src), "r" (stride) 1957 : : "r" (src), "r" (stride)
1952 : "%eax", "%ebx" 1958 : "%eax", "%ebx", "ecx"
1953 ); 1959 );
1954 #else 1960 #else
1955 int x; 1961 int x;
1956 for(x=0; x<8; x++) 1962 for(x=0; x<8; x++)
1957 { 1963 {
1958 src[stride] = (src[0] + src[stride*2])>>1; 1964 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1959 src[stride*3] = (src[stride*2] + src[stride*4])>>1; 1965 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1960 src[stride*5] = (src[stride*4] + src[stride*6])>>1; 1966 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1961 src[stride*7] = src[stride*6]; 1967 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
1962 src++; 1968 src++;
1963 } 1969 }
1964 #endif 1970 #endif
1965 } 1971 }
1966 1972
1967 /** 1973 /**
1968 * Deinterlaces the given block 1974 * Deinterlaces the given block
1969 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block 1975 * will be called for every 8x8 block, and can read & write into an 8x16 block
1970 * will shift the image up by 1 line (FIXME if this is a problem) 1976 * will shift the image up by 1 line (FIXME if this is a problem)
1971 */ 1977 */
1972 static inline void deInterlaceBlendLinear(uint8_t src[], int stride) 1978 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
1973 { 1979 {
1974 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1980 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2034 #endif 2040 #endif
2035 } 2041 }
2036 2042
2037 /** 2043 /**
2038 * Deinterlaces the given block 2044 * Deinterlaces the given block
2039 * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
2040 * will shift the image up by 1 line (FIXME if this is a problem)
2041 */
2042 static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride)
2043 {
2044 #if defined (HAVE_MMSX2) || defined (HAVE_3DNOW)
2045 asm volatile(
2046 "leal (%0, %1), %%eax \n\t"
2047 "leal (%%eax, %1, 4), %%ebx \n\t"
2048 // 0 1 2 3 4 5 6 7 8 9
2049 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2050
2051 "movq (%0), %%mm0 \n\t" // L0
2052 "movq (%%eax, %1), %%mm1 \n\t" // L2
2053 PAVGB(%%mm1, %%mm0) // L0+L2
2054 "movq (%%eax), %%mm2 \n\t" // L1
2055 PAVGB(%%mm2, %%mm0)
2056 "movq %%mm0, (%0) \n\t"
2057 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
2058 PAVGB(%%mm0, %%mm2) // L1+L3
2059 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
2060 "movq %%mm2, (%%eax) \n\t"
2061 "movq (%0, %1, 4), %%mm2 \n\t" // L4
2062 PAVGB(%%mm2, %%mm1) // L2+L4
2063 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
2064 "movq %%mm1, (%%eax, %1) \n\t"
2065 "movq (%%ebx), %%mm1 \n\t" // L5
2066 PAVGB(%%mm1, %%mm0) // L3+L5
2067 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
2068 "movq %%mm0, (%%eax, %1, 2) \n\t"
2069 "movq (%%ebx, %1), %%mm0 \n\t" // L6
2070 PAVGB(%%mm0, %%mm2) // L4+L6
2071 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
2072 "movq %%mm2, (%0, %1, 4) \n\t"
2073 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7
2074 PAVGB(%%mm2, %%mm1) // L5+L7
2075 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
2076 "movq %%mm1, (%%ebx) \n\t"
2077 PAVGB(%%mm2, %%mm0) // L7 + L8
2078 "movq %%mm0, (%%ebx, %1) \n\t"
2079 "movq %%mm0, (%%ebx, %1, 2) \n\t"
2080
2081 : : "r" (src), "r" (stride)
2082 : "%eax", "%ebx"
2083 );
2084 #else
2085 int x;
2086 for(x=0; x<8; x++)
2087 {
2088 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2089 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2090 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2091 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2092 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2093 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2094 src[stride*6] = (src[stride*6] + src[stride*7])>>1;
2095 src[stride*7] = src[stride*6];
2096 src++;
2097 }
2098 #endif
2099 }
2100
2101 /**
2102 * Deinterlaces the given block
2103 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block 2045 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
2104 */ 2046 */
2105 static inline void deInterlaceMedian(uint8_t src[], int stride) 2047 static inline void deInterlaceMedian(uint8_t src[], int stride)
2106 { 2048 {
2107 #ifdef HAVE_MMX 2049 #ifdef HAVE_MMX
2211 src++; 2153 src++;
2212 } 2154 }
2213 #endif 2155 #endif
2214 } 2156 }
2215 2157
2216 /**
2217 * Deinterlaces the given block
2218 * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
2219 */
2220 static inline void deInterlaceMedianLastRow(uint8_t src[], int stride)
2221 {
2222 #ifdef HAVE_MMX
2223 #ifdef HAVE_MMX2
2224 asm volatile(
2225 "leal (%0, %1), %%eax \n\t"
2226 "leal (%%eax, %1, 4), %%ebx \n\t"
2227 // 0 1 2 3 4 5 6 7 8 9
2228 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2229
2230 "movq (%0), %%mm0 \n\t" //
2231 "movq (%%eax, %1), %%mm2 \n\t" //
2232 "movq (%%eax), %%mm1 \n\t" //
2233 "movq %%mm0, %%mm3 \n\t"
2234 "pmaxub %%mm1, %%mm0 \n\t" //
2235 "pminub %%mm3, %%mm1 \n\t" //
2236 "pmaxub %%mm2, %%mm1 \n\t" //
2237 "pminub %%mm1, %%mm0 \n\t"
2238 "movq %%mm0, (%%eax) \n\t"
2239
2240 "movq (%0, %1, 4), %%mm0 \n\t" //
2241 "movq (%%eax, %1, 2), %%mm1 \n\t" //
2242 "movq %%mm2, %%mm3 \n\t"
2243 "pmaxub %%mm1, %%mm2 \n\t" //
2244 "pminub %%mm3, %%mm1 \n\t" //
2245 "pmaxub %%mm0, %%mm1 \n\t" //
2246 "pminub %%mm1, %%mm2 \n\t"
2247 "movq %%mm2, (%%eax, %1, 2) \n\t"
2248
2249 "movq (%%ebx), %%mm2 \n\t" //
2250 "movq (%%ebx, %1), %%mm1 \n\t" //
2251 "movq %%mm2, %%mm3 \n\t"
2252 "pmaxub %%mm0, %%mm2 \n\t" //
2253 "pminub %%mm3, %%mm0 \n\t" //
2254 "pmaxub %%mm1, %%mm0 \n\t" //
2255 "pminub %%mm0, %%mm2 \n\t"
2256 "movq %%mm2, (%%ebx) \n\t"
2257
2258 "movq %%mm1, (%%ebx, %1, 2) \n\t"
2259
2260 : : "r" (src), "r" (stride)
2261 : "%eax", "%ebx"
2262 );
2263 #else //MMX & no MMX2
2264 asm volatile(
2265 "leal (%0, %1), %%eax \n\t"
2266 "leal (%%eax, %1, 4), %%ebx \n\t"
2267 // 0 1 2 3 4 5 6 7 8 9
2268 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2269 "pxor %%mm7, %%mm7 \n\t"
2270
2271 MEDIAN((%0), (%%eax), (%%eax, %1))
2272 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2273 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2274
2275 "movq (%%ebx, %1), %%mm0 \n\t"
2276 "movq %%mm0, (%%ebx, %1, 2) \n\t"
2277
2278 : : "r" (src), "r" (stride)
2279 : "%eax", "%ebx"
2280 );
2281
2282 #endif //MMX
2283 #else
2284 //FIXME
2285 int x;
2286 for(x=0; x<8; x++)
2287 {
2288 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
2289 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
2290 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2291 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2292 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2293 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2294 src[stride*6] = (src[stride*6] + src[stride*7])>>1;
2295 src[stride*7] = src[stride*6];
2296 src++;
2297 }
2298 #endif
2299 }
2300
2301 #ifdef HAVE_ODIVX_POSTPROCESS 2158 #ifdef HAVE_ODIVX_POSTPROCESS
2302 #include "../opendivx/postprocess.h" 2159 #include "../opendivx/postprocess.h"
2303 int use_old_pp=0; 2160 int use_old_pp=0;
2304 #endif 2161 #endif
2305 2162
2535 /* we need 64bit here otherwise weŽll going to have a problem 2392 /* we need 64bit here otherwise weŽll going to have a problem
2536 after watching a black picture for 5 hours*/ 2393 after watching a black picture for 5 hours*/
2537 static uint64_t *yHistogram= NULL; 2394 static uint64_t *yHistogram= NULL;
2538 int black=0, white=255; // blackest black and whitest white in the picture 2395 int black=0, white=255; // blackest black and whitest white in the picture
2539 2396
2397 /* Temporary buffers for handling the last row(s) */
2398 static uint8_t *tempDst= NULL;
2399 static uint8_t *tempSrc= NULL;
2400
2540 #ifdef TIMING 2401 #ifdef TIMING
2541 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; 2402 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
2542 sumTime= rdtsc(); 2403 sumTime= rdtsc();
2543 #endif 2404 #endif
2405
2406 if(tempDst==NULL)
2407 {
2408 tempDst= (uint8_t*)memalign(8, 1024*24);
2409 tempSrc= (uint8_t*)memalign(8, 1024*24);
2410 }
2544 2411
2545 if(!yHistogram) 2412 if(!yHistogram)
2546 { 2413 {
2547 int i; 2414 int i;
2548 yHistogram= (uint64_t*)malloc(8*256); 2415 yHistogram= (uint64_t*)malloc(8*256);
2567 // printf("%d ", yHistogram[i]); 2434 // printf("%d ", yHistogram[i]);
2568 } 2435 }
2569 // printf("\n\n"); 2436 // printf("\n\n");
2570 2437
2571 /* we allways get a completly black picture first */ 2438 /* we allways get a completly black picture first */
2572
2573 maxClipped= (uint64_t)(sum * maxClippedThreshold); 2439 maxClipped= (uint64_t)(sum * maxClippedThreshold);
2574 2440
2575 clipped= sum; 2441 clipped= sum;
2576 for(black=255; black>0; black--) 2442 for(black=255; black>0; black--)
2577 { 2443 {
2602 { 2468 {
2603 packedYScale= 0x0100010001000100LL; 2469 packedYScale= 0x0100010001000100LL;
2604 packedYOffset= 0; 2470 packedYOffset= 0;
2605 } 2471 }
2606 2472
2473 /* copy first row of 8x8 blocks */
2607 for(x=0; x<width; x+=BLOCK_SIZE) 2474 for(x=0; x<width; x+=BLOCK_SIZE)
2608 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); 2475 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
2609 2476
2610 for(y=0; y<height-7; y+=BLOCK_SIZE) 2477 for(y=0; y<height; y+=BLOCK_SIZE)
2611 { 2478 {
2612 //1% speedup if these are here instead of the inner loop 2479 //1% speedup if these are here instead of the inner loop
2613 uint8_t *srcBlock= &(src[y*srcStride]); 2480 uint8_t *srcBlock= &(src[y*srcStride]);
2614 uint8_t *dstBlock= &(dst[y*dstStride]); 2481 uint8_t *dstBlock= &(dst[y*dstStride]);
2615 uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start 2482
2616 uint8_t *vertBlock= &(dstBlock[dstStride*3]); 2483 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
2484 than use a temporary buffer */
2485 if(y+15 >= height)
2486 {
2487 /* copy from line 5 to 12 of src, these will e copied with
2488 blockcopy to dst later */
2489 memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
2490 srcStride*MAX(height-y-5, 0) );
2491
2492 /* duplicate last line to fill the void upto line 12 */
2493 if(y+12 >= height)
2494 {
2495 int i;
2496 for(i=height-y; i<=12; i++)
2497 memcpy(tempSrc + srcStride*i,
2498 src + srcStride*(height-1), srcStride);
2499 }
2500
2501
2502 /* copy up to 5 lines of dst */
2503 memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) );
2504 dstBlock= tempDst;
2505 srcBlock= tempSrc;
2506 }
2617 2507
2618 // finish 1 block before the next otherwise weŽll might have a problem 2508 // finish 1 block before the next otherwise weŽll might have a problem
2619 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 2509 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2620 for(x=0; x<width; x+=BLOCK_SIZE) 2510 for(x=0; x<width; x+=BLOCK_SIZE)
2621 { 2511 {
2623 int QP= isColor ? 2513 int QP= isColor ?
2624 QPs[(y>>3)*QPStride + (x>>3)]: 2514 QPs[(y>>3)*QPStride + (x>>3)]:
2625 QPs[(y>>4)*QPStride + (x>>4)]; 2515 QPs[(y>>4)*QPStride + (x>>4)];
2626 if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8; 2516 if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
2627 #ifdef HAVE_MMX 2517 #ifdef HAVE_MMX
2628 asm volatile( 2518 asm volatile(
2629 "movd %0, %%mm7 \n\t" 2519 "movd %0, %%mm7 \n\t"
2630 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP 2520 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2631 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP 2521 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2632 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP 2522 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
2633 "movq %%mm7, pQPb \n\t" 2523 "movq %%mm7, pQPb \n\t"
2634 : : "r" (QP) 2524 : : "r" (QP)
2635 ); 2525 );
2636 #endif 2526 #endif
2637 2527
2638
2639 if(y + 12 < height)
2640 {
2641 #ifdef MORE_TIMING 2528 #ifdef MORE_TIMING
2642 T0= rdtsc(); 2529 T0= rdtsc();
2643 #endif 2530 #endif
2644 2531
2645 #ifdef HAVE_MMX2 2532 #ifdef HAVE_MMX2
2646 prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); 2533 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2647 prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); 2534 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2648 prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); 2535 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2649 prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); 2536 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2650 #elif defined(HAVE_3DNOW) 2537 #elif defined(HAVE_3DNOW)
2651 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... 2538 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2652 /* prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); 2539 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2653 prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); 2540 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2654 prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); 2541 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2655 prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); 2542 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2656 */ 2543 */
2657 #endif 2544 #endif
2658 if(!isColor) yHistogram[ srcBlock[0] ]++; 2545
2659 2546 if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
2660 blockCopy(vertBlock + dstStride*2, dstStride, 2547
2661 vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX); 2548 blockCopy(dstBlock + dstStride*5, dstStride,
2662 2549 srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX);
2663 if(mode & LINEAR_IPOL_DEINT_FILTER) 2550
2664 deInterlaceInterpolateLinear(dstBlock, dstStride); 2551 if(mode & LINEAR_IPOL_DEINT_FILTER)
2665 else if(mode & LINEAR_BLEND_DEINT_FILTER) 2552 deInterlaceInterpolateLinear(dstBlock, dstStride);
2666 deInterlaceBlendLinear(dstBlock, dstStride); 2553 else if(mode & LINEAR_BLEND_DEINT_FILTER)
2667 else if(mode & MEDIAN_DEINT_FILTER) 2554 deInterlaceBlendLinear(dstBlock, dstStride);
2668 deInterlaceMedian(dstBlock, dstStride); 2555 else if(mode & MEDIAN_DEINT_FILTER)
2669 /* else if(mode & CUBIC_IPOL_DEINT_FILTER) 2556 deInterlaceMedian(dstBlock, dstStride);
2670 deInterlaceInterpolateCubic(dstBlock, dstStride); 2557 else if(mode & CUBIC_IPOL_DEINT_FILTER)
2671 else if(mode & CUBIC_BLEND_DEINT_FILTER) 2558 deInterlaceInterpolateCubic(dstBlock, dstStride);
2672 deInterlaceBlendCubic(dstBlock, dstStride); 2559 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
2560 deInterlaceBlendCubic(dstBlock, dstStride);
2673 */ 2561 */
2674 2562
2563 /* only deblock if we have 2 blocks */
2564 if(y + 8 < height)
2565 {
2675 #ifdef MORE_TIMING 2566 #ifdef MORE_TIMING
2676 T1= rdtsc(); 2567 T1= rdtsc();
2677 memcpyTime+= T1-T0; 2568 memcpyTime+= T1-T0;
2678 T0=T1; 2569 T0=T1;
2679 #endif 2570 #endif
2680 if(mode & V_DEBLOCK) 2571 if(mode & V_DEBLOCK)
2681 { 2572 {
2682 if(mode & V_RK1_FILTER) 2573 if(mode & V_RK1_FILTER)
2683 vertRK1Filter(vertBlock, stride, QP); 2574 vertRK1Filter(dstBlock, stride, QP);
2684 else if(mode & V_X1_FILTER) 2575 else if(mode & V_X1_FILTER)
2685 vertX1Filter(vertBlock, stride, QP); 2576 vertX1Filter(dstBlock, stride, QP);
2686 else 2577 else
2687 { 2578 {
2688 if( isVertDC(vertBlock, stride)) 2579 if( isVertDC(dstBlock, stride))
2689 { 2580 {
2690 if(isVertMinMaxOk(vertBlock, stride, QP)) 2581 if(isVertMinMaxOk(dstBlock, stride, QP))
2691 doVertLowPass(vertBlock, stride, QP); 2582 doVertLowPass(dstBlock, stride, QP);
2692 } 2583 }
2693 else 2584 else
2694 doVertDefFilter(vertBlock, stride, QP); 2585 doVertDefFilter(dstBlock, stride, QP);
2695 } 2586 }
2696 } 2587 }
2697 #ifdef MORE_TIMING 2588 #ifdef MORE_TIMING
2698 T1= rdtsc(); 2589 T1= rdtsc();
2699 vertTime+= T1-T0; 2590 vertTime+= T1-T0;
2700 T0=T1; 2591 T0=T1;
2701 #endif 2592 #endif
2702 } 2593 }
2703 else 2594
2704 { 2595 /* check if we have a previous block to deblock it with dstBlock */
2705 blockCopy(vertBlock + dstStride*1, dstStride,
2706 vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
2707
2708 if(mode & LINEAR_IPOL_DEINT_FILTER)
2709 deInterlaceInterpolateLinearLastRow(dstBlock, dstStride);
2710 else if(mode & LINEAR_BLEND_DEINT_FILTER)
2711 deInterlaceBlendLinearLastRow(dstBlock, dstStride);
2712 else if(mode & MEDIAN_DEINT_FILTER)
2713 deInterlaceMedianLastRow(dstBlock, dstStride);
2714 /* else if(mode & CUBIC_IPOL_DEINT_FILTER)
2715 deInterlaceInterpolateCubicLastRow(dstBlock, dstStride);
2716 else if(mode & CUBIC_BLEND_DEINT_FILTER)
2717 deInterlaceBlendCubicLastRow(dstBlock, dstStride);
2718 */
2719 }
2720
2721 if(x - 8 >= 0 && x<width) 2596 if(x - 8 >= 0 && x<width)
2722 { 2597 {
2723 #ifdef MORE_TIMING 2598 #ifdef MORE_TIMING
2724 T0= rdtsc(); 2599 T0= rdtsc();
2725 #endif 2600 #endif
2747 } 2622 }
2748 else if(y!=0) 2623 else if(y!=0)
2749 dering(dstBlock - stride*9 + width-9, stride, QP); 2624 dering(dstBlock - stride*9 + width-9, stride, QP);
2750 //FIXME dering filter will not be applied to last block (bottom right) 2625 //FIXME dering filter will not be applied to last block (bottom right)
2751 2626
2752
2753 dstBlock+=8; 2627 dstBlock+=8;
2754 srcBlock+=8; 2628 srcBlock+=8;
2755 vertBlock+=8; 2629 }
2756 vertSrcBlock+=8; 2630
2631 /* did we use a tmp buffer */
2632 if(y+15 > height)
2633 {
2634 uint8_t *dstBlock= &(dst[y*dstStride]);
2635 memcpy(dstBlock, tempDst, dstStride*(height-y) );
2757 } 2636 }
2758 } 2637 }
2759 #ifdef HAVE_3DNOW 2638 #ifdef HAVE_3DNOW
2760 asm volatile("femms"); 2639 asm volatile("femms");
2761 #elif defined (HAVE_MMX) 2640 #elif defined (HAVE_MMX)
2770 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000), 2649 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
2771 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000) 2650 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
2772 , black, white); 2651 , black, white);
2773 #endif 2652 #endif
2774 } 2653 }
2775
2776