comparison postproc/postprocess.c @ 2159:795f3d022657

fixed a bug in the horizontal default filter 3dnow version of the Horizontal & Vertical Lowpass filters mmx version of the Horizontal Default filter mmx2 & C versions of a simple filter described in a paper from ramkishor & karan added mode flags & quality2mode function
author arpi
date Wed, 10 Oct 2001 22:21:19 +0000
parents 508468a75be0
children 21a8f158d19f
comparison
equal deleted inserted replaced
2158:508468a75be0 2159:795f3d022657
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */ 17 */
18 18
19 /* 19 /*
20 C MMX MMX2 20 C MMX MMX2 3DNow*
21 isVertDC Ec Ec 21 isVertDC Ec Ec
22 isVertMinMaxOk Ec Ec 22 isVertMinMaxOk Ec Ec
23 doVertLowPass E e 23 doVertLowPass E e e*
24 doVertDefFilter Ec Ec Ec 24 doVertDefFilter Ec Ec Ec
25 isHorizDC Ec Ec 25 isHorizDC Ec Ec
26 isHorizMinMaxOk a 26 isHorizMinMaxOk a
27 doHorizLowPass E a 27 doHorizLowPass E a a*
28 doHorizDefFilter E a 28 doHorizDefFilter E ac ac
29 deRing 29 deRing
30 30
31 * i dont have a 3dnow CPU -> its untested
31 E = Exact implementation 32 E = Exact implementation
32 e = allmost exact implementation 33 e = allmost exact implementation
33 a = alternative / approximate impl 34 a = alternative / approximate impl
34 c = checked against the other implementations (-vo md5) 35 c = checked against the other implementations (-vo md5)
35 */ 36 */
37 /* 38 /*
38 TODO: 39 TODO:
39 verify that everything workes as it should 40 verify that everything workes as it should
40 reduce the time wasted on the mem transfer 41 reduce the time wasted on the mem transfer
41 implement dering 42 implement dering
42 implement everything in C at least 43 implement everything in C at least (done at the moment but ...)
43 figure range of QP out (assuming <256 for now) 44 figure range of QP out (assuming <256 for now)
44 unroll stuff if instructions depend too much on the prior one 45 unroll stuff if instructions depend too much on the prior one
45 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? 46 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
46 move YScale thing to the end instead of fixing QP 47 move YScale thing to the end instead of fixing QP
48 write a faster and higher quality deblocking filter :)
47 ... 49 ...
48 50
49 Notes: 51 Notes:
50 52
53 */
54
55 /*
56 Changelog:
57 0.1.2
58 fixed a bug in the horizontal default filter
59 3dnow version of the Horizontal & Vertical Lowpass filters
60 mmx version of the Horizontal Default filter
61 mmx2 & C versions of a simple filter described in a paper from ramkishor & karandikar
62 added mode flags & quality2mode function
63 0.1.1
51 */ 64 */
52 65
53 66
54 #include <inttypes.h> 67 #include <inttypes.h>
55 #include <stdio.h> 68 #include <stdio.h>
56 #include "../config.h" 69 #include "../config.h"
70 //#undef HAVE_MMX2
71 //#define HAVE_3DNOW
72 //#undef HAVE_MMX
57 #include "postprocess.h" 73 #include "postprocess.h"
58 //#undef HAVE_MMX2
59 //#undef HAVE_MMX
60
61 74
62 75
63 static uint64_t packedYOffset= 0x0000000000000000LL; 76 static uint64_t packedYOffset= 0x0000000000000000LL;
64 static uint64_t packedYScale= 0x0100010001000100LL; 77 static uint64_t packedYScale= 0x0100010001000100LL;
65 static uint64_t w05= 0x0005000500050005LL; 78 static uint64_t w05= 0x0005000500050005LL;
69 static uint64_t bm00010000= 0x000000FF00000000LL; 82 static uint64_t bm00010000= 0x000000FF00000000LL;
70 static uint64_t bm00001000= 0x00000000FF000000LL; 83 static uint64_t bm00001000= 0x00000000FF000000LL;
71 static uint64_t bm10000000= 0xFF00000000000000LL; 84 static uint64_t bm10000000= 0xFF00000000000000LL;
72 static uint64_t bm10000001= 0xFF000000000000FFLL; 85 static uint64_t bm10000001= 0xFF000000000000FFLL;
73 static uint64_t bm11000011= 0xFFFF00000000FFFFLL; 86 static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
87 static uint64_t bm00000011= 0x000000000000FFFFLL;
88 static uint64_t bm11000000= 0xFFFF000000000000LL;
74 static uint64_t bm00011000= 0x000000FFFF000000LL; 89 static uint64_t bm00011000= 0x000000FFFF000000LL;
75 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; 90 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
76 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; 91 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
77 static uint64_t b00= 0x0000000000000000LL; 92 static uint64_t b00= 0x0000000000000000LL;
78 static uint64_t b02= 0x0202020202020202LL; 93 static uint64_t b02= 0x0202020202020202LL;
79 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; 94 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
80 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; 95 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
96 static uint64_t b20= 0x2020202020202020LL;
97 static uint64_t b80= 0x8080808080808080LL;
81 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL; 98 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
82 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL; 99 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
83 static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL; 100 static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
84 static uint64_t temp0=0; 101 static uint64_t temp0=0;
85 static uint64_t temp1=0; 102 static uint64_t temp1=0;
310 */ 327 */
311 static inline void doVertLowPass(uint8_t *src, int stride, int QP) 328 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
312 { 329 {
313 // QP= 64; 330 // QP= 64;
314 331
315 #ifdef HAVE_MMX2 332 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
333 //#ifdef HAVE_MMX2
316 asm volatile( //"movv %0 %1 %2\n\t" 334 asm volatile( //"movv %0 %1 %2\n\t"
317 "pushl %0 \n\t" 335 "pushl %0 \n\t"
318 "movq pQPb, %%mm0 \n\t" // QP,..., QP 336 "movq pQPb, %%mm0 \n\t" // QP,..., QP
319 // "movq bFF , %%mm0 \n\t" // QP,..., QP 337 // "movq bFF , %%mm0 \n\t" // QP,..., QP
320 338
370 "pand b3F, %%mm0 \n\t" 388 "pand b3F, %%mm0 \n\t"
371 "paddusb %%mm2, %%mm0 \n\t" //3 1 /4 389 "paddusb %%mm2, %%mm0 \n\t" //3 1 /4
372 */ 390 */
373 "movq (%0, %1), %%mm0 \n\t" // 1 391 "movq (%0, %1), %%mm0 \n\t" // 1
374 "movq %%mm0, %%mm1 \n\t" // 1 392 "movq %%mm0, %%mm1 \n\t" // 1
375 "pavgb %%mm6, %%mm0 \n\t" //1 1 /2 393 PAVGB(%%mm6, %%mm0) //1 1 /2
376 "pavgb %%mm6, %%mm0 \n\t" //3 1 /4 394 PAVGB(%%mm6, %%mm0) //3 1 /4
377 395
378 "movq (%0, %1, 4), %%mm2 \n\t" // 1 396 "movq (%0, %1, 4), %%mm2 \n\t" // 1
379 "movq %%mm2, %%mm5 \n\t" // 1 397 "movq %%mm2, %%mm5 \n\t" // 1
380 "pavgb (%%eax), %%mm2 \n\t" // 11 /2 398 PAVGB((%%eax), %%mm2) // 11 /2
381 "pavgb (%0, %1, 2), %%mm2 \n\t" // 211 /4 399 PAVGB((%0, %1, 2), %%mm2) // 211 /4
382 "movq %%mm2, %%mm3 \n\t" // 211 /4 400 "movq %%mm2, %%mm3 \n\t" // 211 /4
383 "movq (%0), %%mm4 \n\t" // 1 401 "movq (%0), %%mm4 \n\t" // 1
384 "pavgb %%mm4, %%mm3 \n\t" // 4 211 /8 402 PAVGB(%%mm4, %%mm3) // 4 211 /8
385 "pavgb %%mm0, %%mm3 \n\t" //642211 /16 403 PAVGB(%%mm0, %%mm3) //642211 /16
386 "movq %%mm3, (%0) \n\t" // X 404 "movq %%mm3, (%0) \n\t" // X
387 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 405 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
388 "movq %%mm1, %%mm0 \n\t" // 1 406 "movq %%mm1, %%mm0 \n\t" // 1
389 "pavgb %%mm6, %%mm0 \n\t" //1 1 /2 407 PAVGB(%%mm6, %%mm0) //1 1 /2
390 "movq %%mm4, %%mm3 \n\t" // 1 408 "movq %%mm4, %%mm3 \n\t" // 1
391 "pavgb (%0,%1,2), %%mm3 \n\t" // 1 1 /2 409 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
392 "pavgb (%%eax,%1,2), %%mm5 \n\t" // 11 /2 410 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
393 "pavgb (%%eax), %%mm5 \n\t" // 211 /4 411 PAVGB((%%eax), %%mm5) // 211 /4
394 "pavgb %%mm5, %%mm3 \n\t" // 2 2211 /8 412 PAVGB(%%mm5, %%mm3) // 2 2211 /8
395 "pavgb %%mm0, %%mm3 \n\t" //4242211 /16 413 PAVGB(%%mm0, %%mm3) //4242211 /16
396 "movq %%mm3, (%0,%1) \n\t" // X 414 "movq %%mm3, (%0,%1) \n\t" // X
397 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 415 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
398 "pavgb %%mm4, %%mm6 \n\t" //11 /2 416 PAVGB(%%mm4, %%mm6) //11 /2
399 "movq (%%ebx), %%mm0 \n\t" // 1 417 "movq (%%ebx), %%mm0 \n\t" // 1
400 "pavgb (%%eax, %1, 2), %%mm0 \n\t" // 11/2 418 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
401 "movq %%mm0, %%mm3 \n\t" // 11/2 419 "movq %%mm0, %%mm3 \n\t" // 11/2
402 "pavgb %%mm1, %%mm0 \n\t" // 2 11/4 420 PAVGB(%%mm1, %%mm0) // 2 11/4
403 "pavgb %%mm6, %%mm0 \n\t" //222 11/8 421 PAVGB(%%mm6, %%mm0) //222 11/8
404 "pavgb %%mm2, %%mm0 \n\t" //22242211/16 422 PAVGB(%%mm2, %%mm0) //22242211/16
405 "movq (%0, %1, 2), %%mm2 \n\t" // 1 423 "movq (%0, %1, 2), %%mm2 \n\t" // 1
406 "movq %%mm0, (%0, %1, 2) \n\t" // X 424 "movq %%mm0, (%0, %1, 2) \n\t" // X
407 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 425 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
408 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 426 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
409 "pavgb (%%ebx), %%mm0 \n\t" // 11 /2 427 PAVGB((%%ebx), %%mm0) // 11 /2
410 "pavgb %%mm0, %%mm6 \n\t" //11 11 /4 428 PAVGB(%%mm0, %%mm6) //11 11 /4
411 "pavgb %%mm1, %%mm4 \n\t" // 11 /2 429 PAVGB(%%mm1, %%mm4) // 11 /2
412 "pavgb %%mm2, %%mm1 \n\t" // 11 /2 430 PAVGB(%%mm2, %%mm1) // 11 /2
413 "pavgb %%mm1, %%mm6 \n\t" //1122 11 /8 431 PAVGB(%%mm1, %%mm6) //1122 11 /8
414 "pavgb %%mm5, %%mm6 \n\t" //112242211 /16 432 PAVGB(%%mm5, %%mm6) //112242211 /16
415 "movq (%%eax), %%mm5 \n\t" // 1 433 "movq (%%eax), %%mm5 \n\t" // 1
416 "movq %%mm6, (%%eax) \n\t" // X 434 "movq %%mm6, (%%eax) \n\t" // X
417 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 435 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
418 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 436 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
419 "pavgb %%mm7, %%mm6 \n\t" // 11 /2 437 PAVGB(%%mm7, %%mm6) // 11 /2
420 "pavgb %%mm4, %%mm6 \n\t" // 11 11 /4 438 PAVGB(%%mm4, %%mm6) // 11 11 /4
421 "pavgb %%mm3, %%mm6 \n\t" // 11 2211 /8 439 PAVGB(%%mm3, %%mm6) // 11 2211 /8
422 "pavgb %%mm5, %%mm2 \n\t" // 11 /2 440 PAVGB(%%mm5, %%mm2) // 11 /2
423 "movq (%0, %1, 4), %%mm4 \n\t" // 1 441 "movq (%0, %1, 4), %%mm4 \n\t" // 1
424 "pavgb %%mm4, %%mm2 \n\t" // 112 /4 442 PAVGB(%%mm4, %%mm2) // 112 /4
425 "pavgb %%mm2, %%mm6 \n\t" // 112242211 /16 443 PAVGB(%%mm2, %%mm6) // 112242211 /16
426 "movq %%mm6, (%0, %1, 4) \n\t" // X 444 "movq %%mm6, (%0, %1, 4) \n\t" // X
427 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 445 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
428 "pavgb %%mm7, %%mm1 \n\t" // 11 2 /4 446 PAVGB(%%mm7, %%mm1) // 11 2 /4
429 "pavgb %%mm4, %%mm5 \n\t" // 11 /2 447 PAVGB(%%mm4, %%mm5) // 11 /2
430 "pavgb %%mm5, %%mm0 \n\t" // 11 11 /4 448 PAVGB(%%mm5, %%mm0) // 11 11 /4
431 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 449 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
432 "pavgb %%mm6, %%mm1 \n\t" // 11 4 2 /8 450 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
433 "pavgb %%mm0, %%mm1 \n\t" // 11224222 /16 451 PAVGB(%%mm0, %%mm1) // 11224222 /16
434 // "pxor %%mm1, %%mm1 \n\t" 452 // "pxor %%mm1, %%mm1 \n\t"
435 "movq %%mm1, (%%eax, %1, 2) \n\t" // X 453 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
436 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 454 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
437 "pavgb (%%ebx), %%mm2 \n\t" // 112 4 /8 455 PAVGB((%%ebx), %%mm2) // 112 4 /8
438 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 456 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
439 "pavgb %%mm0, %%mm6 \n\t" // 1 1 /2 457 PAVGB(%%mm0, %%mm6) // 1 1 /2
440 "pavgb %%mm7, %%mm6 \n\t" // 1 12 /4 458 PAVGB(%%mm7, %%mm6) // 1 12 /4
441 "pavgb %%mm2, %%mm6 \n\t" // 1122424 /4 459 PAVGB(%%mm2, %%mm6) // 1122424 /4
442 // "pxor %%mm6, %%mm6 \n\t" 460 // "pxor %%mm6, %%mm6 \n\t"
443 "movq %%mm6, (%%ebx) \n\t" // X 461 "movq %%mm6, (%%ebx) \n\t" // X
444 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 462 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
445 "pavgb %%mm7, %%mm5 \n\t" // 11 2 /4 463 PAVGB(%%mm7, %%mm5) // 11 2 /4
446 "pavgb %%mm7, %%mm5 \n\t" // 11 6 /8 464 PAVGB(%%mm7, %%mm5) // 11 6 /8
447 465
448 "pavgb %%mm3, %%mm0 \n\t" // 112 /4 466 PAVGB(%%mm3, %%mm0) // 112 /4
449 "pavgb %%mm0, %%mm5 \n\t" // 112246 /16 467 PAVGB(%%mm0, %%mm5) // 112246 /16
450 // "pxor %%mm5, %%mm5 \n\t" 468 // "pxor %%mm5, %%mm5 \n\t"
451 // "movq pQPb, %%mm5 \n\t" 469 // "movq pQPb, %%mm5 \n\t"
452 "movq %%mm5, (%%eax, %1, 4) \n\t" // X 470 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
453 "popl %0\n\t" 471 "popl %0\n\t"
454 472
455 : 473 :
456 : "r" (src), "r" (stride) 474 : "r" (src), "r" (stride)
457 : "%eax", "%ebx" 475 : "%eax", "%ebx"
458 ); 476 );
459
460 #else 477 #else
461 const int l1= stride; 478 const int l1= stride;
462 const int l2= stride + l1; 479 const int l2= stride + l1;
463 const int l3= stride + l2; 480 const int l3= stride + l2;
464 const int l4= stride + l3; 481 const int l4= stride + l3;
496 src++; 513 src++;
497 } 514 }
498 515
499 #endif 516 #endif
500 } 517 }
518
519 /**
520 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
521 * values are correctly clipped (MMX2)
522 * values are wraparound (C)
523 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
524 0 8 16 24
525 x = 8
526 x/2 = 4
527 x/8 = 1
528 1 12 12 23
529 */
530 static inline void vertRKFilter(uint8_t *src, int stride, int QP)
531 {
532 #ifdef HAVE_MMX2
533 // FIXME rounding
534 asm volatile(
535 "pxor %%mm7, %%mm7 \n\t" // 0
536 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
537 "leal (%0, %1), %%eax \n\t"
538 "leal (%%eax, %1, 4), %%ebx \n\t"
539 // 0 1 2 3 4 5 6 7 8 9
540 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
541 "movq pQPb, %%mm0 \n\t" // QP,..., QP
542 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
543 "paddusb b02, %%mm0 \n\t"
544 "psrlw $2, %%mm0 \n\t"
545 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4
546 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
547 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
548 "movq (%%ebx), %%mm3 \n\t" // line 5
549 "movq %%mm2, %%mm4 \n\t" // line 4
550 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
551 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
552 "pavgb %%mm3, %%mm5 \n\t"
553 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
554 "psubusb %%mm3, %%mm4 \n\t"
555 "psubusb %%mm2, %%mm3 \n\t"
556 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
557 "psubusb %%mm0, %%mm4 \n\t"
558 "pcmpeqb %%mm7, %%mm4 \n\t"
559 "pand %%mm4, %%mm5 \n\t" // d/2
560
561 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
562 "paddb %%mm5, %%mm2 \n\t"
563 // "psubb %%mm6, %%mm2 \n\t"
564 "movq %%mm2, (%0,%1, 4) \n\t"
565
566 "movq (%%ebx), %%mm2 \n\t"
567 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
568 "psubb %%mm5, %%mm2 \n\t"
569 // "psubb %%mm6, %%mm2 \n\t"
570 "movq %%mm2, (%%ebx) \n\t"
571
572 "paddb %%mm6, %%mm5 \n\t"
573 "psrlw $2, %%mm5 \n\t"
574 "pand b3F, %%mm5 \n\t"
575 "psubb b20, %%mm5 \n\t" // (l5-l4)/8
576
577 "movq (%%eax, %1, 2), %%mm2 \n\t"
578 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
579 "paddsb %%mm5, %%mm2 \n\t"
580 "psubb %%mm6, %%mm2 \n\t"
581 "movq %%mm2, (%%eax, %1, 2) \n\t"
582
583 "movq (%%ebx, %1), %%mm2 \n\t"
584 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
585 "psubsb %%mm5, %%mm2 \n\t"
586 "psubb %%mm6, %%mm2 \n\t"
587 "movq %%mm2, (%%ebx, %1) \n\t"
588
589 :
590 : "r" (src), "r" (stride)
591 : "%eax", "%ebx"
592 );
593 #else
594 const int l1= stride;
595 const int l2= stride + l1;
596 const int l3= stride + l2;
597 const int l4= stride + l3;
598 const int l5= stride + l4;
599 const int l6= stride + l5;
600 const int l7= stride + l6;
601 const int l8= stride + l7;
602 const int l9= stride + l8;
603 for(int x=0; x<BLOCK_SIZE; x++)
604 {
605 if(ABS(src[l4]-src[l5]) < QP + QP/4)
606 {
607 int x = src[l5] - src[l4];
608
609 src[l3] +=x/8;
610 src[l4] +=x/2;
611 src[l5] -=x/2;
612 src[l6] -=x/8;
613 }
614 src++;
615 }
616
617 #endif
618 }
619
620 /**
621 * Experimental Filter 1
622 */
623 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
624 {
625 #ifdef HAVE_MMX2X
626 // FIXME
627 asm volatile(
628
629 :
630 : "r" (src), "r" (stride)
631 : "%eax", "%ebx"
632 );
633 #else
634 const int l1= stride;
635 const int l2= stride + l1;
636 const int l3= stride + l2;
637 const int l4= stride + l3;
638 const int l5= stride + l4;
639 const int l6= stride + l5;
640 const int l7= stride + l6;
641 const int l8= stride + l7;
642 const int l9= stride + l8;
643 for(int x=0; x<BLOCK_SIZE; x++)
644 {
645 int v2= src[l2];
646 int v3= src[l3];
647 int v4= src[l4];
648 int v5= src[l5];
649 int v6= src[l6];
650 int v7= src[l7];
651
652 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
653 {
654 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
655 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
656 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
657 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
658 }
659 src++;
660 }
661
662 #endif
663 }
664
501 665
502 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) 666 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
503 { 667 {
504 #ifdef HAVE_MMX 668 #ifdef HAVE_MMX
505 src+= stride; 669 src+= stride;
913 #endif 1077 #endif
914 } 1078 }
915 1079
916 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) 1080 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP)
917 { 1081 {
918 #ifdef HAVE_MMX2 1082 #ifdef HAVE_MMX
919 asm volatile( 1083 asm volatile(
920 "pushl %0 \n\t" 1084 "pushl %0 \n\t"
921 "pxor %%mm7, %%mm7 \n\t" 1085 "pxor %%mm7, %%mm7 \n\t"
922 "movq bm00001000, %%mm6 \n\t" 1086 "movq bm00001000, %%mm6 \n\t"
923 "movd %2, %%mm5 \n\t" // QP 1087 "movd %2, %%mm5 \n\t" // QP
928 "pxor %%mm5, %%mm5 \n\t" // 0 1092 "pxor %%mm5, %%mm5 \n\t" // 0
929 "psubb %%mm4, %%mm5 \n\t" // -QP 1093 "psubb %%mm4, %%mm5 \n\t" // -QP
930 "leal tempBlock, %%eax \n\t" 1094 "leal tempBlock, %%eax \n\t"
931 1095
932 //FIXME? "unroll by 2" and mix 1096 //FIXME? "unroll by 2" and mix
933 #define HDF(i) "movq " #i "(%%eax), %%mm0 \n\t"\ 1097 #ifdef HAVE_MMX2
1098 #define HDF(i) \
1099 "movq " #i "(%%eax), %%mm0 \n\t"\
934 "movq %%mm0, %%mm1 \n\t"\ 1100 "movq %%mm0, %%mm1 \n\t"\
935 "movq %%mm0, %%mm2 \n\t"\ 1101 "movq %%mm0, %%mm2 \n\t"\
936 "psrlq $8, %%mm1 \n\t"\ 1102 "psrlq $8, %%mm1 \n\t"\
937 "psubusb %%mm1, %%mm2 \n\t"\ 1103 "psubusb %%mm1, %%mm2 \n\t"\
938 "psubusb %%mm0, %%mm1 \n\t"\ 1104 "psubusb %%mm0, %%mm1 \n\t"\
939 "por %%mm2, %%mm1 \n\t" /* |px - p(x+1)| */\ 1105 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
940 "pcmpeqb %%mm7, %%mm2 \n\t" /* sgn[px - p(x+1)] */\ 1106 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
941 "pshufw $0xAA, %%mm1, %%mm3 \n\t"\ 1107 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\
942 "pminub %%mm1, %%mm3 \n\t"\ 1108 "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
943 "psrlq $16, %%mm3 \n\t"\ 1109 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1110 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1111 "paddb %%mm5, %%mm1 \n\t"\
1112 "psubusb %%mm5, %%mm1 \n\t"\
1113 "psrlw $2, %%mm1 \n\t"\
1114 "pxor %%mm2, %%mm1 \n\t"\
1115 "psubb %%mm2, %%mm1 \n\t"\
1116 "pand %%mm6, %%mm1 \n\t"\
1117 "psubb %%mm1, %%mm0 \n\t"\
1118 "psllq $8, %%mm1 \n\t"\
1119 "paddb %%mm1, %%mm0 \n\t"\
1120 "movd %%mm0, (%0) \n\t"\
1121 "psrlq $32, %%mm0 \n\t"\
1122 "movd %%mm0, 4(%0) \n\t"
1123 #else
1124 #define HDF(i)\
1125 "movq " #i "(%%eax), %%mm0 \n\t"\
1126 "movq %%mm0, %%mm1 \n\t"\
1127 "movq %%mm0, %%mm2 \n\t"\
1128 "psrlq $8, %%mm1 \n\t"\
1129 "psubusb %%mm1, %%mm2 \n\t"\
1130 "psubusb %%mm0, %%mm1 \n\t"\
1131 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1132 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1133 "movq %%mm1, %%mm3 \n\t"\
1134 "psllq $32, %%mm3 \n\t"\
1135 "movq %%mm3, %%mm4 \n\t"\
1136 "psubusb %%mm1, %%mm4 \n\t"\
1137 "psubb %%mm4, %%mm3 \n\t"\
1138 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
944 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\ 1139 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
945 "paddb %%mm5, %%mm1 \n\t"\ 1140 "paddb %%mm5, %%mm1 \n\t"\
946 "psubusb %%mm5, %%mm1 \n\t"\ 1141 "psubusb %%mm5, %%mm1 \n\t"\
947 "psrlw $2, %%mm1 \n\t"\ 1142 "psrlw $2, %%mm1 \n\t"\
948 "pxor %%mm2, %%mm1 \n\t"\ 1143 "pxor %%mm2, %%mm1 \n\t"\
952 "psllq $8, %%mm1 \n\t"\ 1147 "psllq $8, %%mm1 \n\t"\
953 "paddb %%mm1, %%mm0 \n\t"\ 1148 "paddb %%mm1, %%mm0 \n\t"\
954 "movd %%mm0, (%0) \n\t"\ 1149 "movd %%mm0, (%0) \n\t"\
955 "psrlq $32, %%mm0 \n\t"\ 1150 "psrlq $32, %%mm0 \n\t"\
956 "movd %%mm0, 4(%0) \n\t" 1151 "movd %%mm0, 4(%0) \n\t"
957 1152 #endif
958 HDF(0) 1153 HDF(0)
959 "addl %1, %0 \n\t" 1154 "addl %1, %0 \n\t"
960 HDF(8) 1155 HDF(8)
961 "addl %1, %0 \n\t" 1156 "addl %1, %0 \n\t"
962 HDF(16) 1157 HDF(16)
1023 } 1218 }
1024 1219
1025 /** 1220 /**
1026 * Do a horizontal low pass filter on the 8x8 block 1221 * Do a horizontal low pass filter on the 8x8 block
1027 * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) 1222 * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1028 * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2 version) 1223 * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2/3DNOW version)
1029 */ 1224 */
1030 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) 1225 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
1031 { 1226 {
1032 //return; 1227 //return;
1033 #ifdef HAVE_MMX2 1228 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1034 asm volatile( //"movv %0 %1 %2\n\t" 1229 asm volatile( //"movv %0 %1 %2\n\t"
1035 "pushl %0\n\t" 1230 "pushl %0\n\t"
1036 "pxor %%mm7, %%mm7 \n\t" 1231 "pxor %%mm7, %%mm7 \n\t"
1037 "leal tempBlock, %%eax \n\t" 1232 "leal tempBlock, %%eax \n\t"
1038 1233 /*
1039 #define HLP1 "movq (%0), %%mm0 \n\t"\ 1234 #define HLP1 "movq (%0), %%mm0 \n\t"\
1040 "movq %%mm0, %%mm1 \n\t"\ 1235 "movq %%mm0, %%mm1 \n\t"\
1041 "psllq $8, %%mm0 \n\t"\ 1236 "psllq $8, %%mm0 \n\t"\
1042 "pavgb %%mm1, %%mm0 \n\t"\ 1237 PAVGB(%%mm1, %%mm0)\
1043 "psrlw $8, %%mm0 \n\t"\ 1238 "psrlw $8, %%mm0 \n\t"\
1044 "pxor %%mm1, %%mm1 \n\t"\ 1239 "pxor %%mm1, %%mm1 \n\t"\
1045 "packuswb %%mm1, %%mm0 \n\t"\ 1240 "packuswb %%mm1, %%mm0 \n\t"\
1046 "movq %%mm0, %%mm1 \n\t"\ 1241 "movq %%mm0, %%mm1 \n\t"\
1047 "movq %%mm0, %%mm2 \n\t"\ 1242 "movq %%mm0, %%mm2 \n\t"\
1048 "psllq $32, %%mm0 \n\t"\ 1243 "psllq $32, %%mm0 \n\t"\
1049 "paddb %%mm0, %%mm1 \n\t"\ 1244 "paddb %%mm0, %%mm1 \n\t"\
1050 "psllq $16, %%mm2 \n\t"\ 1245 "psllq $16, %%mm2 \n\t"\
1051 "pavgb %%mm2, %%mm0 \n\t"\ 1246 PAVGB(%%mm2, %%mm0)\
1052 "movq %%mm0, %%mm3 \n\t"\ 1247 "movq %%mm0, %%mm3 \n\t"\
1053 "pand bm11001100, %%mm0 \n\t"\ 1248 "pand bm11001100, %%mm0 \n\t"\
1054 "paddusb %%mm0, %%mm3 \n\t"\ 1249 "paddusb %%mm0, %%mm3 \n\t"\
1055 "psrlq $8, %%mm3 \n\t"\ 1250 "psrlq $8, %%mm3 \n\t"\
1056 "pavgb %%mm1, %%mm4 \n\t"\ 1251 PAVGB(%%mm1, %%mm4)\
1057 "pavgb %%mm3, %%mm2 \n\t"\ 1252 PAVGB(%%mm3, %%mm2)\
1058 "psrlq $16, %%mm2 \n\t"\ 1253 "psrlq $16, %%mm2 \n\t"\
1059 "punpcklbw %%mm2, %%mm2 \n\t"\ 1254 "punpcklbw %%mm2, %%mm2 \n\t"\
1060 "movq %%mm2, (%0) \n\t"\ 1255 "movq %%mm2, (%0) \n\t"\
1061 1256
1062 #define HLP2 "movq (%0), %%mm0 \n\t"\ 1257 #define HLP2 "movq (%0), %%mm0 \n\t"\
1063 "movq %%mm0, %%mm1 \n\t"\ 1258 "movq %%mm0, %%mm1 \n\t"\
1064 "psllq $8, %%mm0 \n\t"\ 1259 "psllq $8, %%mm0 \n\t"\
1065 "pavgb %%mm1, %%mm0 \n\t"\ 1260 PAVGB(%%mm1, %%mm0)\
1066 "psrlw $8, %%mm0 \n\t"\ 1261 "psrlw $8, %%mm0 \n\t"\
1067 "pxor %%mm1, %%mm1 \n\t"\ 1262 "pxor %%mm1, %%mm1 \n\t"\
1068 "packuswb %%mm1, %%mm0 \n\t"\ 1263 "packuswb %%mm1, %%mm0 \n\t"\
1069 "movq %%mm0, %%mm2 \n\t"\ 1264 "movq %%mm0, %%mm2 \n\t"\
1070 "psllq $32, %%mm0 \n\t"\ 1265 "psllq $32, %%mm0 \n\t"\
1071 "psllq $16, %%mm2 \n\t"\ 1266 "psllq $16, %%mm2 \n\t"\
1072 "pavgb %%mm2, %%mm0 \n\t"\ 1267 PAVGB(%%mm2, %%mm0)\
1073 "movq %%mm0, %%mm3 \n\t"\ 1268 "movq %%mm0, %%mm3 \n\t"\
1074 "pand bm11001100, %%mm0 \n\t"\ 1269 "pand bm11001100, %%mm0 \n\t"\
1075 "paddusb %%mm0, %%mm3 \n\t"\ 1270 "paddusb %%mm0, %%mm3 \n\t"\
1076 "psrlq $8, %%mm3 \n\t"\ 1271 "psrlq $8, %%mm3 \n\t"\
1077 "pavgb %%mm3, %%mm2 \n\t"\ 1272 PAVGB(%%mm3, %%mm2)\
1078 "psrlq $16, %%mm2 \n\t"\ 1273 "psrlq $16, %%mm2 \n\t"\
1079 "punpcklbw %%mm2, %%mm2 \n\t"\ 1274 "punpcklbw %%mm2, %%mm2 \n\t"\
1080 "movq %%mm2, (%0) \n\t"\ 1275 "movq %%mm2, (%0) \n\t"\
1081 1276 */
1082 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16 1277 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1083 /* 1278 /*
1084 31 1279 31
1085 121 1280 121
1086 121 1281 121
1098 123433 = 1293 123433 =
1099 12463 12346 1294 12463 12346
1100 1249 123A 1295 1249 123A
1101 1296
1102 */ 1297 */
1298 #ifdef HAVE_MMX2
1103 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ 1299 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1104 "movq %%mm0, %%mm1 \n\t"\ 1300 "movq %%mm0, %%mm1 \n\t"\
1105 "movq %%mm0, %%mm2 \n\t"\ 1301 "movq %%mm0, %%mm2 \n\t"\
1106 "movq %%mm0, %%mm3 \n\t"\ 1302 "movq %%mm0, %%mm3 \n\t"\
1107 "movq %%mm0, %%mm4 \n\t"\ 1303 "movq %%mm0, %%mm4 \n\t"\
1109 "psrlq $8, %%mm2 \n\t"\ 1305 "psrlq $8, %%mm2 \n\t"\
1110 "pand bm00000001, %%mm3 \n\t"\ 1306 "pand bm00000001, %%mm3 \n\t"\
1111 "pand bm10000000, %%mm4 \n\t"\ 1307 "pand bm10000000, %%mm4 \n\t"\
1112 "por %%mm3, %%mm1 \n\t"\ 1308 "por %%mm3, %%mm1 \n\t"\
1113 "por %%mm4, %%mm2 \n\t"\ 1309 "por %%mm4, %%mm2 \n\t"\
1114 "pavgb %%mm2, %%mm1 \n\t"\ 1310 PAVGB(%%mm2, %%mm1)\
1115 "pavgb %%mm1, %%mm0 \n\t"\ 1311 PAVGB(%%mm1, %%mm0)\
1116 \ 1312 \
1117 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\ 1313 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\
1118 "pshufw $0x90, %%mm0, %%mm4 \n\t"\ 1314 "pshufw $0x90, %%mm0, %%mm4 \n\t"\
1119 "pavgb %%mm3, %%mm4 \n\t"\ 1315 PAVGB(%%mm3, %%mm4)\
1120 "pavgb %%mm4, %%mm0 \n\t"\ 1316 PAVGB(%%mm4, %%mm0)\
1121 "movd %%mm0, (%0) \n\t"\ 1317 "movd %%mm0, (%0) \n\t"\
1122 "psrlq $32, %%mm0 \n\t"\ 1318 "psrlq $32, %%mm0 \n\t"\
1123 "movd %%mm0, 4(%0) \n\t"\ 1319 "movd %%mm0, 4(%0) \n\t"
1320 #else
1321 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1322 "movq %%mm0, %%mm1 \n\t"\
1323 "movq %%mm0, %%mm2 \n\t"\
1324 "movq %%mm0, %%mm3 \n\t"\
1325 "movq %%mm0, %%mm4 \n\t"\
1326 "psllq $8, %%mm1 \n\t"\
1327 "psrlq $8, %%mm2 \n\t"\
1328 "pand bm00000001, %%mm3 \n\t"\
1329 "pand bm10000000, %%mm4 \n\t"\
1330 "por %%mm3, %%mm1 \n\t"\
1331 "por %%mm4, %%mm2 \n\t"\
1332 PAVGB(%%mm2, %%mm1)\
1333 PAVGB(%%mm1, %%mm0)\
1334 \
1335 "movq %%mm0, %%mm3 \n\t"\
1336 "movq %%mm0, %%mm4 \n\t"\
1337 "movq %%mm0, %%mm5 \n\t"\
1338 "psrlq $16, %%mm3 \n\t"\
1339 "psllq $16, %%mm4 \n\t"\
1340 "pand bm11000000, %%mm5 \n\t"\
1341 "por %%mm5, %%mm3 \n\t"\
1342 "movq %%mm0, %%mm5 \n\t"\
1343 "pand bm00000011, %%mm5 \n\t"\
1344 "por %%mm5, %%mm4 \n\t"\
1345 PAVGB(%%mm3, %%mm4)\
1346 PAVGB(%%mm4, %%mm0)\
1347 "movd %%mm0, (%0) \n\t"\
1348 "psrlq $32, %%mm0 \n\t"\
1349 "movd %%mm0, 4(%0) \n\t"
1350 #endif
1124 1351
1125 #define HLP(i) HLP3(i) 1352 #define HLP(i) HLP3(i)
1126 1353
1127 HLP(0) 1354 HLP(0)
1128 "addl %1, %0 \n\t" 1355 "addl %1, %0 \n\t"
1227 "psrlq $16, %%mm7 \n\t" 1454 "psrlq $16, %%mm7 \n\t"
1228 "pmaxub %%mm4, %%mm7 \n\t" 1455 "pmaxub %%mm4, %%mm7 \n\t"
1229 "movq %%mm7, %%mm4 \n\t" 1456 "movq %%mm7, %%mm4 \n\t"
1230 "psrlq $8, %%mm7 \n\t" 1457 "psrlq $8, %%mm7 \n\t"
1231 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels 1458 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
1232 "pavgb %%mm6, %%mm7 \n\t" // (max + min)/2 1459 PAVGB(%%mm6, %%mm7) // (max + min)/2
1233 1460
1234 1461
1235 : : "r" (src), "r" (stride), "r" (QP) 1462 : : "r" (src), "r" (stride), "r" (QP)
1236 : "%eax", "%ebx" 1463 : "%eax", "%ebx"
1237 ); 1464 );
1239 1466
1240 //FIXME 1467 //FIXME
1241 #endif 1468 #endif
1242 } 1469 }
1243 1470
1471
1472
1473
1244 /** 1474 /**
1245 * ... 1475 * ...
1476 * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63)
1477 * -63 is best quality -1 is worst
1246 */ 1478 */
1247 extern "C"{ 1479 extern "C"{
1248 void postprocess(unsigned char * src[], int src_stride, 1480 void postprocess(unsigned char * src[], int src_stride,
1249 unsigned char * dst[], int dst_stride, 1481 unsigned char * dst[], int dst_stride,
1250 int horizontal_size, int vertical_size, 1482 int horizontal_size, int vertical_size,
1251 QP_STORE_T *QP_store, int QP_stride, 1483 QP_STORE_T *QP_store, int QP_stride,
1252 int mode) 1484 int mode)
1253 { 1485 {
1486
1487 if(mode<0) mode= getModeForQuality(-mode);
1488
1254 /* 1489 /*
1255 long long T= rdtsc(); 1490 long long T= rdtsc();
1256 for(int y=vertical_size-1; y>=0 ; y--) 1491 for(int y=vertical_size-1; y>=0 ; y--)
1257 memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride); 1492 memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride);
1258 // memcpy(dst[0], src[0],src_stride*vertical_size); 1493 // memcpy(dst[0], src[0],src_stride*vertical_size);
1264 long long T= rdtsc(); 1499 long long T= rdtsc();
1265 while( (rdtsc() - T)/1000 < 4000); 1500 while( (rdtsc() - T)/1000 < 4000);
1266 1501
1267 return; 1502 return;
1268 */ 1503 */
1269 postProcess(src[0], src_stride, 1504 postProcess(src[0], src_stride, dst[0], dst_stride,
1270 dst[0], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, false); 1505 horizontal_size, vertical_size, QP_store, QP_stride, false, mode);
1271 1506
1272 horizontal_size >>= 1; 1507 horizontal_size >>= 1;
1273 vertical_size >>= 1; 1508 vertical_size >>= 1;
1274 src_stride >>= 1; 1509 src_stride >>= 1;
1275 dst_stride >>= 1; 1510 dst_stride >>= 1;
1276 1511
1277 if(1) 1512 if(1)
1278 { 1513 {
1279 postProcess(src[1], src_stride, 1514 postProcess(src[1], src_stride, dst[1], dst_stride,
1280 dst[1], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, true); 1515 horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4);
1281 postProcess(src[2], src_stride, 1516 postProcess(src[2], src_stride, dst[2], dst_stride,
1282 dst[2], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, true); 1517 horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4);
1283 } 1518 }
1284 else 1519 else
1285 { 1520 {
1286 memcpy(dst[1], src[1], src_stride*horizontal_size); 1521 memcpy(dst[1], src[1], src_stride*horizontal_size);
1287 memcpy(dst[2], src[2], src_stride*horizontal_size); 1522 memcpy(dst[2], src[2], src_stride*horizontal_size);
1288 } 1523 }
1289 } 1524 }
1290 } 1525 /**
1526 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
1527 * 0 <= quality < 64
1528 */
1529 int getModeForQuality(int quality){
1530 int modes[6]= {
1531 LUM_V_DEBLOCK,
1532 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
1533 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
1534 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
1535 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
1536 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
1537 };
1538
1539 return modes[ (quality*6) >>6 ];
1540 }
1541
1542 } // extern "C"
1291 1543
1292 /** 1544 /**
1293 * Copies a block from src to dst and fixes the blacklevel 1545 * Copies a block from src to dst and fixes the blacklevel
1294 */ 1546 */
1295 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride) 1547 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride)
1365 1617
1366 /** 1618 /**
1367 * Filters array of bytes (Y or U or V values) 1619 * Filters array of bytes (Y or U or V values)
1368 */ 1620 */
1369 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 1621 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
1370 QP_STORE_T QPs[], int QPStride, bool isColor) 1622 QP_STORE_T QPs[], int QPStride, bool isColor, int mode)
1371 { 1623 {
1372 1624
1373 #ifdef TIMEING 1625 #ifdef TIMEING
1374 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; 1626 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
1375 sumTime= rdtsc(); 1627 sumTime= rdtsc();
1411 packedYOffset= MAX(black - minAllowedY, 0); 1663 packedYOffset= MAX(black - minAllowedY, 0);
1412 packedYOffset|= packedYOffset<<32; 1664 packedYOffset|= packedYOffset<<32;
1413 packedYOffset|= packedYOffset<<16; 1665 packedYOffset|= packedYOffset<<16;
1414 packedYOffset|= packedYOffset<<8; 1666 packedYOffset|= packedYOffset<<8;
1415 1667
1416 // uint64_t scale= (int)(256.0*256.0/(white-black) + 0.5);
1417 double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); 1668 double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
1418 1669
1419 packedYScale= uint16_t(scale*256.0 + 0.5); 1670 packedYScale= uint16_t(scale*256.0 + 0.5);
1420 packedYScale|= packedYScale<<32; 1671 packedYScale|= packedYScale<<32;
1421 packedYScale|= packedYScale<<16; 1672 packedYScale|= packedYScale<<16;
1460 if(y + 12 < height) 1711 if(y + 12 < height)
1461 { 1712 {
1462 #ifdef MORE_TIMEING 1713 #ifdef MORE_TIMEING
1463 T0= rdtsc(); 1714 T0= rdtsc();
1464 #endif 1715 #endif
1716
1465 #ifdef HAVE_MMX2 1717 #ifdef HAVE_MMX2
1466
1467 prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); 1718 prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
1468 prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); 1719 prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
1469 prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); 1720 prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
1470 prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); 1721 prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
1722 #elif defined(HAVE_3DNOW)
1723 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
1724 /* prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
1725 prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
1726 prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
1727 prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
1728 */
1471 #endif 1729 #endif
1472 if(!isColor) yHistogram[ srcBlock[0] ]++; 1730 if(!isColor) yHistogram[ srcBlock[0] ]++;
1473 1731
1474 blockCopy(vertBlock + dstStride*2, dstStride, 1732 blockCopy(vertBlock + dstStride*2, dstStride,
1475 vertSrcBlock + srcStride*2, srcStride); 1733 vertSrcBlock + srcStride*2, srcStride);
1478 #ifdef MORE_TIMEING 1736 #ifdef MORE_TIMEING
1479 T1= rdtsc(); 1737 T1= rdtsc();
1480 memcpyTime+= T1-T0; 1738 memcpyTime+= T1-T0;
1481 T0=T1; 1739 T0=T1;
1482 #endif 1740 #endif
1483 1741 if(mode & V_DEBLOCK)
1484 if( isVertDC(vertBlock, stride))
1485 { 1742 {
1486 if(isVertMinMaxOk(vertBlock, stride, QP)) 1743 if(mode & RK_FILTER)
1487 doVertLowPass(vertBlock, stride, QP); 1744 vertRKFilter(vertBlock, stride, QP);
1745 else if(0)
1746 vertX1Filter(vertBlock, stride, QP);
1747 else
1748 {
1749 if( isVertDC(vertBlock, stride))
1750 {
1751 if(isVertMinMaxOk(vertBlock, stride, QP))
1752 doVertLowPass(vertBlock, stride, QP);
1753 }
1754 else
1755 doVertDefFilter(vertBlock, stride, QP);
1756 }
1488 } 1757 }
1489 else if(x<width)
1490 doVertDefFilter(vertBlock, stride, QP);
1491
1492 #ifdef MORE_TIMEING 1758 #ifdef MORE_TIMEING
1493 T1= rdtsc(); 1759 T1= rdtsc();
1494 vertTime+= T1-T0; 1760 vertTime+= T1-T0;
1495 T0=T1; 1761 T0=T1;
1496 #endif 1762 #endif
1506 if(x - 8 >= 0 && x<width) 1772 if(x - 8 >= 0 && x<width)
1507 { 1773 {
1508 #ifdef MORE_TIMEING 1774 #ifdef MORE_TIMEING
1509 T0= rdtsc(); 1775 T0= rdtsc();
1510 #endif 1776 #endif
1511 1777 if(mode & H_DEBLOCK)
1512 if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
1513 { 1778 {
1514 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) 1779 if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
1515 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); 1780 {
1781 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
1782 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
1783 }
1784 else
1785 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
1516 } 1786 }
1517 else
1518 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
1519
1520 #ifdef MORE_TIMEING 1787 #ifdef MORE_TIMEING
1521 T1= rdtsc(); 1788 T1= rdtsc();
1522 horizTime+= T1-T0; 1789 horizTime+= T1-T0;
1523 T0=T1; 1790 T0=T1;
1524 #endif 1791 #endif
1533 srcBlock+=8; 1800 srcBlock+=8;
1534 vertBlock+=8; 1801 vertBlock+=8;
1535 vertSrcBlock+=8; 1802 vertSrcBlock+=8;
1536 } 1803 }
1537 } 1804 }
1538 #ifdef HAVE_MMX 1805 #ifdef HAVE_3DNOW
1806 asm volatile("femms");
1807 #elif defined (HAVE_MMX)
1539 asm volatile("emms"); 1808 asm volatile("emms");
1540 #endif 1809 #endif
1541 1810
1542 #ifdef TIMEING 1811 #ifdef TIMEING
1543 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...) 1812 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
1547 int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000), 1816 int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000),
1548 int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000) 1817 int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000)
1549 , black, white); 1818 , black, white);
1550 #endif 1819 #endif
1551 } 1820 }
1821
1822