Mercurial > mplayer.hg
comparison postproc/postprocess_template.c @ 2159:795f3d022657
fixed a bug in the horizontal default filter
3dnow version of the Horizontal & Vertical Lowpass filters
mmx version of the Horizontal Default filter
mmx2 & C versions of a simple filter described in a paper from ramkishor & karan
added mode flags & quality2mode function
author | arpi |
---|---|
date | Wed, 10 Oct 2001 22:21:19 +0000 |
parents | 508468a75be0 |
children | 21a8f158d19f |
comparison
equal
deleted
inserted
replaced
2158:508468a75be0 | 2159:795f3d022657 |
---|---|
15 along with this program; if not, write to the Free Software | 15 along with this program; if not, write to the Free Software |
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
17 */ | 17 */ |
18 | 18 |
19 /* | 19 /* |
20 C MMX MMX2 | 20 C MMX MMX2 3DNow* |
21 isVertDC Ec Ec | 21 isVertDC Ec Ec |
22 isVertMinMaxOk Ec Ec | 22 isVertMinMaxOk Ec Ec |
23 doVertLowPass E e | 23 doVertLowPass E e e* |
24 doVertDefFilter Ec Ec Ec | 24 doVertDefFilter Ec Ec Ec |
25 isHorizDC Ec Ec | 25 isHorizDC Ec Ec |
26 isHorizMinMaxOk a | 26 isHorizMinMaxOk a |
27 doHorizLowPass E a | 27 doHorizLowPass E a a* |
28 doHorizDefFilter E a | 28 doHorizDefFilter E ac ac |
29 deRing | 29 deRing |
30 | 30 |
31 * i dont have a 3dnow CPU -> its untested | |
31 E = Exact implementation | 32 E = Exact implementation |
32 e = allmost exact implementation | 33 e = allmost exact implementation |
33 a = alternative / approximate impl | 34 a = alternative / approximate impl |
34 c = checked against the other implementations (-vo md5) | 35 c = checked against the other implementations (-vo md5) |
35 */ | 36 */ |
37 /* | 38 /* |
38 TODO: | 39 TODO: |
39 verify that everything workes as it should | 40 verify that everything workes as it should |
40 reduce the time wasted on the mem transfer | 41 reduce the time wasted on the mem transfer |
41 implement dering | 42 implement dering |
42 implement everything in C at least | 43 implement everything in C at least (done at the moment but ...) |
43 figure range of QP out (assuming <256 for now) | 44 figure range of QP out (assuming <256 for now) |
44 unroll stuff if instructions depend too much on the prior one | 45 unroll stuff if instructions depend too much on the prior one |
45 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? | 46 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? |
46 move YScale thing to the end instead of fixing QP | 47 move YScale thing to the end instead of fixing QP |
48 write a faster and higher quality deblocking filter :) | |
47 ... | 49 ... |
48 | 50 |
49 Notes: | 51 Notes: |
50 | 52 |
53 */ | |
54 | |
55 /* | |
56 Changelog: | |
57 0.1.2 | |
58 fixed a bug in the horizontal default filter | |
59 3dnow version of the Horizontal & Vertical Lowpass filters | |
60 mmx version of the Horizontal Default filter | |
61 mmx2 & C versions of a simple filter described in a paper from ramkishor & karandikar | |
62 added mode flags & quality2mode function | |
63 0.1.1 | |
51 */ | 64 */ |
52 | 65 |
53 | 66 |
54 #include <inttypes.h> | 67 #include <inttypes.h> |
55 #include <stdio.h> | 68 #include <stdio.h> |
56 #include "../config.h" | 69 #include "../config.h" |
70 //#undef HAVE_MMX2 | |
71 //#define HAVE_3DNOW | |
72 //#undef HAVE_MMX | |
57 #include "postprocess.h" | 73 #include "postprocess.h" |
58 //#undef HAVE_MMX2 | |
59 //#undef HAVE_MMX | |
60 | |
61 | 74 |
62 | 75 |
63 static uint64_t packedYOffset= 0x0000000000000000LL; | 76 static uint64_t packedYOffset= 0x0000000000000000LL; |
64 static uint64_t packedYScale= 0x0100010001000100LL; | 77 static uint64_t packedYScale= 0x0100010001000100LL; |
65 static uint64_t w05= 0x0005000500050005LL; | 78 static uint64_t w05= 0x0005000500050005LL; |
69 static uint64_t bm00010000= 0x000000FF00000000LL; | 82 static uint64_t bm00010000= 0x000000FF00000000LL; |
70 static uint64_t bm00001000= 0x00000000FF000000LL; | 83 static uint64_t bm00001000= 0x00000000FF000000LL; |
71 static uint64_t bm10000000= 0xFF00000000000000LL; | 84 static uint64_t bm10000000= 0xFF00000000000000LL; |
72 static uint64_t bm10000001= 0xFF000000000000FFLL; | 85 static uint64_t bm10000001= 0xFF000000000000FFLL; |
73 static uint64_t bm11000011= 0xFFFF00000000FFFFLL; | 86 static uint64_t bm11000011= 0xFFFF00000000FFFFLL; |
87 static uint64_t bm00000011= 0x000000000000FFFFLL; | |
88 static uint64_t bm11000000= 0xFFFF000000000000LL; | |
74 static uint64_t bm00011000= 0x000000FFFF000000LL; | 89 static uint64_t bm00011000= 0x000000FFFF000000LL; |
75 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; | 90 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; |
76 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; | 91 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; |
77 static uint64_t b00= 0x0000000000000000LL; | 92 static uint64_t b00= 0x0000000000000000LL; |
78 static uint64_t b02= 0x0202020202020202LL; | 93 static uint64_t b02= 0x0202020202020202LL; |
79 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; | 94 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; |
80 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; | 95 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; |
96 static uint64_t b20= 0x2020202020202020LL; | |
97 static uint64_t b80= 0x8080808080808080LL; | |
81 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL; | 98 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL; |
82 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL; | 99 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL; |
83 static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL; | 100 static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL; |
84 static uint64_t temp0=0; | 101 static uint64_t temp0=0; |
85 static uint64_t temp1=0; | 102 static uint64_t temp1=0; |
310 */ | 327 */ |
311 static inline void doVertLowPass(uint8_t *src, int stride, int QP) | 328 static inline void doVertLowPass(uint8_t *src, int stride, int QP) |
312 { | 329 { |
313 // QP= 64; | 330 // QP= 64; |
314 | 331 |
315 #ifdef HAVE_MMX2 | 332 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
333 //#ifdef HAVE_MMX2 | |
316 asm volatile( //"movv %0 %1 %2\n\t" | 334 asm volatile( //"movv %0 %1 %2\n\t" |
317 "pushl %0 \n\t" | 335 "pushl %0 \n\t" |
318 "movq pQPb, %%mm0 \n\t" // QP,..., QP | 336 "movq pQPb, %%mm0 \n\t" // QP,..., QP |
319 // "movq bFF , %%mm0 \n\t" // QP,..., QP | 337 // "movq bFF , %%mm0 \n\t" // QP,..., QP |
320 | 338 |
370 "pand b3F, %%mm0 \n\t" | 388 "pand b3F, %%mm0 \n\t" |
371 "paddusb %%mm2, %%mm0 \n\t" //3 1 /4 | 389 "paddusb %%mm2, %%mm0 \n\t" //3 1 /4 |
372 */ | 390 */ |
373 "movq (%0, %1), %%mm0 \n\t" // 1 | 391 "movq (%0, %1), %%mm0 \n\t" // 1 |
374 "movq %%mm0, %%mm1 \n\t" // 1 | 392 "movq %%mm0, %%mm1 \n\t" // 1 |
375 "pavgb %%mm6, %%mm0 \n\t" //1 1 /2 | 393 PAVGB(%%mm6, %%mm0) //1 1 /2 |
376 "pavgb %%mm6, %%mm0 \n\t" //3 1 /4 | 394 PAVGB(%%mm6, %%mm0) //3 1 /4 |
377 | 395 |
378 "movq (%0, %1, 4), %%mm2 \n\t" // 1 | 396 "movq (%0, %1, 4), %%mm2 \n\t" // 1 |
379 "movq %%mm2, %%mm5 \n\t" // 1 | 397 "movq %%mm2, %%mm5 \n\t" // 1 |
380 "pavgb (%%eax), %%mm2 \n\t" // 11 /2 | 398 PAVGB((%%eax), %%mm2) // 11 /2 |
381 "pavgb (%0, %1, 2), %%mm2 \n\t" // 211 /4 | 399 PAVGB((%0, %1, 2), %%mm2) // 211 /4 |
382 "movq %%mm2, %%mm3 \n\t" // 211 /4 | 400 "movq %%mm2, %%mm3 \n\t" // 211 /4 |
383 "movq (%0), %%mm4 \n\t" // 1 | 401 "movq (%0), %%mm4 \n\t" // 1 |
384 "pavgb %%mm4, %%mm3 \n\t" // 4 211 /8 | 402 PAVGB(%%mm4, %%mm3) // 4 211 /8 |
385 "pavgb %%mm0, %%mm3 \n\t" //642211 /16 | 403 PAVGB(%%mm0, %%mm3) //642211 /16 |
386 "movq %%mm3, (%0) \n\t" // X | 404 "movq %%mm3, (%0) \n\t" // X |
387 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 | 405 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 |
388 "movq %%mm1, %%mm0 \n\t" // 1 | 406 "movq %%mm1, %%mm0 \n\t" // 1 |
389 "pavgb %%mm6, %%mm0 \n\t" //1 1 /2 | 407 PAVGB(%%mm6, %%mm0) //1 1 /2 |
390 "movq %%mm4, %%mm3 \n\t" // 1 | 408 "movq %%mm4, %%mm3 \n\t" // 1 |
391 "pavgb (%0,%1,2), %%mm3 \n\t" // 1 1 /2 | 409 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 |
392 "pavgb (%%eax,%1,2), %%mm5 \n\t" // 11 /2 | 410 PAVGB((%%eax,%1,2), %%mm5) // 11 /2 |
393 "pavgb (%%eax), %%mm5 \n\t" // 211 /4 | 411 PAVGB((%%eax), %%mm5) // 211 /4 |
394 "pavgb %%mm5, %%mm3 \n\t" // 2 2211 /8 | 412 PAVGB(%%mm5, %%mm3) // 2 2211 /8 |
395 "pavgb %%mm0, %%mm3 \n\t" //4242211 /16 | 413 PAVGB(%%mm0, %%mm3) //4242211 /16 |
396 "movq %%mm3, (%0,%1) \n\t" // X | 414 "movq %%mm3, (%0,%1) \n\t" // X |
397 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 | 415 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 |
398 "pavgb %%mm4, %%mm6 \n\t" //11 /2 | 416 PAVGB(%%mm4, %%mm6) //11 /2 |
399 "movq (%%ebx), %%mm0 \n\t" // 1 | 417 "movq (%%ebx), %%mm0 \n\t" // 1 |
400 "pavgb (%%eax, %1, 2), %%mm0 \n\t" // 11/2 | 418 PAVGB((%%eax, %1, 2), %%mm0) // 11/2 |
401 "movq %%mm0, %%mm3 \n\t" // 11/2 | 419 "movq %%mm0, %%mm3 \n\t" // 11/2 |
402 "pavgb %%mm1, %%mm0 \n\t" // 2 11/4 | 420 PAVGB(%%mm1, %%mm0) // 2 11/4 |
403 "pavgb %%mm6, %%mm0 \n\t" //222 11/8 | 421 PAVGB(%%mm6, %%mm0) //222 11/8 |
404 "pavgb %%mm2, %%mm0 \n\t" //22242211/16 | 422 PAVGB(%%mm2, %%mm0) //22242211/16 |
405 "movq (%0, %1, 2), %%mm2 \n\t" // 1 | 423 "movq (%0, %1, 2), %%mm2 \n\t" // 1 |
406 "movq %%mm0, (%0, %1, 2) \n\t" // X | 424 "movq %%mm0, (%0, %1, 2) \n\t" // X |
407 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 | 425 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 |
408 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 | 426 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
409 "pavgb (%%ebx), %%mm0 \n\t" // 11 /2 | 427 PAVGB((%%ebx), %%mm0) // 11 /2 |
410 "pavgb %%mm0, %%mm6 \n\t" //11 11 /4 | 428 PAVGB(%%mm0, %%mm6) //11 11 /4 |
411 "pavgb %%mm1, %%mm4 \n\t" // 11 /2 | 429 PAVGB(%%mm1, %%mm4) // 11 /2 |
412 "pavgb %%mm2, %%mm1 \n\t" // 11 /2 | 430 PAVGB(%%mm2, %%mm1) // 11 /2 |
413 "pavgb %%mm1, %%mm6 \n\t" //1122 11 /8 | 431 PAVGB(%%mm1, %%mm6) //1122 11 /8 |
414 "pavgb %%mm5, %%mm6 \n\t" //112242211 /16 | 432 PAVGB(%%mm5, %%mm6) //112242211 /16 |
415 "movq (%%eax), %%mm5 \n\t" // 1 | 433 "movq (%%eax), %%mm5 \n\t" // 1 |
416 "movq %%mm6, (%%eax) \n\t" // X | 434 "movq %%mm6, (%%eax) \n\t" // X |
417 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 | 435 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 |
418 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 | 436 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 |
419 "pavgb %%mm7, %%mm6 \n\t" // 11 /2 | 437 PAVGB(%%mm7, %%mm6) // 11 /2 |
420 "pavgb %%mm4, %%mm6 \n\t" // 11 11 /4 | 438 PAVGB(%%mm4, %%mm6) // 11 11 /4 |
421 "pavgb %%mm3, %%mm6 \n\t" // 11 2211 /8 | 439 PAVGB(%%mm3, %%mm6) // 11 2211 /8 |
422 "pavgb %%mm5, %%mm2 \n\t" // 11 /2 | 440 PAVGB(%%mm5, %%mm2) // 11 /2 |
423 "movq (%0, %1, 4), %%mm4 \n\t" // 1 | 441 "movq (%0, %1, 4), %%mm4 \n\t" // 1 |
424 "pavgb %%mm4, %%mm2 \n\t" // 112 /4 | 442 PAVGB(%%mm4, %%mm2) // 112 /4 |
425 "pavgb %%mm2, %%mm6 \n\t" // 112242211 /16 | 443 PAVGB(%%mm2, %%mm6) // 112242211 /16 |
426 "movq %%mm6, (%0, %1, 4) \n\t" // X | 444 "movq %%mm6, (%0, %1, 4) \n\t" // X |
427 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 | 445 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 |
428 "pavgb %%mm7, %%mm1 \n\t" // 11 2 /4 | 446 PAVGB(%%mm7, %%mm1) // 11 2 /4 |
429 "pavgb %%mm4, %%mm5 \n\t" // 11 /2 | 447 PAVGB(%%mm4, %%mm5) // 11 /2 |
430 "pavgb %%mm5, %%mm0 \n\t" // 11 11 /4 | 448 PAVGB(%%mm5, %%mm0) // 11 11 /4 |
431 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 | 449 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 |
432 "pavgb %%mm6, %%mm1 \n\t" // 11 4 2 /8 | 450 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
433 "pavgb %%mm0, %%mm1 \n\t" // 11224222 /16 | 451 PAVGB(%%mm0, %%mm1) // 11224222 /16 |
434 // "pxor %%mm1, %%mm1 \n\t" | 452 // "pxor %%mm1, %%mm1 \n\t" |
435 "movq %%mm1, (%%eax, %1, 2) \n\t" // X | 453 "movq %%mm1, (%%eax, %1, 2) \n\t" // X |
436 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 | 454 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
437 "pavgb (%%ebx), %%mm2 \n\t" // 112 4 /8 | 455 PAVGB((%%ebx), %%mm2) // 112 4 /8 |
438 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 | 456 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
439 "pavgb %%mm0, %%mm6 \n\t" // 1 1 /2 | 457 PAVGB(%%mm0, %%mm6) // 1 1 /2 |
440 "pavgb %%mm7, %%mm6 \n\t" // 1 12 /4 | 458 PAVGB(%%mm7, %%mm6) // 1 12 /4 |
441 "pavgb %%mm2, %%mm6 \n\t" // 1122424 /4 | 459 PAVGB(%%mm2, %%mm6) // 1122424 /4 |
442 // "pxor %%mm6, %%mm6 \n\t" | 460 // "pxor %%mm6, %%mm6 \n\t" |
443 "movq %%mm6, (%%ebx) \n\t" // X | 461 "movq %%mm6, (%%ebx) \n\t" // X |
444 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 | 462 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
445 "pavgb %%mm7, %%mm5 \n\t" // 11 2 /4 | 463 PAVGB(%%mm7, %%mm5) // 11 2 /4 |
446 "pavgb %%mm7, %%mm5 \n\t" // 11 6 /8 | 464 PAVGB(%%mm7, %%mm5) // 11 6 /8 |
447 | 465 |
448 "pavgb %%mm3, %%mm0 \n\t" // 112 /4 | 466 PAVGB(%%mm3, %%mm0) // 112 /4 |
449 "pavgb %%mm0, %%mm5 \n\t" // 112246 /16 | 467 PAVGB(%%mm0, %%mm5) // 112246 /16 |
450 // "pxor %%mm5, %%mm5 \n\t" | 468 // "pxor %%mm5, %%mm5 \n\t" |
451 // "movq pQPb, %%mm5 \n\t" | 469 // "movq pQPb, %%mm5 \n\t" |
452 "movq %%mm5, (%%eax, %1, 4) \n\t" // X | 470 "movq %%mm5, (%%eax, %1, 4) \n\t" // X |
453 "popl %0\n\t" | 471 "popl %0\n\t" |
454 | 472 |
455 : | 473 : |
456 : "r" (src), "r" (stride) | 474 : "r" (src), "r" (stride) |
457 : "%eax", "%ebx" | 475 : "%eax", "%ebx" |
458 ); | 476 ); |
459 | |
460 #else | 477 #else |
461 const int l1= stride; | 478 const int l1= stride; |
462 const int l2= stride + l1; | 479 const int l2= stride + l1; |
463 const int l3= stride + l2; | 480 const int l3= stride + l2; |
464 const int l4= stride + l3; | 481 const int l4= stride + l3; |
496 src++; | 513 src++; |
497 } | 514 } |
498 | 515 |
499 #endif | 516 #endif |
500 } | 517 } |
518 | |
519 /** | |
520 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar | |
521 * values are correctly clipped (MMX2) | |
522 * values are wraparound (C) | |
523 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient | |
524 0 8 16 24 | |
525 x = 8 | |
526 x/2 = 4 | |
527 x/8 = 1 | |
528 1 12 12 23 | |
529 */ | |
530 static inline void vertRKFilter(uint8_t *src, int stride, int QP) | |
531 { | |
532 #ifdef HAVE_MMX2 | |
533 // FIXME rounding | |
534 asm volatile( | |
535 "pxor %%mm7, %%mm7 \n\t" // 0 | |
536 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | |
537 "leal (%0, %1), %%eax \n\t" | |
538 "leal (%%eax, %1, 4), %%ebx \n\t" | |
539 // 0 1 2 3 4 5 6 7 8 9 | |
540 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
541 "movq pQPb, %%mm0 \n\t" // QP,..., QP | |
542 "movq %%mm0, %%mm1 \n\t" // QP,..., QP | |
543 "paddusb b02, %%mm0 \n\t" | |
544 "psrlw $2, %%mm0 \n\t" | |
545 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4 | |
546 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... | |
547 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 | |
548 "movq (%%ebx), %%mm3 \n\t" // line 5 | |
549 "movq %%mm2, %%mm4 \n\t" // line 4 | |
550 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 | |
551 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 | |
552 "pavgb %%mm3, %%mm5 \n\t" | |
553 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 | |
554 "psubusb %%mm3, %%mm4 \n\t" | |
555 "psubusb %%mm2, %%mm3 \n\t" | |
556 "por %%mm3, %%mm4 \n\t" // |l4 - l5| | |
557 "psubusb %%mm0, %%mm4 \n\t" | |
558 "pcmpeqb %%mm7, %%mm4 \n\t" | |
559 "pand %%mm4, %%mm5 \n\t" // d/2 | |
560 | |
561 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 | |
562 "paddb %%mm5, %%mm2 \n\t" | |
563 // "psubb %%mm6, %%mm2 \n\t" | |
564 "movq %%mm2, (%0,%1, 4) \n\t" | |
565 | |
566 "movq (%%ebx), %%mm2 \n\t" | |
567 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 | |
568 "psubb %%mm5, %%mm2 \n\t" | |
569 // "psubb %%mm6, %%mm2 \n\t" | |
570 "movq %%mm2, (%%ebx) \n\t" | |
571 | |
572 "paddb %%mm6, %%mm5 \n\t" | |
573 "psrlw $2, %%mm5 \n\t" | |
574 "pand b3F, %%mm5 \n\t" | |
575 "psubb b20, %%mm5 \n\t" // (l5-l4)/8 | |
576 | |
577 "movq (%%eax, %1, 2), %%mm2 \n\t" | |
578 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 | |
579 "paddsb %%mm5, %%mm2 \n\t" | |
580 "psubb %%mm6, %%mm2 \n\t" | |
581 "movq %%mm2, (%%eax, %1, 2) \n\t" | |
582 | |
583 "movq (%%ebx, %1), %%mm2 \n\t" | |
584 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 | |
585 "psubsb %%mm5, %%mm2 \n\t" | |
586 "psubb %%mm6, %%mm2 \n\t" | |
587 "movq %%mm2, (%%ebx, %1) \n\t" | |
588 | |
589 : | |
590 : "r" (src), "r" (stride) | |
591 : "%eax", "%ebx" | |
592 ); | |
593 #else | |
594 const int l1= stride; | |
595 const int l2= stride + l1; | |
596 const int l3= stride + l2; | |
597 const int l4= stride + l3; | |
598 const int l5= stride + l4; | |
599 const int l6= stride + l5; | |
600 const int l7= stride + l6; | |
601 const int l8= stride + l7; | |
602 const int l9= stride + l8; | |
603 for(int x=0; x<BLOCK_SIZE; x++) | |
604 { | |
605 if(ABS(src[l4]-src[l5]) < QP + QP/4) | |
606 { | |
607 int x = src[l5] - src[l4]; | |
608 | |
609 src[l3] +=x/8; | |
610 src[l4] +=x/2; | |
611 src[l5] -=x/2; | |
612 src[l6] -=x/8; | |
613 } | |
614 src++; | |
615 } | |
616 | |
617 #endif | |
618 } | |
619 | |
620 /** | |
621 * Experimental Filter 1 | |
622 */ | |
623 static inline void vertX1Filter(uint8_t *src, int stride, int QP) | |
624 { | |
625 #ifdef HAVE_MMX2X | |
626 // FIXME | |
627 asm volatile( | |
628 | |
629 : | |
630 : "r" (src), "r" (stride) | |
631 : "%eax", "%ebx" | |
632 ); | |
633 #else | |
634 const int l1= stride; | |
635 const int l2= stride + l1; | |
636 const int l3= stride + l2; | |
637 const int l4= stride + l3; | |
638 const int l5= stride + l4; | |
639 const int l6= stride + l5; | |
640 const int l7= stride + l6; | |
641 const int l8= stride + l7; | |
642 const int l9= stride + l8; | |
643 for(int x=0; x<BLOCK_SIZE; x++) | |
644 { | |
645 int v2= src[l2]; | |
646 int v3= src[l3]; | |
647 int v4= src[l4]; | |
648 int v5= src[l5]; | |
649 int v6= src[l6]; | |
650 int v7= src[l7]; | |
651 | |
652 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 ) | |
653 { | |
654 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16; | |
655 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16; | |
656 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16; | |
657 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; | |
658 } | |
659 src++; | |
660 } | |
661 | |
662 #endif | |
663 } | |
664 | |
501 | 665 |
502 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | 666 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) |
503 { | 667 { |
504 #ifdef HAVE_MMX | 668 #ifdef HAVE_MMX |
505 src+= stride; | 669 src+= stride; |
913 #endif | 1077 #endif |
914 } | 1078 } |
915 | 1079 |
916 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) | 1080 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) |
917 { | 1081 { |
918 #ifdef HAVE_MMX2 | 1082 #ifdef HAVE_MMX |
919 asm volatile( | 1083 asm volatile( |
920 "pushl %0 \n\t" | 1084 "pushl %0 \n\t" |
921 "pxor %%mm7, %%mm7 \n\t" | 1085 "pxor %%mm7, %%mm7 \n\t" |
922 "movq bm00001000, %%mm6 \n\t" | 1086 "movq bm00001000, %%mm6 \n\t" |
923 "movd %2, %%mm5 \n\t" // QP | 1087 "movd %2, %%mm5 \n\t" // QP |
928 "pxor %%mm5, %%mm5 \n\t" // 0 | 1092 "pxor %%mm5, %%mm5 \n\t" // 0 |
929 "psubb %%mm4, %%mm5 \n\t" // -QP | 1093 "psubb %%mm4, %%mm5 \n\t" // -QP |
930 "leal tempBlock, %%eax \n\t" | 1094 "leal tempBlock, %%eax \n\t" |
931 | 1095 |
932 //FIXME? "unroll by 2" and mix | 1096 //FIXME? "unroll by 2" and mix |
933 #define HDF(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | 1097 #ifdef HAVE_MMX2 |
1098 #define HDF(i) \ | |
1099 "movq " #i "(%%eax), %%mm0 \n\t"\ | |
934 "movq %%mm0, %%mm1 \n\t"\ | 1100 "movq %%mm0, %%mm1 \n\t"\ |
935 "movq %%mm0, %%mm2 \n\t"\ | 1101 "movq %%mm0, %%mm2 \n\t"\ |
936 "psrlq $8, %%mm1 \n\t"\ | 1102 "psrlq $8, %%mm1 \n\t"\ |
937 "psubusb %%mm1, %%mm2 \n\t"\ | 1103 "psubusb %%mm1, %%mm2 \n\t"\ |
938 "psubusb %%mm0, %%mm1 \n\t"\ | 1104 "psubusb %%mm0, %%mm1 \n\t"\ |
939 "por %%mm2, %%mm1 \n\t" /* |px - p(x+1)| */\ | 1105 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ |
940 "pcmpeqb %%mm7, %%mm2 \n\t" /* sgn[px - p(x+1)] */\ | 1106 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ |
941 "pshufw $0xAA, %%mm1, %%mm3 \n\t"\ | 1107 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ |
942 "pminub %%mm1, %%mm3 \n\t"\ | 1108 "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\ |
943 "psrlq $16, %%mm3 \n\t"\ | 1109 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\ |
1110 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\ | |
1111 "paddb %%mm5, %%mm1 \n\t"\ | |
1112 "psubusb %%mm5, %%mm1 \n\t"\ | |
1113 "psrlw $2, %%mm1 \n\t"\ | |
1114 "pxor %%mm2, %%mm1 \n\t"\ | |
1115 "psubb %%mm2, %%mm1 \n\t"\ | |
1116 "pand %%mm6, %%mm1 \n\t"\ | |
1117 "psubb %%mm1, %%mm0 \n\t"\ | |
1118 "psllq $8, %%mm1 \n\t"\ | |
1119 "paddb %%mm1, %%mm0 \n\t"\ | |
1120 "movd %%mm0, (%0) \n\t"\ | |
1121 "psrlq $32, %%mm0 \n\t"\ | |
1122 "movd %%mm0, 4(%0) \n\t" | |
1123 #else | |
1124 #define HDF(i)\ | |
1125 "movq " #i "(%%eax), %%mm0 \n\t"\ | |
1126 "movq %%mm0, %%mm1 \n\t"\ | |
1127 "movq %%mm0, %%mm2 \n\t"\ | |
1128 "psrlq $8, %%mm1 \n\t"\ | |
1129 "psubusb %%mm1, %%mm2 \n\t"\ | |
1130 "psubusb %%mm0, %%mm1 \n\t"\ | |
1131 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
1132 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
1133 "movq %%mm1, %%mm3 \n\t"\ | |
1134 "psllq $32, %%mm3 \n\t"\ | |
1135 "movq %%mm3, %%mm4 \n\t"\ | |
1136 "psubusb %%mm1, %%mm4 \n\t"\ | |
1137 "psubb %%mm4, %%mm3 \n\t"\ | |
1138 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\ | |
944 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\ | 1139 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\ |
945 "paddb %%mm5, %%mm1 \n\t"\ | 1140 "paddb %%mm5, %%mm1 \n\t"\ |
946 "psubusb %%mm5, %%mm1 \n\t"\ | 1141 "psubusb %%mm5, %%mm1 \n\t"\ |
947 "psrlw $2, %%mm1 \n\t"\ | 1142 "psrlw $2, %%mm1 \n\t"\ |
948 "pxor %%mm2, %%mm1 \n\t"\ | 1143 "pxor %%mm2, %%mm1 \n\t"\ |
952 "psllq $8, %%mm1 \n\t"\ | 1147 "psllq $8, %%mm1 \n\t"\ |
953 "paddb %%mm1, %%mm0 \n\t"\ | 1148 "paddb %%mm1, %%mm0 \n\t"\ |
954 "movd %%mm0, (%0) \n\t"\ | 1149 "movd %%mm0, (%0) \n\t"\ |
955 "psrlq $32, %%mm0 \n\t"\ | 1150 "psrlq $32, %%mm0 \n\t"\ |
956 "movd %%mm0, 4(%0) \n\t" | 1151 "movd %%mm0, 4(%0) \n\t" |
957 | 1152 #endif |
958 HDF(0) | 1153 HDF(0) |
959 "addl %1, %0 \n\t" | 1154 "addl %1, %0 \n\t" |
960 HDF(8) | 1155 HDF(8) |
961 "addl %1, %0 \n\t" | 1156 "addl %1, %0 \n\t" |
962 HDF(16) | 1157 HDF(16) |
1023 } | 1218 } |
1024 | 1219 |
1025 /** | 1220 /** |
1026 * Do a horizontal low pass filter on the 8x8 block | 1221 * Do a horizontal low pass filter on the 8x8 block |
1027 * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) | 1222 * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) |
1028 * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2 version) | 1223 * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2/3DNOW version) |
1029 */ | 1224 */ |
1030 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) | 1225 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) |
1031 { | 1226 { |
1032 //return; | 1227 //return; |
1033 #ifdef HAVE_MMX2 | 1228 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1034 asm volatile( //"movv %0 %1 %2\n\t" | 1229 asm volatile( //"movv %0 %1 %2\n\t" |
1035 "pushl %0\n\t" | 1230 "pushl %0\n\t" |
1036 "pxor %%mm7, %%mm7 \n\t" | 1231 "pxor %%mm7, %%mm7 \n\t" |
1037 "leal tempBlock, %%eax \n\t" | 1232 "leal tempBlock, %%eax \n\t" |
1038 | 1233 /* |
1039 #define HLP1 "movq (%0), %%mm0 \n\t"\ | 1234 #define HLP1 "movq (%0), %%mm0 \n\t"\ |
1040 "movq %%mm0, %%mm1 \n\t"\ | 1235 "movq %%mm0, %%mm1 \n\t"\ |
1041 "psllq $8, %%mm0 \n\t"\ | 1236 "psllq $8, %%mm0 \n\t"\ |
1042 "pavgb %%mm1, %%mm0 \n\t"\ | 1237 PAVGB(%%mm1, %%mm0)\ |
1043 "psrlw $8, %%mm0 \n\t"\ | 1238 "psrlw $8, %%mm0 \n\t"\ |
1044 "pxor %%mm1, %%mm1 \n\t"\ | 1239 "pxor %%mm1, %%mm1 \n\t"\ |
1045 "packuswb %%mm1, %%mm0 \n\t"\ | 1240 "packuswb %%mm1, %%mm0 \n\t"\ |
1046 "movq %%mm0, %%mm1 \n\t"\ | 1241 "movq %%mm0, %%mm1 \n\t"\ |
1047 "movq %%mm0, %%mm2 \n\t"\ | 1242 "movq %%mm0, %%mm2 \n\t"\ |
1048 "psllq $32, %%mm0 \n\t"\ | 1243 "psllq $32, %%mm0 \n\t"\ |
1049 "paddb %%mm0, %%mm1 \n\t"\ | 1244 "paddb %%mm0, %%mm1 \n\t"\ |
1050 "psllq $16, %%mm2 \n\t"\ | 1245 "psllq $16, %%mm2 \n\t"\ |
1051 "pavgb %%mm2, %%mm0 \n\t"\ | 1246 PAVGB(%%mm2, %%mm0)\ |
1052 "movq %%mm0, %%mm3 \n\t"\ | 1247 "movq %%mm0, %%mm3 \n\t"\ |
1053 "pand bm11001100, %%mm0 \n\t"\ | 1248 "pand bm11001100, %%mm0 \n\t"\ |
1054 "paddusb %%mm0, %%mm3 \n\t"\ | 1249 "paddusb %%mm0, %%mm3 \n\t"\ |
1055 "psrlq $8, %%mm3 \n\t"\ | 1250 "psrlq $8, %%mm3 \n\t"\ |
1056 "pavgb %%mm1, %%mm4 \n\t"\ | 1251 PAVGB(%%mm1, %%mm4)\ |
1057 "pavgb %%mm3, %%mm2 \n\t"\ | 1252 PAVGB(%%mm3, %%mm2)\ |
1058 "psrlq $16, %%mm2 \n\t"\ | 1253 "psrlq $16, %%mm2 \n\t"\ |
1059 "punpcklbw %%mm2, %%mm2 \n\t"\ | 1254 "punpcklbw %%mm2, %%mm2 \n\t"\ |
1060 "movq %%mm2, (%0) \n\t"\ | 1255 "movq %%mm2, (%0) \n\t"\ |
1061 | 1256 |
1062 #define HLP2 "movq (%0), %%mm0 \n\t"\ | 1257 #define HLP2 "movq (%0), %%mm0 \n\t"\ |
1063 "movq %%mm0, %%mm1 \n\t"\ | 1258 "movq %%mm0, %%mm1 \n\t"\ |
1064 "psllq $8, %%mm0 \n\t"\ | 1259 "psllq $8, %%mm0 \n\t"\ |
1065 "pavgb %%mm1, %%mm0 \n\t"\ | 1260 PAVGB(%%mm1, %%mm0)\ |
1066 "psrlw $8, %%mm0 \n\t"\ | 1261 "psrlw $8, %%mm0 \n\t"\ |
1067 "pxor %%mm1, %%mm1 \n\t"\ | 1262 "pxor %%mm1, %%mm1 \n\t"\ |
1068 "packuswb %%mm1, %%mm0 \n\t"\ | 1263 "packuswb %%mm1, %%mm0 \n\t"\ |
1069 "movq %%mm0, %%mm2 \n\t"\ | 1264 "movq %%mm0, %%mm2 \n\t"\ |
1070 "psllq $32, %%mm0 \n\t"\ | 1265 "psllq $32, %%mm0 \n\t"\ |
1071 "psllq $16, %%mm2 \n\t"\ | 1266 "psllq $16, %%mm2 \n\t"\ |
1072 "pavgb %%mm2, %%mm0 \n\t"\ | 1267 PAVGB(%%mm2, %%mm0)\ |
1073 "movq %%mm0, %%mm3 \n\t"\ | 1268 "movq %%mm0, %%mm3 \n\t"\ |
1074 "pand bm11001100, %%mm0 \n\t"\ | 1269 "pand bm11001100, %%mm0 \n\t"\ |
1075 "paddusb %%mm0, %%mm3 \n\t"\ | 1270 "paddusb %%mm0, %%mm3 \n\t"\ |
1076 "psrlq $8, %%mm3 \n\t"\ | 1271 "psrlq $8, %%mm3 \n\t"\ |
1077 "pavgb %%mm3, %%mm2 \n\t"\ | 1272 PAVGB(%%mm3, %%mm2)\ |
1078 "psrlq $16, %%mm2 \n\t"\ | 1273 "psrlq $16, %%mm2 \n\t"\ |
1079 "punpcklbw %%mm2, %%mm2 \n\t"\ | 1274 "punpcklbw %%mm2, %%mm2 \n\t"\ |
1080 "movq %%mm2, (%0) \n\t"\ | 1275 "movq %%mm2, (%0) \n\t"\ |
1081 | 1276 */ |
1082 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16 | 1277 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16 |
1083 /* | 1278 /* |
1084 31 | 1279 31 |
1085 121 | 1280 121 |
1086 121 | 1281 121 |
1098 123433 = | 1293 123433 = |
1099 12463 12346 | 1294 12463 12346 |
1100 1249 123A | 1295 1249 123A |
1101 | 1296 |
1102 */ | 1297 */ |
1298 #ifdef HAVE_MMX2 | |
1103 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | 1299 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ |
1104 "movq %%mm0, %%mm1 \n\t"\ | 1300 "movq %%mm0, %%mm1 \n\t"\ |
1105 "movq %%mm0, %%mm2 \n\t"\ | 1301 "movq %%mm0, %%mm2 \n\t"\ |
1106 "movq %%mm0, %%mm3 \n\t"\ | 1302 "movq %%mm0, %%mm3 \n\t"\ |
1107 "movq %%mm0, %%mm4 \n\t"\ | 1303 "movq %%mm0, %%mm4 \n\t"\ |
1109 "psrlq $8, %%mm2 \n\t"\ | 1305 "psrlq $8, %%mm2 \n\t"\ |
1110 "pand bm00000001, %%mm3 \n\t"\ | 1306 "pand bm00000001, %%mm3 \n\t"\ |
1111 "pand bm10000000, %%mm4 \n\t"\ | 1307 "pand bm10000000, %%mm4 \n\t"\ |
1112 "por %%mm3, %%mm1 \n\t"\ | 1308 "por %%mm3, %%mm1 \n\t"\ |
1113 "por %%mm4, %%mm2 \n\t"\ | 1309 "por %%mm4, %%mm2 \n\t"\ |
1114 "pavgb %%mm2, %%mm1 \n\t"\ | 1310 PAVGB(%%mm2, %%mm1)\ |
1115 "pavgb %%mm1, %%mm0 \n\t"\ | 1311 PAVGB(%%mm1, %%mm0)\ |
1116 \ | 1312 \ |
1117 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\ | 1313 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\ |
1118 "pshufw $0x90, %%mm0, %%mm4 \n\t"\ | 1314 "pshufw $0x90, %%mm0, %%mm4 \n\t"\ |
1119 "pavgb %%mm3, %%mm4 \n\t"\ | 1315 PAVGB(%%mm3, %%mm4)\ |
1120 "pavgb %%mm4, %%mm0 \n\t"\ | 1316 PAVGB(%%mm4, %%mm0)\ |
1121 "movd %%mm0, (%0) \n\t"\ | 1317 "movd %%mm0, (%0) \n\t"\ |
1122 "psrlq $32, %%mm0 \n\t"\ | 1318 "psrlq $32, %%mm0 \n\t"\ |
1123 "movd %%mm0, 4(%0) \n\t"\ | 1319 "movd %%mm0, 4(%0) \n\t" |
1320 #else | |
1321 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | |
1322 "movq %%mm0, %%mm1 \n\t"\ | |
1323 "movq %%mm0, %%mm2 \n\t"\ | |
1324 "movq %%mm0, %%mm3 \n\t"\ | |
1325 "movq %%mm0, %%mm4 \n\t"\ | |
1326 "psllq $8, %%mm1 \n\t"\ | |
1327 "psrlq $8, %%mm2 \n\t"\ | |
1328 "pand bm00000001, %%mm3 \n\t"\ | |
1329 "pand bm10000000, %%mm4 \n\t"\ | |
1330 "por %%mm3, %%mm1 \n\t"\ | |
1331 "por %%mm4, %%mm2 \n\t"\ | |
1332 PAVGB(%%mm2, %%mm1)\ | |
1333 PAVGB(%%mm1, %%mm0)\ | |
1334 \ | |
1335 "movq %%mm0, %%mm3 \n\t"\ | |
1336 "movq %%mm0, %%mm4 \n\t"\ | |
1337 "movq %%mm0, %%mm5 \n\t"\ | |
1338 "psrlq $16, %%mm3 \n\t"\ | |
1339 "psllq $16, %%mm4 \n\t"\ | |
1340 "pand bm11000000, %%mm5 \n\t"\ | |
1341 "por %%mm5, %%mm3 \n\t"\ | |
1342 "movq %%mm0, %%mm5 \n\t"\ | |
1343 "pand bm00000011, %%mm5 \n\t"\ | |
1344 "por %%mm5, %%mm4 \n\t"\ | |
1345 PAVGB(%%mm3, %%mm4)\ | |
1346 PAVGB(%%mm4, %%mm0)\ | |
1347 "movd %%mm0, (%0) \n\t"\ | |
1348 "psrlq $32, %%mm0 \n\t"\ | |
1349 "movd %%mm0, 4(%0) \n\t" | |
1350 #endif | |
1124 | 1351 |
1125 #define HLP(i) HLP3(i) | 1352 #define HLP(i) HLP3(i) |
1126 | 1353 |
1127 HLP(0) | 1354 HLP(0) |
1128 "addl %1, %0 \n\t" | 1355 "addl %1, %0 \n\t" |
1227 "psrlq $16, %%mm7 \n\t" | 1454 "psrlq $16, %%mm7 \n\t" |
1228 "pmaxub %%mm4, %%mm7 \n\t" | 1455 "pmaxub %%mm4, %%mm7 \n\t" |
1229 "movq %%mm7, %%mm4 \n\t" | 1456 "movq %%mm7, %%mm4 \n\t" |
1230 "psrlq $8, %%mm7 \n\t" | 1457 "psrlq $8, %%mm7 \n\t" |
1231 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels | 1458 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels |
1232 "pavgb %%mm6, %%mm7 \n\t" // (max + min)/2 | 1459 PAVGB(%%mm6, %%mm7) // (max + min)/2 |
1233 | 1460 |
1234 | 1461 |
1235 : : "r" (src), "r" (stride), "r" (QP) | 1462 : : "r" (src), "r" (stride), "r" (QP) |
1236 : "%eax", "%ebx" | 1463 : "%eax", "%ebx" |
1237 ); | 1464 ); |
1239 | 1466 |
1240 //FIXME | 1467 //FIXME |
1241 #endif | 1468 #endif |
1242 } | 1469 } |
1243 | 1470 |
1471 | |
1472 | |
1473 | |
1244 /** | 1474 /** |
1245 * ... | 1475 * ... |
1476 * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63) | |
1477 * -63 is best quality -1 is worst | |
1246 */ | 1478 */ |
1247 extern "C"{ | 1479 extern "C"{ |
1248 void postprocess(unsigned char * src[], int src_stride, | 1480 void postprocess(unsigned char * src[], int src_stride, |
1249 unsigned char * dst[], int dst_stride, | 1481 unsigned char * dst[], int dst_stride, |
1250 int horizontal_size, int vertical_size, | 1482 int horizontal_size, int vertical_size, |
1251 QP_STORE_T *QP_store, int QP_stride, | 1483 QP_STORE_T *QP_store, int QP_stride, |
1252 int mode) | 1484 int mode) |
1253 { | 1485 { |
1486 | |
1487 if(mode<0) mode= getModeForQuality(-mode); | |
1488 | |
1254 /* | 1489 /* |
1255 long long T= rdtsc(); | 1490 long long T= rdtsc(); |
1256 for(int y=vertical_size-1; y>=0 ; y--) | 1491 for(int y=vertical_size-1; y>=0 ; y--) |
1257 memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride); | 1492 memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride); |
1258 // memcpy(dst[0], src[0],src_stride*vertical_size); | 1493 // memcpy(dst[0], src[0],src_stride*vertical_size); |
1264 long long T= rdtsc(); | 1499 long long T= rdtsc(); |
1265 while( (rdtsc() - T)/1000 < 4000); | 1500 while( (rdtsc() - T)/1000 < 4000); |
1266 | 1501 |
1267 return; | 1502 return; |
1268 */ | 1503 */ |
1269 postProcess(src[0], src_stride, | 1504 postProcess(src[0], src_stride, dst[0], dst_stride, |
1270 dst[0], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, false); | 1505 horizontal_size, vertical_size, QP_store, QP_stride, false, mode); |
1271 | 1506 |
1272 horizontal_size >>= 1; | 1507 horizontal_size >>= 1; |
1273 vertical_size >>= 1; | 1508 vertical_size >>= 1; |
1274 src_stride >>= 1; | 1509 src_stride >>= 1; |
1275 dst_stride >>= 1; | 1510 dst_stride >>= 1; |
1276 | 1511 |
1277 if(1) | 1512 if(1) |
1278 { | 1513 { |
1279 postProcess(src[1], src_stride, | 1514 postProcess(src[1], src_stride, dst[1], dst_stride, |
1280 dst[1], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, true); | 1515 horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); |
1281 postProcess(src[2], src_stride, | 1516 postProcess(src[2], src_stride, dst[2], dst_stride, |
1282 dst[2], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, true); | 1517 horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); |
1283 } | 1518 } |
1284 else | 1519 else |
1285 { | 1520 { |
1286 memcpy(dst[1], src[1], src_stride*horizontal_size); | 1521 memcpy(dst[1], src[1], src_stride*horizontal_size); |
1287 memcpy(dst[2], src[2], src_stride*horizontal_size); | 1522 memcpy(dst[2], src[2], src_stride*horizontal_size); |
1288 } | 1523 } |
1289 } | 1524 } |
1290 } | 1525 /** |
1526 * gets the mode flags for a given quality (larger values mean slower but better postprocessing) | |
1527 * 0 <= quality < 64 | |
1528 */ | |
1529 int getModeForQuality(int quality){ | |
1530 int modes[6]= { | |
1531 LUM_V_DEBLOCK, | |
1532 LUM_V_DEBLOCK | LUM_H_DEBLOCK, | |
1533 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK, | |
1534 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK, | |
1535 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING, | |
1536 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING | |
1537 }; | |
1538 | |
1539 return modes[ (quality*6) >>6 ]; | |
1540 } | |
1541 | |
1542 } // extern "C" | |
1291 | 1543 |
1292 /** | 1544 /** |
1293 * Copies a block from src to dst and fixes the blacklevel | 1545 * Copies a block from src to dst and fixes the blacklevel |
1294 */ | 1546 */ |
1295 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride) | 1547 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride) |
1365 | 1617 |
1366 /** | 1618 /** |
1367 * Filters array of bytes (Y or U or V values) | 1619 * Filters array of bytes (Y or U or V values) |
1368 */ | 1620 */ |
1369 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | 1621 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
1370 QP_STORE_T QPs[], int QPStride, bool isColor) | 1622 QP_STORE_T QPs[], int QPStride, bool isColor, int mode) |
1371 { | 1623 { |
1372 | 1624 |
1373 #ifdef TIMEING | 1625 #ifdef TIMEING |
1374 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; | 1626 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; |
1375 sumTime= rdtsc(); | 1627 sumTime= rdtsc(); |
1411 packedYOffset= MAX(black - minAllowedY, 0); | 1663 packedYOffset= MAX(black - minAllowedY, 0); |
1412 packedYOffset|= packedYOffset<<32; | 1664 packedYOffset|= packedYOffset<<32; |
1413 packedYOffset|= packedYOffset<<16; | 1665 packedYOffset|= packedYOffset<<16; |
1414 packedYOffset|= packedYOffset<<8; | 1666 packedYOffset|= packedYOffset<<8; |
1415 | 1667 |
1416 // uint64_t scale= (int)(256.0*256.0/(white-black) + 0.5); | |
1417 double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); | 1668 double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); |
1418 | 1669 |
1419 packedYScale= uint16_t(scale*256.0 + 0.5); | 1670 packedYScale= uint16_t(scale*256.0 + 0.5); |
1420 packedYScale|= packedYScale<<32; | 1671 packedYScale|= packedYScale<<32; |
1421 packedYScale|= packedYScale<<16; | 1672 packedYScale|= packedYScale<<16; |
1460 if(y + 12 < height) | 1711 if(y + 12 < height) |
1461 { | 1712 { |
1462 #ifdef MORE_TIMEING | 1713 #ifdef MORE_TIMEING |
1463 T0= rdtsc(); | 1714 T0= rdtsc(); |
1464 #endif | 1715 #endif |
1716 | |
1465 #ifdef HAVE_MMX2 | 1717 #ifdef HAVE_MMX2 |
1466 | |
1467 prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); | 1718 prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); |
1468 prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); | 1719 prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); |
1469 prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); | 1720 prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); |
1470 prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); | 1721 prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); |
1722 #elif defined(HAVE_3DNOW) | |
1723 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
1724 /* prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); | |
1725 prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); | |
1726 prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); | |
1727 prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); | |
1728 */ | |
1471 #endif | 1729 #endif |
1472 if(!isColor) yHistogram[ srcBlock[0] ]++; | 1730 if(!isColor) yHistogram[ srcBlock[0] ]++; |
1473 | 1731 |
1474 blockCopy(vertBlock + dstStride*2, dstStride, | 1732 blockCopy(vertBlock + dstStride*2, dstStride, |
1475 vertSrcBlock + srcStride*2, srcStride); | 1733 vertSrcBlock + srcStride*2, srcStride); |
1478 #ifdef MORE_TIMEING | 1736 #ifdef MORE_TIMEING |
1479 T1= rdtsc(); | 1737 T1= rdtsc(); |
1480 memcpyTime+= T1-T0; | 1738 memcpyTime+= T1-T0; |
1481 T0=T1; | 1739 T0=T1; |
1482 #endif | 1740 #endif |
1483 | 1741 if(mode & V_DEBLOCK) |
1484 if( isVertDC(vertBlock, stride)) | |
1485 { | 1742 { |
1486 if(isVertMinMaxOk(vertBlock, stride, QP)) | 1743 if(mode & RK_FILTER) |
1487 doVertLowPass(vertBlock, stride, QP); | 1744 vertRKFilter(vertBlock, stride, QP); |
1745 else if(0) | |
1746 vertX1Filter(vertBlock, stride, QP); | |
1747 else | |
1748 { | |
1749 if( isVertDC(vertBlock, stride)) | |
1750 { | |
1751 if(isVertMinMaxOk(vertBlock, stride, QP)) | |
1752 doVertLowPass(vertBlock, stride, QP); | |
1753 } | |
1754 else | |
1755 doVertDefFilter(vertBlock, stride, QP); | |
1756 } | |
1488 } | 1757 } |
1489 else if(x<width) | |
1490 doVertDefFilter(vertBlock, stride, QP); | |
1491 | |
1492 #ifdef MORE_TIMEING | 1758 #ifdef MORE_TIMEING |
1493 T1= rdtsc(); | 1759 T1= rdtsc(); |
1494 vertTime+= T1-T0; | 1760 vertTime+= T1-T0; |
1495 T0=T1; | 1761 T0=T1; |
1496 #endif | 1762 #endif |
1506 if(x - 8 >= 0 && x<width) | 1772 if(x - 8 >= 0 && x<width) |
1507 { | 1773 { |
1508 #ifdef MORE_TIMEING | 1774 #ifdef MORE_TIMEING |
1509 T0= rdtsc(); | 1775 T0= rdtsc(); |
1510 #endif | 1776 #endif |
1511 | 1777 if(mode & H_DEBLOCK) |
1512 if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) | |
1513 { | 1778 { |
1514 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) | 1779 if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) |
1515 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); | 1780 { |
1781 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) | |
1782 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); | |
1783 } | |
1784 else | |
1785 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); | |
1516 } | 1786 } |
1517 else | |
1518 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); | |
1519 | |
1520 #ifdef MORE_TIMEING | 1787 #ifdef MORE_TIMEING |
1521 T1= rdtsc(); | 1788 T1= rdtsc(); |
1522 horizTime+= T1-T0; | 1789 horizTime+= T1-T0; |
1523 T0=T1; | 1790 T0=T1; |
1524 #endif | 1791 #endif |
1533 srcBlock+=8; | 1800 srcBlock+=8; |
1534 vertBlock+=8; | 1801 vertBlock+=8; |
1535 vertSrcBlock+=8; | 1802 vertSrcBlock+=8; |
1536 } | 1803 } |
1537 } | 1804 } |
1538 #ifdef HAVE_MMX | 1805 #ifdef HAVE_3DNOW |
1806 asm volatile("femms"); | |
1807 #elif defined (HAVE_MMX) | |
1539 asm volatile("emms"); | 1808 asm volatile("emms"); |
1540 #endif | 1809 #endif |
1541 | 1810 |
1542 #ifdef TIMEING | 1811 #ifdef TIMEING |
1543 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...) | 1812 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...) |
1547 int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000), | 1816 int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000), |
1548 int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000) | 1817 int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000) |
1549 , black, white); | 1818 , black, white); |
1550 #endif | 1819 #endif |
1551 } | 1820 } |
1821 | |
1822 |