Mercurial > libpostproc.hg
comparison postprocess_template.c @ 121:dd89aa84269b libpostproc
HAVE_3DNOW --> HAVE_AMD3DNOW to sync with latest configure changes.
author | diego |
---|---|
date | Sun, 25 Jan 2009 19:57:52 +0000 |
parents | 4a1602d552aa |
children | 1500ae6cf66c |
comparison
equal
deleted
inserted
replaced
120:e86e6ea21776 | 121:dd89aa84269b |
---|---|
31 #undef PMINUB | 31 #undef PMINUB |
32 #undef PMAXUB | 32 #undef PMAXUB |
33 | 33 |
34 #if HAVE_MMX2 | 34 #if HAVE_MMX2 |
35 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | 35 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
36 #elif HAVE_3DNOW | 36 #elif HAVE_AMD3DNOW |
37 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | 37 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
38 #endif | 38 #endif |
39 #define PAVGB(a,b) REAL_PAVGB(a,b) | 39 #define PAVGB(a,b) REAL_PAVGB(a,b) |
40 | 40 |
41 #if HAVE_MMX2 | 41 #if HAVE_MMX2 |
177 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 | 177 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
178 */ | 178 */ |
179 #if !HAVE_ALTIVEC | 179 #if !HAVE_ALTIVEC |
180 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) | 180 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) |
181 { | 181 { |
182 #if HAVE_MMX2 || HAVE_3DNOW | 182 #if HAVE_MMX2 || HAVE_AMD3DNOW |
183 src+= stride*3; | 183 src+= stride*3; |
184 __asm__ volatile( //"movv %0 %1 %2\n\t" | 184 __asm__ volatile( //"movv %0 %1 %2\n\t" |
185 "movq %2, %%mm0 \n\t" // QP,..., QP | 185 "movq %2, %%mm0 \n\t" // QP,..., QP |
186 "pxor %%mm4, %%mm4 \n\t" | 186 "pxor %%mm4, %%mm4 \n\t" |
187 | 187 |
304 | 304 |
305 : | 305 : |
306 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) | 306 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) |
307 : "%"REG_a, "%"REG_c | 307 : "%"REG_a, "%"REG_c |
308 ); | 308 ); |
309 #else //HAVE_MMX2 || HAVE_3DNOW | 309 #else //HAVE_MMX2 || HAVE_AMD3DNOW |
310 const int l1= stride; | 310 const int l1= stride; |
311 const int l2= stride + l1; | 311 const int l2= stride + l1; |
312 const int l3= stride + l2; | 312 const int l3= stride + l2; |
313 const int l4= stride + l3; | 313 const int l4= stride + l3; |
314 const int l5= stride + l4; | 314 const int l5= stride + l4; |
343 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; | 343 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; |
344 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; | 344 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; |
345 | 345 |
346 src++; | 346 src++; |
347 } | 347 } |
348 #endif //HAVE_MMX2 || HAVE_3DNOW | 348 #endif //HAVE_MMX2 || HAVE_AMD3DNOW |
349 } | 349 } |
350 #endif //HAVE_ALTIVEC | 350 #endif //HAVE_ALTIVEC |
351 | 351 |
352 #if 0 | 352 #if 0 |
353 /** | 353 /** |
362 x/8 = 1 | 362 x/8 = 1 |
363 1 12 12 23 | 363 1 12 12 23 |
364 */ | 364 */ |
365 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) | 365 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) |
366 { | 366 { |
367 #if HAVE_MMX2 || HAVE_3DNOW | 367 #if HAVE_MMX2 || HAVE_AMD3DNOW |
368 src+= stride*3; | 368 src+= stride*3; |
369 // FIXME rounding | 369 // FIXME rounding |
370 __asm__ volatile( | 370 __asm__ volatile( |
371 "pxor %%mm7, %%mm7 \n\t" // 0 | 371 "pxor %%mm7, %%mm7 \n\t" // 0 |
372 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE | 372 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
424 | 424 |
425 : | 425 : |
426 : "r" (src), "r" ((x86_reg)stride) | 426 : "r" (src), "r" ((x86_reg)stride) |
427 : "%"REG_a, "%"REG_c | 427 : "%"REG_a, "%"REG_c |
428 ); | 428 ); |
429 #else //HAVE_MMX2 || HAVE_3DNOW | 429 #else //HAVE_MMX2 || HAVE_AMD3DNOW |
430 const int l1= stride; | 430 const int l1= stride; |
431 const int l2= stride + l1; | 431 const int l2= stride + l1; |
432 const int l3= stride + l2; | 432 const int l3= stride + l2; |
433 const int l4= stride + l3; | 433 const int l4= stride + l3; |
434 const int l5= stride + l4; | 434 const int l5= stride + l4; |
447 src[x+l5] -=v>>1; | 447 src[x+l5] -=v>>1; |
448 src[x+l6] -=v>>3; | 448 src[x+l6] -=v>>3; |
449 } | 449 } |
450 } | 450 } |
451 | 451 |
452 #endif //HAVE_MMX2 || HAVE_3DNOW | 452 #endif //HAVE_MMX2 || HAVE_AMD3DNOW |
453 } | 453 } |
454 #endif //0 | 454 #endif //0 |
455 | 455 |
456 /** | 456 /** |
457 * Experimental Filter 1 | 457 * Experimental Filter 1 |
460 * can only smooth blocks at the expected locations (it cannot smooth them if they did move) | 460 * can only smooth blocks at the expected locations (it cannot smooth them if they did move) |
461 * MMX2 version does correct clipping C version does not | 461 * MMX2 version does correct clipping C version does not |
462 */ | 462 */ |
463 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) | 463 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) |
464 { | 464 { |
465 #if HAVE_MMX2 || HAVE_3DNOW | 465 #if HAVE_MMX2 || HAVE_AMD3DNOW |
466 src+= stride*3; | 466 src+= stride*3; |
467 | 467 |
468 __asm__ volatile( | 468 __asm__ volatile( |
469 "pxor %%mm7, %%mm7 \n\t" // 0 | 469 "pxor %%mm7, %%mm7 \n\t" // 0 |
470 "lea (%0, %1), %%"REG_a" \n\t" | 470 "lea (%0, %1), %%"REG_a" \n\t" |
546 | 546 |
547 : | 547 : |
548 : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb) | 548 : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb) |
549 : "%"REG_a, "%"REG_c | 549 : "%"REG_a, "%"REG_c |
550 ); | 550 ); |
551 #else //HAVE_MMX2 || HAVE_3DNOW | 551 #else //HAVE_MMX2 || HAVE_AMD3DNOW |
552 | 552 |
553 const int l1= stride; | 553 const int l1= stride; |
554 const int l2= stride + l1; | 554 const int l2= stride + l1; |
555 const int l3= stride + l2; | 555 const int l3= stride + l2; |
556 const int l4= stride + l3; | 556 const int l4= stride + l3; |
580 src[l6] -=v>>2; | 580 src[l6] -=v>>2; |
581 src[l7] -=v>>3; | 581 src[l7] -=v>>3; |
582 } | 582 } |
583 src++; | 583 src++; |
584 } | 584 } |
585 #endif //HAVE_MMX2 || HAVE_3DNOW | 585 #endif //HAVE_MMX2 || HAVE_AMD3DNOW |
586 } | 586 } |
587 | 587 |
588 #if !HAVE_ALTIVEC | 588 #if !HAVE_ALTIVEC |
589 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) | 589 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) |
590 { | 590 { |
591 #if HAVE_MMX2 || HAVE_3DNOW | 591 #if HAVE_MMX2 || HAVE_AMD3DNOW |
592 /* | 592 /* |
593 uint8_t tmp[16]; | 593 uint8_t tmp[16]; |
594 const int l1= stride; | 594 const int l1= stride; |
595 const int l2= stride + l1; | 595 const int l2= stride + l1; |
596 const int l3= stride + l2; | 596 const int l3= stride + l2; |
1099 | 1099 |
1100 : "+r" (src) | 1100 : "+r" (src) |
1101 : "r" ((x86_reg)stride), "m" (c->pQPb) | 1101 : "r" ((x86_reg)stride), "m" (c->pQPb) |
1102 : "%"REG_a, "%"REG_c | 1102 : "%"REG_a, "%"REG_c |
1103 ); | 1103 ); |
1104 #else //HAVE_MMX2 || HAVE_3DNOW | 1104 #else //HAVE_MMX2 || HAVE_AMD3DNOW |
1105 const int l1= stride; | 1105 const int l1= stride; |
1106 const int l2= stride + l1; | 1106 const int l2= stride + l1; |
1107 const int l3= stride + l2; | 1107 const int l3= stride + l2; |
1108 const int l4= stride + l3; | 1108 const int l4= stride + l3; |
1109 const int l5= stride + l4; | 1109 const int l5= stride + l4; |
1137 src[l4]-= d; | 1137 src[l4]-= d; |
1138 src[l5]+= d; | 1138 src[l5]+= d; |
1139 } | 1139 } |
1140 src++; | 1140 src++; |
1141 } | 1141 } |
1142 #endif //HAVE_MMX2 || HAVE_3DNOW | 1142 #endif //HAVE_MMX2 || HAVE_AMD3DNOW |
1143 } | 1143 } |
1144 #endif //HAVE_ALTIVEC | 1144 #endif //HAVE_ALTIVEC |
1145 | 1145 |
1146 #if !HAVE_ALTIVEC | 1146 #if !HAVE_ALTIVEC |
1147 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) | 1147 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) |
1148 { | 1148 { |
1149 #if HAVE_MMX2 || HAVE_3DNOW | 1149 #if HAVE_MMX2 || HAVE_AMD3DNOW |
1150 __asm__ volatile( | 1150 __asm__ volatile( |
1151 "pxor %%mm6, %%mm6 \n\t" | 1151 "pxor %%mm6, %%mm6 \n\t" |
1152 "pcmpeqb %%mm7, %%mm7 \n\t" | 1152 "pcmpeqb %%mm7, %%mm7 \n\t" |
1153 "movq %2, %%mm0 \n\t" | 1153 "movq %2, %%mm0 \n\t" |
1154 "punpcklbw %%mm6, %%mm0 \n\t" | 1154 "punpcklbw %%mm6, %%mm0 \n\t" |
1368 | 1368 |
1369 "1: \n\t" | 1369 "1: \n\t" |
1370 : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2) | 1370 : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2) |
1371 : "%"REG_a, "%"REG_d, "%"REG_c | 1371 : "%"REG_a, "%"REG_d, "%"REG_c |
1372 ); | 1372 ); |
1373 #else //HAVE_MMX2 || HAVE_3DNOW | 1373 #else //HAVE_MMX2 || HAVE_AMD3DNOW |
1374 int y; | 1374 int y; |
1375 int min=255; | 1375 int min=255; |
1376 int max=0; | 1376 int max=0; |
1377 int avg; | 1377 int avg; |
1378 uint8_t *p; | 1378 uint8_t *p; |
1485 } | 1485 } |
1486 } | 1486 } |
1487 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; | 1487 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; |
1488 } | 1488 } |
1489 #endif | 1489 #endif |
1490 #endif //HAVE_MMX2 || HAVE_3DNOW | 1490 #endif //HAVE_MMX2 || HAVE_AMD3DNOW |
1491 } | 1491 } |
1492 #endif //HAVE_ALTIVEC | 1492 #endif //HAVE_ALTIVEC |
1493 | 1493 |
1494 /** | 1494 /** |
1495 * Deinterlaces the given block by linearly interpolating every second line. | 1495 * Deinterlaces the given block by linearly interpolating every second line. |
1497 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. | 1497 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. |
1498 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1498 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
1499 */ | 1499 */ |
1500 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) | 1500 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) |
1501 { | 1501 { |
1502 #if HAVE_MMX2 || HAVE_3DNOW | 1502 #if HAVE_MMX2 || HAVE_AMD3DNOW |
1503 src+= 4*stride; | 1503 src+= 4*stride; |
1504 __asm__ volatile( | 1504 __asm__ volatile( |
1505 "lea (%0, %1), %%"REG_a" \n\t" | 1505 "lea (%0, %1), %%"REG_a" \n\t" |
1506 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" | 1506 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" |
1507 // 0 1 2 3 4 5 6 7 8 9 | 1507 // 0 1 2 3 4 5 6 7 8 9 |
1550 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1550 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
1551 * this filter will read lines 3-15 and write 7-13 | 1551 * this filter will read lines 3-15 and write 7-13 |
1552 */ | 1552 */ |
1553 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) | 1553 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) |
1554 { | 1554 { |
1555 #if HAVE_MMX2 || HAVE_3DNOW | 1555 #if HAVE_MMX2 || HAVE_AMD3DNOW |
1556 src+= stride*3; | 1556 src+= stride*3; |
1557 __asm__ volatile( | 1557 __asm__ volatile( |
1558 "lea (%0, %1), %%"REG_a" \n\t" | 1558 "lea (%0, %1), %%"REG_a" \n\t" |
1559 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | 1559 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
1560 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" | 1560 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" |
1592 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2)) | 1592 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2)) |
1593 | 1593 |
1594 : : "r" (src), "r" ((x86_reg)stride) | 1594 : : "r" (src), "r" ((x86_reg)stride) |
1595 : "%"REG_a, "%"REG_d, "%"REG_c | 1595 : "%"REG_a, "%"REG_d, "%"REG_c |
1596 ); | 1596 ); |
1597 #else //HAVE_MMX2 || HAVE_3DNOW | 1597 #else //HAVE_MMX2 || HAVE_AMD3DNOW |
1598 int x; | 1598 int x; |
1599 src+= stride*3; | 1599 src+= stride*3; |
1600 for(x=0; x<8; x++){ | 1600 for(x=0; x<8; x++){ |
1601 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); | 1601 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); |
1602 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); | 1602 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); |
1603 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); | 1603 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); |
1604 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); | 1604 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); |
1605 src++; | 1605 src++; |
1606 } | 1606 } |
1607 #endif //HAVE_MMX2 || HAVE_3DNOW | 1607 #endif //HAVE_MMX2 || HAVE_AMD3DNOW |
1608 } | 1608 } |
1609 | 1609 |
1610 /** | 1610 /** |
1611 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. | 1611 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. |
1612 * will be called for every 8x8 block and can read & write from line 4-15 | 1612 * will be called for every 8x8 block and can read & write from line 4-15 |
1614 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1614 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
1615 * this filter will read lines 4-13 and write 5-11 | 1615 * this filter will read lines 4-13 and write 5-11 |
1616 */ | 1616 */ |
1617 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) | 1617 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) |
1618 { | 1618 { |
1619 #if HAVE_MMX2 || HAVE_3DNOW | 1619 #if HAVE_MMX2 || HAVE_AMD3DNOW |
1620 src+= stride*4; | 1620 src+= stride*4; |
1621 __asm__ volatile( | 1621 __asm__ volatile( |
1622 "lea (%0, %1), %%"REG_a" \n\t" | 1622 "lea (%0, %1), %%"REG_a" \n\t" |
1623 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | 1623 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
1624 "pxor %%mm7, %%mm7 \n\t" | 1624 "pxor %%mm7, %%mm7 \n\t" |
1663 | 1663 |
1664 "movq %%mm0, (%2) \n\t" | 1664 "movq %%mm0, (%2) \n\t" |
1665 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp) | 1665 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp) |
1666 : "%"REG_a, "%"REG_d | 1666 : "%"REG_a, "%"REG_d |
1667 ); | 1667 ); |
1668 #else //HAVE_MMX2 || HAVE_3DNOW | 1668 #else //HAVE_MMX2 || HAVE_AMD3DNOW |
1669 int x; | 1669 int x; |
1670 src+= stride*4; | 1670 src+= stride*4; |
1671 for(x=0; x<8; x++){ | 1671 for(x=0; x<8; x++){ |
1672 int t1= tmp[x]; | 1672 int t1= tmp[x]; |
1673 int t2= src[stride*1]; | 1673 int t2= src[stride*1]; |
1681 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); | 1681 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); |
1682 tmp[x]= t1; | 1682 tmp[x]= t1; |
1683 | 1683 |
1684 src++; | 1684 src++; |
1685 } | 1685 } |
1686 #endif //HAVE_MMX2 || HAVE_3DNOW | 1686 #endif //HAVE_MMX2 || HAVE_AMD3DNOW |
1687 } | 1687 } |
1688 | 1688 |
1689 /** | 1689 /** |
1690 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter. | 1690 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter. |
1691 * will be called for every 8x8 block and can read & write from line 4-15 | 1691 * will be called for every 8x8 block and can read & write from line 4-15 |
1693 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1693 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
1694 * this filter will read lines 4-13 and write 4-11 | 1694 * this filter will read lines 4-13 and write 4-11 |
1695 */ | 1695 */ |
1696 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) | 1696 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) |
1697 { | 1697 { |
1698 #if HAVE_MMX2 || HAVE_3DNOW | 1698 #if HAVE_MMX2 || HAVE_AMD3DNOW |
1699 src+= stride*4; | 1699 src+= stride*4; |
1700 __asm__ volatile( | 1700 __asm__ volatile( |
1701 "lea (%0, %1), %%"REG_a" \n\t" | 1701 "lea (%0, %1), %%"REG_a" \n\t" |
1702 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | 1702 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
1703 "pxor %%mm7, %%mm7 \n\t" | 1703 "pxor %%mm7, %%mm7 \n\t" |
1753 "movq %%mm0, (%2) \n\t" | 1753 "movq %%mm0, (%2) \n\t" |
1754 "movq %%mm1, (%3) \n\t" | 1754 "movq %%mm1, (%3) \n\t" |
1755 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2) | 1755 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2) |
1756 : "%"REG_a, "%"REG_d | 1756 : "%"REG_a, "%"REG_d |
1757 ); | 1757 ); |
1758 #else //HAVE_MMX2 || HAVE_3DNOW | 1758 #else //HAVE_MMX2 || HAVE_AMD3DNOW |
1759 int x; | 1759 int x; |
1760 src+= stride*4; | 1760 src+= stride*4; |
1761 for(x=0; x<8; x++){ | 1761 for(x=0; x<8; x++){ |
1762 int t1= tmp[x]; | 1762 int t1= tmp[x]; |
1763 int t2= tmp2[x]; | 1763 int t2= tmp2[x]; |
1782 tmp[x]= t3; | 1782 tmp[x]= t3; |
1783 tmp2[x]= t1; | 1783 tmp2[x]= t1; |
1784 | 1784 |
1785 src++; | 1785 src++; |
1786 } | 1786 } |
1787 #endif //HAVE_MMX2 || HAVE_3DNOW | 1787 #endif //HAVE_MMX2 || HAVE_AMD3DNOW |
1788 } | 1788 } |
1789 | 1789 |
1790 /** | 1790 /** |
1791 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter. | 1791 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter. |
1792 * will be called for every 8x8 block and can read & write from line 4-15 | 1792 * will be called for every 8x8 block and can read & write from line 4-15 |
1794 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1794 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
1795 * this filter will read lines 4-13 and write 4-11 | 1795 * this filter will read lines 4-13 and write 4-11 |
1796 */ | 1796 */ |
1797 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) | 1797 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) |
1798 { | 1798 { |
1799 #if HAVE_MMX2 || HAVE_3DNOW | 1799 #if HAVE_MMX2 || HAVE_AMD3DNOW |
1800 src+= 4*stride; | 1800 src+= 4*stride; |
1801 __asm__ volatile( | 1801 __asm__ volatile( |
1802 "lea (%0, %1), %%"REG_a" \n\t" | 1802 "lea (%0, %1), %%"REG_a" \n\t" |
1803 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | 1803 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
1804 // 0 1 2 3 4 5 6 7 8 9 | 1804 // 0 1 2 3 4 5 6 7 8 9 |
1841 "movq %%mm1, (%2) \n\t" | 1841 "movq %%mm1, (%2) \n\t" |
1842 | 1842 |
1843 : : "r" (src), "r" ((x86_reg)stride), "r" (tmp) | 1843 : : "r" (src), "r" ((x86_reg)stride), "r" (tmp) |
1844 : "%"REG_a, "%"REG_d | 1844 : "%"REG_a, "%"REG_d |
1845 ); | 1845 ); |
1846 #else //HAVE_MMX2 || HAVE_3DNOW | 1846 #else //HAVE_MMX2 || HAVE_AMD3DNOW |
1847 int a, b, c, x; | 1847 int a, b, c, x; |
1848 src+= 4*stride; | 1848 src+= 4*stride; |
1849 | 1849 |
1850 for(x=0; x<2; x++){ | 1850 for(x=0; x<2; x++){ |
1851 a= *(uint32_t*)&tmp[stride*0]; | 1851 a= *(uint32_t*)&tmp[stride*0]; |
1884 | 1884 |
1885 *(uint32_t*)&tmp[stride*0]= c; | 1885 *(uint32_t*)&tmp[stride*0]= c; |
1886 src += 4; | 1886 src += 4; |
1887 tmp += 4; | 1887 tmp += 4; |
1888 } | 1888 } |
1889 #endif //HAVE_MMX2 || HAVE_3DNOW | 1889 #endif //HAVE_MMX2 || HAVE_AMD3DNOW |
1890 } | 1890 } |
1891 | 1891 |
1892 /** | 1892 /** |
1893 * Deinterlaces the given block by applying a median filter to every second line. | 1893 * Deinterlaces the given block by applying a median filter to every second line. |
1894 * will be called for every 8x8 block and can read & write from line 4-15, | 1894 * will be called for every 8x8 block and can read & write from line 4-15, |
2189 tempBlurredPast[128]= maxNoise[1]; | 2189 tempBlurredPast[128]= maxNoise[1]; |
2190 tempBlurredPast[129]= maxNoise[2]; | 2190 tempBlurredPast[129]= maxNoise[2]; |
2191 | 2191 |
2192 #define FAST_L2_DIFF | 2192 #define FAST_L2_DIFF |
2193 //#define L1_DIFF //u should change the thresholds too if u try that one | 2193 //#define L1_DIFF //u should change the thresholds too if u try that one |
2194 #if HAVE_MMX2 || HAVE_3DNOW | 2194 #if HAVE_MMX2 || HAVE_AMD3DNOW |
2195 __asm__ volatile( | 2195 __asm__ volatile( |
2196 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride | 2196 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride |
2197 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride | 2197 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride |
2198 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride | 2198 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride |
2199 // 0 1 2 3 4 5 6 7 8 9 | 2199 // 0 1 2 3 4 5 6 7 8 9 |
2477 "4: \n\t" | 2477 "4: \n\t" |
2478 | 2478 |
2479 :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast) | 2479 :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast) |
2480 : "%"REG_a, "%"REG_d, "%"REG_c, "memory" | 2480 : "%"REG_a, "%"REG_d, "%"REG_c, "memory" |
2481 ); | 2481 ); |
2482 #else //HAVE_MMX2 || HAVE_3DNOW | 2482 #else //HAVE_MMX2 || HAVE_AMD3DNOW |
2483 { | 2483 { |
2484 int y; | 2484 int y; |
2485 int d=0; | 2485 int d=0; |
2486 // int sysd=0; | 2486 // int sysd=0; |
2487 int i; | 2487 int i; |
2560 } | 2560 } |
2561 } | 2561 } |
2562 } | 2562 } |
2563 } | 2563 } |
2564 } | 2564 } |
2565 #endif //HAVE_MMX2 || HAVE_3DNOW | 2565 #endif //HAVE_MMX2 || HAVE_AMD3DNOW |
2566 } | 2566 } |
2567 #endif //HAVE_ALTIVEC | 2567 #endif //HAVE_ALTIVEC |
2568 | 2568 |
2569 #if HAVE_MMX | 2569 #if HAVE_MMX |
2570 /** | 2570 /** |
3409 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), | 3409 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), |
3410 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) | 3410 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) |
3411 : "%"REG_a, "%"REG_d | 3411 : "%"REG_a, "%"REG_d |
3412 ); | 3412 ); |
3413 | 3413 |
3414 #elif HAVE_3DNOW | 3414 #elif HAVE_AMD3DNOW |
3415 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... | 3415 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... |
3416 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | 3416 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
3417 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | 3417 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
3418 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | 3418 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
3419 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | 3419 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
3545 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), | 3545 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), |
3546 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) | 3546 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) |
3547 : "%"REG_a, "%"REG_d | 3547 : "%"REG_a, "%"REG_d |
3548 ); | 3548 ); |
3549 | 3549 |
3550 #elif HAVE_3DNOW | 3550 #elif HAVE_AMD3DNOW |
3551 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... | 3551 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... |
3552 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | 3552 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
3553 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | 3553 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
3554 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | 3554 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
3555 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | 3555 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
3697 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; | 3697 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; |
3698 + dstBlock[x +13*dstStride] | 3698 + dstBlock[x +13*dstStride] |
3699 + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; | 3699 + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; |
3700 }*/ | 3700 }*/ |
3701 } | 3701 } |
3702 #if HAVE_3DNOW | 3702 #if HAVE_AMD3DNOW |
3703 __asm__ volatile("femms"); | 3703 __asm__ volatile("femms"); |
3704 #elif HAVE_MMX | 3704 #elif HAVE_MMX |
3705 __asm__ volatile("emms"); | 3705 __asm__ volatile("emms"); |
3706 #endif | 3706 #endif |
3707 | 3707 |