comparison postprocess_template.c @ 121:dd89aa84269b libpostproc

HAVE_3DNOW --> HAVE_AMD3DNOW to sync with latest configure changes.
author diego
date Sun, 25 Jan 2009 19:57:52 +0000
parents 4a1602d552aa
children 1500ae6cf66c
comparison
equal deleted inserted replaced
120:e86e6ea21776 121:dd89aa84269b
31 #undef PMINUB 31 #undef PMINUB
32 #undef PMAXUB 32 #undef PMAXUB
33 33
34 #if HAVE_MMX2 34 #if HAVE_MMX2
35 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" 35 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
36 #elif HAVE_3DNOW 36 #elif HAVE_AMD3DNOW
37 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" 37 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
38 #endif 38 #endif
39 #define PAVGB(a,b) REAL_PAVGB(a,b) 39 #define PAVGB(a,b) REAL_PAVGB(a,b)
40 40
41 #if HAVE_MMX2 41 #if HAVE_MMX2
177 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 177 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
178 */ 178 */
179 #if !HAVE_ALTIVEC 179 #if !HAVE_ALTIVEC
180 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) 180 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
181 { 181 {
182 #if HAVE_MMX2 || HAVE_3DNOW 182 #if HAVE_MMX2 || HAVE_AMD3DNOW
183 src+= stride*3; 183 src+= stride*3;
184 __asm__ volatile( //"movv %0 %1 %2\n\t" 184 __asm__ volatile( //"movv %0 %1 %2\n\t"
185 "movq %2, %%mm0 \n\t" // QP,..., QP 185 "movq %2, %%mm0 \n\t" // QP,..., QP
186 "pxor %%mm4, %%mm4 \n\t" 186 "pxor %%mm4, %%mm4 \n\t"
187 187
304 304
305 : 305 :
306 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) 306 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
307 : "%"REG_a, "%"REG_c 307 : "%"REG_a, "%"REG_c
308 ); 308 );
309 #else //HAVE_MMX2 || HAVE_3DNOW 309 #else //HAVE_MMX2 || HAVE_AMD3DNOW
310 const int l1= stride; 310 const int l1= stride;
311 const int l2= stride + l1; 311 const int l2= stride + l1;
312 const int l3= stride + l2; 312 const int l3= stride + l2;
313 const int l4= stride + l3; 313 const int l4= stride + l3;
314 const int l5= stride + l4; 314 const int l5= stride + l4;
343 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; 343 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
344 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; 344 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
345 345
346 src++; 346 src++;
347 } 347 }
348 #endif //HAVE_MMX2 || HAVE_3DNOW 348 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
349 } 349 }
350 #endif //HAVE_ALTIVEC 350 #endif //HAVE_ALTIVEC
351 351
352 #if 0 352 #if 0
353 /** 353 /**
362 x/8 = 1 362 x/8 = 1
363 1 12 12 23 363 1 12 12 23
364 */ 364 */
365 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) 365 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
366 { 366 {
367 #if HAVE_MMX2 || HAVE_3DNOW 367 #if HAVE_MMX2 || HAVE_AMD3DNOW
368 src+= stride*3; 368 src+= stride*3;
369 // FIXME rounding 369 // FIXME rounding
370 __asm__ volatile( 370 __asm__ volatile(
371 "pxor %%mm7, %%mm7 \n\t" // 0 371 "pxor %%mm7, %%mm7 \n\t" // 0
372 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE 372 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE
424 424
425 : 425 :
426 : "r" (src), "r" ((x86_reg)stride) 426 : "r" (src), "r" ((x86_reg)stride)
427 : "%"REG_a, "%"REG_c 427 : "%"REG_a, "%"REG_c
428 ); 428 );
429 #else //HAVE_MMX2 || HAVE_3DNOW 429 #else //HAVE_MMX2 || HAVE_AMD3DNOW
430 const int l1= stride; 430 const int l1= stride;
431 const int l2= stride + l1; 431 const int l2= stride + l1;
432 const int l3= stride + l2; 432 const int l3= stride + l2;
433 const int l4= stride + l3; 433 const int l4= stride + l3;
434 const int l5= stride + l4; 434 const int l5= stride + l4;
447 src[x+l5] -=v>>1; 447 src[x+l5] -=v>>1;
448 src[x+l6] -=v>>3; 448 src[x+l6] -=v>>3;
449 } 449 }
450 } 450 }
451 451
452 #endif //HAVE_MMX2 || HAVE_3DNOW 452 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
453 } 453 }
454 #endif //0 454 #endif //0
455 455
456 /** 456 /**
457 * Experimental Filter 1 457 * Experimental Filter 1
460 * can only smooth blocks at the expected locations (it cannot smooth them if they did move) 460 * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
461 * MMX2 version does correct clipping C version does not 461 * MMX2 version does correct clipping C version does not
462 */ 462 */
463 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) 463 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
464 { 464 {
465 #if HAVE_MMX2 || HAVE_3DNOW 465 #if HAVE_MMX2 || HAVE_AMD3DNOW
466 src+= stride*3; 466 src+= stride*3;
467 467
468 __asm__ volatile( 468 __asm__ volatile(
469 "pxor %%mm7, %%mm7 \n\t" // 0 469 "pxor %%mm7, %%mm7 \n\t" // 0
470 "lea (%0, %1), %%"REG_a" \n\t" 470 "lea (%0, %1), %%"REG_a" \n\t"
546 546
547 : 547 :
548 : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb) 548 : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
549 : "%"REG_a, "%"REG_c 549 : "%"REG_a, "%"REG_c
550 ); 550 );
551 #else //HAVE_MMX2 || HAVE_3DNOW 551 #else //HAVE_MMX2 || HAVE_AMD3DNOW
552 552
553 const int l1= stride; 553 const int l1= stride;
554 const int l2= stride + l1; 554 const int l2= stride + l1;
555 const int l3= stride + l2; 555 const int l3= stride + l2;
556 const int l4= stride + l3; 556 const int l4= stride + l3;
580 src[l6] -=v>>2; 580 src[l6] -=v>>2;
581 src[l7] -=v>>3; 581 src[l7] -=v>>3;
582 } 582 }
583 src++; 583 src++;
584 } 584 }
585 #endif //HAVE_MMX2 || HAVE_3DNOW 585 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
586 } 586 }
587 587
588 #if !HAVE_ALTIVEC 588 #if !HAVE_ALTIVEC
589 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) 589 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
590 { 590 {
591 #if HAVE_MMX2 || HAVE_3DNOW 591 #if HAVE_MMX2 || HAVE_AMD3DNOW
592 /* 592 /*
593 uint8_t tmp[16]; 593 uint8_t tmp[16];
594 const int l1= stride; 594 const int l1= stride;
595 const int l2= stride + l1; 595 const int l2= stride + l1;
596 const int l3= stride + l2; 596 const int l3= stride + l2;
1099 1099
1100 : "+r" (src) 1100 : "+r" (src)
1101 : "r" ((x86_reg)stride), "m" (c->pQPb) 1101 : "r" ((x86_reg)stride), "m" (c->pQPb)
1102 : "%"REG_a, "%"REG_c 1102 : "%"REG_a, "%"REG_c
1103 ); 1103 );
1104 #else //HAVE_MMX2 || HAVE_3DNOW 1104 #else //HAVE_MMX2 || HAVE_AMD3DNOW
1105 const int l1= stride; 1105 const int l1= stride;
1106 const int l2= stride + l1; 1106 const int l2= stride + l1;
1107 const int l3= stride + l2; 1107 const int l3= stride + l2;
1108 const int l4= stride + l3; 1108 const int l4= stride + l3;
1109 const int l5= stride + l4; 1109 const int l5= stride + l4;
1137 src[l4]-= d; 1137 src[l4]-= d;
1138 src[l5]+= d; 1138 src[l5]+= d;
1139 } 1139 }
1140 src++; 1140 src++;
1141 } 1141 }
1142 #endif //HAVE_MMX2 || HAVE_3DNOW 1142 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
1143 } 1143 }
1144 #endif //HAVE_ALTIVEC 1144 #endif //HAVE_ALTIVEC
1145 1145
1146 #if !HAVE_ALTIVEC 1146 #if !HAVE_ALTIVEC
1147 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) 1147 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1148 { 1148 {
1149 #if HAVE_MMX2 || HAVE_3DNOW 1149 #if HAVE_MMX2 || HAVE_AMD3DNOW
1150 __asm__ volatile( 1150 __asm__ volatile(
1151 "pxor %%mm6, %%mm6 \n\t" 1151 "pxor %%mm6, %%mm6 \n\t"
1152 "pcmpeqb %%mm7, %%mm7 \n\t" 1152 "pcmpeqb %%mm7, %%mm7 \n\t"
1153 "movq %2, %%mm0 \n\t" 1153 "movq %2, %%mm0 \n\t"
1154 "punpcklbw %%mm6, %%mm0 \n\t" 1154 "punpcklbw %%mm6, %%mm0 \n\t"
1368 1368
1369 "1: \n\t" 1369 "1: \n\t"
1370 : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2) 1370 : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2)
1371 : "%"REG_a, "%"REG_d, "%"REG_c 1371 : "%"REG_a, "%"REG_d, "%"REG_c
1372 ); 1372 );
1373 #else //HAVE_MMX2 || HAVE_3DNOW 1373 #else //HAVE_MMX2 || HAVE_AMD3DNOW
1374 int y; 1374 int y;
1375 int min=255; 1375 int min=255;
1376 int max=0; 1376 int max=0;
1377 int avg; 1377 int avg;
1378 uint8_t *p; 1378 uint8_t *p;
1485 } 1485 }
1486 } 1486 }
1487 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; 1487 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1488 } 1488 }
1489 #endif 1489 #endif
1490 #endif //HAVE_MMX2 || HAVE_3DNOW 1490 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
1491 } 1491 }
1492 #endif //HAVE_ALTIVEC 1492 #endif //HAVE_ALTIVEC
1493 1493
1494 /** 1494 /**
1495 * Deinterlaces the given block by linearly interpolating every second line. 1495 * Deinterlaces the given block by linearly interpolating every second line.
1497 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1497 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1498 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1498 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1499 */ 1499 */
1500 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) 1500 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1501 { 1501 {
1502 #if HAVE_MMX2 || HAVE_3DNOW 1502 #if HAVE_MMX2 || HAVE_AMD3DNOW
1503 src+= 4*stride; 1503 src+= 4*stride;
1504 __asm__ volatile( 1504 __asm__ volatile(
1505 "lea (%0, %1), %%"REG_a" \n\t" 1505 "lea (%0, %1), %%"REG_a" \n\t"
1506 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 1506 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
1507 // 0 1 2 3 4 5 6 7 8 9 1507 // 0 1 2 3 4 5 6 7 8 9
1550 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1550 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1551 * this filter will read lines 3-15 and write 7-13 1551 * this filter will read lines 3-15 and write 7-13
1552 */ 1552 */
1553 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) 1553 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1554 { 1554 {
1555 #if HAVE_MMX2 || HAVE_3DNOW 1555 #if HAVE_MMX2 || HAVE_AMD3DNOW
1556 src+= stride*3; 1556 src+= stride*3;
1557 __asm__ volatile( 1557 __asm__ volatile(
1558 "lea (%0, %1), %%"REG_a" \n\t" 1558 "lea (%0, %1), %%"REG_a" \n\t"
1559 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1559 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1560 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" 1560 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t"
1592 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2)) 1592 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2))
1593 1593
1594 : : "r" (src), "r" ((x86_reg)stride) 1594 : : "r" (src), "r" ((x86_reg)stride)
1595 : "%"REG_a, "%"REG_d, "%"REG_c 1595 : "%"REG_a, "%"REG_d, "%"REG_c
1596 ); 1596 );
1597 #else //HAVE_MMX2 || HAVE_3DNOW 1597 #else //HAVE_MMX2 || HAVE_AMD3DNOW
1598 int x; 1598 int x;
1599 src+= stride*3; 1599 src+= stride*3;
1600 for(x=0; x<8; x++){ 1600 for(x=0; x<8; x++){
1601 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); 1601 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1602 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); 1602 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1603 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); 1603 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1604 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); 1604 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1605 src++; 1605 src++;
1606 } 1606 }
1607 #endif //HAVE_MMX2 || HAVE_3DNOW 1607 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
1608 } 1608 }
1609 1609
1610 /** 1610 /**
1611 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. 1611 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1612 * will be called for every 8x8 block and can read & write from line 4-15 1612 * will be called for every 8x8 block and can read & write from line 4-15
1614 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1614 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1615 * this filter will read lines 4-13 and write 5-11 1615 * this filter will read lines 4-13 and write 5-11
1616 */ 1616 */
1617 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) 1617 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1618 { 1618 {
1619 #if HAVE_MMX2 || HAVE_3DNOW 1619 #if HAVE_MMX2 || HAVE_AMD3DNOW
1620 src+= stride*4; 1620 src+= stride*4;
1621 __asm__ volatile( 1621 __asm__ volatile(
1622 "lea (%0, %1), %%"REG_a" \n\t" 1622 "lea (%0, %1), %%"REG_a" \n\t"
1623 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1623 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1624 "pxor %%mm7, %%mm7 \n\t" 1624 "pxor %%mm7, %%mm7 \n\t"
1663 1663
1664 "movq %%mm0, (%2) \n\t" 1664 "movq %%mm0, (%2) \n\t"
1665 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp) 1665 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
1666 : "%"REG_a, "%"REG_d 1666 : "%"REG_a, "%"REG_d
1667 ); 1667 );
1668 #else //HAVE_MMX2 || HAVE_3DNOW 1668 #else //HAVE_MMX2 || HAVE_AMD3DNOW
1669 int x; 1669 int x;
1670 src+= stride*4; 1670 src+= stride*4;
1671 for(x=0; x<8; x++){ 1671 for(x=0; x<8; x++){
1672 int t1= tmp[x]; 1672 int t1= tmp[x];
1673 int t2= src[stride*1]; 1673 int t2= src[stride*1];
1681 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); 1681 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1682 tmp[x]= t1; 1682 tmp[x]= t1;
1683 1683
1684 src++; 1684 src++;
1685 } 1685 }
1686 #endif //HAVE_MMX2 || HAVE_3DNOW 1686 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
1687 } 1687 }
1688 1688
1689 /** 1689 /**
1690 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter. 1690 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter.
1691 * will be called for every 8x8 block and can read & write from line 4-15 1691 * will be called for every 8x8 block and can read & write from line 4-15
1693 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1693 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1694 * this filter will read lines 4-13 and write 4-11 1694 * this filter will read lines 4-13 and write 4-11
1695 */ 1695 */
1696 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) 1696 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1697 { 1697 {
1698 #if HAVE_MMX2 || HAVE_3DNOW 1698 #if HAVE_MMX2 || HAVE_AMD3DNOW
1699 src+= stride*4; 1699 src+= stride*4;
1700 __asm__ volatile( 1700 __asm__ volatile(
1701 "lea (%0, %1), %%"REG_a" \n\t" 1701 "lea (%0, %1), %%"REG_a" \n\t"
1702 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1702 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1703 "pxor %%mm7, %%mm7 \n\t" 1703 "pxor %%mm7, %%mm7 \n\t"
1753 "movq %%mm0, (%2) \n\t" 1753 "movq %%mm0, (%2) \n\t"
1754 "movq %%mm1, (%3) \n\t" 1754 "movq %%mm1, (%3) \n\t"
1755 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2) 1755 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
1756 : "%"REG_a, "%"REG_d 1756 : "%"REG_a, "%"REG_d
1757 ); 1757 );
1758 #else //HAVE_MMX2 || HAVE_3DNOW 1758 #else //HAVE_MMX2 || HAVE_AMD3DNOW
1759 int x; 1759 int x;
1760 src+= stride*4; 1760 src+= stride*4;
1761 for(x=0; x<8; x++){ 1761 for(x=0; x<8; x++){
1762 int t1= tmp[x]; 1762 int t1= tmp[x];
1763 int t2= tmp2[x]; 1763 int t2= tmp2[x];
1782 tmp[x]= t3; 1782 tmp[x]= t3;
1783 tmp2[x]= t1; 1783 tmp2[x]= t1;
1784 1784
1785 src++; 1785 src++;
1786 } 1786 }
1787 #endif //HAVE_MMX2 || HAVE_3DNOW 1787 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
1788 } 1788 }
1789 1789
1790 /** 1790 /**
1791 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter. 1791 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter.
1792 * will be called for every 8x8 block and can read & write from line 4-15 1792 * will be called for every 8x8 block and can read & write from line 4-15
1794 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1794 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1795 * this filter will read lines 4-13 and write 4-11 1795 * this filter will read lines 4-13 and write 4-11
1796 */ 1796 */
1797 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) 1797 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1798 { 1798 {
1799 #if HAVE_MMX2 || HAVE_3DNOW 1799 #if HAVE_MMX2 || HAVE_AMD3DNOW
1800 src+= 4*stride; 1800 src+= 4*stride;
1801 __asm__ volatile( 1801 __asm__ volatile(
1802 "lea (%0, %1), %%"REG_a" \n\t" 1802 "lea (%0, %1), %%"REG_a" \n\t"
1803 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1803 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1804 // 0 1 2 3 4 5 6 7 8 9 1804 // 0 1 2 3 4 5 6 7 8 9
1841 "movq %%mm1, (%2) \n\t" 1841 "movq %%mm1, (%2) \n\t"
1842 1842
1843 : : "r" (src), "r" ((x86_reg)stride), "r" (tmp) 1843 : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
1844 : "%"REG_a, "%"REG_d 1844 : "%"REG_a, "%"REG_d
1845 ); 1845 );
1846 #else //HAVE_MMX2 || HAVE_3DNOW 1846 #else //HAVE_MMX2 || HAVE_AMD3DNOW
1847 int a, b, c, x; 1847 int a, b, c, x;
1848 src+= 4*stride; 1848 src+= 4*stride;
1849 1849
1850 for(x=0; x<2; x++){ 1850 for(x=0; x<2; x++){
1851 a= *(uint32_t*)&tmp[stride*0]; 1851 a= *(uint32_t*)&tmp[stride*0];
1884 1884
1885 *(uint32_t*)&tmp[stride*0]= c; 1885 *(uint32_t*)&tmp[stride*0]= c;
1886 src += 4; 1886 src += 4;
1887 tmp += 4; 1887 tmp += 4;
1888 } 1888 }
1889 #endif //HAVE_MMX2 || HAVE_3DNOW 1889 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
1890 } 1890 }
1891 1891
1892 /** 1892 /**
1893 * Deinterlaces the given block by applying a median filter to every second line. 1893 * Deinterlaces the given block by applying a median filter to every second line.
1894 * will be called for every 8x8 block and can read & write from line 4-15, 1894 * will be called for every 8x8 block and can read & write from line 4-15,
2189 tempBlurredPast[128]= maxNoise[1]; 2189 tempBlurredPast[128]= maxNoise[1];
2190 tempBlurredPast[129]= maxNoise[2]; 2190 tempBlurredPast[129]= maxNoise[2];
2191 2191
2192 #define FAST_L2_DIFF 2192 #define FAST_L2_DIFF
2193 //#define L1_DIFF //u should change the thresholds too if u try that one 2193 //#define L1_DIFF //u should change the thresholds too if u try that one
2194 #if HAVE_MMX2 || HAVE_3DNOW 2194 #if HAVE_MMX2 || HAVE_AMD3DNOW
2195 __asm__ volatile( 2195 __asm__ volatile(
2196 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride 2196 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride
2197 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride 2197 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride
2198 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2198 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2199 // 0 1 2 3 4 5 6 7 8 9 2199 // 0 1 2 3 4 5 6 7 8 9
2477 "4: \n\t" 2477 "4: \n\t"
2478 2478
2479 :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast) 2479 :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
2480 : "%"REG_a, "%"REG_d, "%"REG_c, "memory" 2480 : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
2481 ); 2481 );
2482 #else //HAVE_MMX2 || HAVE_3DNOW 2482 #else //HAVE_MMX2 || HAVE_AMD3DNOW
2483 { 2483 {
2484 int y; 2484 int y;
2485 int d=0; 2485 int d=0;
2486 // int sysd=0; 2486 // int sysd=0;
2487 int i; 2487 int i;
2560 } 2560 }
2561 } 2561 }
2562 } 2562 }
2563 } 2563 }
2564 } 2564 }
2565 #endif //HAVE_MMX2 || HAVE_3DNOW 2565 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
2566 } 2566 }
2567 #endif //HAVE_ALTIVEC 2567 #endif //HAVE_ALTIVEC
2568 2568
2569 #if HAVE_MMX 2569 #if HAVE_MMX
2570 /** 2570 /**
3409 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), 3409 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
3410 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) 3410 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
3411 : "%"REG_a, "%"REG_d 3411 : "%"REG_a, "%"REG_d
3412 ); 3412 );
3413 3413
3414 #elif HAVE_3DNOW 3414 #elif HAVE_AMD3DNOW
3415 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... 3415 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3416 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 3416 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3417 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 3417 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3418 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 3418 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3419 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 3419 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3545 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), 3545 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
3546 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) 3546 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
3547 : "%"REG_a, "%"REG_d 3547 : "%"REG_a, "%"REG_d
3548 ); 3548 );
3549 3549
3550 #elif HAVE_3DNOW 3550 #elif HAVE_AMD3DNOW
3551 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... 3551 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3552 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 3552 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3553 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 3553 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3554 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 3554 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3555 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 3555 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3697 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; 3697 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3698 + dstBlock[x +13*dstStride] 3698 + dstBlock[x +13*dstStride]
3699 + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; 3699 + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3700 }*/ 3700 }*/
3701 } 3701 }
3702 #if HAVE_3DNOW 3702 #if HAVE_AMD3DNOW
3703 __asm__ volatile("femms"); 3703 __asm__ volatile("femms");
3704 #elif HAVE_MMX 3704 #elif HAVE_MMX
3705 __asm__ volatile("emms"); 3705 __asm__ volatile("emms");
3706 #endif 3706 #endif
3707 3707