Mercurial > mplayer.hg
comparison postproc/postprocess.c @ 3093:fb4cee33d3c6
faster dering
author | michael |
---|---|
date | Sat, 24 Nov 2001 01:38:30 +0000 |
parents | 3fc9a8b9f178 |
children | 4150aff2ac17 |
comparison
equal
deleted
inserted
replaced
3092:c442c6565922 | 3093:fb4cee33d3c6 |
---|---|
45 c = checked against the other implementations (-vo md5) | 45 c = checked against the other implementations (-vo md5) |
46 */ | 46 */ |
47 | 47 |
48 /* | 48 /* |
49 TODO: | 49 TODO: |
50 verify that everything workes as it should (how?) | |
51 reduce the time wasted on the mem transfer | 50 reduce the time wasted on the mem transfer |
52 implement everything in C at least (done at the moment but ...) | 51 implement everything in C at least (done at the moment but ...) |
53 unroll stuff if instructions depend too much on the prior one | 52 unroll stuff if instructions depend too much on the prior one |
54 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? | 53 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? |
55 move YScale thing to the end instead of fixing QP | 54 move YScale thing to the end instead of fixing QP |
60 split this huge file | 59 split this huge file |
61 border remover | 60 border remover |
62 optimize c versions | 61 optimize c versions |
63 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks | 62 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks |
64 smart blur | 63 smart blur |
65 commandline option for the deblock thresholds | 64 commandline option for the deblock / dering thresholds |
65 memcpy chrominance if no chroma filtering is done | |
66 ... | 66 ... |
67 */ | 67 */ |
68 | 68 |
69 //Changelog: use the CVS log | 69 //Changelog: use the CVS log |
70 | 70 |
160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code | 160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code |
161 #endif | 161 #endif |
162 | 162 |
163 int hFlatnessThreshold= 56 - 16; | 163 int hFlatnessThreshold= 56 - 16; |
164 int vFlatnessThreshold= 56 - 16; | 164 int vFlatnessThreshold= 56 - 16; |
165 int deringThreshold= 20; | |
165 | 166 |
166 //amount of "black" u r willing to loose to get a brightness corrected picture | 167 //amount of "black" u r willing to loose to get a brightness corrected picture |
167 double maxClippedThreshold= 0.01; | 168 double maxClippedThreshold= 0.01; |
168 | 169 |
169 int maxAllowedY=234; | 170 int maxAllowedY=234; |
308 "paddb %%mm7, %%mm2 \n\t" | 309 "paddb %%mm7, %%mm2 \n\t" |
309 "pcmpgtb %%mm6, %%mm2 \n\t" | 310 "pcmpgtb %%mm6, %%mm2 \n\t" |
310 "paddb %%mm2, %%mm0 \n\t" | 311 "paddb %%mm2, %%mm0 \n\t" |
311 | 312 |
312 " \n\t" | 313 " \n\t" |
314 #ifdef HAVE_MMX2 | |
315 "pxor %%mm7, %%mm7 \n\t" | |
316 "psadbw %%mm7, %%mm0 \n\t" | |
317 #else | |
313 "movq %%mm0, %%mm1 \n\t" | 318 "movq %%mm0, %%mm1 \n\t" |
314 "psrlw $8, %%mm0 \n\t" | 319 "psrlw $8, %%mm0 \n\t" |
315 "paddb %%mm1, %%mm0 \n\t" | 320 "paddb %%mm1, %%mm0 \n\t" |
316 #ifdef HAVE_MMX2 | |
317 "pshufw $0xF9, %%mm0, %%mm1 \n\t" | |
318 "paddb %%mm1, %%mm0 \n\t" | |
319 "pshufw $0xFE, %%mm0, %%mm1 \n\t" | |
320 #else | |
321 "movq %%mm0, %%mm1 \n\t" | 321 "movq %%mm0, %%mm1 \n\t" |
322 "psrlq $16, %%mm0 \n\t" | 322 "psrlq $16, %%mm0 \n\t" |
323 "paddb %%mm1, %%mm0 \n\t" | 323 "paddb %%mm1, %%mm0 \n\t" |
324 "movq %%mm0, %%mm1 \n\t" | 324 "movq %%mm0, %%mm1 \n\t" |
325 "psrlq $32, %%mm0 \n\t" | 325 "psrlq $32, %%mm0 \n\t" |
326 #endif | |
327 "paddb %%mm1, %%mm0 \n\t" | 326 "paddb %%mm1, %%mm0 \n\t" |
327 #endif | |
328 "movd %%mm0, %0 \n\t" | 328 "movd %%mm0, %0 \n\t" |
329 : "=r" (numEq) | 329 : "=r" (numEq) |
330 : "r" (src), "r" (stride) | 330 : "r" (src), "r" (stride) |
331 : "%eax", "%ebx" | 331 : "%ebx" |
332 ); | 332 ); |
333 | 333 numEq= (-numEq) &0xFF; |
334 numEq= (256 - numEq) &0xFF; | |
335 | 334 |
336 #else | 335 #else |
337 for(y=0; y<BLOCK_SIZE-1; y++) | 336 for(y=0; y<BLOCK_SIZE-1; y++) |
338 { | 337 { |
339 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++; | 338 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++; |
1589 "leal (%0, %1), %%eax \n\t" | 1588 "leal (%0, %1), %%eax \n\t" |
1590 "leal (%%eax, %1, 4), %%ebx \n\t" | 1589 "leal (%%eax, %1, 4), %%ebx \n\t" |
1591 // 0 1 2 3 4 5 6 7 8 9 | 1590 // 0 1 2 3 4 5 6 7 8 9 |
1592 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1591 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
1593 | 1592 |
1594 "pcmpeqb %%mm6, %%mm6 \n\t" | 1593 "pcmpeqb %%mm7, %%mm7 \n\t" |
1595 "pxor %%mm7, %%mm7 \n\t" | 1594 "pxor %%mm6, %%mm6 \n\t" |
1596 #ifdef HAVE_MMX2 | 1595 #ifdef HAVE_MMX2 |
1597 #define FIND_MIN_MAX(addr)\ | 1596 #define FIND_MIN_MAX(addr)\ |
1598 "movq " #addr ", %%mm0 \n\t"\ | 1597 "movq " #addr ", %%mm0 \n\t"\ |
1599 "pminub %%mm0, %%mm6 \n\t"\ | 1598 "pminub %%mm0, %%mm7 \n\t"\ |
1600 "pmaxub %%mm0, %%mm7 \n\t" | 1599 "pmaxub %%mm0, %%mm6 \n\t" |
1601 #else | 1600 #else |
1602 #define FIND_MIN_MAX(addr)\ | 1601 #define FIND_MIN_MAX(addr)\ |
1603 "movq " #addr ", %%mm0 \n\t"\ | 1602 "movq " #addr ", %%mm0 \n\t"\ |
1604 "movq %%mm6, %%mm1 \n\t"\ | 1603 "movq %%mm7, %%mm1 \n\t"\ |
1605 "psubusb %%mm0, %%mm7 \n\t"\ | 1604 "psubusb %%mm0, %%mm6 \n\t"\ |
1606 "paddb %%mm0, %%mm7 \n\t"\ | 1605 "paddb %%mm0, %%mm6 \n\t"\ |
1607 "psubusb %%mm0, %%mm1 \n\t"\ | 1606 "psubusb %%mm0, %%mm1 \n\t"\ |
1608 "psubb %%mm1, %%mm6 \n\t" | 1607 "psubb %%mm1, %%mm7 \n\t" |
1609 #endif | 1608 #endif |
1610 | 1609 |
1611 FIND_MIN_MAX((%%eax)) | 1610 FIND_MIN_MAX((%%eax)) |
1612 FIND_MIN_MAX((%%eax, %1)) | 1611 FIND_MIN_MAX((%%eax, %1)) |
1613 FIND_MIN_MAX((%%eax, %1, 2)) | 1612 FIND_MIN_MAX((%%eax, %1, 2)) |
1615 FIND_MIN_MAX((%%ebx)) | 1614 FIND_MIN_MAX((%%ebx)) |
1616 FIND_MIN_MAX((%%ebx, %1)) | 1615 FIND_MIN_MAX((%%ebx, %1)) |
1617 FIND_MIN_MAX((%%ebx, %1, 2)) | 1616 FIND_MIN_MAX((%%ebx, %1, 2)) |
1618 FIND_MIN_MAX((%0, %1, 8)) | 1617 FIND_MIN_MAX((%0, %1, 8)) |
1619 | 1618 |
1619 "movq %%mm7, %%mm4 \n\t" | |
1620 "psrlq $8, %%mm7 \n\t" | |
1621 #ifdef HAVE_MMX2 | |
1622 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
1623 "pshufw $0xF9, %%mm7, %%mm4 \n\t" | |
1624 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
1625 "pshufw $0xFE, %%mm7, %%mm4 \n\t" | |
1626 "pminub %%mm4, %%mm7 \n\t" | |
1627 #else | |
1628 "movq %%mm7, %%mm1 \n\t" | |
1629 "psubusb %%mm4, %%mm1 \n\t" | |
1630 "psubb %%mm1, %%mm7 \n\t" | |
1631 "movq %%mm7, %%mm4 \n\t" | |
1632 "psrlq $16, %%mm7 \n\t" | |
1633 "movq %%mm7, %%mm1 \n\t" | |
1634 "psubusb %%mm4, %%mm1 \n\t" | |
1635 "psubb %%mm1, %%mm7 \n\t" | |
1636 "movq %%mm7, %%mm4 \n\t" | |
1637 "psrlq $32, %%mm7 \n\t" | |
1638 "movq %%mm7, %%mm1 \n\t" | |
1639 "psubusb %%mm4, %%mm1 \n\t" | |
1640 "psubb %%mm1, %%mm7 \n\t" | |
1641 #endif | |
1642 | |
1643 | |
1620 "movq %%mm6, %%mm4 \n\t" | 1644 "movq %%mm6, %%mm4 \n\t" |
1621 "psrlq $8, %%mm6 \n\t" | 1645 "psrlq $8, %%mm6 \n\t" |
1622 #ifdef HAVE_MMX2 | 1646 #ifdef HAVE_MMX2 |
1623 "pminub %%mm4, %%mm6 \n\t" // min of pixels | 1647 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels |
1624 "pshufw $0xF9, %%mm6, %%mm4 \n\t" | 1648 "pshufw $0xF9, %%mm6, %%mm4 \n\t" |
1625 "pminub %%mm4, %%mm6 \n\t" // min of pixels | 1649 "pmaxub %%mm4, %%mm6 \n\t" |
1626 "pshufw $0xFE, %%mm6, %%mm4 \n\t" | 1650 "pshufw $0xFE, %%mm6, %%mm4 \n\t" |
1627 "pminub %%mm4, %%mm6 \n\t" | 1651 "pmaxub %%mm4, %%mm6 \n\t" |
1628 #else | 1652 #else |
1629 "movq %%mm6, %%mm1 \n\t" | 1653 "psubusb %%mm4, %%mm6 \n\t" |
1630 "psubusb %%mm4, %%mm1 \n\t" | 1654 "paddb %%mm4, %%mm6 \n\t" |
1631 "psubb %%mm1, %%mm6 \n\t" | |
1632 "movq %%mm6, %%mm4 \n\t" | 1655 "movq %%mm6, %%mm4 \n\t" |
1633 "psrlq $16, %%mm6 \n\t" | 1656 "psrlq $16, %%mm6 \n\t" |
1634 "movq %%mm6, %%mm1 \n\t" | 1657 "psubusb %%mm4, %%mm6 \n\t" |
1635 "psubusb %%mm4, %%mm1 \n\t" | 1658 "paddb %%mm4, %%mm6 \n\t" |
1636 "psubb %%mm1, %%mm6 \n\t" | |
1637 "movq %%mm6, %%mm4 \n\t" | 1659 "movq %%mm6, %%mm4 \n\t" |
1638 "psrlq $32, %%mm6 \n\t" | 1660 "psrlq $32, %%mm6 \n\t" |
1639 "movq %%mm6, %%mm1 \n\t" | 1661 "psubusb %%mm4, %%mm6 \n\t" |
1640 "psubusb %%mm4, %%mm1 \n\t" | 1662 "paddb %%mm4, %%mm6 \n\t" |
1641 "psubb %%mm1, %%mm6 \n\t" | 1663 #endif |
1642 #endif | 1664 "movq %%mm6, %%mm0 \n\t" // max |
1643 | 1665 "psubb %%mm7, %%mm6 \n\t" // max - min |
1644 | 1666 "movd %%mm6, %%ecx \n\t" |
1645 "movq %%mm7, %%mm4 \n\t" | 1667 "cmpb deringThreshold, %%cl \n\t" |
1646 "psrlq $8, %%mm7 \n\t" | 1668 " jb 1f \n\t" |
1647 #ifdef HAVE_MMX2 | 1669 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 |
1648 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels | |
1649 "pshufw $0xF9, %%mm7, %%mm4 \n\t" | |
1650 "pmaxub %%mm4, %%mm7 \n\t" | |
1651 "pshufw $0xFE, %%mm7, %%mm4 \n\t" | |
1652 "pmaxub %%mm4, %%mm7 \n\t" | |
1653 #else | |
1654 "psubusb %%mm4, %%mm7 \n\t" | |
1655 "paddb %%mm4, %%mm7 \n\t" | |
1656 "movq %%mm7, %%mm4 \n\t" | |
1657 "psrlq $16, %%mm7 \n\t" | |
1658 "psubusb %%mm4, %%mm7 \n\t" | |
1659 "paddb %%mm4, %%mm7 \n\t" | |
1660 "movq %%mm7, %%mm4 \n\t" | |
1661 "psrlq $32, %%mm7 \n\t" | |
1662 "psubusb %%mm4, %%mm7 \n\t" | |
1663 "paddb %%mm4, %%mm7 \n\t" | |
1664 #endif | |
1665 PAVGB(%%mm6, %%mm7) // a=(max + min)/2 | |
1666 "punpcklbw %%mm7, %%mm7 \n\t" | 1670 "punpcklbw %%mm7, %%mm7 \n\t" |
1667 "punpcklbw %%mm7, %%mm7 \n\t" | 1671 "punpcklbw %%mm7, %%mm7 \n\t" |
1668 "punpcklbw %%mm7, %%mm7 \n\t" | 1672 "punpcklbw %%mm7, %%mm7 \n\t" |
1669 "movq %%mm7, temp0 \n\t" | 1673 "movq %%mm7, temp0 \n\t" |
1670 | 1674 |
1783 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | 1787 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
1784 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | 1788 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) |
1785 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | 1789 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
1786 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | 1790 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
1787 | 1791 |
1788 | 1792 "1: \n\t" |
1789 : : "r" (src), "r" (stride), "r" (QP) | 1793 : : "r" (src), "r" (stride), "r" (QP) |
1790 : "%eax", "%ebx" | 1794 : "%eax", "%ebx", "%ecx" |
1791 ); | 1795 ); |
1792 #else | 1796 #else |
1793 int y; | 1797 int y; |
1794 int min=255; | 1798 int min=255; |
1795 int max=0; | 1799 int max=0; |
1807 if(*p > max) max= *p; | 1811 if(*p > max) max= *p; |
1808 if(*p < min) min= *p; | 1812 if(*p < min) min= *p; |
1809 } | 1813 } |
1810 } | 1814 } |
1811 avg= (min + max + 1)/2; | 1815 avg= (min + max + 1)/2; |
1816 | |
1817 if(max - min <deringThreshold) return; | |
1812 | 1818 |
1813 for(y=0; y<10; y++) | 1819 for(y=0; y<10; y++) |
1814 { | 1820 { |
1815 int x; | 1821 int x; |
1816 int t = 0; | 1822 int t = 0; |
1840 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) | 1846 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) |
1841 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) | 1847 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) |
1842 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); | 1848 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); |
1843 f= (f + 8)>>4; | 1849 f= (f + 8)>>4; |
1844 | 1850 |
1851 #ifdef DEBUG_DERING_THRESHOLD | |
1852 asm volatile("emms\n\t":); | |
1853 { | |
1854 static long long numPixels=0; | |
1855 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; | |
1856 // if((max-min)<20 || (max-min)*QP<200) | |
1857 // if((max-min)*QP < 500) | |
1858 // if(max-min<QP/2) | |
1859 if(max-min < 20) | |
1860 { | |
1861 static int numSkiped=0; | |
1862 static int errorSum=0; | |
1863 static int worstQP=0; | |
1864 static int worstRange=0; | |
1865 static int worstDiff=0; | |
1866 int diff= (f - *p); | |
1867 int absDiff= ABS(diff); | |
1868 int error= diff*diff; | |
1869 | |
1870 if(x==1 || x==8 || y==1 || y==8) continue; | |
1871 | |
1872 numSkiped++; | |
1873 if(absDiff > worstDiff) | |
1874 { | |
1875 worstDiff= absDiff; | |
1876 worstQP= QP; | |
1877 worstRange= max-min; | |
1878 } | |
1879 errorSum+= error; | |
1880 | |
1881 if(1024LL*1024LL*1024LL % numSkiped == 0) | |
1882 { | |
1883 printf( "sum:%1.3f, skip:%d, wQP:%d, " | |
1884 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", | |
1885 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, | |
1886 worstDiff, (float)numSkiped/numPixels); | |
1887 } | |
1888 } | |
1889 } | |
1890 #endif | |
1845 if (*p + 2*QP < f) *p= *p + 2*QP; | 1891 if (*p + 2*QP < f) *p= *p + 2*QP; |
1846 else if(*p - 2*QP > f) *p= *p - 2*QP; | 1892 else if(*p - 2*QP > f) *p= *p - 2*QP; |
1847 else *p=f; | 1893 else *p=f; |
1848 } | 1894 } |
1849 } | 1895 } |
1850 } | 1896 } |
1851 | 1897 #ifdef DEBUG_DERING_THRESHOLD |
1898 if(max-min < 20) | |
1899 { | |
1900 for(y=1; y<9; y++) | |
1901 { | |
1902 int x; | |
1903 int t = 0; | |
1904 p= src + stride*y; | |
1905 for(x=1; x<9; x++) | |
1906 { | |
1907 p++; | |
1908 *p = MIN(*p + 20, 255); | |
1909 } | |
1910 } | |
1911 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; | |
1912 } | |
1913 #endif | |
1852 #endif | 1914 #endif |
1853 } | 1915 } |
1854 | 1916 |
1855 /** | 1917 /** |
1856 * Deinterlaces the given block | 1918 * Deinterlaces the given block |