comparison postproc/postprocess.c @ 3093:fb4cee33d3c6

faster dering
author michael
date Sat, 24 Nov 2001 01:38:30 +0000
parents 3fc9a8b9f178
children 4150aff2ac17
comparison
equal deleted inserted replaced
3092:c442c6565922 3093:fb4cee33d3c6
45 c = checked against the other implementations (-vo md5) 45 c = checked against the other implementations (-vo md5)
46 */ 46 */
47 47
48 /* 48 /*
49 TODO: 49 TODO:
50 verify that everything workes as it should (how?)
51 reduce the time wasted on the mem transfer 50 reduce the time wasted on the mem transfer
52 implement everything in C at least (done at the moment but ...) 51 implement everything in C at least (done at the moment but ...)
53 unroll stuff if instructions depend too much on the prior one 52 unroll stuff if instructions depend too much on the prior one
54 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? 53 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
55 move YScale thing to the end instead of fixing QP 54 move YScale thing to the end instead of fixing QP
60 split this huge file 59 split this huge file
61 border remover 60 border remover
62 optimize c versions 61 optimize c versions
63 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks 62 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
64 smart blur 63 smart blur
65 commandline option for the deblock thresholds 64 commandline option for the deblock / dering thresholds
65 memcpy chrominance if no chroma filtering is done
66 ... 66 ...
67 */ 67 */
68 68
69 //Changelog: use the CVS log 69 //Changelog: use the CVS log
70 70
160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code 160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
161 #endif 161 #endif
162 162
163 int hFlatnessThreshold= 56 - 16; 163 int hFlatnessThreshold= 56 - 16;
164 int vFlatnessThreshold= 56 - 16; 164 int vFlatnessThreshold= 56 - 16;
165 int deringThreshold= 20;
165 166
166 //amount of "black" u r willing to loose to get a brightness corrected picture 167 //amount of "black" u r willing to loose to get a brightness corrected picture
167 double maxClippedThreshold= 0.01; 168 double maxClippedThreshold= 0.01;
168 169
169 int maxAllowedY=234; 170 int maxAllowedY=234;
308 "paddb %%mm7, %%mm2 \n\t" 309 "paddb %%mm7, %%mm2 \n\t"
309 "pcmpgtb %%mm6, %%mm2 \n\t" 310 "pcmpgtb %%mm6, %%mm2 \n\t"
310 "paddb %%mm2, %%mm0 \n\t" 311 "paddb %%mm2, %%mm0 \n\t"
311 312
312 " \n\t" 313 " \n\t"
314 #ifdef HAVE_MMX2
315 "pxor %%mm7, %%mm7 \n\t"
316 "psadbw %%mm7, %%mm0 \n\t"
317 #else
313 "movq %%mm0, %%mm1 \n\t" 318 "movq %%mm0, %%mm1 \n\t"
314 "psrlw $8, %%mm0 \n\t" 319 "psrlw $8, %%mm0 \n\t"
315 "paddb %%mm1, %%mm0 \n\t" 320 "paddb %%mm1, %%mm0 \n\t"
316 #ifdef HAVE_MMX2
317 "pshufw $0xF9, %%mm0, %%mm1 \n\t"
318 "paddb %%mm1, %%mm0 \n\t"
319 "pshufw $0xFE, %%mm0, %%mm1 \n\t"
320 #else
321 "movq %%mm0, %%mm1 \n\t" 321 "movq %%mm0, %%mm1 \n\t"
322 "psrlq $16, %%mm0 \n\t" 322 "psrlq $16, %%mm0 \n\t"
323 "paddb %%mm1, %%mm0 \n\t" 323 "paddb %%mm1, %%mm0 \n\t"
324 "movq %%mm0, %%mm1 \n\t" 324 "movq %%mm0, %%mm1 \n\t"
325 "psrlq $32, %%mm0 \n\t" 325 "psrlq $32, %%mm0 \n\t"
326 #endif
327 "paddb %%mm1, %%mm0 \n\t" 326 "paddb %%mm1, %%mm0 \n\t"
327 #endif
328 "movd %%mm0, %0 \n\t" 328 "movd %%mm0, %0 \n\t"
329 : "=r" (numEq) 329 : "=r" (numEq)
330 : "r" (src), "r" (stride) 330 : "r" (src), "r" (stride)
331 : "%eax", "%ebx" 331 : "%ebx"
332 ); 332 );
333 333 numEq= (-numEq) &0xFF;
334 numEq= (256 - numEq) &0xFF;
335 334
336 #else 335 #else
337 for(y=0; y<BLOCK_SIZE-1; y++) 336 for(y=0; y<BLOCK_SIZE-1; y++)
338 { 337 {
339 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++; 338 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
1589 "leal (%0, %1), %%eax \n\t" 1588 "leal (%0, %1), %%eax \n\t"
1590 "leal (%%eax, %1, 4), %%ebx \n\t" 1589 "leal (%%eax, %1, 4), %%ebx \n\t"
1591 // 0 1 2 3 4 5 6 7 8 9 1590 // 0 1 2 3 4 5 6 7 8 9
1592 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1591 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1593 1592
1594 "pcmpeqb %%mm6, %%mm6 \n\t" 1593 "pcmpeqb %%mm7, %%mm7 \n\t"
1595 "pxor %%mm7, %%mm7 \n\t" 1594 "pxor %%mm6, %%mm6 \n\t"
1596 #ifdef HAVE_MMX2 1595 #ifdef HAVE_MMX2
1597 #define FIND_MIN_MAX(addr)\ 1596 #define FIND_MIN_MAX(addr)\
1598 "movq " #addr ", %%mm0 \n\t"\ 1597 "movq " #addr ", %%mm0 \n\t"\
1599 "pminub %%mm0, %%mm6 \n\t"\ 1598 "pminub %%mm0, %%mm7 \n\t"\
1600 "pmaxub %%mm0, %%mm7 \n\t" 1599 "pmaxub %%mm0, %%mm6 \n\t"
1601 #else 1600 #else
1602 #define FIND_MIN_MAX(addr)\ 1601 #define FIND_MIN_MAX(addr)\
1603 "movq " #addr ", %%mm0 \n\t"\ 1602 "movq " #addr ", %%mm0 \n\t"\
1604 "movq %%mm6, %%mm1 \n\t"\ 1603 "movq %%mm7, %%mm1 \n\t"\
1605 "psubusb %%mm0, %%mm7 \n\t"\ 1604 "psubusb %%mm0, %%mm6 \n\t"\
1606 "paddb %%mm0, %%mm7 \n\t"\ 1605 "paddb %%mm0, %%mm6 \n\t"\
1607 "psubusb %%mm0, %%mm1 \n\t"\ 1606 "psubusb %%mm0, %%mm1 \n\t"\
1608 "psubb %%mm1, %%mm6 \n\t" 1607 "psubb %%mm1, %%mm7 \n\t"
1609 #endif 1608 #endif
1610 1609
1611 FIND_MIN_MAX((%%eax)) 1610 FIND_MIN_MAX((%%eax))
1612 FIND_MIN_MAX((%%eax, %1)) 1611 FIND_MIN_MAX((%%eax, %1))
1613 FIND_MIN_MAX((%%eax, %1, 2)) 1612 FIND_MIN_MAX((%%eax, %1, 2))
1615 FIND_MIN_MAX((%%ebx)) 1614 FIND_MIN_MAX((%%ebx))
1616 FIND_MIN_MAX((%%ebx, %1)) 1615 FIND_MIN_MAX((%%ebx, %1))
1617 FIND_MIN_MAX((%%ebx, %1, 2)) 1616 FIND_MIN_MAX((%%ebx, %1, 2))
1618 FIND_MIN_MAX((%0, %1, 8)) 1617 FIND_MIN_MAX((%0, %1, 8))
1619 1618
1619 "movq %%mm7, %%mm4 \n\t"
1620 "psrlq $8, %%mm7 \n\t"
1621 #ifdef HAVE_MMX2
1622 "pminub %%mm4, %%mm7 \n\t" // min of pixels
1623 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
1624 "pminub %%mm4, %%mm7 \n\t" // min of pixels
1625 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
1626 "pminub %%mm4, %%mm7 \n\t"
1627 #else
1628 "movq %%mm7, %%mm1 \n\t"
1629 "psubusb %%mm4, %%mm1 \n\t"
1630 "psubb %%mm1, %%mm7 \n\t"
1631 "movq %%mm7, %%mm4 \n\t"
1632 "psrlq $16, %%mm7 \n\t"
1633 "movq %%mm7, %%mm1 \n\t"
1634 "psubusb %%mm4, %%mm1 \n\t"
1635 "psubb %%mm1, %%mm7 \n\t"
1636 "movq %%mm7, %%mm4 \n\t"
1637 "psrlq $32, %%mm7 \n\t"
1638 "movq %%mm7, %%mm1 \n\t"
1639 "psubusb %%mm4, %%mm1 \n\t"
1640 "psubb %%mm1, %%mm7 \n\t"
1641 #endif
1642
1643
1620 "movq %%mm6, %%mm4 \n\t" 1644 "movq %%mm6, %%mm4 \n\t"
1621 "psrlq $8, %%mm6 \n\t" 1645 "psrlq $8, %%mm6 \n\t"
1622 #ifdef HAVE_MMX2 1646 #ifdef HAVE_MMX2
1623 "pminub %%mm4, %%mm6 \n\t" // min of pixels 1647 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
1624 "pshufw $0xF9, %%mm6, %%mm4 \n\t" 1648 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1625 "pminub %%mm4, %%mm6 \n\t" // min of pixels 1649 "pmaxub %%mm4, %%mm6 \n\t"
1626 "pshufw $0xFE, %%mm6, %%mm4 \n\t" 1650 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1627 "pminub %%mm4, %%mm6 \n\t" 1651 "pmaxub %%mm4, %%mm6 \n\t"
1628 #else 1652 #else
1629 "movq %%mm6, %%mm1 \n\t" 1653 "psubusb %%mm4, %%mm6 \n\t"
1630 "psubusb %%mm4, %%mm1 \n\t" 1654 "paddb %%mm4, %%mm6 \n\t"
1631 "psubb %%mm1, %%mm6 \n\t"
1632 "movq %%mm6, %%mm4 \n\t" 1655 "movq %%mm6, %%mm4 \n\t"
1633 "psrlq $16, %%mm6 \n\t" 1656 "psrlq $16, %%mm6 \n\t"
1634 "movq %%mm6, %%mm1 \n\t" 1657 "psubusb %%mm4, %%mm6 \n\t"
1635 "psubusb %%mm4, %%mm1 \n\t" 1658 "paddb %%mm4, %%mm6 \n\t"
1636 "psubb %%mm1, %%mm6 \n\t"
1637 "movq %%mm6, %%mm4 \n\t" 1659 "movq %%mm6, %%mm4 \n\t"
1638 "psrlq $32, %%mm6 \n\t" 1660 "psrlq $32, %%mm6 \n\t"
1639 "movq %%mm6, %%mm1 \n\t" 1661 "psubusb %%mm4, %%mm6 \n\t"
1640 "psubusb %%mm4, %%mm1 \n\t" 1662 "paddb %%mm4, %%mm6 \n\t"
1641 "psubb %%mm1, %%mm6 \n\t" 1663 #endif
1642 #endif 1664 "movq %%mm6, %%mm0 \n\t" // max
1643 1665 "psubb %%mm7, %%mm6 \n\t" // max - min
1644 1666 "movd %%mm6, %%ecx \n\t"
1645 "movq %%mm7, %%mm4 \n\t" 1667 "cmpb deringThreshold, %%cl \n\t"
1646 "psrlq $8, %%mm7 \n\t" 1668 " jb 1f \n\t"
1647 #ifdef HAVE_MMX2 1669 PAVGB(%%mm0, %%mm7) // a=(max + min)/2
1648 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
1649 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
1650 "pmaxub %%mm4, %%mm7 \n\t"
1651 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
1652 "pmaxub %%mm4, %%mm7 \n\t"
1653 #else
1654 "psubusb %%mm4, %%mm7 \n\t"
1655 "paddb %%mm4, %%mm7 \n\t"
1656 "movq %%mm7, %%mm4 \n\t"
1657 "psrlq $16, %%mm7 \n\t"
1658 "psubusb %%mm4, %%mm7 \n\t"
1659 "paddb %%mm4, %%mm7 \n\t"
1660 "movq %%mm7, %%mm4 \n\t"
1661 "psrlq $32, %%mm7 \n\t"
1662 "psubusb %%mm4, %%mm7 \n\t"
1663 "paddb %%mm4, %%mm7 \n\t"
1664 #endif
1665 PAVGB(%%mm6, %%mm7) // a=(max + min)/2
1666 "punpcklbw %%mm7, %%mm7 \n\t" 1670 "punpcklbw %%mm7, %%mm7 \n\t"
1667 "punpcklbw %%mm7, %%mm7 \n\t" 1671 "punpcklbw %%mm7, %%mm7 \n\t"
1668 "punpcklbw %%mm7, %%mm7 \n\t" 1672 "punpcklbw %%mm7, %%mm7 \n\t"
1669 "movq %%mm7, temp0 \n\t" 1673 "movq %%mm7, temp0 \n\t"
1670 1674
1783 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1787 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1784 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) 1788 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1785 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1789 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1786 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1790 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1787 1791
1788 1792 "1: \n\t"
1789 : : "r" (src), "r" (stride), "r" (QP) 1793 : : "r" (src), "r" (stride), "r" (QP)
1790 : "%eax", "%ebx" 1794 : "%eax", "%ebx", "%ecx"
1791 ); 1795 );
1792 #else 1796 #else
1793 int y; 1797 int y;
1794 int min=255; 1798 int min=255;
1795 int max=0; 1799 int max=0;
1807 if(*p > max) max= *p; 1811 if(*p > max) max= *p;
1808 if(*p < min) min= *p; 1812 if(*p < min) min= *p;
1809 } 1813 }
1810 } 1814 }
1811 avg= (min + max + 1)/2; 1815 avg= (min + max + 1)/2;
1816
1817 if(max - min <deringThreshold) return;
1812 1818
1813 for(y=0; y<10; y++) 1819 for(y=0; y<10; y++)
1814 { 1820 {
1815 int x; 1821 int x;
1816 int t = 0; 1822 int t = 0;
1840 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) 1846 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1841 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) 1847 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1842 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); 1848 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1843 f= (f + 8)>>4; 1849 f= (f + 8)>>4;
1844 1850
1851 #ifdef DEBUG_DERING_THRESHOLD
1852 asm volatile("emms\n\t":);
1853 {
1854 static long long numPixels=0;
1855 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1856 // if((max-min)<20 || (max-min)*QP<200)
1857 // if((max-min)*QP < 500)
1858 // if(max-min<QP/2)
1859 if(max-min < 20)
1860 {
1861 static int numSkiped=0;
1862 static int errorSum=0;
1863 static int worstQP=0;
1864 static int worstRange=0;
1865 static int worstDiff=0;
1866 int diff= (f - *p);
1867 int absDiff= ABS(diff);
1868 int error= diff*diff;
1869
1870 if(x==1 || x==8 || y==1 || y==8) continue;
1871
1872 numSkiped++;
1873 if(absDiff > worstDiff)
1874 {
1875 worstDiff= absDiff;
1876 worstQP= QP;
1877 worstRange= max-min;
1878 }
1879 errorSum+= error;
1880
1881 if(1024LL*1024LL*1024LL % numSkiped == 0)
1882 {
1883 printf( "sum:%1.3f, skip:%d, wQP:%d, "
1884 "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1885 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1886 worstDiff, (float)numSkiped/numPixels);
1887 }
1888 }
1889 }
1890 #endif
1845 if (*p + 2*QP < f) *p= *p + 2*QP; 1891 if (*p + 2*QP < f) *p= *p + 2*QP;
1846 else if(*p - 2*QP > f) *p= *p - 2*QP; 1892 else if(*p - 2*QP > f) *p= *p - 2*QP;
1847 else *p=f; 1893 else *p=f;
1848 } 1894 }
1849 } 1895 }
1850 } 1896 }
1851 1897 #ifdef DEBUG_DERING_THRESHOLD
1898 if(max-min < 20)
1899 {
1900 for(y=1; y<9; y++)
1901 {
1902 int x;
1903 int t = 0;
1904 p= src + stride*y;
1905 for(x=1; x<9; x++)
1906 {
1907 p++;
1908 *p = MIN(*p + 20, 255);
1909 }
1910 }
1911 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1912 }
1913 #endif
1852 #endif 1914 #endif
1853 } 1915 }
1854 1916
1855 /** 1917 /**
1856 * Deinterlaces the given block 1918 * Deinterlaces the given block