comparison postproc/postprocess_template.c @ 3099:897d46457708

runtime cpu detection
author michael
date Sat, 24 Nov 2001 22:16:29 +0000
parents 4150aff2ac17
children b2e24fec97bc
comparison
equal deleted inserted replaced
3098:6b21035859c9 3099:897d46457708
14 You should have received a copy of the GNU General Public License 14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */ 17 */
18 18
19 /* 19 #undef PAVGB
20 C MMX MMX2 3DNow 20 #undef PMINUB
21 isVertDC Ec Ec 21 #undef PMAXUB
22 isVertMinMaxOk Ec Ec
23 doVertLowPass E e e
24 doVertDefFilter Ec Ec e e
25 isHorizDC Ec Ec
26 isHorizMinMaxOk a E
27 doHorizLowPass E e e
28 doHorizDefFilter Ec Ec e e
29 deRing E e e*
30 Vertical RKAlgo1 E a a
31 Horizontal RKAlgo1 a a
32 Vertical X1# a E E
33 Horizontal X1# a E E
34 LinIpolDeinterlace e E E*
35 CubicIpolDeinterlace a e e*
36 LinBlendDeinterlace e E E*
37 MedianDeinterlace# Ec Ec
38 TempDeNoiser# E e e
39
40 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
41 # more or less selfinvented filters so the exactness isnt too meaningfull
42 E = Exact implementation
43 e = allmost exact implementation (slightly different rounding,...)
44 a = alternative / approximate impl
45 c = checked against the other implementations (-vo md5)
46 */
47
48 /*
49 TODO:
50 reduce the time wasted on the mem transfer
51 implement everything in C at least (done at the moment but ...)
52 unroll stuff if instructions depend too much on the prior one
53 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
54 move YScale thing to the end instead of fixing QP
55 write a faster and higher quality deblocking filter :)
56 make the mainloop more flexible (variable number of blocks at once
57 (the if/else stuff per block is slowing things down)
58 compare the quality & speed of all filters
59 split this huge file
60 border remover
61 optimize c versions
62 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
63 smart blur
64 commandline option for the deblock / dering thresholds
65 ...
66 */
67
68 //Changelog: use the CVS log
69
70 #include "../config.h"
71 #include <inttypes.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75 #ifdef HAVE_MALLOC_H
76 #include <malloc.h>
77 #endif
78 //#undef HAVE_MMX2
79 //#define HAVE_3DNOW
80 //#undef HAVE_MMX
81 //#define DEBUG_BRIGHTNESS
82 #include "../libvo/fastmemcpy.h"
83 #include "postprocess.h"
84
85 #define MIN(a,b) ((a) > (b) ? (b) : (a))
86 #define MAX(a,b) ((a) < (b) ? (b) : (a))
87 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
88 #define SIGN(a) ((a) > 0 ? 1 : -1)
89 22
90 #ifdef HAVE_MMX2 23 #ifdef HAVE_MMX2
91 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" 24 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
92 #elif defined (HAVE_3DNOW) 25 #elif defined (HAVE_3DNOW)
93 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" 26 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
109 "psubusb " #a ", " #b " \n\t"\ 42 "psubusb " #a ", " #b " \n\t"\
110 "paddb " #a ", " #b " \n\t" 43 "paddb " #a ", " #b " \n\t"
111 #endif 44 #endif
112 45
113 46
114 #define GET_MODE_BUFFER_SIZE 500
115 #define OPTIONS_ARRAY_SIZE 10
116
117 #ifdef HAVE_MMX
118 static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL;
119 static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL;
120 static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL;
121 static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL;
122 static uint64_t __attribute__((aligned(8))) w1400= 0x1400140014001400LL;
123 static uint64_t __attribute__((aligned(8))) bm00000001= 0x00000000000000FFLL;
124 static uint64_t __attribute__((aligned(8))) bm00010000= 0x000000FF00000000LL;
125 static uint64_t __attribute__((aligned(8))) bm00001000= 0x00000000FF000000LL;
126 static uint64_t __attribute__((aligned(8))) bm10000000= 0xFF00000000000000LL;
127 static uint64_t __attribute__((aligned(8))) bm10000001= 0xFF000000000000FFLL;
128 static uint64_t __attribute__((aligned(8))) bm11000011= 0xFFFF00000000FFFFLL;
129 static uint64_t __attribute__((aligned(8))) bm00000011= 0x000000000000FFFFLL;
130 static uint64_t __attribute__((aligned(8))) bm11111110= 0xFFFFFFFFFFFFFF00LL;
131 static uint64_t __attribute__((aligned(8))) bm11000000= 0xFFFF000000000000LL;
132 static uint64_t __attribute__((aligned(8))) bm00011000= 0x000000FFFF000000LL;
133 static uint64_t __attribute__((aligned(8))) bm00110011= 0x0000FFFF0000FFFFLL;
134 static uint64_t __attribute__((aligned(8))) bm11001100= 0xFFFF0000FFFF0000LL;
135 static uint64_t __attribute__((aligned(8))) b00= 0x0000000000000000LL;
136 static uint64_t __attribute__((aligned(8))) b01= 0x0101010101010101LL;
137 static uint64_t __attribute__((aligned(8))) b02= 0x0202020202020202LL;
138 static uint64_t __attribute__((aligned(8))) b0F= 0x0F0F0F0F0F0F0F0FLL;
139 static uint64_t __attribute__((aligned(8))) b04= 0x0404040404040404LL;
140 static uint64_t __attribute__((aligned(8))) b08= 0x0808080808080808LL;
141 static uint64_t __attribute__((aligned(8))) bFF= 0xFFFFFFFFFFFFFFFFLL;
142 static uint64_t __attribute__((aligned(8))) b20= 0x2020202020202020LL;
143 static uint64_t __attribute__((aligned(8))) b80= 0x8080808080808080LL;
144 static uint64_t __attribute__((aligned(8))) b7E= 0x7E7E7E7E7E7E7E7ELL;
145 static uint64_t __attribute__((aligned(8))) b7C= 0x7C7C7C7C7C7C7C7CLL;
146 static uint64_t __attribute__((aligned(8))) b3F= 0x3F3F3F3F3F3F3F3FLL;
147 static uint64_t __attribute__((aligned(8))) temp0=0;
148 static uint64_t __attribute__((aligned(8))) temp1=0;
149 static uint64_t __attribute__((aligned(8))) temp2=0;
150 static uint64_t __attribute__((aligned(8))) temp3=0;
151 static uint64_t __attribute__((aligned(8))) temp4=0;
152 static uint64_t __attribute__((aligned(8))) temp5=0;
153 static uint64_t __attribute__((aligned(8))) pQPb=0;
154 static uint64_t __attribute__((aligned(8))) pQPb2=0;
155 static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
156 static uint32_t __attribute__((aligned(4))) maxTmpNoise[4];
157 #else
158 static uint64_t packedYOffset= 0x0000000000000000LL;
159 static uint64_t packedYScale= 0x0100010001000100LL;
160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
161 #endif
162
163 int hFlatnessThreshold= 56 - 16;
164 int vFlatnessThreshold= 56 - 16;
165 int deringThreshold= 20;
166
167 //amount of "black" u r willing to loose to get a brightness corrected picture
168 double maxClippedThreshold= 0.01;
169
170 int maxAllowedY=234;
171 int minAllowedY=16;
172
173 static struct PPFilter filters[]=
174 {
175 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
176 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
177 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER},
178 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
179 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
180 {"dr", "dering", 1, 5, 6, DERING},
181 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
182 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER},
183 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER},
184 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER},
185 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER},
186 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
187 {NULL, NULL,0,0,0,0} //End Marker
188 };
189
190 static char *replaceTable[]=
191 {
192 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
193 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
194 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
195 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
196 NULL //End Marker
197 };
198
199 #ifdef HAVE_MMX
200 static inline void unusedVariableWarningFixer()
201 {
202 if(
203 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000
204 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110
205 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F
206 + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4
207 + temp5 + pQPb== 0) b00=0;
208 }
209 #endif
210
211 #ifdef TIMING
212 static inline long long rdtsc()
213 {
214 long long l;
215 asm volatile( "rdtsc\n\t"
216 : "=A" (l)
217 );
218 // printf("%d\n", int(l/1000));
219 return l;
220 }
221 #endif
222
223 #ifdef HAVE_MMX2
224 static inline void prefetchnta(void *p)
225 {
226 asm volatile( "prefetchnta (%0)\n\t"
227 : : "r" (p)
228 );
229 }
230
231 static inline void prefetcht0(void *p)
232 {
233 asm volatile( "prefetcht0 (%0)\n\t"
234 : : "r" (p)
235 );
236 }
237
238 static inline void prefetcht1(void *p)
239 {
240 asm volatile( "prefetcht1 (%0)\n\t"
241 : : "r" (p)
242 );
243 }
244
245 static inline void prefetcht2(void *p)
246 {
247 asm volatile( "prefetcht2 (%0)\n\t"
248 : : "r" (p)
249 );
250 }
251 #endif
252
253 //FIXME? |255-0| = 1 (shouldnt be a problem ...) 47 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
254 /** 48 /**
255 * Check if the middle 8x8 Block in the given 8x16 block is flat 49 * Check if the middle 8x8 Block in the given 8x16 block is flat
256 */ 50 */
257 static inline int isVertDC(uint8_t src[], int stride){ 51 static inline int RENAME(isVertDC)(uint8_t src[], int stride){
258 int numEq= 0; 52 int numEq= 0;
259 #ifndef HAVE_MMX 53 #ifndef HAVE_MMX
260 int y; 54 int y;
261 #endif 55 #endif
262 src+= stride*4; // src points to begin of the 8x8 Block 56 src+= stride*4; // src points to begin of the 8x8 Block
361 */ 155 */
362 // for(int i=0; i<numEq/8; i++) src[i]=255; 156 // for(int i=0; i<numEq/8; i++) src[i]=255;
363 return (numEq > vFlatnessThreshold) ? 1 : 0; 157 return (numEq > vFlatnessThreshold) ? 1 : 0;
364 } 158 }
365 159
366 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP) 160 static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, int QP)
367 { 161 {
368 #ifdef HAVE_MMX 162 #ifdef HAVE_MMX
369 int isOk; 163 int isOk;
370 src+= stride*3; 164 src+= stride*3;
371 asm volatile( 165 asm volatile(
418 212
419 /** 213 /**
420 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) 214 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
421 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 215 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
422 */ 216 */
423 static inline void doVertLowPass(uint8_t *src, int stride, int QP) 217 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP)
424 { 218 {
425 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 219 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
426 src+= stride*3; 220 src+= stride*3;
427 asm volatile( //"movv %0 %1 %2\n\t" 221 asm volatile( //"movv %0 %1 %2\n\t"
428 "movq pQPb, %%mm0 \n\t" // QP,..., QP 222 "movq pQPb, %%mm0 \n\t" // QP,..., QP
600 x = 8 394 x = 8
601 x/2 = 4 395 x/2 = 4
602 x/8 = 1 396 x/8 = 1
603 1 12 12 23 397 1 12 12 23
604 */ 398 */
605 static inline void vertRK1Filter(uint8_t *src, int stride, int QP) 399 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
606 { 400 {
607 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 401 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
608 src+= stride*3; 402 src+= stride*3;
609 // FIXME rounding 403 // FIXME rounding
610 asm volatile( 404 asm volatile(
700 * will not damage linear gradients 494 * will not damage linear gradients
701 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter 495 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
702 * can only smooth blocks at the expected locations (it cant smooth them if they did move) 496 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
703 * MMX2 version does correct clipping C version doesnt 497 * MMX2 version does correct clipping C version doesnt
704 */ 498 */
705 static inline void vertX1Filter(uint8_t *src, int stride, int QP) 499 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP)
706 { 500 {
707 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 501 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
708 src+= stride*3; 502 src+= stride*3;
709 503
710 asm volatile( 504 asm volatile(
856 } 650 }
857 */ 651 */
858 #endif 652 #endif
859 } 653 }
860 654
861 /** 655 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP)
862 * Experimental Filter 1 (Horizontal)
863 * will not damage linear gradients
864 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
865 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
866 * MMX2 version does correct clipping C version doesnt
867 * not identical with the vertical one
868 */
869 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
870 {
871 int y;
872 //FIXME (has little in common with the mmx2 version)
873 for(y=0; y<BLOCK_SIZE; y++)
874 {
875 int a= src[1] - src[2];
876 int b= src[3] - src[4];
877 int c= src[5] - src[6];
878
879 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
880
881 if(d < QP)
882 {
883 int v = d * SIGN(-b);
884
885 src[1] +=v/8;
886 src[2] +=v/4;
887 src[3] +=3*v/8;
888 src[4] -=3*v/8;
889 src[5] -=v/4;
890 src[6] -=v/8;
891
892 }
893 src+=stride;
894 }
895 }
896
897
898 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
899 { 656 {
900 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 657 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
901 /* 658 /*
902 uint8_t tmp[16]; 659 uint8_t tmp[16];
903 const int l1= stride; 660 const int l1= stride;
1472 src++; 1229 src++;
1473 } 1230 }
1474 #endif 1231 #endif
1475 } 1232 }
1476 1233
1477 /** 1234 static inline void RENAME(dering)(uint8_t src[], int stride, int QP)
1478 * Check if the given 8x8 Block is mostly "flat"
1479 */
1480 static inline int isHorizDC(uint8_t src[], int stride)
1481 {
1482 int numEq= 0;
1483 int y;
1484 for(y=0; y<BLOCK_SIZE; y++)
1485 {
1486 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1487 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1488 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1489 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1490 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1491 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1492 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1493 src+= stride;
1494 }
1495 return numEq > hFlatnessThreshold;
1496 }
1497
1498 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1499 {
1500 if(abs(src[0] - src[7]) > 2*QP) return 0;
1501
1502 return 1;
1503 }
1504
1505 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1506 {
1507 int y;
1508 for(y=0; y<BLOCK_SIZE; y++)
1509 {
1510 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1511
1512 if(ABS(middleEnergy) < 8*QP)
1513 {
1514 const int q=(dst[3] - dst[4])/2;
1515 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1516 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1517
1518 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1519 d= MAX(d, 0);
1520
1521 d= (5*d + 32) >> 6;
1522 d*= SIGN(-middleEnergy);
1523
1524 if(q>0)
1525 {
1526 d= d<0 ? 0 : d;
1527 d= d>q ? q : d;
1528 }
1529 else
1530 {
1531 d= d>0 ? 0 : d;
1532 d= d<q ? q : d;
1533 }
1534
1535 dst[3]-= d;
1536 dst[4]+= d;
1537 }
1538 dst+= stride;
1539 }
1540 }
1541
1542 /**
1543 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1544 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1545 */
1546 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1547 {
1548
1549 int y;
1550 for(y=0; y<BLOCK_SIZE; y++)
1551 {
1552 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1553 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1554
1555 int sums[9];
1556 sums[0] = first + dst[0];
1557 sums[1] = dst[0] + dst[1];
1558 sums[2] = dst[1] + dst[2];
1559 sums[3] = dst[2] + dst[3];
1560 sums[4] = dst[3] + dst[4];
1561 sums[5] = dst[4] + dst[5];
1562 sums[6] = dst[5] + dst[6];
1563 sums[7] = dst[6] + dst[7];
1564 sums[8] = dst[7] + last;
1565
1566 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1567 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
1568 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
1569 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
1570 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
1571 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
1572 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
1573 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
1574
1575 dst+= stride;
1576 }
1577 }
1578
1579
1580 static inline void dering(uint8_t src[], int stride, int QP)
1581 { 1235 {
1582 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1236 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1583 asm volatile( 1237 asm volatile(
1584 "movq pQPb, %%mm0 \n\t" 1238 "movq pQPb, %%mm0 \n\t"
1585 "paddusb %%mm0, %%mm0 \n\t" 1239 "paddusb %%mm0, %%mm0 \n\t"
1590 // 0 1 2 3 4 5 6 7 8 9 1244 // 0 1 2 3 4 5 6 7 8 9
1591 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1245 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1592 1246
1593 "pcmpeqb %%mm7, %%mm7 \n\t" 1247 "pcmpeqb %%mm7, %%mm7 \n\t"
1594 "pxor %%mm6, %%mm6 \n\t" 1248 "pxor %%mm6, %%mm6 \n\t"
1249 #undef FIND_MIN_MAX
1595 #ifdef HAVE_MMX2 1250 #ifdef HAVE_MMX2
1596 #define FIND_MIN_MAX(addr)\ 1251 #define FIND_MIN_MAX(addr)\
1597 "movq " #addr ", %%mm0 \n\t"\ 1252 "movq " #addr ", %%mm0 \n\t"\
1598 "pminub %%mm0, %%mm7 \n\t"\ 1253 "pminub %%mm0, %%mm7 \n\t"\
1599 "pmaxub %%mm0, %%mm6 \n\t" 1254 "pmaxub %%mm0, %%mm6 \n\t"
1918 * Deinterlaces the given block 1573 * Deinterlaces the given block
1919 * will be called for every 8x8 block and can read & write from line 4-15 1574 * will be called for every 8x8 block and can read & write from line 4-15
1920 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too 1575 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1921 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1576 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1922 */ 1577 */
1923 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) 1578 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1924 { 1579 {
1925 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1580 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1926 src+= 4*stride; 1581 src+= 4*stride;
1927 asm volatile( 1582 asm volatile(
1928 "leal (%0, %1), %%eax \n\t" 1583 "leal (%0, %1), %%eax \n\t"
1967 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too 1622 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1968 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1623 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1969 * this filter will read lines 3-15 and write 7-13 1624 * this filter will read lines 3-15 and write 7-13
1970 * no cliping in C version 1625 * no cliping in C version
1971 */ 1626 */
1972 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride) 1627 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1973 { 1628 {
1974 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1629 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1975 src+= stride*3; 1630 src+= stride*3;
1976 asm volatile( 1631 asm volatile(
1977 "leal (%0, %1), %%eax \n\t" 1632 "leal (%0, %1), %%eax \n\t"
2032 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too 1687 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2033 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1688 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2034 * will shift the image up by 1 line (FIXME if this is a problem) 1689 * will shift the image up by 1 line (FIXME if this is a problem)
2035 * this filter will read lines 4-13 and write 4-11 1690 * this filter will read lines 4-13 and write 4-11
2036 */ 1691 */
2037 static inline void deInterlaceBlendLinear(uint8_t src[], int stride) 1692 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride)
2038 { 1693 {
2039 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1694 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2040 src+= 4*stride; 1695 src+= 4*stride;
2041 asm volatile( 1696 asm volatile(
2042 "leal (%0, %1), %%eax \n\t" 1697 "leal (%0, %1), %%eax \n\t"
2105 * Deinterlaces the given block 1760 * Deinterlaces the given block
2106 * will be called for every 8x8 block and can read & write from line 4-15, 1761 * will be called for every 8x8 block and can read & write from line 4-15,
2107 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too 1762 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
2108 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1763 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2109 */ 1764 */
2110 static inline void deInterlaceMedian(uint8_t src[], int stride) 1765 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
2111 { 1766 {
2112 #ifdef HAVE_MMX 1767 #ifdef HAVE_MMX
2113 src+= 4*stride; 1768 src+= 4*stride;
2114 #ifdef HAVE_MMX2 1769 #ifdef HAVE_MMX2
2115 asm volatile( 1770 asm volatile(
2222 1877
2223 #ifdef HAVE_MMX 1878 #ifdef HAVE_MMX
2224 /** 1879 /**
2225 * transposes and shift the given 8x8 Block into dst1 and dst2 1880 * transposes and shift the given 8x8 Block into dst1 and dst2
2226 */ 1881 */
2227 static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) 1882 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2228 { 1883 {
2229 asm( 1884 asm(
2230 "leal (%0, %1), %%eax \n\t" 1885 "leal (%0, %1), %%eax \n\t"
2231 "leal (%%eax, %1, 4), %%ebx \n\t" 1886 "leal (%%eax, %1, 4), %%ebx \n\t"
2232 // 0 1 2 3 4 5 6 7 8 9 1887 // 0 1 2 3 4 5 6 7 8 9
2306 } 1961 }
2307 1962
2308 /** 1963 /**
2309 * transposes the given 8x8 block 1964 * transposes the given 8x8 block
2310 */ 1965 */
2311 static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src) 1966 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2312 { 1967 {
2313 asm( 1968 asm(
2314 "leal (%0, %1), %%eax \n\t" 1969 "leal (%0, %1), %%eax \n\t"
2315 "leal (%%eax, %1, 4), %%ebx \n\t" 1970 "leal (%%eax, %1, 4), %%ebx \n\t"
2316 // 0 1 2 3 4 5 6 7 8 9 1971 // 0 1 2 3 4 5 6 7 8 9
2385 ); 2040 );
2386 } 2041 }
2387 #endif 2042 #endif
2388 //static int test=0; 2043 //static int test=0;
2389 2044
2390 static void inline tempNoiseReducer(uint8_t *src, int stride, 2045 static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2391 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) 2046 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2392 { 2047 {
2393 #define FAST_L2_DIFF 2048 #define FAST_L2_DIFF
2394 //#define L1_DIFF //u should change the thresholds too if u try that one 2049 //#define L1_DIFF //u should change the thresholds too if u try that one
2395 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2050 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2784 } 2439 }
2785 } 2440 }
2786 #endif 2441 #endif
2787 } 2442 }
2788 2443
2789 #ifdef HAVE_ODIVX_POSTPROCESS 2444 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2790 #include "../opendivx/postprocess.h"
2791 int use_old_pp=0;
2792 #endif
2793
2794 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2795 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); 2445 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
2796
2797 /* -pp Command line Help
2798 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
2799
2800 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
2801
2802 long form example:
2803 -pp vdeblock:autoq,hdeblock:autoq,linblenddeint -pp default,-vdeblock
2804 short form example:
2805 -pp vb:a,hb:a,lb -pp de,-vb
2806 more examples:
2807 -pp tn:64:128:256
2808
2809 Filters Options
2810 short long name short long option Description
2811 * * a autoq cpu power dependant enabler
2812 c chrom chrominance filtring enabled
2813 y nochrom chrominance filtring disabled
2814 hb hdeblock horizontal deblocking filter
2815 vb vdeblock vertical deblocking filter
2816 vr rkvdeblock
2817 h1 x1hdeblock Experimental horizontal deblock filter 1
2818 v1 x1vdeblock Experimental vertical deblock filter 1
2819 dr dering not implemented yet
2820 al autolevels automatic brightness / contrast fixer
2821 f fullyrange stretch luminance range to (0..255)
2822 lb linblenddeint linear blend deinterlacer
2823 li linipoldeint linear interpolating deinterlacer
2824 ci cubicipoldeint cubic interpolating deinterlacer
2825 md mediandeint median deinterlacer
2826 de default hdeblock:a,vdeblock:a,dering:a,autolevels
2827 fa fast x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
2828 tn tmpnoise (3 Thresholds) Temporal Noise Reducer
2829 */
2830
2831 /**
2832 * returns a PPMode struct which will have a non 0 error variable if an error occured
2833 * name is the string after "-pp" on the command line
2834 * quality is a number from 0 to GET_PP_QUALITY_MAX
2835 */
2836 struct PPMode getPPModeByNameAndQuality(char *name, int quality)
2837 {
2838 char temp[GET_MODE_BUFFER_SIZE];
2839 char *p= temp;
2840 char *filterDelimiters= ",";
2841 char *optionDelimiters= ":";
2842 struct PPMode ppMode= {0,0,0,0,0,0,{150,200,400}};
2843 char *filterToken;
2844
2845 strncpy(temp, name, GET_MODE_BUFFER_SIZE);
2846
2847 printf("%s\n", name);
2848
2849 for(;;){
2850 char *filterName;
2851 int q= 1000000; //GET_PP_QUALITY_MAX;
2852 int chrom=-1;
2853 char *option;
2854 char *options[OPTIONS_ARRAY_SIZE];
2855 int i;
2856 int filterNameOk=0;
2857 int numOfUnknownOptions=0;
2858 int enable=1; //does the user want us to enabled or disabled the filter
2859
2860 filterToken= strtok(p, filterDelimiters);
2861 if(filterToken == NULL) break;
2862 p+= strlen(filterToken) + 1; // p points to next filterToken
2863 filterName= strtok(filterToken, optionDelimiters);
2864 printf("%s::%s\n", filterToken, filterName);
2865
2866 if(*filterName == '-')
2867 {
2868 enable=0;
2869 filterName++;
2870 }
2871
2872 for(;;){ //for all options
2873 option= strtok(NULL, optionDelimiters);
2874 if(option == NULL) break;
2875
2876 printf("%s\n", option);
2877 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
2878 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
2879 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
2880 else
2881 {
2882 options[numOfUnknownOptions] = option;
2883 numOfUnknownOptions++;
2884 }
2885 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
2886 }
2887 options[numOfUnknownOptions] = NULL;
2888
2889 /* replace stuff from the replace Table */
2890 for(i=0; replaceTable[2*i]!=NULL; i++)
2891 {
2892 if(!strcmp(replaceTable[2*i], filterName))
2893 {
2894 int newlen= strlen(replaceTable[2*i + 1]);
2895 int plen;
2896 int spaceLeft;
2897
2898 if(p==NULL) p= temp, *p=0; //last filter
2899 else p--, *p=','; //not last filter
2900
2901 plen= strlen(p);
2902 spaceLeft= (int)p - (int)temp + plen;
2903 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE)
2904 {
2905 ppMode.error++;
2906 break;
2907 }
2908 memmove(p + newlen, p, plen+1);
2909 memcpy(p, replaceTable[2*i + 1], newlen);
2910 filterNameOk=1;
2911 }
2912 }
2913
2914 for(i=0; filters[i].shortName!=NULL; i++)
2915 {
2916 // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
2917 if( !strcmp(filters[i].longName, filterName)
2918 || !strcmp(filters[i].shortName, filterName))
2919 {
2920 ppMode.lumMode &= ~filters[i].mask;
2921 ppMode.chromMode &= ~filters[i].mask;
2922
2923 filterNameOk=1;
2924 if(!enable) break; // user wants to disable it
2925
2926 if(q >= filters[i].minLumQuality)
2927 ppMode.lumMode|= filters[i].mask;
2928 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
2929 if(q >= filters[i].minChromQuality)
2930 ppMode.chromMode|= filters[i].mask;
2931
2932 if(filters[i].mask == LEVEL_FIX)
2933 {
2934 int o;
2935 ppMode.minAllowedY= 16;
2936 ppMode.maxAllowedY= 234;
2937 for(o=0; options[o]!=NULL; o++)
2938 if( !strcmp(options[o],"fullyrange")
2939 ||!strcmp(options[o],"f"))
2940 {
2941 ppMode.minAllowedY= 0;
2942 ppMode.maxAllowedY= 255;
2943 numOfUnknownOptions--;
2944 }
2945 }
2946 else if(filters[i].mask == TEMP_NOISE_FILTER)
2947 {
2948 int o;
2949 int numOfNoises=0;
2950 ppMode.maxTmpNoise[0]= 150;
2951 ppMode.maxTmpNoise[1]= 200;
2952 ppMode.maxTmpNoise[2]= 400;
2953
2954 for(o=0; options[o]!=NULL; o++)
2955 {
2956 char *tail;
2957 ppMode.maxTmpNoise[numOfNoises]=
2958 strtol(options[o], &tail, 0);
2959 if(tail!=options[o])
2960 {
2961 numOfNoises++;
2962 numOfUnknownOptions--;
2963 if(numOfNoises >= 3) break;
2964 }
2965 }
2966 }
2967 }
2968 }
2969 if(!filterNameOk) ppMode.error++;
2970 ppMode.error += numOfUnknownOptions;
2971 }
2972
2973 #ifdef HAVE_ODIVX_POSTPROCESS
2974 if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
2975 if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
2976 if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
2977 if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
2978 if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
2979 if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
2980 #endif
2981
2982 return ppMode;
2983 }
2984
2985 /**
2986 * Obsolete, dont use it, use postprocess2() instead
2987 */
2988 void postprocess(unsigned char * src[], int src_stride,
2989 unsigned char * dst[], int dst_stride,
2990 int horizontal_size, int vertical_size,
2991 QP_STORE_T *QP_store, int QP_stride,
2992 int mode)
2993 {
2994 struct PPMode ppMode;
2995 static QP_STORE_T zeroArray[2048/8];
2996 /*
2997 static int qual=0;
2998
2999 ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock,tmpnoise:150:200:300", qual);
3000 printf("OK\n");
3001 qual++;
3002 qual%=7;
3003 printf("\n%X %X %X %X :%d: %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error,
3004 qual, ppMode.maxTmpNoise[0], ppMode.maxTmpNoise[1], ppMode.maxTmpNoise[2]);
3005 postprocess2(src, src_stride, dst, dst_stride,
3006 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
3007
3008 return;
3009 */
3010 if(QP_store==NULL)
3011 {
3012 QP_store= zeroArray;
3013 QP_stride= 0;
3014 }
3015
3016 ppMode.lumMode= mode;
3017 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
3018 ppMode.chromMode= mode;
3019 ppMode.maxTmpNoise[0]= 700;
3020 ppMode.maxTmpNoise[1]= 1500;
3021 ppMode.maxTmpNoise[2]= 3000;
3022
3023 #ifdef HAVE_ODIVX_POSTPROCESS
3024 // Note: I could make this shit outside of this file, but it would mean one
3025 // more function call...
3026 if(use_old_pp){
3027 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
3028 return;
3029 }
3030 #endif
3031
3032 postProcess(src[0], src_stride, dst[0], dst_stride,
3033 horizontal_size, vertical_size, QP_store, QP_stride, 0, &ppMode);
3034
3035 horizontal_size >>= 1;
3036 vertical_size >>= 1;
3037 src_stride >>= 1;
3038 dst_stride >>= 1;
3039
3040 if(ppMode.chromMode)
3041 {
3042 postProcess(src[1], src_stride, dst[1], dst_stride,
3043 horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode);
3044 postProcess(src[2], src_stride, dst[2], dst_stride,
3045 horizontal_size, vertical_size, QP_store, QP_stride, 2, &ppMode);
3046 }
3047 else if(src_stride == dst_stride)
3048 {
3049 memcpy(dst[1], src[1], src_stride*vertical_size);
3050 memcpy(dst[2], src[2], src_stride*vertical_size);
3051 }
3052 else
3053 {
3054 int y;
3055 for(y=0; y<vertical_size; y++)
3056 {
3057 memcpy(&(dst[1][y*dst_stride]), &(src[1][y*src_stride]), horizontal_size);
3058 memcpy(&(dst[2][y*dst_stride]), &(src[2][y*src_stride]), horizontal_size);
3059 }
3060 }
3061
3062 #if 0
3063 memset(dst[1], 128, dst_stride*vertical_size);
3064 memset(dst[2], 128, dst_stride*vertical_size);
3065 #endif
3066 }
3067
3068 void postprocess2(unsigned char * src[], int src_stride,
3069 unsigned char * dst[], int dst_stride,
3070 int horizontal_size, int vertical_size,
3071 QP_STORE_T *QP_store, int QP_stride,
3072 struct PPMode *mode)
3073 {
3074
3075 static QP_STORE_T zeroArray[2048/8];
3076 if(QP_store==NULL)
3077 {
3078 QP_store= zeroArray;
3079 QP_stride= 0;
3080 }
3081
3082 #ifdef HAVE_ODIVX_POSTPROCESS
3083 // Note: I could make this shit outside of this file, but it would mean one
3084 // more function call...
3085 if(use_old_pp){
3086 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
3087 mode->oldMode);
3088 return;
3089 }
3090 #endif
3091
3092 postProcess(src[0], src_stride, dst[0], dst_stride,
3093 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
3094
3095 horizontal_size >>= 1;
3096 vertical_size >>= 1;
3097 src_stride >>= 1;
3098 dst_stride >>= 1;
3099
3100 if(mode->chromMode)
3101 {
3102 postProcess(src[1], src_stride, dst[1], dst_stride,
3103 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
3104 postProcess(src[2], src_stride, dst[2], dst_stride,
3105 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
3106 }
3107 else if(src_stride == dst_stride)
3108 {
3109 memcpy(dst[1], src[1], src_stride*vertical_size);
3110 memcpy(dst[2], src[2], src_stride*vertical_size);
3111 }
3112 else
3113 {
3114 int y;
3115 for(y=0; y<vertical_size; y++)
3116 {
3117 memcpy(&(dst[1][y*dst_stride]), &(src[1][y*src_stride]), horizontal_size);
3118 memcpy(&(dst[2][y*dst_stride]), &(src[2][y*src_stride]), horizontal_size);
3119 }
3120 }
3121 }
3122
3123
3124 /**
3125 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
3126 * 0 <= quality <= 6
3127 */
3128 int getPpModeForQuality(int quality){
3129 int modes[1+GET_PP_QUALITY_MAX]= {
3130 0,
3131 #if 1
3132 // horizontal filters first
3133 LUM_H_DEBLOCK,
3134 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
3135 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
3136 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
3137 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
3138 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
3139 #else
3140 // vertical filters first
3141 LUM_V_DEBLOCK,
3142 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
3143 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
3144 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
3145 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
3146 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
3147 #endif
3148 };
3149
3150 #ifdef HAVE_ODIVX_POSTPROCESS
3151 int odivx_modes[1+GET_PP_QUALITY_MAX]= {
3152 0,
3153 PP_DEBLOCK_Y_H,
3154 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
3155 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
3156 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
3157 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
3158 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
3159 };
3160 if(use_old_pp) return odivx_modes[quality];
3161 #endif
3162 return modes[quality];
3163 }
3164 2446
3165 /** 2447 /**
3166 * Copies a block from src to dst and fixes the blacklevel 2448 * Copies a block from src to dst and fixes the blacklevel
3167 * numLines must be a multiple of 4 2449 * numLines must be a multiple of 4
3168 * levelFix == 0 -> dont touch the brighness & contrast 2450 * levelFix == 0 -> dont touch the brighness & contrast
3169 */ 2451 */
3170 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, 2452 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
3171 int levelFix) 2453 int levelFix)
3172 { 2454 {
3173 #ifndef HAVE_MMX 2455 #ifndef HAVE_MMX
3174 int i; 2456 int i;
3175 #endif 2457 #endif
3265 2547
3266 2548
3267 /** 2549 /**
3268 * Filters array of bytes (Y or U or V values) 2550 * Filters array of bytes (Y or U or V values)
3269 */ 2551 */
3270 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 2552 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3271 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode) 2553 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode)
3272 { 2554 {
3273 int x,y; 2555 int x,y;
3274 const int mode= isColor ? ppMode->chromMode : ppMode->lumMode; 2556 const int mode= isColor ? ppMode->chromMode : ppMode->lumMode;
3275 2557
3461 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 2743 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3462 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 2744 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3463 */ 2745 */
3464 #endif 2746 #endif
3465 2747
3466 blockCopy(dstBlock + dstStride*copyAhead, dstStride, 2748 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3467 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); 2749 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
3468 2750
3469 if(mode & LINEAR_IPOL_DEINT_FILTER) 2751 if(mode & LINEAR_IPOL_DEINT_FILTER)
3470 deInterlaceInterpolateLinear(dstBlock, dstStride); 2752 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3471 else if(mode & LINEAR_BLEND_DEINT_FILTER) 2753 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3472 deInterlaceBlendLinear(dstBlock, dstStride); 2754 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
3473 else if(mode & MEDIAN_DEINT_FILTER) 2755 else if(mode & MEDIAN_DEINT_FILTER)
3474 deInterlaceMedian(dstBlock, dstStride); 2756 RENAME(deInterlaceMedian)(dstBlock, dstStride);
3475 else if(mode & CUBIC_IPOL_DEINT_FILTER) 2757 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3476 deInterlaceInterpolateCubic(dstBlock, dstStride); 2758 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3477 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) 2759 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3478 deInterlaceBlendCubic(dstBlock, dstStride); 2760 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3479 */ 2761 */
3480 dstBlock+=8; 2762 dstBlock+=8;
3481 srcBlock+=8; 2763 srcBlock+=8;
3482 } 2764 }
3483 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride ); 2765 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride );
3486 for(y=0; y<height; y+=BLOCK_SIZE) 2768 for(y=0; y<height; y+=BLOCK_SIZE)
3487 { 2769 {
3488 //1% speedup if these are here instead of the inner loop 2770 //1% speedup if these are here instead of the inner loop
3489 uint8_t *srcBlock= &(src[y*srcStride]); 2771 uint8_t *srcBlock= &(src[y*srcStride]);
3490 uint8_t *dstBlock= &(dst[y*dstStride]); 2772 uint8_t *dstBlock= &(dst[y*dstStride]);
2773 #ifdef HAVE_MMX
2774 uint8_t *tempBlock1= tempBlocks;
2775 uint8_t *tempBlock2= tempBlocks + 8;
2776 #endif
3491 #ifdef ARCH_X86 2777 #ifdef ARCH_X86
3492 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; 2778 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
3493 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); 2779 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
3494 int QPFrac= QPDelta; 2780 int QPFrac= QPDelta;
3495 uint8_t *tempBlock1= tempBlocks;
3496 uint8_t *tempBlock2= tempBlocks + 8;
3497 #endif 2781 #endif
3498 int QP=0; 2782 int QP=0;
3499 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards 2783 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3500 if not than use a temporary buffer */ 2784 if not than use a temporary buffer */
3501 if(y+15 >= height) 2785 if(y+15 >= height)
3525 // finish 1 block before the next otherwise weŽll might have a problem 2809 // finish 1 block before the next otherwise weŽll might have a problem
3526 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 2810 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3527 for(x=0; x<width; x+=BLOCK_SIZE) 2811 for(x=0; x<width; x+=BLOCK_SIZE)
3528 { 2812 {
3529 const int stride= dstStride; 2813 const int stride= dstStride;
2814 #ifdef HAVE_MMX
3530 uint8_t *tmpXchg; 2815 uint8_t *tmpXchg;
2816 #endif
3531 #ifdef ARCH_X86 2817 #ifdef ARCH_X86
3532 QP= *QPptr; 2818 QP= *QPptr;
3533 asm volatile( 2819 asm volatile(
3534 "addl %2, %1 \n\t" 2820 "addl %2, %1 \n\t"
3535 "sbbl %%eax, %%eax \n\t" 2821 "sbbl %%eax, %%eax \n\t"
3617 dstBlock= tempDstBlock; 2903 dstBlock= tempDstBlock;
3618 srcBlock= tempSrcBlock; 2904 srcBlock= tempSrcBlock;
3619 } 2905 }
3620 #endif 2906 #endif
3621 2907
3622 blockCopy(dstBlock + dstStride*copyAhead, dstStride, 2908 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3623 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); 2909 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
3624 2910
3625 if(mode & LINEAR_IPOL_DEINT_FILTER) 2911 if(mode & LINEAR_IPOL_DEINT_FILTER)
3626 deInterlaceInterpolateLinear(dstBlock, dstStride); 2912 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3627 else if(mode & LINEAR_BLEND_DEINT_FILTER) 2913 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3628 deInterlaceBlendLinear(dstBlock, dstStride); 2914 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
3629 else if(mode & MEDIAN_DEINT_FILTER) 2915 else if(mode & MEDIAN_DEINT_FILTER)
3630 deInterlaceMedian(dstBlock, dstStride); 2916 RENAME(deInterlaceMedian)(dstBlock, dstStride);
3631 else if(mode & CUBIC_IPOL_DEINT_FILTER) 2917 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3632 deInterlaceInterpolateCubic(dstBlock, dstStride); 2918 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3633 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) 2919 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3634 deInterlaceBlendCubic(dstBlock, dstStride); 2920 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3635 */ 2921 */
3636 2922
3637 /* only deblock if we have 2 blocks */ 2923 /* only deblock if we have 2 blocks */
3638 if(y + 8 < height) 2924 if(y + 8 < height)
3639 { 2925 {
3641 T1= rdtsc(); 2927 T1= rdtsc();
3642 memcpyTime+= T1-T0; 2928 memcpyTime+= T1-T0;
3643 T0=T1; 2929 T0=T1;
3644 #endif 2930 #endif
3645 if(mode & V_RK1_FILTER) 2931 if(mode & V_RK1_FILTER)
3646 vertRK1Filter(dstBlock, stride, QP); 2932 RENAME(vertRK1Filter)(dstBlock, stride, QP);
3647 else if(mode & V_X1_FILTER) 2933 else if(mode & V_X1_FILTER)
3648 vertX1Filter(dstBlock, stride, QP); 2934 RENAME(vertX1Filter)(dstBlock, stride, QP);
3649 else if(mode & V_DEBLOCK) 2935 else if(mode & V_DEBLOCK)
3650 { 2936 {
3651 if( isVertDC(dstBlock, stride)) 2937 if( RENAME(isVertDC)(dstBlock, stride))
3652 { 2938 {
3653 if(isVertMinMaxOk(dstBlock, stride, QP)) 2939 if(RENAME(isVertMinMaxOk)(dstBlock, stride, QP))
3654 doVertLowPass(dstBlock, stride, QP); 2940 RENAME(doVertLowPass)(dstBlock, stride, QP);
3655 } 2941 }
3656 else 2942 else
3657 doVertDefFilter(dstBlock, stride, QP); 2943 RENAME(doVertDefFilter)(dstBlock, stride, QP);
3658 } 2944 }
3659 #ifdef MORE_TIMING 2945 #ifdef MORE_TIMING
3660 T1= rdtsc(); 2946 T1= rdtsc();
3661 vertTime+= T1-T0; 2947 vertTime+= T1-T0;
3662 T0=T1; 2948 T0=T1;
3663 #endif 2949 #endif
3664 } 2950 }
3665 2951
3666 #ifdef HAVE_MMX 2952 #ifdef HAVE_MMX
3667 transpose1(tempBlock1, tempBlock2, dstBlock, dstStride); 2953 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3668 #endif 2954 #endif
3669 /* check if we have a previous block to deblock it with dstBlock */ 2955 /* check if we have a previous block to deblock it with dstBlock */
3670 if(x - 8 >= 0) 2956 if(x - 8 >= 0)
3671 { 2957 {
3672 #ifdef MORE_TIMING 2958 #ifdef MORE_TIMING
3673 T0= rdtsc(); 2959 T0= rdtsc();
3674 #endif 2960 #endif
3675 #ifdef HAVE_MMX 2961 #ifdef HAVE_MMX
3676 if(mode & H_RK1_FILTER) 2962 if(mode & H_RK1_FILTER)
3677 vertRK1Filter(tempBlock1, 16, QP); 2963 RENAME(vertRK1Filter)(tempBlock1, 16, QP);
3678 else if(mode & H_X1_FILTER) 2964 else if(mode & H_X1_FILTER)
3679 vertX1Filter(tempBlock1, 16, QP); 2965 RENAME(vertX1Filter)(tempBlock1, 16, QP);
3680 else if(mode & H_DEBLOCK) 2966 else if(mode & H_DEBLOCK)
3681 { 2967 {
3682 if( isVertDC(tempBlock1, 16) ) 2968 if( RENAME(isVertDC)(tempBlock1, 16) )
3683 { 2969 {
3684 if(isVertMinMaxOk(tempBlock1, 16, QP)) 2970 if(RENAME(isVertMinMaxOk)(tempBlock1, 16, QP))
3685 doVertLowPass(tempBlock1, 16, QP); 2971 RENAME(doVertLowPass)(tempBlock1, 16, QP);
3686 } 2972 }
3687 else 2973 else
3688 doVertDefFilter(tempBlock1, 16, QP); 2974 RENAME(doVertDefFilter)(tempBlock1, 16, QP);
3689 } 2975 }
3690 2976
3691 transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16); 2977 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3692 2978
3693 #else 2979 #else
3694 if(mode & H_X1_FILTER) 2980 if(mode & H_X1_FILTER)
3695 horizX1Filter(dstBlock-4, stride, QP); 2981 horizX1Filter(dstBlock-4, stride, QP);
3696 else if(mode & H_DEBLOCK) 2982 else if(mode & H_DEBLOCK)
3710 T0=T1; 2996 T0=T1;
3711 #endif 2997 #endif
3712 if(mode & DERING) 2998 if(mode & DERING)
3713 { 2999 {
3714 //FIXME filter first line 3000 //FIXME filter first line
3715 if(y>0) dering(dstBlock - stride - 8, stride, QP); 3001 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, QP);
3716 } 3002 }
3717 3003
3718 if(mode & TEMP_NOISE_FILTER) 3004 if(mode & TEMP_NOISE_FILTER)
3719 { 3005 {
3720 tempNoiseReducer(dstBlock-8, stride, 3006 RENAME(tempNoiseReducer)(dstBlock-8, stride,
3721 tempBlured[isColor] + y*dstStride + x, 3007 tempBlured[isColor] + y*dstStride + x,
3722 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), 3008 tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3723 ppMode->maxTmpNoise); 3009 ppMode->maxTmpNoise);
3724 } 3010 }
3725 } 3011 }
3749 #endif 3035 #endif
3750 } 3036 }
3751 3037
3752 if(mode & DERING) 3038 if(mode & DERING)
3753 { 3039 {
3754 if(y > 0) dering(dstBlock - dstStride - 8, dstStride, QP); 3040 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, QP);
3755 } 3041 }
3756 3042
3757 if((mode & TEMP_NOISE_FILTER)) 3043 if((mode & TEMP_NOISE_FILTER))
3758 { 3044 {
3759 tempNoiseReducer(dstBlock-8, dstStride, 3045 RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3760 tempBlured[isColor] + y*dstStride + x, 3046 tempBlured[isColor] + y*dstStride + x,
3761 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), 3047 tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3762 ppMode->maxTmpNoise); 3048 ppMode->maxTmpNoise);
3763 } 3049 }
3764 3050