Mercurial > mplayer.hg
comparison postproc/postprocess_template.c @ 3099:897d46457708
runtime cpu detection
author | michael |
---|---|
date | Sat, 24 Nov 2001 22:16:29 +0000 |
parents | 4150aff2ac17 |
children | b2e24fec97bc |
comparison
equal
deleted
inserted
replaced
3098:6b21035859c9 | 3099:897d46457708 |
---|---|
14 You should have received a copy of the GNU General Public License | 14 You should have received a copy of the GNU General Public License |
15 along with this program; if not, write to the Free Software | 15 along with this program; if not, write to the Free Software |
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
17 */ | 17 */ |
18 | 18 |
19 /* | 19 #undef PAVGB |
20 C MMX MMX2 3DNow | 20 #undef PMINUB |
21 isVertDC Ec Ec | 21 #undef PMAXUB |
22 isVertMinMaxOk Ec Ec | |
23 doVertLowPass E e e | |
24 doVertDefFilter Ec Ec e e | |
25 isHorizDC Ec Ec | |
26 isHorizMinMaxOk a E | |
27 doHorizLowPass E e e | |
28 doHorizDefFilter Ec Ec e e | |
29 deRing E e e* | |
30 Vertical RKAlgo1 E a a | |
31 Horizontal RKAlgo1 a a | |
32 Vertical X1# a E E | |
33 Horizontal X1# a E E | |
34 LinIpolDeinterlace e E E* | |
35 CubicIpolDeinterlace a e e* | |
36 LinBlendDeinterlace e E E* | |
37 MedianDeinterlace# Ec Ec | |
38 TempDeNoiser# E e e | |
39 | |
40 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work | |
41 # more or less selfinvented filters so the exactness isnt too meaningfull | |
42 E = Exact implementation | |
43 e = allmost exact implementation (slightly different rounding,...) | |
44 a = alternative / approximate impl | |
45 c = checked against the other implementations (-vo md5) | |
46 */ | |
47 | |
48 /* | |
49 TODO: | |
50 reduce the time wasted on the mem transfer | |
51 implement everything in C at least (done at the moment but ...) | |
52 unroll stuff if instructions depend too much on the prior one | |
53 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? | |
54 move YScale thing to the end instead of fixing QP | |
55 write a faster and higher quality deblocking filter :) | |
56 make the mainloop more flexible (variable number of blocks at once | |
57 (the if/else stuff per block is slowing things down) | |
58 compare the quality & speed of all filters | |
59 split this huge file | |
60 border remover | |
61 optimize c versions | |
62 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks | |
63 smart blur | |
64 commandline option for the deblock / dering thresholds | |
65 ... | |
66 */ | |
67 | |
68 //Changelog: use the CVS log | |
69 | |
70 #include "../config.h" | |
71 #include <inttypes.h> | |
72 #include <stdio.h> | |
73 #include <stdlib.h> | |
74 #include <string.h> | |
75 #ifdef HAVE_MALLOC_H | |
76 #include <malloc.h> | |
77 #endif | |
78 //#undef HAVE_MMX2 | |
79 //#define HAVE_3DNOW | |
80 //#undef HAVE_MMX | |
81 //#define DEBUG_BRIGHTNESS | |
82 #include "../libvo/fastmemcpy.h" | |
83 #include "postprocess.h" | |
84 | |
85 #define MIN(a,b) ((a) > (b) ? (b) : (a)) | |
86 #define MAX(a,b) ((a) < (b) ? (b) : (a)) | |
87 #define ABS(a) ((a) > 0 ? (a) : (-(a))) | |
88 #define SIGN(a) ((a) > 0 ? 1 : -1) | |
89 | 22 |
90 #ifdef HAVE_MMX2 | 23 #ifdef HAVE_MMX2 |
91 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | 24 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
92 #elif defined (HAVE_3DNOW) | 25 #elif defined (HAVE_3DNOW) |
93 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | 26 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
109 "psubusb " #a ", " #b " \n\t"\ | 42 "psubusb " #a ", " #b " \n\t"\ |
110 "paddb " #a ", " #b " \n\t" | 43 "paddb " #a ", " #b " \n\t" |
111 #endif | 44 #endif |
112 | 45 |
113 | 46 |
114 #define GET_MODE_BUFFER_SIZE 500 | |
115 #define OPTIONS_ARRAY_SIZE 10 | |
116 | |
117 #ifdef HAVE_MMX | |
118 static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL; | |
119 static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL; | |
120 static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL; | |
121 static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL; | |
122 static uint64_t __attribute__((aligned(8))) w1400= 0x1400140014001400LL; | |
123 static uint64_t __attribute__((aligned(8))) bm00000001= 0x00000000000000FFLL; | |
124 static uint64_t __attribute__((aligned(8))) bm00010000= 0x000000FF00000000LL; | |
125 static uint64_t __attribute__((aligned(8))) bm00001000= 0x00000000FF000000LL; | |
126 static uint64_t __attribute__((aligned(8))) bm10000000= 0xFF00000000000000LL; | |
127 static uint64_t __attribute__((aligned(8))) bm10000001= 0xFF000000000000FFLL; | |
128 static uint64_t __attribute__((aligned(8))) bm11000011= 0xFFFF00000000FFFFLL; | |
129 static uint64_t __attribute__((aligned(8))) bm00000011= 0x000000000000FFFFLL; | |
130 static uint64_t __attribute__((aligned(8))) bm11111110= 0xFFFFFFFFFFFFFF00LL; | |
131 static uint64_t __attribute__((aligned(8))) bm11000000= 0xFFFF000000000000LL; | |
132 static uint64_t __attribute__((aligned(8))) bm00011000= 0x000000FFFF000000LL; | |
133 static uint64_t __attribute__((aligned(8))) bm00110011= 0x0000FFFF0000FFFFLL; | |
134 static uint64_t __attribute__((aligned(8))) bm11001100= 0xFFFF0000FFFF0000LL; | |
135 static uint64_t __attribute__((aligned(8))) b00= 0x0000000000000000LL; | |
136 static uint64_t __attribute__((aligned(8))) b01= 0x0101010101010101LL; | |
137 static uint64_t __attribute__((aligned(8))) b02= 0x0202020202020202LL; | |
138 static uint64_t __attribute__((aligned(8))) b0F= 0x0F0F0F0F0F0F0F0FLL; | |
139 static uint64_t __attribute__((aligned(8))) b04= 0x0404040404040404LL; | |
140 static uint64_t __attribute__((aligned(8))) b08= 0x0808080808080808LL; | |
141 static uint64_t __attribute__((aligned(8))) bFF= 0xFFFFFFFFFFFFFFFFLL; | |
142 static uint64_t __attribute__((aligned(8))) b20= 0x2020202020202020LL; | |
143 static uint64_t __attribute__((aligned(8))) b80= 0x8080808080808080LL; | |
144 static uint64_t __attribute__((aligned(8))) b7E= 0x7E7E7E7E7E7E7E7ELL; | |
145 static uint64_t __attribute__((aligned(8))) b7C= 0x7C7C7C7C7C7C7C7CLL; | |
146 static uint64_t __attribute__((aligned(8))) b3F= 0x3F3F3F3F3F3F3F3FLL; | |
147 static uint64_t __attribute__((aligned(8))) temp0=0; | |
148 static uint64_t __attribute__((aligned(8))) temp1=0; | |
149 static uint64_t __attribute__((aligned(8))) temp2=0; | |
150 static uint64_t __attribute__((aligned(8))) temp3=0; | |
151 static uint64_t __attribute__((aligned(8))) temp4=0; | |
152 static uint64_t __attribute__((aligned(8))) temp5=0; | |
153 static uint64_t __attribute__((aligned(8))) pQPb=0; | |
154 static uint64_t __attribute__((aligned(8))) pQPb2=0; | |
155 static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code | |
156 static uint32_t __attribute__((aligned(4))) maxTmpNoise[4]; | |
157 #else | |
158 static uint64_t packedYOffset= 0x0000000000000000LL; | |
159 static uint64_t packedYScale= 0x0100010001000100LL; | |
160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code | |
161 #endif | |
162 | |
163 int hFlatnessThreshold= 56 - 16; | |
164 int vFlatnessThreshold= 56 - 16; | |
165 int deringThreshold= 20; | |
166 | |
167 //amount of "black" u r willing to loose to get a brightness corrected picture | |
168 double maxClippedThreshold= 0.01; | |
169 | |
170 int maxAllowedY=234; | |
171 int minAllowedY=16; | |
172 | |
173 static struct PPFilter filters[]= | |
174 { | |
175 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK}, | |
176 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK}, | |
177 {"vr", "rkvdeblock", 1, 2, 4, H_RK1_FILTER}, | |
178 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER}, | |
179 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER}, | |
180 {"dr", "dering", 1, 5, 6, DERING}, | |
181 {"al", "autolevels", 0, 1, 2, LEVEL_FIX}, | |
182 {"lb", "linblenddeint", 0, 1, 6, LINEAR_BLEND_DEINT_FILTER}, | |
183 {"li", "linipoldeint", 0, 1, 6, LINEAR_IPOL_DEINT_FILTER}, | |
184 {"ci", "cubicipoldeint", 0, 1, 6, CUBIC_IPOL_DEINT_FILTER}, | |
185 {"md", "mediandeint", 0, 1, 6, MEDIAN_DEINT_FILTER}, | |
186 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER}, | |
187 {NULL, NULL,0,0,0,0} //End Marker | |
188 }; | |
189 | |
190 static char *replaceTable[]= | |
191 { | |
192 "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", | |
193 "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", | |
194 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", | |
195 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", | |
196 NULL //End Marker | |
197 }; | |
198 | |
199 #ifdef HAVE_MMX | |
200 static inline void unusedVariableWarningFixer() | |
201 { | |
202 if( | |
203 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000 | |
204 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110 | |
205 + bm11000000 + bm00011000 + bm00110011 + bm11001100 + b00 + b01 + b02 + b0F | |
206 + bFF + b20 + b04+ b08 + pQPb2 + b80 + b7E + b7C + b3F + temp0 + temp1 + temp2 + temp3 + temp4 | |
207 + temp5 + pQPb== 0) b00=0; | |
208 } | |
209 #endif | |
210 | |
211 #ifdef TIMING | |
212 static inline long long rdtsc() | |
213 { | |
214 long long l; | |
215 asm volatile( "rdtsc\n\t" | |
216 : "=A" (l) | |
217 ); | |
218 // printf("%d\n", int(l/1000)); | |
219 return l; | |
220 } | |
221 #endif | |
222 | |
223 #ifdef HAVE_MMX2 | |
224 static inline void prefetchnta(void *p) | |
225 { | |
226 asm volatile( "prefetchnta (%0)\n\t" | |
227 : : "r" (p) | |
228 ); | |
229 } | |
230 | |
231 static inline void prefetcht0(void *p) | |
232 { | |
233 asm volatile( "prefetcht0 (%0)\n\t" | |
234 : : "r" (p) | |
235 ); | |
236 } | |
237 | |
238 static inline void prefetcht1(void *p) | |
239 { | |
240 asm volatile( "prefetcht1 (%0)\n\t" | |
241 : : "r" (p) | |
242 ); | |
243 } | |
244 | |
245 static inline void prefetcht2(void *p) | |
246 { | |
247 asm volatile( "prefetcht2 (%0)\n\t" | |
248 : : "r" (p) | |
249 ); | |
250 } | |
251 #endif | |
252 | |
253 //FIXME? |255-0| = 1 (shouldnt be a problem ...) | 47 //FIXME? |255-0| = 1 (shouldnt be a problem ...) |
254 /** | 48 /** |
255 * Check if the middle 8x8 Block in the given 8x16 block is flat | 49 * Check if the middle 8x8 Block in the given 8x16 block is flat |
256 */ | 50 */ |
257 static inline int isVertDC(uint8_t src[], int stride){ | 51 static inline int RENAME(isVertDC)(uint8_t src[], int stride){ |
258 int numEq= 0; | 52 int numEq= 0; |
259 #ifndef HAVE_MMX | 53 #ifndef HAVE_MMX |
260 int y; | 54 int y; |
261 #endif | 55 #endif |
262 src+= stride*4; // src points to begin of the 8x8 Block | 56 src+= stride*4; // src points to begin of the 8x8 Block |
361 */ | 155 */ |
362 // for(int i=0; i<numEq/8; i++) src[i]=255; | 156 // for(int i=0; i<numEq/8; i++) src[i]=255; |
363 return (numEq > vFlatnessThreshold) ? 1 : 0; | 157 return (numEq > vFlatnessThreshold) ? 1 : 0; |
364 } | 158 } |
365 | 159 |
366 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP) | 160 static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, int QP) |
367 { | 161 { |
368 #ifdef HAVE_MMX | 162 #ifdef HAVE_MMX |
369 int isOk; | 163 int isOk; |
370 src+= stride*3; | 164 src+= stride*3; |
371 asm volatile( | 165 asm volatile( |
418 | 212 |
419 /** | 213 /** |
420 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) | 214 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
421 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 | 215 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
422 */ | 216 */ |
423 static inline void doVertLowPass(uint8_t *src, int stride, int QP) | 217 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) |
424 { | 218 { |
425 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 219 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
426 src+= stride*3; | 220 src+= stride*3; |
427 asm volatile( //"movv %0 %1 %2\n\t" | 221 asm volatile( //"movv %0 %1 %2\n\t" |
428 "movq pQPb, %%mm0 \n\t" // QP,..., QP | 222 "movq pQPb, %%mm0 \n\t" // QP,..., QP |
600 x = 8 | 394 x = 8 |
601 x/2 = 4 | 395 x/2 = 4 |
602 x/8 = 1 | 396 x/8 = 1 |
603 1 12 12 23 | 397 1 12 12 23 |
604 */ | 398 */ |
605 static inline void vertRK1Filter(uint8_t *src, int stride, int QP) | 399 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) |
606 { | 400 { |
607 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 401 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
608 src+= stride*3; | 402 src+= stride*3; |
609 // FIXME rounding | 403 // FIXME rounding |
610 asm volatile( | 404 asm volatile( |
700 * will not damage linear gradients | 494 * will not damage linear gradients |
701 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | 495 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter |
702 * can only smooth blocks at the expected locations (it cant smooth them if they did move) | 496 * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
703 * MMX2 version does correct clipping C version doesnt | 497 * MMX2 version does correct clipping C version doesnt |
704 */ | 498 */ |
705 static inline void vertX1Filter(uint8_t *src, int stride, int QP) | 499 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP) |
706 { | 500 { |
707 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 501 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
708 src+= stride*3; | 502 src+= stride*3; |
709 | 503 |
710 asm volatile( | 504 asm volatile( |
856 } | 650 } |
857 */ | 651 */ |
858 #endif | 652 #endif |
859 } | 653 } |
860 | 654 |
861 /** | 655 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) |
862 * Experimental Filter 1 (Horizontal) | |
863 * will not damage linear gradients | |
864 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
865 * can only smooth blocks at the expected locations (it cant smooth them if they did move) | |
866 * MMX2 version does correct clipping C version doesnt | |
867 * not identical with the vertical one | |
868 */ | |
869 static inline void horizX1Filter(uint8_t *src, int stride, int QP) | |
870 { | |
871 int y; | |
872 //FIXME (has little in common with the mmx2 version) | |
873 for(y=0; y<BLOCK_SIZE; y++) | |
874 { | |
875 int a= src[1] - src[2]; | |
876 int b= src[3] - src[4]; | |
877 int c= src[5] - src[6]; | |
878 | |
879 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); | |
880 | |
881 if(d < QP) | |
882 { | |
883 int v = d * SIGN(-b); | |
884 | |
885 src[1] +=v/8; | |
886 src[2] +=v/4; | |
887 src[3] +=3*v/8; | |
888 src[4] -=3*v/8; | |
889 src[5] -=v/4; | |
890 src[6] -=v/8; | |
891 | |
892 } | |
893 src+=stride; | |
894 } | |
895 } | |
896 | |
897 | |
898 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | |
899 { | 656 { |
900 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 657 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
901 /* | 658 /* |
902 uint8_t tmp[16]; | 659 uint8_t tmp[16]; |
903 const int l1= stride; | 660 const int l1= stride; |
1472 src++; | 1229 src++; |
1473 } | 1230 } |
1474 #endif | 1231 #endif |
1475 } | 1232 } |
1476 | 1233 |
1477 /** | 1234 static inline void RENAME(dering)(uint8_t src[], int stride, int QP) |
1478 * Check if the given 8x8 Block is mostly "flat" | |
1479 */ | |
1480 static inline int isHorizDC(uint8_t src[], int stride) | |
1481 { | |
1482 int numEq= 0; | |
1483 int y; | |
1484 for(y=0; y<BLOCK_SIZE; y++) | |
1485 { | |
1486 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; | |
1487 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++; | |
1488 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++; | |
1489 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++; | |
1490 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; | |
1491 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; | |
1492 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; | |
1493 src+= stride; | |
1494 } | |
1495 return numEq > hFlatnessThreshold; | |
1496 } | |
1497 | |
1498 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) | |
1499 { | |
1500 if(abs(src[0] - src[7]) > 2*QP) return 0; | |
1501 | |
1502 return 1; | |
1503 } | |
1504 | |
1505 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) | |
1506 { | |
1507 int y; | |
1508 for(y=0; y<BLOCK_SIZE; y++) | |
1509 { | |
1510 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]); | |
1511 | |
1512 if(ABS(middleEnergy) < 8*QP) | |
1513 { | |
1514 const int q=(dst[3] - dst[4])/2; | |
1515 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); | |
1516 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); | |
1517 | |
1518 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
1519 d= MAX(d, 0); | |
1520 | |
1521 d= (5*d + 32) >> 6; | |
1522 d*= SIGN(-middleEnergy); | |
1523 | |
1524 if(q>0) | |
1525 { | |
1526 d= d<0 ? 0 : d; | |
1527 d= d>q ? q : d; | |
1528 } | |
1529 else | |
1530 { | |
1531 d= d>0 ? 0 : d; | |
1532 d= d<q ? q : d; | |
1533 } | |
1534 | |
1535 dst[3]-= d; | |
1536 dst[4]+= d; | |
1537 } | |
1538 dst+= stride; | |
1539 } | |
1540 } | |
1541 | |
1542 /** | |
1543 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) | |
1544 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) | |
1545 */ | |
1546 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP) | |
1547 { | |
1548 | |
1549 int y; | |
1550 for(y=0; y<BLOCK_SIZE; y++) | |
1551 { | |
1552 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; | |
1553 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; | |
1554 | |
1555 int sums[9]; | |
1556 sums[0] = first + dst[0]; | |
1557 sums[1] = dst[0] + dst[1]; | |
1558 sums[2] = dst[1] + dst[2]; | |
1559 sums[3] = dst[2] + dst[3]; | |
1560 sums[4] = dst[3] + dst[4]; | |
1561 sums[5] = dst[4] + dst[5]; | |
1562 sums[6] = dst[5] + dst[6]; | |
1563 sums[7] = dst[6] + dst[7]; | |
1564 sums[8] = dst[7] + last; | |
1565 | |
1566 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; | |
1567 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; | |
1568 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; | |
1569 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; | |
1570 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; | |
1571 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; | |
1572 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4; | |
1573 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; | |
1574 | |
1575 dst+= stride; | |
1576 } | |
1577 } | |
1578 | |
1579 | |
1580 static inline void dering(uint8_t src[], int stride, int QP) | |
1581 { | 1235 { |
1582 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1236 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1583 asm volatile( | 1237 asm volatile( |
1584 "movq pQPb, %%mm0 \n\t" | 1238 "movq pQPb, %%mm0 \n\t" |
1585 "paddusb %%mm0, %%mm0 \n\t" | 1239 "paddusb %%mm0, %%mm0 \n\t" |
1590 // 0 1 2 3 4 5 6 7 8 9 | 1244 // 0 1 2 3 4 5 6 7 8 9 |
1591 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1245 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
1592 | 1246 |
1593 "pcmpeqb %%mm7, %%mm7 \n\t" | 1247 "pcmpeqb %%mm7, %%mm7 \n\t" |
1594 "pxor %%mm6, %%mm6 \n\t" | 1248 "pxor %%mm6, %%mm6 \n\t" |
1249 #undef FIND_MIN_MAX | |
1595 #ifdef HAVE_MMX2 | 1250 #ifdef HAVE_MMX2 |
1596 #define FIND_MIN_MAX(addr)\ | 1251 #define FIND_MIN_MAX(addr)\ |
1597 "movq " #addr ", %%mm0 \n\t"\ | 1252 "movq " #addr ", %%mm0 \n\t"\ |
1598 "pminub %%mm0, %%mm7 \n\t"\ | 1253 "pminub %%mm0, %%mm7 \n\t"\ |
1599 "pmaxub %%mm0, %%mm6 \n\t" | 1254 "pmaxub %%mm0, %%mm6 \n\t" |
1918 * Deinterlaces the given block | 1573 * Deinterlaces the given block |
1919 * will be called for every 8x8 block and can read & write from line 4-15 | 1574 * will be called for every 8x8 block and can read & write from line 4-15 |
1920 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | 1575 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
1921 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1576 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
1922 */ | 1577 */ |
1923 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) | 1578 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) |
1924 { | 1579 { |
1925 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1580 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1926 src+= 4*stride; | 1581 src+= 4*stride; |
1927 asm volatile( | 1582 asm volatile( |
1928 "leal (%0, %1), %%eax \n\t" | 1583 "leal (%0, %1), %%eax \n\t" |
1967 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | 1622 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
1968 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1623 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
1969 * this filter will read lines 3-15 and write 7-13 | 1624 * this filter will read lines 3-15 and write 7-13 |
1970 * no cliping in C version | 1625 * no cliping in C version |
1971 */ | 1626 */ |
1972 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride) | 1627 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) |
1973 { | 1628 { |
1974 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1629 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1975 src+= stride*3; | 1630 src+= stride*3; |
1976 asm volatile( | 1631 asm volatile( |
1977 "leal (%0, %1), %%eax \n\t" | 1632 "leal (%0, %1), %%eax \n\t" |
2032 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | 1687 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
2033 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1688 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
2034 * will shift the image up by 1 line (FIXME if this is a problem) | 1689 * will shift the image up by 1 line (FIXME if this is a problem) |
2035 * this filter will read lines 4-13 and write 4-11 | 1690 * this filter will read lines 4-13 and write 4-11 |
2036 */ | 1691 */ |
2037 static inline void deInterlaceBlendLinear(uint8_t src[], int stride) | 1692 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride) |
2038 { | 1693 { |
2039 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1694 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2040 src+= 4*stride; | 1695 src+= 4*stride; |
2041 asm volatile( | 1696 asm volatile( |
2042 "leal (%0, %1), %%eax \n\t" | 1697 "leal (%0, %1), %%eax \n\t" |
2105 * Deinterlaces the given block | 1760 * Deinterlaces the given block |
2106 * will be called for every 8x8 block and can read & write from line 4-15, | 1761 * will be called for every 8x8 block and can read & write from line 4-15, |
2107 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | 1762 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
2108 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1763 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
2109 */ | 1764 */ |
2110 static inline void deInterlaceMedian(uint8_t src[], int stride) | 1765 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) |
2111 { | 1766 { |
2112 #ifdef HAVE_MMX | 1767 #ifdef HAVE_MMX |
2113 src+= 4*stride; | 1768 src+= 4*stride; |
2114 #ifdef HAVE_MMX2 | 1769 #ifdef HAVE_MMX2 |
2115 asm volatile( | 1770 asm volatile( |
2222 | 1877 |
2223 #ifdef HAVE_MMX | 1878 #ifdef HAVE_MMX |
2224 /** | 1879 /** |
2225 * transposes and shift the given 8x8 Block into dst1 and dst2 | 1880 * transposes and shift the given 8x8 Block into dst1 and dst2 |
2226 */ | 1881 */ |
2227 static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) | 1882 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) |
2228 { | 1883 { |
2229 asm( | 1884 asm( |
2230 "leal (%0, %1), %%eax \n\t" | 1885 "leal (%0, %1), %%eax \n\t" |
2231 "leal (%%eax, %1, 4), %%ebx \n\t" | 1886 "leal (%%eax, %1, 4), %%ebx \n\t" |
2232 // 0 1 2 3 4 5 6 7 8 9 | 1887 // 0 1 2 3 4 5 6 7 8 9 |
2306 } | 1961 } |
2307 | 1962 |
2308 /** | 1963 /** |
2309 * transposes the given 8x8 block | 1964 * transposes the given 8x8 block |
2310 */ | 1965 */ |
2311 static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src) | 1966 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) |
2312 { | 1967 { |
2313 asm( | 1968 asm( |
2314 "leal (%0, %1), %%eax \n\t" | 1969 "leal (%0, %1), %%eax \n\t" |
2315 "leal (%%eax, %1, 4), %%ebx \n\t" | 1970 "leal (%%eax, %1, 4), %%ebx \n\t" |
2316 // 0 1 2 3 4 5 6 7 8 9 | 1971 // 0 1 2 3 4 5 6 7 8 9 |
2385 ); | 2040 ); |
2386 } | 2041 } |
2387 #endif | 2042 #endif |
2388 //static int test=0; | 2043 //static int test=0; |
2389 | 2044 |
2390 static void inline tempNoiseReducer(uint8_t *src, int stride, | 2045 static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
2391 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) | 2046 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) |
2392 { | 2047 { |
2393 #define FAST_L2_DIFF | 2048 #define FAST_L2_DIFF |
2394 //#define L1_DIFF //u should change the thresholds too if u try that one | 2049 //#define L1_DIFF //u should change the thresholds too if u try that one |
2395 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 2050 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2784 } | 2439 } |
2785 } | 2440 } |
2786 #endif | 2441 #endif |
2787 } | 2442 } |
2788 | 2443 |
2789 #ifdef HAVE_ODIVX_POSTPROCESS | 2444 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
2790 #include "../opendivx/postprocess.h" | |
2791 int use_old_pp=0; | |
2792 #endif | |
2793 | |
2794 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | |
2795 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); | 2445 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); |
2796 | |
2797 /* -pp Command line Help | |
2798 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)? | |
2799 | |
2800 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]... | |
2801 | |
2802 long form example: | |
2803 -pp vdeblock:autoq,hdeblock:autoq,linblenddeint -pp default,-vdeblock | |
2804 short form example: | |
2805 -pp vb:a,hb:a,lb -pp de,-vb | |
2806 more examples: | |
2807 -pp tn:64:128:256 | |
2808 | |
2809 Filters Options | |
2810 short long name short long option Description | |
2811 * * a autoq cpu power dependant enabler | |
2812 c chrom chrominance filtring enabled | |
2813 y nochrom chrominance filtring disabled | |
2814 hb hdeblock horizontal deblocking filter | |
2815 vb vdeblock vertical deblocking filter | |
2816 vr rkvdeblock | |
2817 h1 x1hdeblock Experimental horizontal deblock filter 1 | |
2818 v1 x1vdeblock Experimental vertical deblock filter 1 | |
2819 dr dering not implemented yet | |
2820 al autolevels automatic brightness / contrast fixer | |
2821 f fullyrange stretch luminance range to (0..255) | |
2822 lb linblenddeint linear blend deinterlacer | |
2823 li linipoldeint linear interpolating deinterlacer | |
2824 ci cubicipoldeint cubic interpolating deinterlacer | |
2825 md mediandeint median deinterlacer | |
2826 de default hdeblock:a,vdeblock:a,dering:a,autolevels | |
2827 fa fast x1hdeblock:a,x1vdeblock:a,dering:a,autolevels | |
2828 tn tmpnoise (3 Thresholds) Temporal Noise Reducer | |
2829 */ | |
2830 | |
2831 /** | |
2832 * returns a PPMode struct which will have a non 0 error variable if an error occured | |
2833 * name is the string after "-pp" on the command line | |
2834 * quality is a number from 0 to GET_PP_QUALITY_MAX | |
2835 */ | |
2836 struct PPMode getPPModeByNameAndQuality(char *name, int quality) | |
2837 { | |
2838 char temp[GET_MODE_BUFFER_SIZE]; | |
2839 char *p= temp; | |
2840 char *filterDelimiters= ","; | |
2841 char *optionDelimiters= ":"; | |
2842 struct PPMode ppMode= {0,0,0,0,0,0,{150,200,400}}; | |
2843 char *filterToken; | |
2844 | |
2845 strncpy(temp, name, GET_MODE_BUFFER_SIZE); | |
2846 | |
2847 printf("%s\n", name); | |
2848 | |
2849 for(;;){ | |
2850 char *filterName; | |
2851 int q= 1000000; //GET_PP_QUALITY_MAX; | |
2852 int chrom=-1; | |
2853 char *option; | |
2854 char *options[OPTIONS_ARRAY_SIZE]; | |
2855 int i; | |
2856 int filterNameOk=0; | |
2857 int numOfUnknownOptions=0; | |
2858 int enable=1; //does the user want us to enabled or disabled the filter | |
2859 | |
2860 filterToken= strtok(p, filterDelimiters); | |
2861 if(filterToken == NULL) break; | |
2862 p+= strlen(filterToken) + 1; // p points to next filterToken | |
2863 filterName= strtok(filterToken, optionDelimiters); | |
2864 printf("%s::%s\n", filterToken, filterName); | |
2865 | |
2866 if(*filterName == '-') | |
2867 { | |
2868 enable=0; | |
2869 filterName++; | |
2870 } | |
2871 | |
2872 for(;;){ //for all options | |
2873 option= strtok(NULL, optionDelimiters); | |
2874 if(option == NULL) break; | |
2875 | |
2876 printf("%s\n", option); | |
2877 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality; | |
2878 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0; | |
2879 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1; | |
2880 else | |
2881 { | |
2882 options[numOfUnknownOptions] = option; | |
2883 numOfUnknownOptions++; | |
2884 } | |
2885 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break; | |
2886 } | |
2887 options[numOfUnknownOptions] = NULL; | |
2888 | |
2889 /* replace stuff from the replace Table */ | |
2890 for(i=0; replaceTable[2*i]!=NULL; i++) | |
2891 { | |
2892 if(!strcmp(replaceTable[2*i], filterName)) | |
2893 { | |
2894 int newlen= strlen(replaceTable[2*i + 1]); | |
2895 int plen; | |
2896 int spaceLeft; | |
2897 | |
2898 if(p==NULL) p= temp, *p=0; //last filter | |
2899 else p--, *p=','; //not last filter | |
2900 | |
2901 plen= strlen(p); | |
2902 spaceLeft= (int)p - (int)temp + plen; | |
2903 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE) | |
2904 { | |
2905 ppMode.error++; | |
2906 break; | |
2907 } | |
2908 memmove(p + newlen, p, plen+1); | |
2909 memcpy(p, replaceTable[2*i + 1], newlen); | |
2910 filterNameOk=1; | |
2911 } | |
2912 } | |
2913 | |
2914 for(i=0; filters[i].shortName!=NULL; i++) | |
2915 { | |
2916 // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName); | |
2917 if( !strcmp(filters[i].longName, filterName) | |
2918 || !strcmp(filters[i].shortName, filterName)) | |
2919 { | |
2920 ppMode.lumMode &= ~filters[i].mask; | |
2921 ppMode.chromMode &= ~filters[i].mask; | |
2922 | |
2923 filterNameOk=1; | |
2924 if(!enable) break; // user wants to disable it | |
2925 | |
2926 if(q >= filters[i].minLumQuality) | |
2927 ppMode.lumMode|= filters[i].mask; | |
2928 if(chrom==1 || (chrom==-1 && filters[i].chromDefault)) | |
2929 if(q >= filters[i].minChromQuality) | |
2930 ppMode.chromMode|= filters[i].mask; | |
2931 | |
2932 if(filters[i].mask == LEVEL_FIX) | |
2933 { | |
2934 int o; | |
2935 ppMode.minAllowedY= 16; | |
2936 ppMode.maxAllowedY= 234; | |
2937 for(o=0; options[o]!=NULL; o++) | |
2938 if( !strcmp(options[o],"fullyrange") | |
2939 ||!strcmp(options[o],"f")) | |
2940 { | |
2941 ppMode.minAllowedY= 0; | |
2942 ppMode.maxAllowedY= 255; | |
2943 numOfUnknownOptions--; | |
2944 } | |
2945 } | |
2946 else if(filters[i].mask == TEMP_NOISE_FILTER) | |
2947 { | |
2948 int o; | |
2949 int numOfNoises=0; | |
2950 ppMode.maxTmpNoise[0]= 150; | |
2951 ppMode.maxTmpNoise[1]= 200; | |
2952 ppMode.maxTmpNoise[2]= 400; | |
2953 | |
2954 for(o=0; options[o]!=NULL; o++) | |
2955 { | |
2956 char *tail; | |
2957 ppMode.maxTmpNoise[numOfNoises]= | |
2958 strtol(options[o], &tail, 0); | |
2959 if(tail!=options[o]) | |
2960 { | |
2961 numOfNoises++; | |
2962 numOfUnknownOptions--; | |
2963 if(numOfNoises >= 3) break; | |
2964 } | |
2965 } | |
2966 } | |
2967 } | |
2968 } | |
2969 if(!filterNameOk) ppMode.error++; | |
2970 ppMode.error += numOfUnknownOptions; | |
2971 } | |
2972 | |
2973 #ifdef HAVE_ODIVX_POSTPROCESS | |
2974 if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H; | |
2975 if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V; | |
2976 if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H; | |
2977 if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V; | |
2978 if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y; | |
2979 if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C; | |
2980 #endif | |
2981 | |
2982 return ppMode; | |
2983 } | |
2984 | |
2985 /** | |
2986 * Obsolete, dont use it, use postprocess2() instead | |
2987 */ | |
2988 void postprocess(unsigned char * src[], int src_stride, | |
2989 unsigned char * dst[], int dst_stride, | |
2990 int horizontal_size, int vertical_size, | |
2991 QP_STORE_T *QP_store, int QP_stride, | |
2992 int mode) | |
2993 { | |
2994 struct PPMode ppMode; | |
2995 static QP_STORE_T zeroArray[2048/8]; | |
2996 /* | |
2997 static int qual=0; | |
2998 | |
2999 ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock,tmpnoise:150:200:300", qual); | |
3000 printf("OK\n"); | |
3001 qual++; | |
3002 qual%=7; | |
3003 printf("\n%X %X %X %X :%d: %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error, | |
3004 qual, ppMode.maxTmpNoise[0], ppMode.maxTmpNoise[1], ppMode.maxTmpNoise[2]); | |
3005 postprocess2(src, src_stride, dst, dst_stride, | |
3006 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode); | |
3007 | |
3008 return; | |
3009 */ | |
3010 if(QP_store==NULL) | |
3011 { | |
3012 QP_store= zeroArray; | |
3013 QP_stride= 0; | |
3014 } | |
3015 | |
3016 ppMode.lumMode= mode; | |
3017 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00); | |
3018 ppMode.chromMode= mode; | |
3019 ppMode.maxTmpNoise[0]= 700; | |
3020 ppMode.maxTmpNoise[1]= 1500; | |
3021 ppMode.maxTmpNoise[2]= 3000; | |
3022 | |
3023 #ifdef HAVE_ODIVX_POSTPROCESS | |
3024 // Note: I could make this shit outside of this file, but it would mean one | |
3025 // more function call... | |
3026 if(use_old_pp){ | |
3027 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode); | |
3028 return; | |
3029 } | |
3030 #endif | |
3031 | |
3032 postProcess(src[0], src_stride, dst[0], dst_stride, | |
3033 horizontal_size, vertical_size, QP_store, QP_stride, 0, &ppMode); | |
3034 | |
3035 horizontal_size >>= 1; | |
3036 vertical_size >>= 1; | |
3037 src_stride >>= 1; | |
3038 dst_stride >>= 1; | |
3039 | |
3040 if(ppMode.chromMode) | |
3041 { | |
3042 postProcess(src[1], src_stride, dst[1], dst_stride, | |
3043 horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode); | |
3044 postProcess(src[2], src_stride, dst[2], dst_stride, | |
3045 horizontal_size, vertical_size, QP_store, QP_stride, 2, &ppMode); | |
3046 } | |
3047 else if(src_stride == dst_stride) | |
3048 { | |
3049 memcpy(dst[1], src[1], src_stride*vertical_size); | |
3050 memcpy(dst[2], src[2], src_stride*vertical_size); | |
3051 } | |
3052 else | |
3053 { | |
3054 int y; | |
3055 for(y=0; y<vertical_size; y++) | |
3056 { | |
3057 memcpy(&(dst[1][y*dst_stride]), &(src[1][y*src_stride]), horizontal_size); | |
3058 memcpy(&(dst[2][y*dst_stride]), &(src[2][y*src_stride]), horizontal_size); | |
3059 } | |
3060 } | |
3061 | |
3062 #if 0 | |
3063 memset(dst[1], 128, dst_stride*vertical_size); | |
3064 memset(dst[2], 128, dst_stride*vertical_size); | |
3065 #endif | |
3066 } | |
3067 | |
3068 void postprocess2(unsigned char * src[], int src_stride, | |
3069 unsigned char * dst[], int dst_stride, | |
3070 int horizontal_size, int vertical_size, | |
3071 QP_STORE_T *QP_store, int QP_stride, | |
3072 struct PPMode *mode) | |
3073 { | |
3074 | |
3075 static QP_STORE_T zeroArray[2048/8]; | |
3076 if(QP_store==NULL) | |
3077 { | |
3078 QP_store= zeroArray; | |
3079 QP_stride= 0; | |
3080 } | |
3081 | |
3082 #ifdef HAVE_ODIVX_POSTPROCESS | |
3083 // Note: I could make this shit outside of this file, but it would mean one | |
3084 // more function call... | |
3085 if(use_old_pp){ | |
3086 odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride, | |
3087 mode->oldMode); | |
3088 return; | |
3089 } | |
3090 #endif | |
3091 | |
3092 postProcess(src[0], src_stride, dst[0], dst_stride, | |
3093 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode); | |
3094 | |
3095 horizontal_size >>= 1; | |
3096 vertical_size >>= 1; | |
3097 src_stride >>= 1; | |
3098 dst_stride >>= 1; | |
3099 | |
3100 if(mode->chromMode) | |
3101 { | |
3102 postProcess(src[1], src_stride, dst[1], dst_stride, | |
3103 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); | |
3104 postProcess(src[2], src_stride, dst[2], dst_stride, | |
3105 horizontal_size, vertical_size, QP_store, QP_stride, 2, mode); | |
3106 } | |
3107 else if(src_stride == dst_stride) | |
3108 { | |
3109 memcpy(dst[1], src[1], src_stride*vertical_size); | |
3110 memcpy(dst[2], src[2], src_stride*vertical_size); | |
3111 } | |
3112 else | |
3113 { | |
3114 int y; | |
3115 for(y=0; y<vertical_size; y++) | |
3116 { | |
3117 memcpy(&(dst[1][y*dst_stride]), &(src[1][y*src_stride]), horizontal_size); | |
3118 memcpy(&(dst[2][y*dst_stride]), &(src[2][y*src_stride]), horizontal_size); | |
3119 } | |
3120 } | |
3121 } | |
3122 | |
3123 | |
3124 /** | |
3125 * gets the mode flags for a given quality (larger values mean slower but better postprocessing) | |
3126 * 0 <= quality <= 6 | |
3127 */ | |
3128 int getPpModeForQuality(int quality){ | |
3129 int modes[1+GET_PP_QUALITY_MAX]= { | |
3130 0, | |
3131 #if 1 | |
3132 // horizontal filters first | |
3133 LUM_H_DEBLOCK, | |
3134 LUM_H_DEBLOCK | LUM_V_DEBLOCK, | |
3135 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK, | |
3136 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK, | |
3137 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING, | |
3138 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING | |
3139 #else | |
3140 // vertical filters first | |
3141 LUM_V_DEBLOCK, | |
3142 LUM_V_DEBLOCK | LUM_H_DEBLOCK, | |
3143 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK, | |
3144 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK, | |
3145 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING, | |
3146 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING | |
3147 #endif | |
3148 }; | |
3149 | |
3150 #ifdef HAVE_ODIVX_POSTPROCESS | |
3151 int odivx_modes[1+GET_PP_QUALITY_MAX]= { | |
3152 0, | |
3153 PP_DEBLOCK_Y_H, | |
3154 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V, | |
3155 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H, | |
3156 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V, | |
3157 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y, | |
3158 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C | |
3159 }; | |
3160 if(use_old_pp) return odivx_modes[quality]; | |
3161 #endif | |
3162 return modes[quality]; | |
3163 } | |
3164 | 2446 |
3165 /** | 2447 /** |
3166 * Copies a block from src to dst and fixes the blacklevel | 2448 * Copies a block from src to dst and fixes the blacklevel |
3167 * numLines must be a multiple of 4 | 2449 * numLines must be a multiple of 4 |
3168 * levelFix == 0 -> dont touch the brighness & contrast | 2450 * levelFix == 0 -> dont touch the brighness & contrast |
3169 */ | 2451 */ |
3170 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, | 2452 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
3171 int levelFix) | 2453 int levelFix) |
3172 { | 2454 { |
3173 #ifndef HAVE_MMX | 2455 #ifndef HAVE_MMX |
3174 int i; | 2456 int i; |
3175 #endif | 2457 #endif |
3265 | 2547 |
3266 | 2548 |
3267 /** | 2549 /** |
3268 * Filters array of bytes (Y or U or V values) | 2550 * Filters array of bytes (Y or U or V values) |
3269 */ | 2551 */ |
3270 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | 2552 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
3271 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode) | 2553 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode) |
3272 { | 2554 { |
3273 int x,y; | 2555 int x,y; |
3274 const int mode= isColor ? ppMode->chromMode : ppMode->lumMode; | 2556 const int mode= isColor ? ppMode->chromMode : ppMode->lumMode; |
3275 | 2557 |
3461 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | 2743 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
3462 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | 2744 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
3463 */ | 2745 */ |
3464 #endif | 2746 #endif |
3465 | 2747 |
3466 blockCopy(dstBlock + dstStride*copyAhead, dstStride, | 2748 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
3467 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); | 2749 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); |
3468 | 2750 |
3469 if(mode & LINEAR_IPOL_DEINT_FILTER) | 2751 if(mode & LINEAR_IPOL_DEINT_FILTER) |
3470 deInterlaceInterpolateLinear(dstBlock, dstStride); | 2752 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
3471 else if(mode & LINEAR_BLEND_DEINT_FILTER) | 2753 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
3472 deInterlaceBlendLinear(dstBlock, dstStride); | 2754 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
3473 else if(mode & MEDIAN_DEINT_FILTER) | 2755 else if(mode & MEDIAN_DEINT_FILTER) |
3474 deInterlaceMedian(dstBlock, dstStride); | 2756 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
3475 else if(mode & CUBIC_IPOL_DEINT_FILTER) | 2757 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
3476 deInterlaceInterpolateCubic(dstBlock, dstStride); | 2758 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
3477 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) | 2759 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
3478 deInterlaceBlendCubic(dstBlock, dstStride); | 2760 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
3479 */ | 2761 */ |
3480 dstBlock+=8; | 2762 dstBlock+=8; |
3481 srcBlock+=8; | 2763 srcBlock+=8; |
3482 } | 2764 } |
3483 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride ); | 2765 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride ); |
3486 for(y=0; y<height; y+=BLOCK_SIZE) | 2768 for(y=0; y<height; y+=BLOCK_SIZE) |
3487 { | 2769 { |
3488 //1% speedup if these are here instead of the inner loop | 2770 //1% speedup if these are here instead of the inner loop |
3489 uint8_t *srcBlock= &(src[y*srcStride]); | 2771 uint8_t *srcBlock= &(src[y*srcStride]); |
3490 uint8_t *dstBlock= &(dst[y*dstStride]); | 2772 uint8_t *dstBlock= &(dst[y*dstStride]); |
2773 #ifdef HAVE_MMX | |
2774 uint8_t *tempBlock1= tempBlocks; | |
2775 uint8_t *tempBlock2= tempBlocks + 8; | |
2776 #endif | |
3491 #ifdef ARCH_X86 | 2777 #ifdef ARCH_X86 |
3492 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; | 2778 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; |
3493 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); | 2779 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); |
3494 int QPFrac= QPDelta; | 2780 int QPFrac= QPDelta; |
3495 uint8_t *tempBlock1= tempBlocks; | |
3496 uint8_t *tempBlock2= tempBlocks + 8; | |
3497 #endif | 2781 #endif |
3498 int QP=0; | 2782 int QP=0; |
3499 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards | 2783 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
3500 if not than use a temporary buffer */ | 2784 if not than use a temporary buffer */ |
3501 if(y+15 >= height) | 2785 if(y+15 >= height) |
3525 // finish 1 block before the next otherwise weŽll might have a problem | 2809 // finish 1 block before the next otherwise weŽll might have a problem |
3526 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | 2810 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
3527 for(x=0; x<width; x+=BLOCK_SIZE) | 2811 for(x=0; x<width; x+=BLOCK_SIZE) |
3528 { | 2812 { |
3529 const int stride= dstStride; | 2813 const int stride= dstStride; |
2814 #ifdef HAVE_MMX | |
3530 uint8_t *tmpXchg; | 2815 uint8_t *tmpXchg; |
2816 #endif | |
3531 #ifdef ARCH_X86 | 2817 #ifdef ARCH_X86 |
3532 QP= *QPptr; | 2818 QP= *QPptr; |
3533 asm volatile( | 2819 asm volatile( |
3534 "addl %2, %1 \n\t" | 2820 "addl %2, %1 \n\t" |
3535 "sbbl %%eax, %%eax \n\t" | 2821 "sbbl %%eax, %%eax \n\t" |
3617 dstBlock= tempDstBlock; | 2903 dstBlock= tempDstBlock; |
3618 srcBlock= tempSrcBlock; | 2904 srcBlock= tempSrcBlock; |
3619 } | 2905 } |
3620 #endif | 2906 #endif |
3621 | 2907 |
3622 blockCopy(dstBlock + dstStride*copyAhead, dstStride, | 2908 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
3623 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); | 2909 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); |
3624 | 2910 |
3625 if(mode & LINEAR_IPOL_DEINT_FILTER) | 2911 if(mode & LINEAR_IPOL_DEINT_FILTER) |
3626 deInterlaceInterpolateLinear(dstBlock, dstStride); | 2912 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
3627 else if(mode & LINEAR_BLEND_DEINT_FILTER) | 2913 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
3628 deInterlaceBlendLinear(dstBlock, dstStride); | 2914 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
3629 else if(mode & MEDIAN_DEINT_FILTER) | 2915 else if(mode & MEDIAN_DEINT_FILTER) |
3630 deInterlaceMedian(dstBlock, dstStride); | 2916 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
3631 else if(mode & CUBIC_IPOL_DEINT_FILTER) | 2917 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
3632 deInterlaceInterpolateCubic(dstBlock, dstStride); | 2918 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
3633 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) | 2919 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
3634 deInterlaceBlendCubic(dstBlock, dstStride); | 2920 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
3635 */ | 2921 */ |
3636 | 2922 |
3637 /* only deblock if we have 2 blocks */ | 2923 /* only deblock if we have 2 blocks */ |
3638 if(y + 8 < height) | 2924 if(y + 8 < height) |
3639 { | 2925 { |
3641 T1= rdtsc(); | 2927 T1= rdtsc(); |
3642 memcpyTime+= T1-T0; | 2928 memcpyTime+= T1-T0; |
3643 T0=T1; | 2929 T0=T1; |
3644 #endif | 2930 #endif |
3645 if(mode & V_RK1_FILTER) | 2931 if(mode & V_RK1_FILTER) |
3646 vertRK1Filter(dstBlock, stride, QP); | 2932 RENAME(vertRK1Filter)(dstBlock, stride, QP); |
3647 else if(mode & V_X1_FILTER) | 2933 else if(mode & V_X1_FILTER) |
3648 vertX1Filter(dstBlock, stride, QP); | 2934 RENAME(vertX1Filter)(dstBlock, stride, QP); |
3649 else if(mode & V_DEBLOCK) | 2935 else if(mode & V_DEBLOCK) |
3650 { | 2936 { |
3651 if( isVertDC(dstBlock, stride)) | 2937 if( RENAME(isVertDC)(dstBlock, stride)) |
3652 { | 2938 { |
3653 if(isVertMinMaxOk(dstBlock, stride, QP)) | 2939 if(RENAME(isVertMinMaxOk)(dstBlock, stride, QP)) |
3654 doVertLowPass(dstBlock, stride, QP); | 2940 RENAME(doVertLowPass)(dstBlock, stride, QP); |
3655 } | 2941 } |
3656 else | 2942 else |
3657 doVertDefFilter(dstBlock, stride, QP); | 2943 RENAME(doVertDefFilter)(dstBlock, stride, QP); |
3658 } | 2944 } |
3659 #ifdef MORE_TIMING | 2945 #ifdef MORE_TIMING |
3660 T1= rdtsc(); | 2946 T1= rdtsc(); |
3661 vertTime+= T1-T0; | 2947 vertTime+= T1-T0; |
3662 T0=T1; | 2948 T0=T1; |
3663 #endif | 2949 #endif |
3664 } | 2950 } |
3665 | 2951 |
3666 #ifdef HAVE_MMX | 2952 #ifdef HAVE_MMX |
3667 transpose1(tempBlock1, tempBlock2, dstBlock, dstStride); | 2953 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
3668 #endif | 2954 #endif |
3669 /* check if we have a previous block to deblock it with dstBlock */ | 2955 /* check if we have a previous block to deblock it with dstBlock */ |
3670 if(x - 8 >= 0) | 2956 if(x - 8 >= 0) |
3671 { | 2957 { |
3672 #ifdef MORE_TIMING | 2958 #ifdef MORE_TIMING |
3673 T0= rdtsc(); | 2959 T0= rdtsc(); |
3674 #endif | 2960 #endif |
3675 #ifdef HAVE_MMX | 2961 #ifdef HAVE_MMX |
3676 if(mode & H_RK1_FILTER) | 2962 if(mode & H_RK1_FILTER) |
3677 vertRK1Filter(tempBlock1, 16, QP); | 2963 RENAME(vertRK1Filter)(tempBlock1, 16, QP); |
3678 else if(mode & H_X1_FILTER) | 2964 else if(mode & H_X1_FILTER) |
3679 vertX1Filter(tempBlock1, 16, QP); | 2965 RENAME(vertX1Filter)(tempBlock1, 16, QP); |
3680 else if(mode & H_DEBLOCK) | 2966 else if(mode & H_DEBLOCK) |
3681 { | 2967 { |
3682 if( isVertDC(tempBlock1, 16) ) | 2968 if( RENAME(isVertDC)(tempBlock1, 16) ) |
3683 { | 2969 { |
3684 if(isVertMinMaxOk(tempBlock1, 16, QP)) | 2970 if(RENAME(isVertMinMaxOk)(tempBlock1, 16, QP)) |
3685 doVertLowPass(tempBlock1, 16, QP); | 2971 RENAME(doVertLowPass)(tempBlock1, 16, QP); |
3686 } | 2972 } |
3687 else | 2973 else |
3688 doVertDefFilter(tempBlock1, 16, QP); | 2974 RENAME(doVertDefFilter)(tempBlock1, 16, QP); |
3689 } | 2975 } |
3690 | 2976 |
3691 transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16); | 2977 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); |
3692 | 2978 |
3693 #else | 2979 #else |
3694 if(mode & H_X1_FILTER) | 2980 if(mode & H_X1_FILTER) |
3695 horizX1Filter(dstBlock-4, stride, QP); | 2981 horizX1Filter(dstBlock-4, stride, QP); |
3696 else if(mode & H_DEBLOCK) | 2982 else if(mode & H_DEBLOCK) |
3710 T0=T1; | 2996 T0=T1; |
3711 #endif | 2997 #endif |
3712 if(mode & DERING) | 2998 if(mode & DERING) |
3713 { | 2999 { |
3714 //FIXME filter first line | 3000 //FIXME filter first line |
3715 if(y>0) dering(dstBlock - stride - 8, stride, QP); | 3001 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, QP); |
3716 } | 3002 } |
3717 | 3003 |
3718 if(mode & TEMP_NOISE_FILTER) | 3004 if(mode & TEMP_NOISE_FILTER) |
3719 { | 3005 { |
3720 tempNoiseReducer(dstBlock-8, stride, | 3006 RENAME(tempNoiseReducer)(dstBlock-8, stride, |
3721 tempBlured[isColor] + y*dstStride + x, | 3007 tempBlured[isColor] + y*dstStride + x, |
3722 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | 3008 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), |
3723 ppMode->maxTmpNoise); | 3009 ppMode->maxTmpNoise); |
3724 } | 3010 } |
3725 } | 3011 } |
3749 #endif | 3035 #endif |
3750 } | 3036 } |
3751 | 3037 |
3752 if(mode & DERING) | 3038 if(mode & DERING) |
3753 { | 3039 { |
3754 if(y > 0) dering(dstBlock - dstStride - 8, dstStride, QP); | 3040 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, QP); |
3755 } | 3041 } |
3756 | 3042 |
3757 if((mode & TEMP_NOISE_FILTER)) | 3043 if((mode & TEMP_NOISE_FILTER)) |
3758 { | 3044 { |
3759 tempNoiseReducer(dstBlock-8, dstStride, | 3045 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, |
3760 tempBlured[isColor] + y*dstStride + x, | 3046 tempBlured[isColor] + y*dstStride + x, |
3761 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | 3047 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), |
3762 ppMode->maxTmpNoise); | 3048 ppMode->maxTmpNoise); |
3763 } | 3049 } |
3764 | 3050 |