Mercurial > mplayer.hg
comparison postproc/postprocess_template.c @ 7946:f483ab704252
postprocessing cleanup:
remove opendivx #ifdefs
remove rk1 filter
remove unused / obsolete stuff
add -1,4,2,4,-1 deinterlacing filter (ffmpeg uses that)
threadsafe / no more non-const globals
some optimizations
different strides for Y,U,V possible
remove ebx usage (someone really should fix gcc, this is really lame)
change the dering filter slightly (tell me if its worse for any files)
author | michael |
---|---|
date | Mon, 28 Oct 2002 19:31:04 +0000 |
parents | e3ecccc7e505 |
children | 5a6cbe774760 |
comparison
equal
deleted
inserted
replaced
7945:32939f2b3d2e | 7946:f483ab704252 |
---|---|
43 "paddb " #a ", " #b " \n\t" | 43 "paddb " #a ", " #b " \n\t" |
44 #endif | 44 #endif |
45 | 45 |
46 | 46 |
47 //FIXME? |255-0| = 1 (shouldnt be a problem ...) | 47 //FIXME? |255-0| = 1 (shouldnt be a problem ...) |
48 #ifdef HAVE_MMX | |
48 /** | 49 /** |
49 * Check if the middle 8x8 Block in the given 8x16 block is flat | 50 * Check if the middle 8x8 Block in the given 8x16 block is flat |
50 */ | 51 */ |
51 static inline int RENAME(isVertDC)(uint8_t src[], int stride){ | 52 static inline int RENAME(isVertDC)(uint8_t src[], int stride, PPContext *c){ |
52 int numEq= 0; | 53 int numEq= 0; |
53 #ifndef HAVE_MMX | |
54 int y; | |
55 #endif | |
56 src+= stride*4; // src points to begin of the 8x8 Block | 54 src+= stride*4; // src points to begin of the 8x8 Block |
57 #ifdef HAVE_MMX | |
58 asm volatile( | 55 asm volatile( |
59 "leal (%1, %2), %%eax \n\t" | 56 "leal (%1, %2), %%eax \n\t" |
60 "leal (%%eax, %2, 4), %%ebx \n\t" | |
61 // 0 1 2 3 4 5 6 7 8 9 | 57 // 0 1 2 3 4 5 6 7 8 9 |
62 // %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2 | 58 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 |
63 "movq "MANGLE(mmxDCOffset)", %%mm7 \n\t" // mm7 = 0x7F | 59 "movq %3, %%mm7 \n\t" // mm7 = 0x7F |
64 "movq "MANGLE(mmxDCThreshold)", %%mm6 \n\t" // mm6 = 0x7D | 60 "movq %4, %%mm6 \n\t" // mm6 = 0x7D |
65 "movq (%1), %%mm0 \n\t" | 61 "movq (%1), %%mm0 \n\t" |
66 "movq (%%eax), %%mm1 \n\t" | 62 "movq (%%eax), %%mm1 \n\t" |
67 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece | 63 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece |
68 "paddb %%mm7, %%mm0 \n\t" | 64 "paddb %%mm7, %%mm0 \n\t" |
69 "pcmpgtb %%mm6, %%mm0 \n\t" | 65 "pcmpgtb %%mm6, %%mm0 \n\t" |
77 "movq (%%eax, %2, 2), %%mm1 \n\t" | 73 "movq (%%eax, %2, 2), %%mm1 \n\t" |
78 "psubb %%mm1, %%mm2 \n\t" | 74 "psubb %%mm1, %%mm2 \n\t" |
79 "paddb %%mm7, %%mm2 \n\t" | 75 "paddb %%mm7, %%mm2 \n\t" |
80 "pcmpgtb %%mm6, %%mm2 \n\t" | 76 "pcmpgtb %%mm6, %%mm2 \n\t" |
81 "paddb %%mm2, %%mm0 \n\t" | 77 "paddb %%mm2, %%mm0 \n\t" |
78 | |
79 "leal (%%eax, %2, 4), %%eax \n\t" | |
82 | 80 |
83 "movq (%1, %2, 4), %%mm2 \n\t" | 81 "movq (%1, %2, 4), %%mm2 \n\t" |
84 "psubb %%mm2, %%mm1 \n\t" | 82 "psubb %%mm2, %%mm1 \n\t" |
85 "paddb %%mm7, %%mm1 \n\t" | 83 "paddb %%mm7, %%mm1 \n\t" |
86 "pcmpgtb %%mm6, %%mm1 \n\t" | 84 "pcmpgtb %%mm6, %%mm1 \n\t" |
87 "paddb %%mm1, %%mm0 \n\t" | 85 "paddb %%mm1, %%mm0 \n\t" |
88 | 86 |
89 "movq (%%ebx), %%mm1 \n\t" | 87 "movq (%%eax), %%mm1 \n\t" |
90 "psubb %%mm1, %%mm2 \n\t" | 88 "psubb %%mm1, %%mm2 \n\t" |
91 "paddb %%mm7, %%mm2 \n\t" | 89 "paddb %%mm7, %%mm2 \n\t" |
92 "pcmpgtb %%mm6, %%mm2 \n\t" | 90 "pcmpgtb %%mm6, %%mm2 \n\t" |
93 "paddb %%mm2, %%mm0 \n\t" | 91 "paddb %%mm2, %%mm0 \n\t" |
94 | 92 |
95 "movq (%%ebx, %2), %%mm2 \n\t" | 93 "movq (%%eax, %2), %%mm2 \n\t" |
96 "psubb %%mm2, %%mm1 \n\t" | 94 "psubb %%mm2, %%mm1 \n\t" |
97 "paddb %%mm7, %%mm1 \n\t" | 95 "paddb %%mm7, %%mm1 \n\t" |
98 "pcmpgtb %%mm6, %%mm1 \n\t" | 96 "pcmpgtb %%mm6, %%mm1 \n\t" |
99 "paddb %%mm1, %%mm0 \n\t" | 97 "paddb %%mm1, %%mm0 \n\t" |
100 | 98 |
101 "movq (%%ebx, %2, 2), %%mm1 \n\t" | 99 "movq (%%eax, %2, 2), %%mm1 \n\t" |
102 "psubb %%mm1, %%mm2 \n\t" | 100 "psubb %%mm1, %%mm2 \n\t" |
103 "paddb %%mm7, %%mm2 \n\t" | 101 "paddb %%mm7, %%mm2 \n\t" |
104 "pcmpgtb %%mm6, %%mm2 \n\t" | 102 "pcmpgtb %%mm6, %%mm2 \n\t" |
105 "paddb %%mm2, %%mm0 \n\t" | 103 "paddb %%mm2, %%mm0 \n\t" |
106 | 104 |
119 "psrlq $32, %%mm0 \n\t" | 117 "psrlq $32, %%mm0 \n\t" |
120 "paddb %%mm1, %%mm0 \n\t" | 118 "paddb %%mm1, %%mm0 \n\t" |
121 #endif | 119 #endif |
122 "movd %%mm0, %0 \n\t" | 120 "movd %%mm0, %0 \n\t" |
123 : "=r" (numEq) | 121 : "=r" (numEq) |
124 : "r" (src), "r" (stride) | 122 : "r" (src), "r" (stride), "m" (c->mmxDcOffset), "m" (c->mmxDcThreshold) |
125 : "%eax", "%ebx" | 123 : "%eax" |
126 ); | 124 ); |
127 numEq= (-numEq) &0xFF; | 125 numEq= (-numEq) &0xFF; |
128 | 126 return numEq > c->ppMode.flatnessThreshold; |
129 #else | |
130 for(y=0; y<BLOCK_SIZE-1; y++) | |
131 { | |
132 if(((src[0] - src[0+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
133 if(((src[1] - src[1+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
134 if(((src[2] - src[2+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
135 if(((src[3] - src[3+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
136 if(((src[4] - src[4+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
137 if(((src[5] - src[5+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
138 if(((src[6] - src[6+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
139 if(((src[7] - src[7+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
140 src+= stride; | |
141 } | |
142 #endif | |
143 /* if(abs(numEq - asmEq) > 0) | |
144 { | |
145 printf("\nasm:%d c:%d\n", asmEq, numEq); | |
146 for(int y=0; y<8; y++) | |
147 { | |
148 for(int x=0; x<8; x++) | |
149 { | |
150 printf("%d ", temp[x + y*stride]); | |
151 } | |
152 printf("\n"); | |
153 } | |
154 } | |
155 */ | |
156 // for(int i=0; i<numEq/8; i++) src[i]=255; | |
157 return (numEq > vFlatnessThreshold) ? 1 : 0; | |
158 } | 127 } |
159 | 128 #endif |
160 static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, int QP) | 129 |
130 static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, PPContext *c) | |
161 { | 131 { |
162 #ifdef HAVE_MMX | 132 #ifdef HAVE_MMX |
163 int isOk; | 133 int isOk; |
164 src+= stride*3; | 134 src+= stride*3; |
165 asm volatile( | 135 asm volatile( |
166 // "int $3 \n\t" | |
167 "movq (%1, %2), %%mm0 \n\t" | 136 "movq (%1, %2), %%mm0 \n\t" |
168 "movq (%1, %2, 8), %%mm1 \n\t" | 137 "movq (%1, %2, 8), %%mm1 \n\t" |
169 "movq %%mm0, %%mm2 \n\t" | 138 "movq %%mm0, %%mm2 \n\t" |
170 "psubusb %%mm1, %%mm0 \n\t" | 139 "psubusb %%mm1, %%mm0 \n\t" |
171 "psubusb %%mm2, %%mm1 \n\t" | 140 "psubusb %%mm2, %%mm1 \n\t" |
172 "por %%mm1, %%mm0 \n\t" // ABS Diff | 141 "por %%mm1, %%mm0 \n\t" // ABS Diff |
173 | 142 |
174 "movq "MANGLE(pQPb)", %%mm7 \n\t" // QP,..., QP | 143 "movq %3, %%mm7 \n\t" // QP,..., QP |
175 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP | 144 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
176 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 | 145 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 |
177 "pcmpeqd "MANGLE(b00)", %%mm0 \n\t" | 146 "packssdw %%mm0, %%mm0 \n\t" |
178 "psrlq $16, %%mm0 \n\t" | |
179 "pcmpeqd "MANGLE(bFF)", %%mm0 \n\t" | |
180 // "movd %%mm0, (%1, %2, 4)\n\t" | |
181 "movd %%mm0, %0 \n\t" | 147 "movd %%mm0, %0 \n\t" |
182 : "=r" (isOk) | 148 : "=r" (isOk) |
183 : "r" (src), "r" (stride) | 149 : "r" (src), "r" (stride), "m" (c->pQPb) |
184 ); | 150 ); |
185 return isOk; | 151 return isOk==0; |
186 #else | 152 #else |
187 | |
188 int isOk2= 1; | |
189 int x; | 153 int x; |
154 const int QP= c->QP; | |
190 src+= stride*3; | 155 src+= stride*3; |
191 for(x=0; x<BLOCK_SIZE; x++) | 156 for(x=0; x<BLOCK_SIZE; x++) |
192 { | 157 { |
193 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; | 158 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0; |
194 } | 159 } |
195 /* if(isOk && !isOk2 || !isOk && isOk2) | 160 |
196 { | 161 return 1; |
197 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP); | 162 #endif |
198 for(int y=0; y<9; y++) | |
199 { | |
200 for(int x=0; x<8; x++) | |
201 { | |
202 printf("%d ", src[x + y*stride]); | |
203 } | |
204 printf("\n"); | |
205 } | |
206 } */ | |
207 | |
208 return isOk2; | |
209 #endif | |
210 | |
211 } | 163 } |
212 | 164 |
213 /** | 165 /** |
214 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) | 166 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
215 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 | 167 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
216 */ | 168 */ |
217 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) | 169 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) |
218 { | 170 { |
219 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 171 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
220 src+= stride*3; | 172 src+= stride*3; |
221 asm volatile( //"movv %0 %1 %2\n\t" | 173 asm volatile( //"movv %0 %1 %2\n\t" |
222 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP | 174 "movq %2, %%mm0 \n\t" // QP,..., QP |
175 "pxor %%mm4, %%mm4 \n\t" | |
223 | 176 |
224 "movq (%0), %%mm6 \n\t" | 177 "movq (%0), %%mm6 \n\t" |
225 "movq (%0, %1), %%mm5 \n\t" | 178 "movq (%0, %1), %%mm5 \n\t" |
226 "movq %%mm5, %%mm1 \n\t" | 179 "movq %%mm5, %%mm1 \n\t" |
227 "movq %%mm6, %%mm2 \n\t" | 180 "movq %%mm6, %%mm2 \n\t" |
228 "psubusb %%mm6, %%mm5 \n\t" | 181 "psubusb %%mm6, %%mm5 \n\t" |
229 "psubusb %%mm1, %%mm2 \n\t" | 182 "psubusb %%mm1, %%mm2 \n\t" |
230 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | 183 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
231 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 | 184 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
232 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // diff <= QP -> FF | 185 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
233 | 186 |
234 "pand %%mm2, %%mm6 \n\t" | 187 "pand %%mm2, %%mm6 \n\t" |
235 "pandn %%mm1, %%mm2 \n\t" | 188 "pandn %%mm1, %%mm2 \n\t" |
236 "por %%mm2, %%mm6 \n\t"// First Line to Filter | 189 "por %%mm2, %%mm6 \n\t"// First Line to Filter |
237 | 190 |
238 "movq (%0, %1, 8), %%mm5 \n\t" | 191 "movq (%0, %1, 8), %%mm5 \n\t" |
239 "leal (%0, %1, 4), %%eax \n\t" | 192 "leal (%0, %1, 4), %%eax \n\t" |
240 "leal (%0, %1, 8), %%ebx \n\t" | 193 "leal (%0, %1, 8), %%ecx \n\t" |
241 "subl %1, %%ebx \n\t" | 194 "subl %1, %%ecx \n\t" |
242 "addl %1, %0 \n\t" // %0 points to line 1 not 0 | 195 "addl %1, %0 \n\t" // %0 points to line 1 not 0 |
243 "movq (%0, %1, 8), %%mm7 \n\t" | 196 "movq (%0, %1, 8), %%mm7 \n\t" |
244 "movq %%mm5, %%mm1 \n\t" | 197 "movq %%mm5, %%mm1 \n\t" |
245 "movq %%mm7, %%mm2 \n\t" | 198 "movq %%mm7, %%mm2 \n\t" |
246 "psubusb %%mm7, %%mm5 \n\t" | 199 "psubusb %%mm7, %%mm5 \n\t" |
247 "psubusb %%mm1, %%mm2 \n\t" | 200 "psubusb %%mm1, %%mm2 \n\t" |
248 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | 201 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
249 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 | 202 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
250 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // diff <= QP -> FF | 203 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
251 | 204 |
252 "pand %%mm2, %%mm7 \n\t" | 205 "pand %%mm2, %%mm7 \n\t" |
253 "pandn %%mm1, %%mm2 \n\t" | 206 "pandn %%mm1, %%mm2 \n\t" |
254 "por %%mm2, %%mm7 \n\t" // First Line to Filter | 207 "por %%mm2, %%mm7 \n\t" // First Line to Filter |
255 | 208 |
256 | 209 |
257 // 1 2 3 4 5 6 7 8 | 210 // 1 2 3 4 5 6 7 8 |
258 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1 | 211 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 |
259 // 6 4 2 2 1 1 | 212 // 6 4 2 2 1 1 |
260 // 6 4 4 2 | 213 // 6 4 4 2 |
261 // 6 8 2 | 214 // 6 8 2 |
262 | 215 |
263 "movq (%0, %1), %%mm0 \n\t" // 1 | 216 "movq (%0, %1), %%mm0 \n\t" // 1 |
284 PAVGB(%%mm5, %%mm3) // 2 2211 /8 | 237 PAVGB(%%mm5, %%mm3) // 2 2211 /8 |
285 PAVGB(%%mm0, %%mm3) //4242211 /16 | 238 PAVGB(%%mm0, %%mm3) //4242211 /16 |
286 "movq %%mm3, (%0,%1) \n\t" // X | 239 "movq %%mm3, (%0,%1) \n\t" // X |
287 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 | 240 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 |
288 PAVGB(%%mm4, %%mm6) //11 /2 | 241 PAVGB(%%mm4, %%mm6) //11 /2 |
289 "movq (%%ebx), %%mm0 \n\t" // 1 | 242 "movq (%%ecx), %%mm0 \n\t" // 1 |
290 PAVGB((%%eax, %1, 2), %%mm0) // 11/2 | 243 PAVGB((%%eax, %1, 2), %%mm0) // 11/2 |
291 "movq %%mm0, %%mm3 \n\t" // 11/2 | 244 "movq %%mm0, %%mm3 \n\t" // 11/2 |
292 PAVGB(%%mm1, %%mm0) // 2 11/4 | 245 PAVGB(%%mm1, %%mm0) // 2 11/4 |
293 PAVGB(%%mm6, %%mm0) //222 11/8 | 246 PAVGB(%%mm6, %%mm0) //222 11/8 |
294 PAVGB(%%mm2, %%mm0) //22242211/16 | 247 PAVGB(%%mm2, %%mm0) //22242211/16 |
295 "movq (%0, %1, 2), %%mm2 \n\t" // 1 | 248 "movq (%0, %1, 2), %%mm2 \n\t" // 1 |
296 "movq %%mm0, (%0, %1, 2) \n\t" // X | 249 "movq %%mm0, (%0, %1, 2) \n\t" // X |
297 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 | 250 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 |
298 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 | 251 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
299 PAVGB((%%ebx), %%mm0) // 11 /2 | 252 PAVGB((%%ecx), %%mm0) // 11 /2 |
300 PAVGB(%%mm0, %%mm6) //11 11 /4 | 253 PAVGB(%%mm0, %%mm6) //11 11 /4 |
301 PAVGB(%%mm1, %%mm4) // 11 /2 | 254 PAVGB(%%mm1, %%mm4) // 11 /2 |
302 PAVGB(%%mm2, %%mm1) // 11 /2 | 255 PAVGB(%%mm2, %%mm1) // 11 /2 |
303 PAVGB(%%mm1, %%mm6) //1122 11 /8 | 256 PAVGB(%%mm1, %%mm6) //1122 11 /8 |
304 PAVGB(%%mm5, %%mm6) //112242211 /16 | 257 PAVGB(%%mm5, %%mm6) //112242211 /16 |
321 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 | 274 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 |
322 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 | 275 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
323 PAVGB(%%mm0, %%mm1) // 11224222 /16 | 276 PAVGB(%%mm0, %%mm1) // 11224222 /16 |
324 "movq %%mm1, (%%eax, %1, 2) \n\t" // X | 277 "movq %%mm1, (%%eax, %1, 2) \n\t" // X |
325 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 | 278 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
326 PAVGB((%%ebx), %%mm2) // 112 4 /8 | 279 PAVGB((%%ecx), %%mm2) // 112 4 /8 |
327 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 | 280 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
328 PAVGB(%%mm0, %%mm6) // 1 1 /2 | 281 PAVGB(%%mm0, %%mm6) // 1 1 /2 |
329 PAVGB(%%mm7, %%mm6) // 1 12 /4 | 282 PAVGB(%%mm7, %%mm6) // 1 12 /4 |
330 PAVGB(%%mm2, %%mm6) // 1122424 /4 | 283 PAVGB(%%mm2, %%mm6) // 1122424 /4 |
331 "movq %%mm6, (%%ebx) \n\t" // X | 284 "movq %%mm6, (%%ecx) \n\t" // X |
332 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 | 285 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
333 PAVGB(%%mm7, %%mm5) // 11 2 /4 | 286 PAVGB(%%mm7, %%mm5) // 11 2 /4 |
334 PAVGB(%%mm7, %%mm5) // 11 6 /8 | 287 PAVGB(%%mm7, %%mm5) // 11 6 /8 |
335 | 288 |
336 PAVGB(%%mm3, %%mm0) // 112 /4 | 289 PAVGB(%%mm3, %%mm0) // 112 /4 |
337 PAVGB(%%mm0, %%mm5) // 112246 /16 | 290 PAVGB(%%mm0, %%mm5) // 112246 /16 |
338 "movq %%mm5, (%%eax, %1, 4) \n\t" // X | 291 "movq %%mm5, (%%eax, %1, 4) \n\t" // X |
339 "subl %1, %0 \n\t" | 292 "subl %1, %0 \n\t" |
340 | 293 |
341 : | 294 : |
342 : "r" (src), "r" (stride) | 295 : "r" (src), "r" (stride), "m" (c->pQPb) |
343 : "%eax", "%ebx" | 296 : "%eax", "%ecx" |
344 ); | 297 ); |
345 #else | 298 #else |
346 const int l1= stride; | 299 const int l1= stride; |
347 const int l2= stride + l1; | 300 const int l2= stride + l1; |
348 const int l3= stride + l2; | 301 const int l3= stride + l2; |
354 const int l9= stride + l8; | 307 const int l9= stride + l8; |
355 int x; | 308 int x; |
356 src+= stride*3; | 309 src+= stride*3; |
357 for(x=0; x<BLOCK_SIZE; x++) | 310 for(x=0; x<BLOCK_SIZE; x++) |
358 { | 311 { |
359 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; | 312 const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; |
360 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; | 313 const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; |
361 | 314 |
362 int sums[9]; | 315 int sums[9]; |
363 sums[0] = first + src[l1]; | 316 sums[0] = first + src[l1]; |
364 sums[1] = src[l1] + src[l2]; | 317 sums[1] = src[l1] + src[l2]; |
365 sums[2] = src[l2] + src[l3]; | 318 sums[2] = src[l2] + src[l3]; |
379 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4; | 332 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4; |
380 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; | 333 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; |
381 | 334 |
382 src++; | 335 src++; |
383 } | 336 } |
384 | |
385 #endif | 337 #endif |
386 } | 338 } |
387 | 339 |
340 #if 0 | |
388 /** | 341 /** |
389 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar | 342 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar |
390 * values are correctly clipped (MMX2) | 343 * values are correctly clipped (MMX2) |
391 * values are wraparound (C) | 344 * values are wraparound (C) |
392 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient | 345 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient |
403 // FIXME rounding | 356 // FIXME rounding |
404 asm volatile( | 357 asm volatile( |
405 "pxor %%mm7, %%mm7 \n\t" // 0 | 358 "pxor %%mm7, %%mm7 \n\t" // 0 |
406 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE | 359 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
407 "leal (%0, %1), %%eax \n\t" | 360 "leal (%0, %1), %%eax \n\t" |
408 "leal (%%eax, %1, 4), %%ebx \n\t" | 361 "leal (%%eax, %1, 4), %%ecx \n\t" |
409 // 0 1 2 3 4 5 6 7 8 9 | 362 // 0 1 2 3 4 5 6 7 8 9 |
410 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 363 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
411 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP | 364 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP |
412 "movq %%mm0, %%mm1 \n\t" // QP,..., QP | 365 "movq %%mm0, %%mm1 \n\t" // QP,..., QP |
413 "paddusb "MANGLE(b02)", %%mm0 \n\t" | 366 "paddusb "MANGLE(b02)", %%mm0 \n\t" |
414 "psrlw $2, %%mm0 \n\t" | 367 "psrlw $2, %%mm0 \n\t" |
415 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 | 368 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 |
416 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... | 369 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... |
417 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 | 370 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 |
418 "movq (%%ebx), %%mm3 \n\t" // line 5 | 371 "movq (%%ecx), %%mm3 \n\t" // line 5 |
419 "movq %%mm2, %%mm4 \n\t" // line 4 | 372 "movq %%mm2, %%mm4 \n\t" // line 4 |
420 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 | 373 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 |
421 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 | 374 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 |
422 PAVGB(%%mm3, %%mm5) | 375 PAVGB(%%mm3, %%mm5) |
423 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 | 376 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 |
431 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 | 384 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 |
432 "paddb %%mm5, %%mm2 \n\t" | 385 "paddb %%mm5, %%mm2 \n\t" |
433 // "psubb %%mm6, %%mm2 \n\t" | 386 // "psubb %%mm6, %%mm2 \n\t" |
434 "movq %%mm2, (%0,%1, 4) \n\t" | 387 "movq %%mm2, (%0,%1, 4) \n\t" |
435 | 388 |
436 "movq (%%ebx), %%mm2 \n\t" | 389 "movq (%%ecx), %%mm2 \n\t" |
437 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 | 390 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 |
438 "psubb %%mm5, %%mm2 \n\t" | 391 "psubb %%mm5, %%mm2 \n\t" |
439 // "psubb %%mm6, %%mm2 \n\t" | 392 // "psubb %%mm6, %%mm2 \n\t" |
440 "movq %%mm2, (%%ebx) \n\t" | 393 "movq %%mm2, (%%ecx) \n\t" |
441 | 394 |
442 "paddb %%mm6, %%mm5 \n\t" | 395 "paddb %%mm6, %%mm5 \n\t" |
443 "psrlw $2, %%mm5 \n\t" | 396 "psrlw $2, %%mm5 \n\t" |
444 "pand "MANGLE(b3F)", %%mm5 \n\t" | 397 "pand "MANGLE(b3F)", %%mm5 \n\t" |
445 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 | 398 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 |
448 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 | 401 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 |
449 "paddsb %%mm5, %%mm2 \n\t" | 402 "paddsb %%mm5, %%mm2 \n\t" |
450 "psubb %%mm6, %%mm2 \n\t" | 403 "psubb %%mm6, %%mm2 \n\t" |
451 "movq %%mm2, (%%eax, %1, 2) \n\t" | 404 "movq %%mm2, (%%eax, %1, 2) \n\t" |
452 | 405 |
453 "movq (%%ebx, %1), %%mm2 \n\t" | 406 "movq (%%ecx, %1), %%mm2 \n\t" |
454 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 | 407 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 |
455 "psubsb %%mm5, %%mm2 \n\t" | 408 "psubsb %%mm5, %%mm2 \n\t" |
456 "psubb %%mm6, %%mm2 \n\t" | 409 "psubb %%mm6, %%mm2 \n\t" |
457 "movq %%mm2, (%%ebx, %1) \n\t" | 410 "movq %%mm2, (%%ecx, %1) \n\t" |
458 | 411 |
459 : | 412 : |
460 : "r" (src), "r" (stride) | 413 : "r" (src), "r" (stride) |
461 : "%eax", "%ebx" | 414 : "%eax", "%ecx" |
462 ); | 415 ); |
463 #else | 416 #else |
464 const int l1= stride; | 417 const int l1= stride; |
465 const int l2= stride + l1; | 418 const int l2= stride + l1; |
466 const int l3= stride + l2; | 419 const int l3= stride + l2; |
486 } | 439 } |
487 } | 440 } |
488 | 441 |
489 #endif | 442 #endif |
490 } | 443 } |
444 #endif | |
491 | 445 |
492 /** | 446 /** |
493 * Experimental Filter 1 | 447 * Experimental Filter 1 |
494 * will not damage linear gradients | 448 * will not damage linear gradients |
495 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | 449 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter |
496 * can only smooth blocks at the expected locations (it cant smooth them if they did move) | 450 * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
497 * MMX2 version does correct clipping C version doesnt | 451 * MMX2 version does correct clipping C version doesnt |
498 */ | 452 */ |
499 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP) | 453 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) |
500 { | 454 { |
501 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 455 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
502 src+= stride*3; | 456 src+= stride*3; |
503 | 457 |
504 asm volatile( | 458 asm volatile( |
505 "pxor %%mm7, %%mm7 \n\t" // 0 | 459 "pxor %%mm7, %%mm7 \n\t" // 0 |
506 "leal (%0, %1), %%eax \n\t" | 460 "leal (%0, %1), %%eax \n\t" |
507 "leal (%%eax, %1, 4), %%ebx \n\t" | 461 "leal (%%eax, %1, 4), %%ecx \n\t" |
508 // 0 1 2 3 4 5 6 7 8 9 | 462 // 0 1 2 3 4 5 6 7 8 9 |
509 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 463 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
510 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 | 464 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
511 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 | 465 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 |
512 "movq %%mm1, %%mm2 \n\t" // line 4 | 466 "movq %%mm1, %%mm2 \n\t" // line 4 |
513 "psubusb %%mm0, %%mm1 \n\t" | 467 "psubusb %%mm0, %%mm1 \n\t" |
514 "psubusb %%mm2, %%mm0 \n\t" | 468 "psubusb %%mm2, %%mm0 \n\t" |
515 "por %%mm1, %%mm0 \n\t" // |l2 - l3| | 469 "por %%mm1, %%mm0 \n\t" // |l2 - l3| |
516 "movq (%%ebx), %%mm3 \n\t" // line 5 | 470 "movq (%%ecx), %%mm3 \n\t" // line 5 |
517 "movq (%%ebx, %1), %%mm4 \n\t" // line 6 | 471 "movq (%%ecx, %1), %%mm4 \n\t" // line 6 |
518 "movq %%mm3, %%mm5 \n\t" // line 5 | 472 "movq %%mm3, %%mm5 \n\t" // line 5 |
519 "psubusb %%mm4, %%mm3 \n\t" | 473 "psubusb %%mm4, %%mm3 \n\t" |
520 "psubusb %%mm5, %%mm4 \n\t" | 474 "psubusb %%mm5, %%mm4 \n\t" |
521 "por %%mm4, %%mm3 \n\t" // |l5 - l6| | 475 "por %%mm4, %%mm3 \n\t" // |l5 - l6| |
522 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 | 476 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 |
526 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 | 480 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 |
527 "psubusb %%mm1, %%mm5 \n\t" | 481 "psubusb %%mm1, %%mm5 \n\t" |
528 "por %%mm5, %%mm4 \n\t" // |l4 - l5| | 482 "por %%mm5, %%mm4 \n\t" // |l4 - l5| |
529 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) | 483 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) |
530 "movq %%mm4, %%mm3 \n\t" // d | 484 "movq %%mm4, %%mm3 \n\t" // d |
531 "movq "MANGLE(pQPb)", %%mm0 \n\t" | 485 "movq %2, %%mm0 \n\t" |
532 "paddusb %%mm0, %%mm0 \n\t" | 486 "paddusb %%mm0, %%mm0 \n\t" |
533 "psubusb %%mm0, %%mm4 \n\t" | 487 "psubusb %%mm0, %%mm4 \n\t" |
534 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 | 488 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
535 "psubusb "MANGLE(b01)", %%mm3 \n\t" | 489 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
536 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 | 490 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
544 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | 498 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
545 "psubusb %%mm3, %%mm0 \n\t" | 499 "psubusb %%mm3, %%mm0 \n\t" |
546 "pxor %%mm2, %%mm0 \n\t" | 500 "pxor %%mm2, %%mm0 \n\t" |
547 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 | 501 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 |
548 | 502 |
549 "movq (%%ebx), %%mm0 \n\t" // line 5 | 503 "movq (%%ecx), %%mm0 \n\t" // line 5 |
550 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | 504 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
551 "paddusb %%mm3, %%mm0 \n\t" | 505 "paddusb %%mm3, %%mm0 \n\t" |
552 "pxor %%mm2, %%mm0 \n\t" | 506 "pxor %%mm2, %%mm0 \n\t" |
553 "movq %%mm0, (%%ebx) \n\t" // line 5 | 507 "movq %%mm0, (%%ecx) \n\t" // line 5 |
554 | 508 |
555 PAVGB(%%mm7, %%mm1) // d/4 | 509 PAVGB(%%mm7, %%mm1) // d/4 |
556 | 510 |
557 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 | 511 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
558 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | 512 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
559 "psubusb %%mm1, %%mm0 \n\t" | 513 "psubusb %%mm1, %%mm0 \n\t" |
560 "pxor %%mm2, %%mm0 \n\t" | 514 "pxor %%mm2, %%mm0 \n\t" |
561 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 | 515 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 |
562 | 516 |
563 "movq (%%ebx, %1), %%mm0 \n\t" // line 6 | 517 "movq (%%ecx, %1), %%mm0 \n\t" // line 6 |
564 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | 518 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
565 "paddusb %%mm1, %%mm0 \n\t" | 519 "paddusb %%mm1, %%mm0 \n\t" |
566 "pxor %%mm2, %%mm0 \n\t" | 520 "pxor %%mm2, %%mm0 \n\t" |
567 "movq %%mm0, (%%ebx, %1) \n\t" // line 6 | 521 "movq %%mm0, (%%ecx, %1) \n\t" // line 6 |
568 | 522 |
569 PAVGB(%%mm7, %%mm1) // d/8 | 523 PAVGB(%%mm7, %%mm1) // d/8 |
570 | 524 |
571 "movq (%%eax, %1), %%mm0 \n\t" // line 2 | 525 "movq (%%eax, %1), %%mm0 \n\t" // line 2 |
572 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 | 526 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
573 "psubusb %%mm1, %%mm0 \n\t" | 527 "psubusb %%mm1, %%mm0 \n\t" |
574 "pxor %%mm2, %%mm0 \n\t" | 528 "pxor %%mm2, %%mm0 \n\t" |
575 "movq %%mm0, (%%eax, %1) \n\t" // line 2 | 529 "movq %%mm0, (%%eax, %1) \n\t" // line 2 |
576 | 530 |
577 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 | 531 "movq (%%ecx, %1, 2), %%mm0 \n\t" // line 7 |
578 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 | 532 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
579 "paddusb %%mm1, %%mm0 \n\t" | 533 "paddusb %%mm1, %%mm0 \n\t" |
580 "pxor %%mm2, %%mm0 \n\t" | 534 "pxor %%mm2, %%mm0 \n\t" |
581 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 | 535 "movq %%mm0, (%%ecx, %1, 2) \n\t" // line 7 |
582 | 536 |
583 : | 537 : |
584 : "r" (src), "r" (stride) | 538 : "r" (src), "r" (stride), "m" (co->pQPb) |
585 : "%eax", "%ebx" | 539 : "%eax", "%ecx" |
586 ); | 540 ); |
587 #else | 541 #else |
588 | 542 |
589 const int l1= stride; | 543 const int l1= stride; |
590 const int l2= stride + l1; | 544 const int l2= stride + l1; |
605 int c= src[l5] - src[l6]; | 559 int c= src[l5] - src[l6]; |
606 | 560 |
607 int d= ABS(b) - ((ABS(a) + ABS(c))>>1); | 561 int d= ABS(b) - ((ABS(a) + ABS(c))>>1); |
608 d= MAX(d, 0); | 562 d= MAX(d, 0); |
609 | 563 |
610 if(d < QP*2) | 564 if(d < co->QP*2) |
611 { | 565 { |
612 int v = d * SIGN(-b); | 566 int v = d * SIGN(-b); |
613 | 567 |
614 src[l2] +=v>>3; | 568 src[l2] +=v>>3; |
615 src[l3] +=v>>2; | 569 src[l3] +=v>>2; |
619 src[l7] -=v>>3; | 573 src[l7] -=v>>3; |
620 | 574 |
621 } | 575 } |
622 src++; | 576 src++; |
623 } | 577 } |
624 /* | |
625 const int l1= stride; | |
626 const int l2= stride + l1; | |
627 const int l3= stride + l2; | |
628 const int l4= stride + l3; | |
629 const int l5= stride + l4; | |
630 const int l6= stride + l5; | |
631 const int l7= stride + l6; | |
632 const int l8= stride + l7; | |
633 const int l9= stride + l8; | |
634 for(int x=0; x<BLOCK_SIZE; x++) | |
635 { | |
636 int v2= src[l2]; | |
637 int v3= src[l3]; | |
638 int v4= src[l4]; | |
639 int v5= src[l5]; | |
640 int v6= src[l6]; | |
641 int v7= src[l7]; | |
642 | |
643 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 ) | |
644 { | |
645 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16; | |
646 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16; | |
647 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16; | |
648 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; | |
649 } | |
650 src++; | |
651 } | |
652 */ | |
653 #endif | 578 #endif |
654 } | 579 } |
655 | 580 |
656 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) | 581 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) |
657 { | 582 { |
658 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 583 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
659 /* | 584 /* |
660 uint8_t tmp[16]; | 585 uint8_t tmp[16]; |
661 const int l1= stride; | 586 const int l1= stride; |
674 asm volatile( | 599 asm volatile( |
675 | 600 |
676 #if 0 //sligtly more accurate and slightly slower | 601 #if 0 //sligtly more accurate and slightly slower |
677 "pxor %%mm7, %%mm7 \n\t" // 0 | 602 "pxor %%mm7, %%mm7 \n\t" // 0 |
678 "leal (%0, %1), %%eax \n\t" | 603 "leal (%0, %1), %%eax \n\t" |
679 "leal (%%eax, %1, 4), %%ebx \n\t" | 604 "leal (%%eax, %1, 4), %%ecx \n\t" |
680 // 0 1 2 3 4 5 6 7 | 605 // 0 1 2 3 4 5 6 7 |
681 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 | 606 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
682 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 | 607 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 |
683 | 608 |
684 | 609 |
685 "movq (%0, %1, 2), %%mm0 \n\t" // l2 | 610 "movq (%0, %1, 2), %%mm0 \n\t" // l2 |
686 "movq (%0), %%mm1 \n\t" // l0 | 611 "movq (%0), %%mm1 \n\t" // l0 |
687 "movq %%mm0, %%mm2 \n\t" // l2 | 612 "movq %%mm0, %%mm2 \n\t" // l2 |
706 "movq %%mm0, %%mm4 \n\t" // l4 | 631 "movq %%mm0, %%mm4 \n\t" // l4 |
707 PAVGB(%%mm7, %%mm0) // ~l4/2 | 632 PAVGB(%%mm7, %%mm0) // ~l4/2 |
708 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 | 633 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 |
709 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 | 634 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 |
710 | 635 |
711 "movq (%%ebx), %%mm2 \n\t" // l5 | 636 "movq (%%ecx), %%mm2 \n\t" // l5 |
712 "movq %%mm3, %%mm5 \n\t" // l3 | 637 "movq %%mm3, %%mm5 \n\t" // l3 |
713 PAVGB(%%mm7, %%mm3) // ~l3/2 | 638 PAVGB(%%mm7, %%mm3) // ~l3/2 |
714 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 | 639 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 |
715 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 | 640 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 |
716 | 641 |
719 "psubusb %%mm6, %%mm3 \n\t" | 644 "psubusb %%mm6, %%mm3 \n\t" |
720 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 | 645 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 |
721 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) | 646 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) |
722 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 | 647 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 |
723 | 648 |
724 "movq (%%ebx, %1), %%mm6 \n\t" // l6 | 649 "movq (%%ecx, %1), %%mm6 \n\t" // l6 |
725 "movq %%mm6, %%mm5 \n\t" // l6 | 650 "movq %%mm6, %%mm5 \n\t" // l6 |
726 PAVGB(%%mm7, %%mm6) // ~l6/2 | 651 PAVGB(%%mm7, %%mm6) // ~l6/2 |
727 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 | 652 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 |
728 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 | 653 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 |
729 | 654 |
730 "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7 | 655 "movq (%%ecx, %1, 2), %%mm5 \n\t" // l7 |
731 "movq %%mm2, %%mm4 \n\t" // l5 | 656 "movq %%mm2, %%mm4 \n\t" // l5 |
732 PAVGB(%%mm7, %%mm2) // ~l5/2 | 657 PAVGB(%%mm7, %%mm2) // ~l5/2 |
733 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 | 658 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 |
734 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 | 659 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 |
735 | 660 |
739 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 | 664 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 |
740 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 | 665 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 |
741 | 666 |
742 | 667 |
743 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 | 668 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 |
744 "movq "MANGLE(pQPb)", %%mm4 \n\t" // QP //FIXME QP+1 ? | 669 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? |
745 "paddusb "MANGLE(b01)", %%mm4 \n\t" | 670 "paddusb "MANGLE(b01)", %%mm4 \n\t" |
746 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP | 671 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP |
747 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 | 672 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 |
748 "pand %%mm4, %%mm3 \n\t" | 673 "pand %%mm4, %%mm3 \n\t" |
749 | 674 |
781 #endif | 706 #endif |
782 | 707 |
783 "leal (%0, %1), %%eax \n\t" | 708 "leal (%0, %1), %%eax \n\t" |
784 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 | 709 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 |
785 // 0 1 2 3 4 5 6 7 | 710 // 0 1 2 3 4 5 6 7 |
786 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 | 711 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
787 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 | 712 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 |
788 | 713 |
789 | 714 |
790 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 | 715 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 |
791 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | 716 "movq (%0, %1, 4), %%mm0 \n\t" // l4 |
792 "pxor %%mm6, %%mm1 \n\t" // -l3-1 | 717 "pxor %%mm6, %%mm1 \n\t" // -l3-1 |
796 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 | 721 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 |
797 "movq (%%eax, %1), %%mm3 \n\t" // l2 | 722 "movq (%%eax, %1), %%mm3 \n\t" // l2 |
798 "pxor %%mm6, %%mm2 \n\t" // -l5-1 | 723 "pxor %%mm6, %%mm2 \n\t" // -l5-1 |
799 "movq %%mm2, %%mm5 \n\t" // -l5-1 | 724 "movq %%mm2, %%mm5 \n\t" // -l5-1 |
800 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 | 725 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 |
801 "leal (%%eax, %1, 4), %%ebx \n\t" | 726 "leal (%%eax, %1, 4), %%ecx \n\t" |
802 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 | 727 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 |
803 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 | 728 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 |
804 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 | 729 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 |
805 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 | 730 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 |
806 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 | 731 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 |
813 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 | 738 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 |
814 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 | 739 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 |
815 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 | 740 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 |
816 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 | 741 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 |
817 | 742 |
818 PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2 | 743 PAVGB((%%ecx, %1), %%mm5) // (l6-l5+256)/2 |
819 "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7 | 744 "movq (%%ecx, %1, 2), %%mm1 \n\t" // l7 |
820 "pxor %%mm6, %%mm1 \n\t" // -l7-1 | 745 "pxor %%mm6, %%mm1 \n\t" // -l7-1 |
821 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 | 746 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 |
822 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 | 747 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 |
823 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 | 748 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 |
824 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 | 749 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 |
834 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 | 759 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 |
835 | 760 |
836 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 | 761 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 |
837 | 762 |
838 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 | 763 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 |
839 "movq "MANGLE(pQPb)", %%mm2 \n\t" // QP | 764 "movq %2, %%mm2 \n\t" // QP |
840 PAVGB(%%mm6, %%mm2) // 128 + QP/2 | 765 PAVGB(%%mm6, %%mm2) // 128 + QP/2 |
841 "psubb %%mm6, %%mm2 \n\t" | 766 "psubb %%mm6, %%mm2 \n\t" |
842 | 767 |
843 "movq %%mm4, %%mm1 \n\t" | 768 "movq %%mm4, %%mm1 \n\t" |
844 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) | 769 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) |
875 "pxor %%mm1, %%mm2 \n\t" | 800 "pxor %%mm1, %%mm2 \n\t" |
876 "movq %%mm0, (%%eax, %1, 2) \n\t" | 801 "movq %%mm0, (%%eax, %1, 2) \n\t" |
877 "movq %%mm2, (%0, %1, 4) \n\t" | 802 "movq %%mm2, (%0, %1, 4) \n\t" |
878 | 803 |
879 : | 804 : |
880 : "r" (src), "r" (stride) | 805 : "r" (src), "r" (stride), "m" (c->pQPb) |
881 : "%eax", "%ebx" | 806 : "%eax", "%ecx" |
882 ); | 807 ); |
883 | 808 |
884 /* | 809 /* |
885 { | 810 { |
886 int x; | 811 int x; |
949 src+= stride*4; | 874 src+= stride*4; |
950 | 875 |
951 asm volatile( | 876 asm volatile( |
952 "pxor %%mm7, %%mm7 \n\t" | 877 "pxor %%mm7, %%mm7 \n\t" |
953 "leal (%0, %1), %%eax \n\t" | 878 "leal (%0, %1), %%eax \n\t" |
954 "leal (%%eax, %1, 4), %%ebx \n\t" | 879 "leal (%%eax, %1, 4), %%edx \n\t" |
880 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars | |
881 "andl $0xFFFFFFF8, %%ecx \n\t" // align | |
955 // 0 1 2 3 4 5 6 7 | 882 // 0 1 2 3 4 5 6 7 |
956 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 | 883 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 |
957 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 | 884 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 |
958 | 885 |
959 "movq (%0), %%mm0 \n\t" | 886 "movq (%0), %%mm0 \n\t" |
960 "movq %%mm0, %%mm1 \n\t" | 887 "movq %%mm0, %%mm1 \n\t" |
961 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 | 888 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
962 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 | 889 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
990 | 917 |
991 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 | 918 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
992 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 | 919 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
993 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | 920 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
994 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | 921 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
995 "movq %%mm0, "MANGLE(temp0)" \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | 922 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
996 "movq %%mm1, "MANGLE(temp1)" \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | 923 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
997 | 924 |
998 "movq (%0, %1, 4), %%mm0 \n\t" | 925 "movq (%0, %1, 4), %%mm0 \n\t" |
999 "movq %%mm0, %%mm1 \n\t" | 926 "movq %%mm0, %%mm1 \n\t" |
1000 "punpcklbw %%mm7, %%mm0 \n\t" // L4 | 927 "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
1001 "punpckhbw %%mm7, %%mm1 \n\t" // H4 | 928 "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
1002 | 929 |
1003 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 | 930 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
1004 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 | 931 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
1005 "movq %%mm2, "MANGLE(temp2)" \n\t" // L3 - L4 | 932 "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4 |
1006 "movq %%mm3, "MANGLE(temp3)" \n\t" // H3 - H4 | 933 "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4 |
1007 "paddw %%mm4, %%mm4 \n\t" // 2L2 | 934 "paddw %%mm4, %%mm4 \n\t" // 2L2 |
1008 "paddw %%mm5, %%mm5 \n\t" // 2H2 | 935 "paddw %%mm5, %%mm5 \n\t" // 2H2 |
1009 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 | 936 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
1010 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 | 937 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
1011 | 938 |
1012 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 | 939 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
1013 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 | 940 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
1014 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 | 941 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
1015 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 | 942 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
1016 //50 opcodes so far | 943 //50 opcodes so far |
1017 "movq (%%ebx), %%mm2 \n\t" | 944 "movq (%%edx), %%mm2 \n\t" |
1018 "movq %%mm2, %%mm3 \n\t" | 945 "movq %%mm2, %%mm3 \n\t" |
1019 "punpcklbw %%mm7, %%mm2 \n\t" // L5 | 946 "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
1020 "punpckhbw %%mm7, %%mm3 \n\t" // H5 | 947 "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
1021 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 | 948 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
1022 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 | 949 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
1023 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 | 950 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
1024 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 | 951 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
1025 | 952 |
1026 "movq (%%ebx, %1), %%mm6 \n\t" | 953 "movq (%%edx, %1), %%mm6 \n\t" |
1027 "punpcklbw %%mm7, %%mm6 \n\t" // L6 | 954 "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
1028 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 | 955 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
1029 "movq (%%ebx, %1), %%mm6 \n\t" | 956 "movq (%%edx, %1), %%mm6 \n\t" |
1030 "punpckhbw %%mm7, %%mm6 \n\t" // H6 | 957 "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
1031 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 | 958 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
1032 | 959 |
1033 "paddw %%mm0, %%mm0 \n\t" // 2L4 | 960 "paddw %%mm0, %%mm0 \n\t" // 2L4 |
1034 "paddw %%mm1, %%mm1 \n\t" // 2H4 | 961 "paddw %%mm1, %%mm1 \n\t" // 2H4 |
1038 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 | 965 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
1039 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 | 966 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
1040 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 | 967 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
1041 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 | 968 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
1042 | 969 |
1043 "movq (%%ebx, %1, 2), %%mm2 \n\t" | 970 "movq (%%edx, %1, 2), %%mm2 \n\t" |
1044 "movq %%mm2, %%mm3 \n\t" | 971 "movq %%mm2, %%mm3 \n\t" |
1045 "punpcklbw %%mm7, %%mm2 \n\t" // L7 | 972 "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
1046 "punpckhbw %%mm7, %%mm3 \n\t" // H7 | 973 "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
1047 | 974 |
1048 "paddw %%mm2, %%mm2 \n\t" // 2L7 | 975 "paddw %%mm2, %%mm2 \n\t" // 2L7 |
1049 "paddw %%mm3, %%mm3 \n\t" // 2H7 | 976 "paddw %%mm3, %%mm3 \n\t" // 2H7 |
1050 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 | 977 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
1051 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 | 978 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
1052 | 979 |
1053 "movq "MANGLE(temp0)", %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | 980 "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
1054 "movq "MANGLE(temp1)", %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | 981 "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
1055 | 982 |
1056 #ifdef HAVE_MMX2 | 983 #ifdef HAVE_MMX2 |
1057 "movq %%mm7, %%mm6 \n\t" // 0 | 984 "movq %%mm7, %%mm6 \n\t" // 0 |
1058 "psubw %%mm0, %%mm6 \n\t" | 985 "psubw %%mm0, %%mm6 \n\t" |
1059 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | 986 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
1104 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) | 1031 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
1105 "pxor %%mm7, %%mm5 \n\t" | 1032 "pxor %%mm7, %%mm5 \n\t" |
1106 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| | 1033 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
1107 // 100 opcodes | 1034 // 100 opcodes |
1108 "movd %2, %%mm2 \n\t" // QP | 1035 "movd %2, %%mm2 \n\t" // QP |
1109 "punpcklwd %%mm2, %%mm2 \n\t" | |
1110 "punpcklwd %%mm2, %%mm2 \n\t" | |
1111 "psllw $3, %%mm2 \n\t" // 8QP | 1036 "psllw $3, %%mm2 \n\t" // 8QP |
1112 "movq %%mm2, %%mm3 \n\t" // 8QP | 1037 "movq %%mm2, %%mm3 \n\t" // 8QP |
1113 "pcmpgtw %%mm4, %%mm2 \n\t" | 1038 "pcmpgtw %%mm4, %%mm2 \n\t" |
1114 "pcmpgtw %%mm5, %%mm3 \n\t" | 1039 "pcmpgtw %%mm5, %%mm3 \n\t" |
1115 "pand %%mm2, %%mm4 \n\t" | 1040 "pand %%mm2, %%mm4 \n\t" |
1127 "paddw %%mm2, %%mm4 \n\t" | 1052 "paddw %%mm2, %%mm4 \n\t" |
1128 "paddw %%mm2, %%mm5 \n\t" | 1053 "paddw %%mm2, %%mm5 \n\t" |
1129 "psrlw $6, %%mm4 \n\t" | 1054 "psrlw $6, %%mm4 \n\t" |
1130 "psrlw $6, %%mm5 \n\t" | 1055 "psrlw $6, %%mm5 \n\t" |
1131 | 1056 |
1132 /* | 1057 "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4 |
1133 "movq w06, %%mm2 \n\t" // 6 | 1058 "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4 |
1134 "paddw %%mm2, %%mm4 \n\t" | |
1135 "paddw %%mm2, %%mm5 \n\t" | |
1136 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16 | |
1137 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120 | |
1138 "pmulhw %%mm2, %%mm4 \n\t" // hd/13 | |
1139 "pmulhw %%mm2, %%mm5 \n\t" // ld/13 | |
1140 */ | |
1141 | |
1142 "movq "MANGLE(temp2)", %%mm0 \n\t" // L3 - L4 | |
1143 "movq "MANGLE(temp3)", %%mm1 \n\t" // H3 - H4 | |
1144 | 1059 |
1145 "pxor %%mm2, %%mm2 \n\t" | 1060 "pxor %%mm2, %%mm2 \n\t" |
1146 "pxor %%mm3, %%mm3 \n\t" | 1061 "pxor %%mm3, %%mm3 \n\t" |
1147 | 1062 |
1148 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) | 1063 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
1181 "movq (%0, %1, 4), %%mm0 \n\t" | 1096 "movq (%0, %1, 4), %%mm0 \n\t" |
1182 "psubb %%mm4, %%mm0 \n\t" | 1097 "psubb %%mm4, %%mm0 \n\t" |
1183 "movq %%mm0, (%0, %1, 4) \n\t" | 1098 "movq %%mm0, (%0, %1, 4) \n\t" |
1184 | 1099 |
1185 : | 1100 : |
1186 : "r" (src), "r" (stride), "r" (QP) | 1101 : "r" (src), "r" (stride), "m" (c->pQPb) |
1187 : "%eax", "%ebx" | 1102 : "%eax", "%edx", "%ecx" |
1188 ); | 1103 ); |
1189 #else | 1104 #else |
1190 const int l1= stride; | 1105 const int l1= stride; |
1191 const int l2= stride + l1; | 1106 const int l2= stride + l1; |
1192 const int l3= stride + l2; | 1107 const int l3= stride + l2; |
1199 int x; | 1114 int x; |
1200 src+= stride*3; | 1115 src+= stride*3; |
1201 for(x=0; x<BLOCK_SIZE; x++) | 1116 for(x=0; x<BLOCK_SIZE; x++) |
1202 { | 1117 { |
1203 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | 1118 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
1204 if(ABS(middleEnergy) < 8*QP) | 1119 if(ABS(middleEnergy) < 8*c->QP) |
1205 { | 1120 { |
1206 const int q=(src[l4] - src[l5])/2; | 1121 const int q=(src[l4] - src[l5])/2; |
1207 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | 1122 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); |
1208 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | 1123 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); |
1209 | 1124 |
1230 src++; | 1145 src++; |
1231 } | 1146 } |
1232 #endif | 1147 #endif |
1233 } | 1148 } |
1234 | 1149 |
1235 static inline void RENAME(dering)(uint8_t src[], int stride, int QP) | 1150 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) |
1236 { | 1151 { |
1237 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1152 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1238 asm volatile( | 1153 asm volatile( |
1239 "movq "MANGLE(pQPb)", %%mm0 \n\t" | 1154 "pxor %%mm6, %%mm6 \n\t" |
1240 "paddusb %%mm0, %%mm0 \n\t" | 1155 "pcmpeqb %%mm7, %%mm7 \n\t" |
1241 "movq %%mm0, "MANGLE(pQPb2)" \n\t" | 1156 "movq %2, %%mm0 \n\t" |
1157 "punpcklbw %%mm6, %%mm0 \n\t" | |
1158 "psrlw $1, %%mm0 \n\t" | |
1159 "psubw %%mm7, %%mm0 \n\t" | |
1160 "packuswb %%mm0, %%mm0 \n\t" | |
1161 "movq %%mm0, %3 \n\t" | |
1242 | 1162 |
1243 "leal (%0, %1), %%eax \n\t" | 1163 "leal (%0, %1), %%eax \n\t" |
1244 "leal (%%eax, %1, 4), %%ebx \n\t" | 1164 "leal (%%eax, %1, 4), %%edx \n\t" |
1165 | |
1245 // 0 1 2 3 4 5 6 7 8 9 | 1166 // 0 1 2 3 4 5 6 7 8 9 |
1246 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1167 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
1247 | 1168 |
1248 "pcmpeqb %%mm7, %%mm7 \n\t" | |
1249 "pxor %%mm6, %%mm6 \n\t" | |
1250 #undef FIND_MIN_MAX | 1169 #undef FIND_MIN_MAX |
1251 #ifdef HAVE_MMX2 | 1170 #ifdef HAVE_MMX2 |
1252 #define FIND_MIN_MAX(addr)\ | 1171 #define FIND_MIN_MAX(addr)\ |
1253 "movq " #addr ", %%mm0 \n\t"\ | 1172 "movq " #addr ", %%mm0 \n\t"\ |
1254 "pminub %%mm0, %%mm7 \n\t"\ | 1173 "pminub %%mm0, %%mm7 \n\t"\ |
1265 | 1184 |
1266 FIND_MIN_MAX((%%eax)) | 1185 FIND_MIN_MAX((%%eax)) |
1267 FIND_MIN_MAX((%%eax, %1)) | 1186 FIND_MIN_MAX((%%eax, %1)) |
1268 FIND_MIN_MAX((%%eax, %1, 2)) | 1187 FIND_MIN_MAX((%%eax, %1, 2)) |
1269 FIND_MIN_MAX((%0, %1, 4)) | 1188 FIND_MIN_MAX((%0, %1, 4)) |
1270 FIND_MIN_MAX((%%ebx)) | 1189 FIND_MIN_MAX((%%edx)) |
1271 FIND_MIN_MAX((%%ebx, %1)) | 1190 FIND_MIN_MAX((%%edx, %1)) |
1272 FIND_MIN_MAX((%%ebx, %1, 2)) | 1191 FIND_MIN_MAX((%%edx, %1, 2)) |
1273 FIND_MIN_MAX((%0, %1, 8)) | 1192 FIND_MIN_MAX((%0, %1, 8)) |
1274 | 1193 |
1275 "movq %%mm7, %%mm4 \n\t" | 1194 "movq %%mm7, %%mm4 \n\t" |
1276 "psrlq $8, %%mm7 \n\t" | 1195 "psrlq $8, %%mm7 \n\t" |
1277 #ifdef HAVE_MMX2 | 1196 #ifdef HAVE_MMX2 |
1320 "movq %%mm6, %%mm0 \n\t" // max | 1239 "movq %%mm6, %%mm0 \n\t" // max |
1321 "psubb %%mm7, %%mm6 \n\t" // max - min | 1240 "psubb %%mm7, %%mm6 \n\t" // max - min |
1322 "movd %%mm6, %%ecx \n\t" | 1241 "movd %%mm6, %%ecx \n\t" |
1323 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" | 1242 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" |
1324 " jb 1f \n\t" | 1243 " jb 1f \n\t" |
1244 "leal -24(%%esp), %%ecx \n\t" | |
1245 "andl $0xFFFFFFF8, %%ecx \n\t" | |
1325 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 | 1246 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 |
1326 "punpcklbw %%mm7, %%mm7 \n\t" | 1247 "punpcklbw %%mm7, %%mm7 \n\t" |
1327 "punpcklbw %%mm7, %%mm7 \n\t" | 1248 "punpcklbw %%mm7, %%mm7 \n\t" |
1328 "punpcklbw %%mm7, %%mm7 \n\t" | 1249 "punpcklbw %%mm7, %%mm7 \n\t" |
1329 "movq %%mm7, "MANGLE(temp0)" \n\t" | 1250 "movq %%mm7, (%%ecx) \n\t" |
1330 | 1251 |
1331 "movq (%0), %%mm0 \n\t" // L10 | 1252 "movq (%0), %%mm0 \n\t" // L10 |
1332 "movq %%mm0, %%mm1 \n\t" // L10 | 1253 "movq %%mm0, %%mm1 \n\t" // L10 |
1333 "movq %%mm0, %%mm2 \n\t" // L10 | 1254 "movq %%mm0, %%mm2 \n\t" // L10 |
1334 "psllq $8, %%mm1 \n\t" | 1255 "psllq $8, %%mm1 \n\t" |
1388 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ | 1309 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ |
1389 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ | 1310 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ |
1390 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ | 1311 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ |
1391 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ | 1312 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ |
1392 PAVGB(lx, pplx) \ | 1313 PAVGB(lx, pplx) \ |
1393 "movq " #lx ", "MANGLE(temp1)" \n\t"\ | 1314 "movq " #lx ", 8(%%ecx) \n\t"\ |
1394 "movq "MANGLE(temp0)", " #lx " \n\t"\ | 1315 "movq (%%ecx), " #lx " \n\t"\ |
1395 "psubusb " #lx ", " #t1 " \n\t"\ | 1316 "psubusb " #lx ", " #t1 " \n\t"\ |
1396 "psubusb " #lx ", " #t0 " \n\t"\ | 1317 "psubusb " #lx ", " #t0 " \n\t"\ |
1397 "psubusb " #lx ", " #sx " \n\t"\ | 1318 "psubusb " #lx ", " #sx " \n\t"\ |
1398 "movq "MANGLE(b00)", " #lx " \n\t"\ | 1319 "movq "MANGLE(b00)", " #lx " \n\t"\ |
1399 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ | 1320 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ |
1403 "paddb " #t0 ", " #sx " \n\t"\ | 1324 "paddb " #t0 ", " #sx " \n\t"\ |
1404 \ | 1325 \ |
1405 PAVGB(plx, pplx) /* filtered */\ | 1326 PAVGB(plx, pplx) /* filtered */\ |
1406 "movq " #dst ", " #t0 " \n\t" /* dst */\ | 1327 "movq " #dst ", " #t0 " \n\t" /* dst */\ |
1407 "movq " #t0 ", " #t1 " \n\t" /* dst */\ | 1328 "movq " #t0 ", " #t1 " \n\t" /* dst */\ |
1408 "psubusb "MANGLE(pQPb2)", " #t0 " \n\t"\ | 1329 "psubusb %3, " #t0 " \n\t"\ |
1409 "paddusb "MANGLE(pQPb2)", " #t1 " \n\t"\ | 1330 "paddusb %3, " #t1 " \n\t"\ |
1410 PMAXUB(t0, pplx)\ | 1331 PMAXUB(t0, pplx)\ |
1411 PMINUB(t1, pplx, t0)\ | 1332 PMINUB(t1, pplx, t0)\ |
1412 "paddb " #sx ", " #ppsx " \n\t"\ | 1333 "paddb " #sx ", " #ppsx " \n\t"\ |
1413 "paddb " #psx ", " #ppsx " \n\t"\ | 1334 "paddb " #psx ", " #ppsx " \n\t"\ |
1414 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ | 1335 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ |
1416 "pcmpeqb " #lx ", " #ppsx " \n\t"\ | 1337 "pcmpeqb " #lx ", " #ppsx " \n\t"\ |
1417 "pand " #ppsx ", " #pplx " \n\t"\ | 1338 "pand " #ppsx ", " #pplx " \n\t"\ |
1418 "pandn " #dst ", " #ppsx " \n\t"\ | 1339 "pandn " #dst ", " #ppsx " \n\t"\ |
1419 "por " #pplx ", " #ppsx " \n\t"\ | 1340 "por " #pplx ", " #ppsx " \n\t"\ |
1420 "movq " #ppsx ", " #dst " \n\t"\ | 1341 "movq " #ppsx ", " #dst " \n\t"\ |
1421 "movq "MANGLE(temp1)", " #lx " \n\t" | 1342 "movq 8(%%ecx), " #lx " \n\t" |
1422 | 1343 |
1423 /* | 1344 /* |
1424 0000000 | 1345 0000000 |
1425 1111111 | 1346 1111111 |
1426 | 1347 |
1437 */ | 1358 */ |
1438 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) | 1359 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) |
1439 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | 1360 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
1440 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | 1361 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
1441 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | 1362 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) |
1442 DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | 1363 DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
1443 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | 1364 DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
1444 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | 1365 DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) |
1445 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | 1366 DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
1446 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | 1367 DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
1447 | 1368 |
1448 "1: \n\t" | 1369 "1: \n\t" |
1449 : : "r" (src), "r" (stride), "r" (QP) | 1370 : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2) |
1450 : "%eax", "%ebx", "%ecx" | 1371 : "%eax", "%edx", "%ecx" |
1451 ); | 1372 ); |
1452 #else | 1373 #else |
1453 int y; | 1374 int y; |
1454 int min=255; | 1375 int min=255; |
1455 int max=0; | 1376 int max=0; |
1456 int avg; | 1377 int avg; |
1457 uint8_t *p; | 1378 uint8_t *p; |
1458 int s[10]; | 1379 int s[10]; |
1380 const int QP2= c->QP/2 + 1; | |
1459 | 1381 |
1460 for(y=1; y<9; y++) | 1382 for(y=1; y<9; y++) |
1461 { | 1383 { |
1462 int x; | 1384 int x; |
1463 p= src + stride*y; | 1385 p= src + stride*y; |
1466 p++; | 1388 p++; |
1467 if(*p > max) max= *p; | 1389 if(*p > max) max= *p; |
1468 if(*p < min) min= *p; | 1390 if(*p < min) min= *p; |
1469 } | 1391 } |
1470 } | 1392 } |
1471 avg= (min + max + 1)/2; | 1393 avg= (min + max + 1)>>1; |
1472 | 1394 |
1473 if(max - min <deringThreshold) return; | 1395 if(max - min <deringThreshold) return; |
1474 | 1396 |
1475 for(y=0; y<10; y++) | 1397 for(y=0; y<10; y++) |
1476 { | 1398 { |
1477 int x; | |
1478 int t = 0; | 1399 int t = 0; |
1479 p= src + stride*y; | 1400 |
1480 for(x=0; x<10; x++) | 1401 if(src[stride*y + 0] > avg) t+= 1; |
1481 { | 1402 if(src[stride*y + 1] > avg) t+= 2; |
1482 if(*p > avg) t |= (1<<x); | 1403 if(src[stride*y + 2] > avg) t+= 4; |
1483 p++; | 1404 if(src[stride*y + 3] > avg) t+= 8; |
1484 } | 1405 if(src[stride*y + 4] > avg) t+= 16; |
1406 if(src[stride*y + 5] > avg) t+= 32; | |
1407 if(src[stride*y + 6] > avg) t+= 64; | |
1408 if(src[stride*y + 7] > avg) t+= 128; | |
1409 if(src[stride*y + 8] > avg) t+= 256; | |
1410 if(src[stride*y + 9] > avg) t+= 512; | |
1411 | |
1485 t |= (~t)<<16; | 1412 t |= (~t)<<16; |
1486 t &= (t<<1) & (t>>1); | 1413 t &= (t<<1) & (t>>1); |
1487 s[y] = t; | 1414 s[y] = t; |
1488 } | 1415 } |
1416 | |
1417 for(y=1; y<9; y++) | |
1418 { | |
1419 int t = s[y-1] & s[y] & s[y+1]; | |
1420 t|= t>>16; | |
1421 s[y-1]= t; | |
1422 } | |
1489 | 1423 |
1490 for(y=1; y<9; y++) | 1424 for(y=1; y<9; y++) |
1491 { | 1425 { |
1492 int x; | 1426 int x; |
1493 int t = s[y-1] & s[y] & s[y+1]; | 1427 int t = s[y-1]; |
1494 t|= t>>16; | |
1495 | 1428 |
1496 p= src + stride*y; | 1429 p= src + stride*y; |
1497 for(x=1; x<9; x++) | 1430 for(x=1; x<9; x++) |
1498 { | 1431 { |
1499 p++; | 1432 p++; |
1542 worstDiff, (float)numSkiped/numPixels); | 1475 worstDiff, (float)numSkiped/numPixels); |
1543 } | 1476 } |
1544 } | 1477 } |
1545 } | 1478 } |
1546 #endif | 1479 #endif |
1547 if (*p + 2*QP < f) *p= *p + 2*QP; | 1480 if (*p + QP2 < f) *p= *p + QP2; |
1548 else if(*p - 2*QP > f) *p= *p - 2*QP; | 1481 else if(*p - QP2 > f) *p= *p - QP2; |
1549 else *p=f; | 1482 else *p=f; |
1550 } | 1483 } |
1551 } | 1484 } |
1552 } | 1485 } |
1553 #ifdef DEBUG_DERING_THRESHOLD | 1486 #ifdef DEBUG_DERING_THRESHOLD |
1580 { | 1513 { |
1581 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1514 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1582 src+= 4*stride; | 1515 src+= 4*stride; |
1583 asm volatile( | 1516 asm volatile( |
1584 "leal (%0, %1), %%eax \n\t" | 1517 "leal (%0, %1), %%eax \n\t" |
1585 "leal (%%eax, %1, 4), %%ebx \n\t" | 1518 "leal (%%eax, %1, 4), %%ecx \n\t" |
1586 // 0 1 2 3 4 5 6 7 8 9 | 1519 // 0 1 2 3 4 5 6 7 8 9 |
1587 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1520 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
1588 | 1521 |
1589 "movq (%0), %%mm0 \n\t" | 1522 "movq (%0), %%mm0 \n\t" |
1590 "movq (%%eax, %1), %%mm1 \n\t" | 1523 "movq (%%eax, %1), %%mm1 \n\t" |
1591 PAVGB(%%mm1, %%mm0) | 1524 PAVGB(%%mm1, %%mm0) |
1592 "movq %%mm0, (%%eax) \n\t" | 1525 "movq %%mm0, (%%eax) \n\t" |
1593 "movq (%0, %1, 4), %%mm0 \n\t" | 1526 "movq (%0, %1, 4), %%mm0 \n\t" |
1594 PAVGB(%%mm0, %%mm1) | 1527 PAVGB(%%mm0, %%mm1) |
1595 "movq %%mm1, (%%eax, %1, 2) \n\t" | 1528 "movq %%mm1, (%%eax, %1, 2) \n\t" |
1596 "movq (%%ebx, %1), %%mm1 \n\t" | 1529 "movq (%%ecx, %1), %%mm1 \n\t" |
1597 PAVGB(%%mm1, %%mm0) | 1530 PAVGB(%%mm1, %%mm0) |
1598 "movq %%mm0, (%%ebx) \n\t" | 1531 "movq %%mm0, (%%ecx) \n\t" |
1599 "movq (%0, %1, 8), %%mm0 \n\t" | 1532 "movq (%0, %1, 8), %%mm0 \n\t" |
1600 PAVGB(%%mm0, %%mm1) | 1533 PAVGB(%%mm0, %%mm1) |
1601 "movq %%mm1, (%%ebx, %1, 2) \n\t" | 1534 "movq %%mm1, (%%ecx, %1, 2) \n\t" |
1602 | 1535 |
1603 : : "r" (src), "r" (stride) | 1536 : : "r" (src), "r" (stride) |
1604 : "%eax", "%ebx" | 1537 : "%eax", "%ecx" |
1605 ); | 1538 ); |
1606 #else | 1539 #else |
1607 int x; | 1540 int x; |
1608 src+= 4*stride; | 1541 src+= 4*stride; |
1609 for(x=0; x<8; x++) | 1542 for(x=0; x<8; x++) |
1629 { | 1562 { |
1630 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1563 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1631 src+= stride*3; | 1564 src+= stride*3; |
1632 asm volatile( | 1565 asm volatile( |
1633 "leal (%0, %1), %%eax \n\t" | 1566 "leal (%0, %1), %%eax \n\t" |
1634 "leal (%%eax, %1, 4), %%ebx \n\t" | 1567 "leal (%%eax, %1, 4), %%edx \n\t" |
1635 "leal (%%ebx, %1, 4), %%ecx \n\t" | 1568 "leal (%%edx, %1, 4), %%ecx \n\t" |
1636 "addl %1, %%ecx \n\t" | 1569 "addl %1, %%ecx \n\t" |
1637 "pxor %%mm7, %%mm7 \n\t" | 1570 "pxor %%mm7, %%mm7 \n\t" |
1638 // 0 1 2 3 4 5 6 7 8 9 10 | 1571 // 0 1 2 3 4 5 6 7 8 9 10 |
1639 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx | 1572 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx |
1640 | 1573 |
1641 #define DEINT_CUBIC(a,b,c,d,e)\ | 1574 #define DEINT_CUBIC(a,b,c,d,e)\ |
1642 "movq " #a ", %%mm0 \n\t"\ | 1575 "movq " #a ", %%mm0 \n\t"\ |
1643 "movq " #b ", %%mm1 \n\t"\ | 1576 "movq " #b ", %%mm1 \n\t"\ |
1644 "movq " #d ", %%mm2 \n\t"\ | 1577 "movq " #d ", %%mm2 \n\t"\ |
1658 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ | 1591 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ |
1659 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ | 1592 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ |
1660 "packuswb %%mm3, %%mm1 \n\t"\ | 1593 "packuswb %%mm3, %%mm1 \n\t"\ |
1661 "movq %%mm1, " #c " \n\t" | 1594 "movq %%mm1, " #c " \n\t" |
1662 | 1595 |
1663 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1)) | 1596 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1)) |
1664 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8)) | 1597 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8)) |
1665 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx)) | 1598 DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx)) |
1666 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2)) | 1599 DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2)) |
1667 | 1600 |
1668 : : "r" (src), "r" (stride) | 1601 : : "r" (src), "r" (stride) |
1669 : "%eax", "%ebx", "ecx" | 1602 : "%eax", "%edx", "ecx" |
1670 ); | 1603 ); |
1671 #else | 1604 #else |
1672 int x; | 1605 int x; |
1673 src+= stride*3; | 1606 src+= stride*3; |
1674 for(x=0; x<8; x++) | 1607 for(x=0; x<8; x++) |
1675 { | 1608 { |
1676 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; | 1609 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; |
1677 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; | 1610 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; |
1678 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; | 1611 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; |
1679 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4; | 1612 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4; |
1613 src++; | |
1614 } | |
1615 #endif | |
1616 } | |
1617 | |
1618 /** | |
1619 * Deinterlaces the given block | |
1620 * will be called for every 8x8 block and can read & write from line 4-15 | |
1621 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1622 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
1623 * this filter will read lines 4-13 and write 5-11 | |
1624 * no cliping in C version | |
1625 */ | |
1626 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) | |
1627 { | |
1628 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1629 src+= stride*4; | |
1630 asm volatile( | |
1631 "leal (%0, %1), %%eax \n\t" | |
1632 "leal (%%eax, %1, 4), %%edx \n\t" | |
1633 "pxor %%mm7, %%mm7 \n\t" | |
1634 "movq (%2), %%mm0 \n\t" | |
1635 // 0 1 2 3 4 5 6 7 8 9 10 | |
1636 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | |
1637 | |
1638 #define DEINT_FF(a,b,c,d)\ | |
1639 "movq " #a ", %%mm1 \n\t"\ | |
1640 "movq " #b ", %%mm2 \n\t"\ | |
1641 "movq " #c ", %%mm3 \n\t"\ | |
1642 "movq " #d ", %%mm4 \n\t"\ | |
1643 PAVGB(%%mm3, %%mm1) \ | |
1644 PAVGB(%%mm4, %%mm0) \ | |
1645 "movq %%mm0, %%mm3 \n\t"\ | |
1646 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1647 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1648 "movq %%mm1, %%mm4 \n\t"\ | |
1649 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
1650 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
1651 "psllw $2, %%mm1 \n\t"\ | |
1652 "psllw $2, %%mm4 \n\t"\ | |
1653 "psubw %%mm0, %%mm1 \n\t"\ | |
1654 "psubw %%mm3, %%mm4 \n\t"\ | |
1655 "movq %%mm2, %%mm5 \n\t"\ | |
1656 "movq %%mm2, %%mm0 \n\t"\ | |
1657 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1658 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
1659 "paddw %%mm2, %%mm1 \n\t"\ | |
1660 "paddw %%mm5, %%mm4 \n\t"\ | |
1661 "psraw $2, %%mm1 \n\t"\ | |
1662 "psraw $2, %%mm4 \n\t"\ | |
1663 "packuswb %%mm4, %%mm1 \n\t"\ | |
1664 "movq %%mm1, " #b " \n\t"\ | |
1665 | |
1666 DEINT_FF((%0) , (%%eax) , (%%eax, %1), (%%eax, %1, 2)) | |
1667 DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx) ) | |
1668 DEINT_FF((%0, %1, 4), (%%edx) , (%%edx, %1), (%%edx, %1, 2)) | |
1669 DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4)) | |
1670 | |
1671 "movq %%mm0, (%2) \n\t" | |
1672 : : "r" (src), "r" (stride), "r"(tmp) | |
1673 : "%eax", "%edx" | |
1674 ); | |
1675 #else | |
1676 int x; | |
1677 src+= stride*4; | |
1678 for(x=0; x<8; x++) | |
1679 { | |
1680 int t1= tmp[x]; | |
1681 int t2= src[stride*1]; | |
1682 | |
1683 src[stride*1]= (-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3; | |
1684 t1= src[stride*4]; | |
1685 src[stride*3]= (-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3; | |
1686 t2= src[stride*6]; | |
1687 src[stride*5]= (-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3; | |
1688 t1= src[stride*8]; | |
1689 src[stride*7]= (-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3; | |
1690 tmp[x]= t1; | |
1691 | |
1680 src++; | 1692 src++; |
1681 } | 1693 } |
1682 #endif | 1694 #endif |
1683 } | 1695 } |
1684 | 1696 |
1694 { | 1706 { |
1695 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1707 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1696 src+= 4*stride; | 1708 src+= 4*stride; |
1697 asm volatile( | 1709 asm volatile( |
1698 "leal (%0, %1), %%eax \n\t" | 1710 "leal (%0, %1), %%eax \n\t" |
1699 "leal (%%eax, %1, 4), %%ebx \n\t" | 1711 "leal (%%eax, %1, 4), %%edx \n\t" |
1700 // 0 1 2 3 4 5 6 7 8 9 | 1712 // 0 1 2 3 4 5 6 7 8 9 |
1701 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1713 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
1702 | 1714 |
1703 "movq (%0), %%mm0 \n\t" // L0 | 1715 "movq (%0), %%mm0 \n\t" // L0 |
1704 "movq (%%eax, %1), %%mm1 \n\t" // L2 | 1716 "movq (%%eax, %1), %%mm1 \n\t" // L2 |
1705 PAVGB(%%mm1, %%mm0) // L0+L2 | 1717 PAVGB(%%mm1, %%mm0) // L0+L2 |
1706 "movq (%%eax), %%mm2 \n\t" // L1 | 1718 "movq (%%eax), %%mm2 \n\t" // L1 |
1712 "movq %%mm2, (%%eax) \n\t" | 1724 "movq %%mm2, (%%eax) \n\t" |
1713 "movq (%0, %1, 4), %%mm2 \n\t" // L4 | 1725 "movq (%0, %1, 4), %%mm2 \n\t" // L4 |
1714 PAVGB(%%mm2, %%mm1) // L2+L4 | 1726 PAVGB(%%mm2, %%mm1) // L2+L4 |
1715 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 | 1727 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 |
1716 "movq %%mm1, (%%eax, %1) \n\t" | 1728 "movq %%mm1, (%%eax, %1) \n\t" |
1717 "movq (%%ebx), %%mm1 \n\t" // L5 | 1729 "movq (%%edx), %%mm1 \n\t" // L5 |
1718 PAVGB(%%mm1, %%mm0) // L3+L5 | 1730 PAVGB(%%mm1, %%mm0) // L3+L5 |
1719 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 | 1731 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 |
1720 "movq %%mm0, (%%eax, %1, 2) \n\t" | 1732 "movq %%mm0, (%%eax, %1, 2) \n\t" |
1721 "movq (%%ebx, %1), %%mm0 \n\t" // L6 | 1733 "movq (%%edx, %1), %%mm0 \n\t" // L6 |
1722 PAVGB(%%mm0, %%mm2) // L4+L6 | 1734 PAVGB(%%mm0, %%mm2) // L4+L6 |
1723 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 | 1735 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 |
1724 "movq %%mm2, (%0, %1, 4) \n\t" | 1736 "movq %%mm2, (%0, %1, 4) \n\t" |
1725 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7 | 1737 "movq (%%edx, %1, 2), %%mm2 \n\t" // L7 |
1726 PAVGB(%%mm2, %%mm1) // L5+L7 | 1738 PAVGB(%%mm2, %%mm1) // L5+L7 |
1727 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 | 1739 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 |
1728 "movq %%mm1, (%%ebx) \n\t" | 1740 "movq %%mm1, (%%edx) \n\t" |
1729 "movq (%0, %1, 8), %%mm1 \n\t" // L8 | 1741 "movq (%0, %1, 8), %%mm1 \n\t" // L8 |
1730 PAVGB(%%mm1, %%mm0) // L6+L8 | 1742 PAVGB(%%mm1, %%mm0) // L6+L8 |
1731 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 | 1743 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 |
1732 "movq %%mm0, (%%ebx, %1) \n\t" | 1744 "movq %%mm0, (%%edx, %1) \n\t" |
1733 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9 | 1745 "movq (%%edx, %1, 4), %%mm0 \n\t" // L9 |
1734 PAVGB(%%mm0, %%mm2) // L7+L9 | 1746 PAVGB(%%mm0, %%mm2) // L7+L9 |
1735 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 | 1747 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 |
1736 "movq %%mm2, (%%ebx, %1, 2) \n\t" | 1748 "movq %%mm2, (%%edx, %1, 2) \n\t" |
1737 | 1749 |
1738 | 1750 |
1739 : : "r" (src), "r" (stride) | 1751 : : "r" (src), "r" (stride) |
1740 : "%eax", "%ebx" | 1752 : "%eax", "%edx" |
1741 ); | 1753 ); |
1742 #else | 1754 #else |
1743 int x; | 1755 int x; |
1744 src+= 4*stride; | 1756 src+= 4*stride; |
1745 for(x=0; x<8; x++) | 1757 for(x=0; x<8; x++) |
1768 #ifdef HAVE_MMX | 1780 #ifdef HAVE_MMX |
1769 src+= 4*stride; | 1781 src+= 4*stride; |
1770 #ifdef HAVE_MMX2 | 1782 #ifdef HAVE_MMX2 |
1771 asm volatile( | 1783 asm volatile( |
1772 "leal (%0, %1), %%eax \n\t" | 1784 "leal (%0, %1), %%eax \n\t" |
1773 "leal (%%eax, %1, 4), %%ebx \n\t" | 1785 "leal (%%eax, %1, 4), %%edx \n\t" |
1774 // 0 1 2 3 4 5 6 7 8 9 | 1786 // 0 1 2 3 4 5 6 7 8 9 |
1775 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1787 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
1776 | 1788 |
1777 "movq (%0), %%mm0 \n\t" // | 1789 "movq (%0), %%mm0 \n\t" // |
1778 "movq (%%eax, %1), %%mm2 \n\t" // | 1790 "movq (%%eax, %1), %%mm2 \n\t" // |
1779 "movq (%%eax), %%mm1 \n\t" // | 1791 "movq (%%eax), %%mm1 \n\t" // |
1780 "movq %%mm0, %%mm3 \n\t" | 1792 "movq %%mm0, %%mm3 \n\t" |
1791 "pminub %%mm3, %%mm1 \n\t" // | 1803 "pminub %%mm3, %%mm1 \n\t" // |
1792 "pmaxub %%mm0, %%mm1 \n\t" // | 1804 "pmaxub %%mm0, %%mm1 \n\t" // |
1793 "pminub %%mm1, %%mm2 \n\t" | 1805 "pminub %%mm1, %%mm2 \n\t" |
1794 "movq %%mm2, (%%eax, %1, 2) \n\t" | 1806 "movq %%mm2, (%%eax, %1, 2) \n\t" |
1795 | 1807 |
1796 "movq (%%ebx), %%mm2 \n\t" // | 1808 "movq (%%edx), %%mm2 \n\t" // |
1797 "movq (%%ebx, %1), %%mm1 \n\t" // | 1809 "movq (%%edx, %1), %%mm1 \n\t" // |
1798 "movq %%mm2, %%mm3 \n\t" | 1810 "movq %%mm2, %%mm3 \n\t" |
1799 "pmaxub %%mm0, %%mm2 \n\t" // | 1811 "pmaxub %%mm0, %%mm2 \n\t" // |
1800 "pminub %%mm3, %%mm0 \n\t" // | 1812 "pminub %%mm3, %%mm0 \n\t" // |
1801 "pmaxub %%mm1, %%mm0 \n\t" // | 1813 "pmaxub %%mm1, %%mm0 \n\t" // |
1802 "pminub %%mm0, %%mm2 \n\t" | 1814 "pminub %%mm0, %%mm2 \n\t" |
1803 "movq %%mm2, (%%ebx) \n\t" | 1815 "movq %%mm2, (%%edx) \n\t" |
1804 | 1816 |
1805 "movq (%%ebx, %1, 2), %%mm2 \n\t" // | 1817 "movq (%%edx, %1, 2), %%mm2 \n\t" // |
1806 "movq (%0, %1, 8), %%mm0 \n\t" // | 1818 "movq (%0, %1, 8), %%mm0 \n\t" // |
1807 "movq %%mm2, %%mm3 \n\t" | 1819 "movq %%mm2, %%mm3 \n\t" |
1808 "pmaxub %%mm0, %%mm2 \n\t" // | 1820 "pmaxub %%mm0, %%mm2 \n\t" // |
1809 "pminub %%mm3, %%mm0 \n\t" // | 1821 "pminub %%mm3, %%mm0 \n\t" // |
1810 "pmaxub %%mm1, %%mm0 \n\t" // | 1822 "pmaxub %%mm1, %%mm0 \n\t" // |
1811 "pminub %%mm0, %%mm2 \n\t" | 1823 "pminub %%mm0, %%mm2 \n\t" |
1812 "movq %%mm2, (%%ebx, %1, 2) \n\t" | 1824 "movq %%mm2, (%%edx, %1, 2) \n\t" |
1813 | 1825 |
1814 | 1826 |
1815 : : "r" (src), "r" (stride) | 1827 : : "r" (src), "r" (stride) |
1816 : "%eax", "%ebx" | 1828 : "%eax", "%edx" |
1817 ); | 1829 ); |
1818 | 1830 |
1819 #else // MMX without MMX2 | 1831 #else // MMX without MMX2 |
1820 asm volatile( | 1832 asm volatile( |
1821 "leal (%0, %1), %%eax \n\t" | 1833 "leal (%0, %1), %%eax \n\t" |
1822 "leal (%%eax, %1, 4), %%ebx \n\t" | 1834 "leal (%%eax, %1, 4), %%edx \n\t" |
1823 // 0 1 2 3 4 5 6 7 8 9 | 1835 // 0 1 2 3 4 5 6 7 8 9 |
1824 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1836 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
1825 "pxor %%mm7, %%mm7 \n\t" | 1837 "pxor %%mm7, %%mm7 \n\t" |
1826 | 1838 |
1827 #define MEDIAN(a,b,c)\ | 1839 #define MEDIAN(a,b,c)\ |
1828 "movq " #a ", %%mm0 \n\t"\ | 1840 "movq " #a ", %%mm0 \n\t"\ |
1829 "movq " #b ", %%mm2 \n\t"\ | 1841 "movq " #b ", %%mm2 \n\t"\ |
1848 "pand %%mm1, %%mm0 \n\t"\ | 1860 "pand %%mm1, %%mm0 \n\t"\ |
1849 "movq %%mm0, " #b " \n\t" | 1861 "movq %%mm0, " #b " \n\t" |
1850 | 1862 |
1851 MEDIAN((%0), (%%eax), (%%eax, %1)) | 1863 MEDIAN((%0), (%%eax), (%%eax, %1)) |
1852 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) | 1864 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) |
1853 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1)) | 1865 MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1)) |
1854 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8)) | 1866 MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8)) |
1855 | 1867 |
1856 : : "r" (src), "r" (stride) | 1868 : : "r" (src), "r" (stride) |
1857 : "%eax", "%ebx" | 1869 : "%eax", "%edx" |
1858 ); | 1870 ); |
1859 #endif // MMX | 1871 #endif // MMX |
1860 #else | 1872 #else |
1861 //FIXME | 1873 //FIXME |
1862 int x; | 1874 int x; |
1882 */ | 1894 */ |
1883 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) | 1895 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) |
1884 { | 1896 { |
1885 asm( | 1897 asm( |
1886 "leal (%0, %1), %%eax \n\t" | 1898 "leal (%0, %1), %%eax \n\t" |
1887 "leal (%%eax, %1, 4), %%ebx \n\t" | 1899 "leal (%%eax, %1, 4), %%edx \n\t" |
1888 // 0 1 2 3 4 5 6 7 8 9 | 1900 // 0 1 2 3 4 5 6 7 8 9 |
1889 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1901 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
1890 "movq (%0), %%mm0 \n\t" // 12345678 | 1902 "movq (%0), %%mm0 \n\t" // 12345678 |
1891 "movq (%%eax), %%mm1 \n\t" // abcdefgh | 1903 "movq (%%eax), %%mm1 \n\t" // abcdefgh |
1892 "movq %%mm0, %%mm2 \n\t" // 12345678 | 1904 "movq %%mm0, %%mm2 \n\t" // 12345678 |
1893 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | 1905 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
1894 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | 1906 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
1920 "movd %%mm1, 96(%3) \n\t" | 1932 "movd %%mm1, 96(%3) \n\t" |
1921 "psrlq $32, %%mm1 \n\t" | 1933 "psrlq $32, %%mm1 \n\t" |
1922 "movd %%mm1, 112(%3) \n\t" | 1934 "movd %%mm1, 112(%3) \n\t" |
1923 | 1935 |
1924 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 | 1936 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 |
1925 "movq (%%ebx), %%mm1 \n\t" // abcdefgh | 1937 "movq (%%edx), %%mm1 \n\t" // abcdefgh |
1926 "movq %%mm0, %%mm2 \n\t" // 12345678 | 1938 "movq %%mm0, %%mm2 \n\t" // 12345678 |
1927 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | 1939 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
1928 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | 1940 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
1929 | 1941 |
1930 "movq (%%ebx, %1), %%mm1 \n\t" | 1942 "movq (%%edx, %1), %%mm1 \n\t" |
1931 "movq (%%ebx, %1, 2), %%mm3 \n\t" | 1943 "movq (%%edx, %1, 2), %%mm3 \n\t" |
1932 "movq %%mm1, %%mm4 \n\t" | 1944 "movq %%mm1, %%mm4 \n\t" |
1933 "punpcklbw %%mm3, %%mm1 \n\t" | 1945 "punpcklbw %%mm3, %%mm1 \n\t" |
1934 "punpckhbw %%mm3, %%mm4 \n\t" | 1946 "punpckhbw %%mm3, %%mm4 \n\t" |
1935 | 1947 |
1936 "movq %%mm0, %%mm3 \n\t" | 1948 "movq %%mm0, %%mm3 \n\t" |
1955 "psrlq $32, %%mm1 \n\t" | 1967 "psrlq $32, %%mm1 \n\t" |
1956 "movd %%mm1, 116(%3) \n\t" | 1968 "movd %%mm1, 116(%3) \n\t" |
1957 | 1969 |
1958 | 1970 |
1959 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) | 1971 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) |
1960 : "%eax", "%ebx" | 1972 : "%eax", "%edx" |
1961 ); | 1973 ); |
1962 } | 1974 } |
1963 | 1975 |
1964 /** | 1976 /** |
1965 * transposes the given 8x8 block | 1977 * transposes the given 8x8 block |
1966 */ | 1978 */ |
1967 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) | 1979 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) |
1968 { | 1980 { |
1969 asm( | 1981 asm( |
1970 "leal (%0, %1), %%eax \n\t" | 1982 "leal (%0, %1), %%eax \n\t" |
1971 "leal (%%eax, %1, 4), %%ebx \n\t" | 1983 "leal (%%eax, %1, 4), %%edx \n\t" |
1972 // 0 1 2 3 4 5 6 7 8 9 | 1984 // 0 1 2 3 4 5 6 7 8 9 |
1973 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1985 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
1974 "movq (%2), %%mm0 \n\t" // 12345678 | 1986 "movq (%2), %%mm0 \n\t" // 12345678 |
1975 "movq 16(%2), %%mm1 \n\t" // abcdefgh | 1987 "movq 16(%2), %%mm1 \n\t" // abcdefgh |
1976 "movq %%mm0, %%mm2 \n\t" // 12345678 | 1988 "movq %%mm0, %%mm2 \n\t" // 12345678 |
1977 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | 1989 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
1978 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | 1990 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
1996 "movd %%mm3, (%%eax, %1) \n\t" | 2008 "movd %%mm3, (%%eax, %1) \n\t" |
1997 "psrlq $32, %%mm3 \n\t" | 2009 "psrlq $32, %%mm3 \n\t" |
1998 "movd %%mm3, (%%eax, %1, 2) \n\t" | 2010 "movd %%mm3, (%%eax, %1, 2) \n\t" |
1999 "movd %%mm2, (%0, %1, 4) \n\t" | 2011 "movd %%mm2, (%0, %1, 4) \n\t" |
2000 "psrlq $32, %%mm2 \n\t" | 2012 "psrlq $32, %%mm2 \n\t" |
2001 "movd %%mm2, (%%ebx) \n\t" | 2013 "movd %%mm2, (%%edx) \n\t" |
2002 "movd %%mm1, (%%ebx, %1) \n\t" | 2014 "movd %%mm1, (%%edx, %1) \n\t" |
2003 "psrlq $32, %%mm1 \n\t" | 2015 "psrlq $32, %%mm1 \n\t" |
2004 "movd %%mm1, (%%ebx, %1, 2) \n\t" | 2016 "movd %%mm1, (%%edx, %1, 2) \n\t" |
2005 | 2017 |
2006 | 2018 |
2007 "movq 64(%2), %%mm0 \n\t" // 12345678 | 2019 "movq 64(%2), %%mm0 \n\t" // 12345678 |
2008 "movq 80(%2), %%mm1 \n\t" // abcdefgh | 2020 "movq 80(%2), %%mm1 \n\t" // abcdefgh |
2009 "movq %%mm0, %%mm2 \n\t" // 12345678 | 2021 "movq %%mm0, %%mm2 \n\t" // 12345678 |
2029 "movd %%mm3, 4(%%eax, %1) \n\t" | 2041 "movd %%mm3, 4(%%eax, %1) \n\t" |
2030 "psrlq $32, %%mm3 \n\t" | 2042 "psrlq $32, %%mm3 \n\t" |
2031 "movd %%mm3, 4(%%eax, %1, 2) \n\t" | 2043 "movd %%mm3, 4(%%eax, %1, 2) \n\t" |
2032 "movd %%mm2, 4(%0, %1, 4) \n\t" | 2044 "movd %%mm2, 4(%0, %1, 4) \n\t" |
2033 "psrlq $32, %%mm2 \n\t" | 2045 "psrlq $32, %%mm2 \n\t" |
2034 "movd %%mm2, 4(%%ebx) \n\t" | 2046 "movd %%mm2, 4(%%edx) \n\t" |
2035 "movd %%mm1, 4(%%ebx, %1) \n\t" | 2047 "movd %%mm1, 4(%%edx, %1) \n\t" |
2036 "psrlq $32, %%mm1 \n\t" | 2048 "psrlq $32, %%mm1 \n\t" |
2037 "movd %%mm1, 4(%%ebx, %1, 2) \n\t" | 2049 "movd %%mm1, 4(%%edx, %1, 2) \n\t" |
2038 | 2050 |
2039 :: "r" (dst), "r" (dstStride), "r" (src) | 2051 :: "r" (dst), "r" (dstStride), "r" (src) |
2040 : "%eax", "%ebx" | 2052 : "%eax", "%edx" |
2041 ); | 2053 ); |
2042 } | 2054 } |
2043 #endif | 2055 #endif |
2044 //static int test=0; | 2056 //static int test=0; |
2045 | 2057 |
2046 static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride, | 2058 static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
2047 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) | 2059 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) |
2048 { | 2060 { |
2061 // to save a register (FIXME do this outside of the loops) | |
2062 tempBluredPast[127]= maxNoise[0]; | |
2063 tempBluredPast[128]= maxNoise[1]; | |
2064 tempBluredPast[129]= maxNoise[2]; | |
2065 | |
2049 #define FAST_L2_DIFF | 2066 #define FAST_L2_DIFF |
2050 //#define L1_DIFF //u should change the thresholds too if u try that one | 2067 //#define L1_DIFF //u should change the thresholds too if u try that one |
2051 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 2068 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2052 asm volatile( | 2069 asm volatile( |
2053 "leal (%2, %2, 2), %%eax \n\t" // 3*stride | 2070 "leal (%2, %2, 2), %%eax \n\t" // 3*stride |
2054 "leal (%2, %2, 4), %%ebx \n\t" // 5*stride | 2071 "leal (%2, %2, 4), %%edx \n\t" // 5*stride |
2055 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride | 2072 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride |
2056 // 0 1 2 3 4 5 6 7 8 9 | 2073 // 0 1 2 3 4 5 6 7 8 9 |
2057 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+ebx %x+2eax %x+ecx %x+8%2 | 2074 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 |
2058 //FIXME reorder? | 2075 //FIXME reorder? |
2059 #ifdef L1_DIFF //needs mmx2 | 2076 #ifdef L1_DIFF //needs mmx2 |
2060 "movq (%0), %%mm0 \n\t" // L0 | 2077 "movq (%0), %%mm0 \n\t" // L0 |
2061 "psadbw (%1), %%mm0 \n\t" // |L0-R0| | 2078 "psadbw (%1), %%mm0 \n\t" // |L0-R0| |
2062 "movq (%0, %2), %%mm1 \n\t" // L1 | 2079 "movq (%0, %2), %%mm1 \n\t" // L1 |
2067 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3| | 2084 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3| |
2068 | 2085 |
2069 "movq (%0, %2, 4), %%mm4 \n\t" // L4 | 2086 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
2070 "paddw %%mm1, %%mm0 \n\t" | 2087 "paddw %%mm1, %%mm0 \n\t" |
2071 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| | 2088 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| |
2072 "movq (%0, %%ebx), %%mm5 \n\t" // L5 | 2089 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
2073 "paddw %%mm2, %%mm0 \n\t" | 2090 "paddw %%mm2, %%mm0 \n\t" |
2074 "psadbw (%1, %%ebx), %%mm5 \n\t" // |L5-R5| | 2091 "psadbw (%1, %%edx), %%mm5 \n\t" // |L5-R5| |
2075 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 | 2092 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
2076 "paddw %%mm3, %%mm0 \n\t" | 2093 "paddw %%mm3, %%mm0 \n\t" |
2077 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6| | 2094 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6| |
2078 "movq (%0, %%ecx), %%mm7 \n\t" // L7 | 2095 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
2079 "paddw %%mm4, %%mm0 \n\t" | 2096 "paddw %%mm4, %%mm0 \n\t" |
2102 L2_DIFF_CORE((%0), (%1)) | 2119 L2_DIFF_CORE((%0), (%1)) |
2103 L2_DIFF_CORE((%0, %2), (%1, %2)) | 2120 L2_DIFF_CORE((%0, %2), (%1, %2)) |
2104 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) | 2121 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
2105 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) | 2122 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
2106 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) | 2123 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
2107 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx)) | 2124 L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
2108 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) | 2125 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
2109 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) | 2126 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
2110 | 2127 |
2111 #else | 2128 #else |
2112 "pxor %%mm7, %%mm7 \n\t" | 2129 "pxor %%mm7, %%mm7 \n\t" |
2130 L2_DIFF_CORE((%0), (%1)) | 2147 L2_DIFF_CORE((%0), (%1)) |
2131 L2_DIFF_CORE((%0, %2), (%1, %2)) | 2148 L2_DIFF_CORE((%0, %2), (%1, %2)) |
2132 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) | 2149 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
2133 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) | 2150 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
2134 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) | 2151 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
2135 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx)) | 2152 L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
2136 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) | 2153 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
2137 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) | 2154 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
2138 | 2155 |
2139 #endif | 2156 #endif |
2140 | 2157 |
2141 "movq %%mm0, %%mm4 \n\t" | 2158 "movq %%mm0, %%mm4 \n\t" |
2142 "psrlq $32, %%mm0 \n\t" | 2159 "psrlq $32, %%mm0 \n\t" |
2143 "paddd %%mm0, %%mm4 \n\t" | 2160 "paddd %%mm0, %%mm4 \n\t" |
2144 "movd %%mm4, %%ecx \n\t" | 2161 "movd %%mm4, %%ecx \n\t" |
2145 "shll $2, %%ecx \n\t" | 2162 "shll $2, %%ecx \n\t" |
2146 "movl %3, %%ebx \n\t" | 2163 "movl %3, %%edx \n\t" |
2147 "addl -4(%%ebx), %%ecx \n\t" | 2164 "addl -4(%%edx), %%ecx \n\t" |
2148 "addl 4(%%ebx), %%ecx \n\t" | 2165 "addl 4(%%edx), %%ecx \n\t" |
2149 "addl -1024(%%ebx), %%ecx \n\t" | 2166 "addl -1024(%%edx), %%ecx \n\t" |
2150 "addl $4, %%ecx \n\t" | 2167 "addl $4, %%ecx \n\t" |
2151 "addl 1024(%%ebx), %%ecx \n\t" | 2168 "addl 1024(%%edx), %%ecx \n\t" |
2152 "shrl $3, %%ecx \n\t" | 2169 "shrl $3, %%ecx \n\t" |
2153 "movl %%ecx, (%%ebx) \n\t" | 2170 "movl %%ecx, (%%edx) \n\t" |
2154 "leal (%%eax, %2, 2), %%ebx \n\t" // 5*stride | |
2155 | 2171 |
2156 // "movl %3, %%ecx \n\t" | 2172 // "movl %3, %%ecx \n\t" |
2157 // "movl %%ecx, test \n\t" | 2173 // "movl %%ecx, test \n\t" |
2158 // "jmp 4f \n\t" | 2174 // "jmp 4f \n\t" |
2159 "cmpl 4+"MANGLE(maxTmpNoise)", %%ecx \n\t" | 2175 "cmpl 512(%%edx), %%ecx \n\t" |
2160 " jb 2f \n\t" | 2176 " jb 2f \n\t" |
2161 "cmpl 8+"MANGLE(maxTmpNoise)", %%ecx \n\t" | 2177 "cmpl 516(%%edx), %%ecx \n\t" |
2162 " jb 1f \n\t" | 2178 " jb 1f \n\t" |
2163 | 2179 |
2164 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride | 2180 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
2181 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
2165 "movq (%0), %%mm0 \n\t" // L0 | 2182 "movq (%0), %%mm0 \n\t" // L0 |
2166 "movq (%0, %2), %%mm1 \n\t" // L1 | 2183 "movq (%0, %2), %%mm1 \n\t" // L1 |
2167 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | 2184 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
2168 "movq (%0, %%eax), %%mm3 \n\t" // L3 | 2185 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
2169 "movq (%0, %2, 4), %%mm4 \n\t" // L4 | 2186 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
2170 "movq (%0, %%ebx), %%mm5 \n\t" // L5 | 2187 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
2171 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 | 2188 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
2172 "movq (%0, %%ecx), %%mm7 \n\t" // L7 | 2189 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
2173 "movq %%mm0, (%1) \n\t" // L0 | 2190 "movq %%mm0, (%1) \n\t" // L0 |
2174 "movq %%mm1, (%1, %2) \n\t" // L1 | 2191 "movq %%mm1, (%1, %2) \n\t" // L1 |
2175 "movq %%mm2, (%1, %2, 2) \n\t" // L2 | 2192 "movq %%mm2, (%1, %2, 2) \n\t" // L2 |
2176 "movq %%mm3, (%1, %%eax) \n\t" // L3 | 2193 "movq %%mm3, (%1, %%eax) \n\t" // L3 |
2177 "movq %%mm4, (%1, %2, 4) \n\t" // L4 | 2194 "movq %%mm4, (%1, %2, 4) \n\t" // L4 |
2178 "movq %%mm5, (%1, %%ebx) \n\t" // L5 | 2195 "movq %%mm5, (%1, %%edx) \n\t" // L5 |
2179 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6 | 2196 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6 |
2180 "movq %%mm7, (%1, %%ecx) \n\t" // L7 | 2197 "movq %%mm7, (%1, %%ecx) \n\t" // L7 |
2181 "jmp 4f \n\t" | 2198 "jmp 4f \n\t" |
2182 | 2199 |
2183 "1: \n\t" | 2200 "1: \n\t" |
2184 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride | 2201 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
2202 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
2185 "movq (%0), %%mm0 \n\t" // L0 | 2203 "movq (%0), %%mm0 \n\t" // L0 |
2186 PAVGB((%1), %%mm0) // L0 | 2204 PAVGB((%1), %%mm0) // L0 |
2187 "movq (%0, %2), %%mm1 \n\t" // L1 | 2205 "movq (%0, %2), %%mm1 \n\t" // L1 |
2188 PAVGB((%1, %2), %%mm1) // L1 | 2206 PAVGB((%1, %2), %%mm1) // L1 |
2189 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | 2207 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
2190 PAVGB((%1, %2, 2), %%mm2) // L2 | 2208 PAVGB((%1, %2, 2), %%mm2) // L2 |
2191 "movq (%0, %%eax), %%mm3 \n\t" // L3 | 2209 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
2192 PAVGB((%1, %%eax), %%mm3) // L3 | 2210 PAVGB((%1, %%eax), %%mm3) // L3 |
2193 "movq (%0, %2, 4), %%mm4 \n\t" // L4 | 2211 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
2194 PAVGB((%1, %2, 4), %%mm4) // L4 | 2212 PAVGB((%1, %2, 4), %%mm4) // L4 |
2195 "movq (%0, %%ebx), %%mm5 \n\t" // L5 | 2213 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
2196 PAVGB((%1, %%ebx), %%mm5) // L5 | 2214 PAVGB((%1, %%edx), %%mm5) // L5 |
2197 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 | 2215 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
2198 PAVGB((%1, %%eax, 2), %%mm6) // L6 | 2216 PAVGB((%1, %%eax, 2), %%mm6) // L6 |
2199 "movq (%0, %%ecx), %%mm7 \n\t" // L7 | 2217 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
2200 PAVGB((%1, %%ecx), %%mm7) // L7 | 2218 PAVGB((%1, %%ecx), %%mm7) // L7 |
2201 "movq %%mm0, (%1) \n\t" // R0 | 2219 "movq %%mm0, (%1) \n\t" // R0 |
2202 "movq %%mm1, (%1, %2) \n\t" // R1 | 2220 "movq %%mm1, (%1, %2) \n\t" // R1 |
2203 "movq %%mm2, (%1, %2, 2) \n\t" // R2 | 2221 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
2204 "movq %%mm3, (%1, %%eax) \n\t" // R3 | 2222 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
2205 "movq %%mm4, (%1, %2, 4) \n\t" // R4 | 2223 "movq %%mm4, (%1, %2, 4) \n\t" // R4 |
2206 "movq %%mm5, (%1, %%ebx) \n\t" // R5 | 2224 "movq %%mm5, (%1, %%edx) \n\t" // R5 |
2207 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6 | 2225 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6 |
2208 "movq %%mm7, (%1, %%ecx) \n\t" // R7 | 2226 "movq %%mm7, (%1, %%ecx) \n\t" // R7 |
2209 "movq %%mm0, (%0) \n\t" // L0 | 2227 "movq %%mm0, (%0) \n\t" // L0 |
2210 "movq %%mm1, (%0, %2) \n\t" // L1 | 2228 "movq %%mm1, (%0, %2) \n\t" // L1 |
2211 "movq %%mm2, (%0, %2, 2) \n\t" // L2 | 2229 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
2212 "movq %%mm3, (%0, %%eax) \n\t" // L3 | 2230 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
2213 "movq %%mm4, (%0, %2, 4) \n\t" // L4 | 2231 "movq %%mm4, (%0, %2, 4) \n\t" // L4 |
2214 "movq %%mm5, (%0, %%ebx) \n\t" // L5 | 2232 "movq %%mm5, (%0, %%edx) \n\t" // L5 |
2215 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 | 2233 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 |
2216 "movq %%mm7, (%0, %%ecx) \n\t" // L7 | 2234 "movq %%mm7, (%0, %%ecx) \n\t" // L7 |
2217 "jmp 4f \n\t" | 2235 "jmp 4f \n\t" |
2218 | 2236 |
2219 "2: \n\t" | 2237 "2: \n\t" |
2220 "cmpl "MANGLE(maxTmpNoise)", %%ecx \n\t" | 2238 "cmpl 508(%%edx), %%ecx \n\t" |
2221 " jb 3f \n\t" | 2239 " jb 3f \n\t" |
2222 | 2240 |
2223 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride | 2241 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
2242 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
2224 "movq (%0), %%mm0 \n\t" // L0 | 2243 "movq (%0), %%mm0 \n\t" // L0 |
2225 "movq (%0, %2), %%mm1 \n\t" // L1 | 2244 "movq (%0, %2), %%mm1 \n\t" // L1 |
2226 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | 2245 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
2227 "movq (%0, %%eax), %%mm3 \n\t" // L3 | 2246 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
2228 "movq (%1), %%mm4 \n\t" // R0 | 2247 "movq (%1), %%mm4 \n\t" // R0 |
2245 "movq %%mm1, (%0, %2) \n\t" // L1 | 2264 "movq %%mm1, (%0, %2) \n\t" // L1 |
2246 "movq %%mm2, (%0, %2, 2) \n\t" // L2 | 2265 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
2247 "movq %%mm3, (%0, %%eax) \n\t" // L3 | 2266 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
2248 | 2267 |
2249 "movq (%0, %2, 4), %%mm0 \n\t" // L4 | 2268 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
2250 "movq (%0, %%ebx), %%mm1 \n\t" // L5 | 2269 "movq (%0, %%edx), %%mm1 \n\t" // L5 |
2251 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 | 2270 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
2252 "movq (%0, %%ecx), %%mm3 \n\t" // L7 | 2271 "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
2253 "movq (%1, %2, 4), %%mm4 \n\t" // R4 | 2272 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
2254 "movq (%1, %%ebx), %%mm5 \n\t" // R5 | 2273 "movq (%1, %%edx), %%mm5 \n\t" // R5 |
2255 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 | 2274 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
2256 "movq (%1, %%ecx), %%mm7 \n\t" // R7 | 2275 "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
2257 PAVGB(%%mm4, %%mm0) | 2276 PAVGB(%%mm4, %%mm0) |
2258 PAVGB(%%mm5, %%mm1) | 2277 PAVGB(%%mm5, %%mm1) |
2259 PAVGB(%%mm6, %%mm2) | 2278 PAVGB(%%mm6, %%mm2) |
2261 PAVGB(%%mm4, %%mm0) | 2280 PAVGB(%%mm4, %%mm0) |
2262 PAVGB(%%mm5, %%mm1) | 2281 PAVGB(%%mm5, %%mm1) |
2263 PAVGB(%%mm6, %%mm2) | 2282 PAVGB(%%mm6, %%mm2) |
2264 PAVGB(%%mm7, %%mm3) | 2283 PAVGB(%%mm7, %%mm3) |
2265 "movq %%mm0, (%1, %2, 4) \n\t" // R4 | 2284 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
2266 "movq %%mm1, (%1, %%ebx) \n\t" // R5 | 2285 "movq %%mm1, (%1, %%edx) \n\t" // R5 |
2267 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 | 2286 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
2268 "movq %%mm3, (%1, %%ecx) \n\t" // R7 | 2287 "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
2269 "movq %%mm0, (%0, %2, 4) \n\t" // L4 | 2288 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
2270 "movq %%mm1, (%0, %%ebx) \n\t" // L5 | 2289 "movq %%mm1, (%0, %%edx) \n\t" // L5 |
2271 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 | 2290 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
2272 "movq %%mm3, (%0, %%ecx) \n\t" // L7 | 2291 "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
2273 "jmp 4f \n\t" | 2292 "jmp 4f \n\t" |
2274 | 2293 |
2275 "3: \n\t" | 2294 "3: \n\t" |
2276 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride | 2295 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
2296 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
2277 "movq (%0), %%mm0 \n\t" // L0 | 2297 "movq (%0), %%mm0 \n\t" // L0 |
2278 "movq (%0, %2), %%mm1 \n\t" // L1 | 2298 "movq (%0, %2), %%mm1 \n\t" // L1 |
2279 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | 2299 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
2280 "movq (%0, %%eax), %%mm3 \n\t" // L3 | 2300 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
2281 "movq (%1), %%mm4 \n\t" // R0 | 2301 "movq (%1), %%mm4 \n\t" // R0 |
2302 "movq %%mm1, (%0, %2) \n\t" // L1 | 2322 "movq %%mm1, (%0, %2) \n\t" // L1 |
2303 "movq %%mm2, (%0, %2, 2) \n\t" // L2 | 2323 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
2304 "movq %%mm3, (%0, %%eax) \n\t" // L3 | 2324 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
2305 | 2325 |
2306 "movq (%0, %2, 4), %%mm0 \n\t" // L4 | 2326 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
2307 "movq (%0, %%ebx), %%mm1 \n\t" // L5 | 2327 "movq (%0, %%edx), %%mm1 \n\t" // L5 |
2308 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 | 2328 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
2309 "movq (%0, %%ecx), %%mm3 \n\t" // L7 | 2329 "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
2310 "movq (%1, %2, 4), %%mm4 \n\t" // R4 | 2330 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
2311 "movq (%1, %%ebx), %%mm5 \n\t" // R5 | 2331 "movq (%1, %%edx), %%mm5 \n\t" // R5 |
2312 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 | 2332 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
2313 "movq (%1, %%ecx), %%mm7 \n\t" // R7 | 2333 "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
2314 PAVGB(%%mm4, %%mm0) | 2334 PAVGB(%%mm4, %%mm0) |
2315 PAVGB(%%mm5, %%mm1) | 2335 PAVGB(%%mm5, %%mm1) |
2316 PAVGB(%%mm6, %%mm2) | 2336 PAVGB(%%mm6, %%mm2) |
2322 PAVGB(%%mm4, %%mm0) | 2342 PAVGB(%%mm4, %%mm0) |
2323 PAVGB(%%mm5, %%mm1) | 2343 PAVGB(%%mm5, %%mm1) |
2324 PAVGB(%%mm6, %%mm2) | 2344 PAVGB(%%mm6, %%mm2) |
2325 PAVGB(%%mm7, %%mm3) | 2345 PAVGB(%%mm7, %%mm3) |
2326 "movq %%mm0, (%1, %2, 4) \n\t" // R4 | 2346 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
2327 "movq %%mm1, (%1, %%ebx) \n\t" // R5 | 2347 "movq %%mm1, (%1, %%edx) \n\t" // R5 |
2328 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 | 2348 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
2329 "movq %%mm3, (%1, %%ecx) \n\t" // R7 | 2349 "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
2330 "movq %%mm0, (%0, %2, 4) \n\t" // L4 | 2350 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
2331 "movq %%mm1, (%0, %%ebx) \n\t" // L5 | 2351 "movq %%mm1, (%0, %%edx) \n\t" // L5 |
2332 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 | 2352 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
2333 "movq %%mm3, (%0, %%ecx) \n\t" // L7 | 2353 "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
2334 | 2354 |
2335 "4: \n\t" | 2355 "4: \n\t" |
2336 | 2356 |
2337 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast) | 2357 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast) |
2338 : "%eax", "%ebx", "%ecx", "memory" | 2358 : "%eax", "%edx", "%ecx", "memory" |
2339 ); | 2359 ); |
2340 //printf("%d\n", test); | 2360 //printf("%d\n", test); |
2341 #else | 2361 #else |
2342 int y; | 2362 int y; |
2343 int d=0; | 2363 int d=0; |
2441 } | 2461 } |
2442 #endif | 2462 #endif |
2443 } | 2463 } |
2444 | 2464 |
2445 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | 2465 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
2446 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); | 2466 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); |
2447 | 2467 |
2448 /** | 2468 /** |
2449 * Copies a block from src to dst and fixes the blacklevel | 2469 * Copies a block from src to dst and fixes the blacklevel |
2450 * levelFix == 0 -> dont touch the brighness & contrast | 2470 * levelFix == 0 -> dont touch the brighness & contrast |
2451 */ | 2471 */ |
2452 #undef SCALED_CPY | 2472 #undef SCALED_CPY |
2453 | 2473 |
2454 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, | 2474 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
2455 int levelFix) | 2475 int levelFix, int64_t *packedOffsetAndScale) |
2456 { | 2476 { |
2457 #ifndef HAVE_MMX | 2477 #ifndef HAVE_MMX |
2458 int i; | 2478 int i; |
2459 #endif | 2479 #endif |
2460 if(levelFix) | 2480 if(levelFix) |
2461 { | 2481 { |
2462 #ifdef HAVE_MMX | 2482 #ifdef HAVE_MMX |
2463 asm volatile( | 2483 asm volatile( |
2464 "leal (%0,%2), %%eax \n\t" | 2484 "movq (%%eax), %%mm2 \n\t" // packedYOffset |
2465 "leal (%1,%3), %%ebx \n\t" | 2485 "movq 8(%%eax), %%mm3 \n\t" // packedYScale |
2466 "movq "MANGLE(packedYOffset)", %%mm2\n\t" | 2486 "leal (%2,%4), %%eax \n\t" |
2467 "movq "MANGLE(packedYScale)", %%mm3\n\t" | 2487 "leal (%3,%5), %%edx \n\t" |
2468 "pxor %%mm4, %%mm4 \n\t" | 2488 "pxor %%mm4, %%mm4 \n\t" |
2469 #ifdef HAVE_MMX2 | 2489 #ifdef HAVE_MMX2 |
2470 #define SCALED_CPY(src1, src2, dst1, dst2) \ | 2490 #define SCALED_CPY(src1, src2, dst1, dst2) \ |
2471 "movq " #src1 ", %%mm0 \n\t"\ | 2491 "movq " #src1 ", %%mm0 \n\t"\ |
2472 "movq " #src1 ", %%mm5 \n\t"\ | 2492 "movq " #src1 ", %%mm5 \n\t"\ |
2516 "movq %%mm0, " #dst1 " \n\t"\ | 2536 "movq %%mm0, " #dst1 " \n\t"\ |
2517 "movq %%mm1, " #dst2 " \n\t"\ | 2537 "movq %%mm1, " #dst2 " \n\t"\ |
2518 | 2538 |
2519 #endif //!HAVE_MMX2 | 2539 #endif //!HAVE_MMX2 |
2520 | 2540 |
2521 SCALED_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) | 2541 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) |
2522 SCALED_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2)) | 2542 SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2)) |
2523 SCALED_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4)) | 2543 SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4)) |
2524 "leal (%%eax,%2,4), %%eax \n\t" | 2544 "leal (%%eax,%4,4), %%eax \n\t" |
2525 "leal (%%ebx,%3,4), %%ebx \n\t" | 2545 "leal (%%edx,%5,4), %%edx \n\t" |
2526 SCALED_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2)) | 2546 SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2)) |
2527 | 2547 |
2528 | 2548 |
2529 : : "r"(src), | 2549 : "=&a" (packedOffsetAndScale) |
2550 : "0" (packedOffsetAndScale), | |
2551 "r"(src), | |
2530 "r"(dst), | 2552 "r"(dst), |
2531 "r" (srcStride), | 2553 "r" (srcStride), |
2532 "r" (dstStride) | 2554 "r" (dstStride) |
2533 : "%eax", "%ebx" | 2555 : "%edx" |
2534 ); | 2556 ); |
2535 #else | 2557 #else |
2536 for(i=0; i<8; i++) | 2558 for(i=0; i<8; i++) |
2537 memcpy( &(dst[dstStride*i]), | 2559 memcpy( &(dst[dstStride*i]), |
2538 &(src[srcStride*i]), BLOCK_SIZE); | 2560 &(src[srcStride*i]), BLOCK_SIZE); |
2541 else | 2563 else |
2542 { | 2564 { |
2543 #ifdef HAVE_MMX | 2565 #ifdef HAVE_MMX |
2544 asm volatile( | 2566 asm volatile( |
2545 "leal (%0,%2), %%eax \n\t" | 2567 "leal (%0,%2), %%eax \n\t" |
2546 "leal (%1,%3), %%ebx \n\t" | 2568 "leal (%1,%3), %%edx \n\t" |
2547 | 2569 |
2548 #define SIMPLE_CPY(src1, src2, dst1, dst2) \ | 2570 #define SIMPLE_CPY(src1, src2, dst1, dst2) \ |
2549 "movq " #src1 ", %%mm0 \n\t"\ | 2571 "movq " #src1 ", %%mm0 \n\t"\ |
2550 "movq " #src2 ", %%mm1 \n\t"\ | 2572 "movq " #src2 ", %%mm1 \n\t"\ |
2551 "movq %%mm0, " #dst1 " \n\t"\ | 2573 "movq %%mm0, " #dst1 " \n\t"\ |
2552 "movq %%mm1, " #dst2 " \n\t"\ | 2574 "movq %%mm1, " #dst2 " \n\t"\ |
2553 | 2575 |
2554 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) | 2576 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) |
2555 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2)) | 2577 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2)) |
2556 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4)) | 2578 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4)) |
2557 "leal (%%eax,%2,4), %%eax \n\t" | 2579 "leal (%%eax,%2,4), %%eax \n\t" |
2558 "leal (%%ebx,%3,4), %%ebx \n\t" | 2580 "leal (%%edx,%3,4), %%edx \n\t" |
2559 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2)) | 2581 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2)) |
2560 | 2582 |
2561 : : "r" (src), | 2583 : : "r" (src), |
2562 "r" (dst), | 2584 "r" (dst), |
2563 "r" (srcStride), | 2585 "r" (srcStride), |
2564 "r" (dstStride) | 2586 "r" (dstStride) |
2565 : "%eax", "%ebx" | 2587 : "%eax", "%edx" |
2566 ); | 2588 ); |
2567 #else | 2589 #else |
2568 for(i=0; i<8; i++) | 2590 for(i=0; i<8; i++) |
2569 memcpy( &(dst[dstStride*i]), | 2591 memcpy( &(dst[dstStride*i]), |
2570 &(src[srcStride*i]), BLOCK_SIZE); | 2592 &(src[srcStride*i]), BLOCK_SIZE); |
2600 | 2622 |
2601 /** | 2623 /** |
2602 * Filters array of bytes (Y or U or V values) | 2624 * Filters array of bytes (Y or U or V values) |
2603 */ | 2625 */ |
2604 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | 2626 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
2605 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode) | 2627 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) |
2606 { | 2628 { |
2629 PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access | |
2607 int x,y; | 2630 int x,y; |
2608 #ifdef COMPILE_TIME_MODE | 2631 #ifdef COMPILE_TIME_MODE |
2609 const int mode= COMPILE_TIME_MODE; | 2632 const int mode= COMPILE_TIME_MODE; |
2610 #else | 2633 #else |
2611 const int mode= isColor ? ppMode->chromMode : ppMode->lumMode; | 2634 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; |
2612 #endif | 2635 #endif |
2613 /* we need 64bit here otherwise weŽll going to have a problem | |
2614 after watching a black picture for 5 hours*/ | |
2615 static uint64_t *yHistogram= NULL; | |
2616 int black=0, white=255; // blackest black and whitest white in the picture | 2636 int black=0, white=255; // blackest black and whitest white in the picture |
2617 int QPCorrecture= 256*256; | 2637 int QPCorrecture= 256*256; |
2618 | 2638 |
2619 /* Temporary buffers for handling the last row(s) */ | |
2620 static uint8_t *tempDst= NULL; | |
2621 static uint8_t *tempSrc= NULL; | |
2622 | |
2623 /* Temporary buffers for handling the last block */ | |
2624 static uint8_t *tempDstBlock= NULL; | |
2625 static uint8_t *tempSrcBlock= NULL; | |
2626 | |
2627 /* Temporal noise reducing buffers */ | |
2628 static uint8_t *tempBlured[3]= {NULL,NULL,NULL}; | |
2629 static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL}; | |
2630 | |
2631 int copyAhead; | 2639 int copyAhead; |
2632 | 2640 |
2633 #ifdef PP_FUNNY_STRIDE | 2641 //FIXME remove |
2634 uint8_t *dstBlockPtrBackup; | 2642 uint64_t * const yHistogram= c.yHistogram; |
2635 uint8_t *srcBlockPtrBackup; | 2643 uint8_t * const tempSrc= c.tempSrc; |
2636 #endif | 2644 uint8_t * const tempDst= c.tempDst; |
2637 | 2645 |
2638 #ifdef MORE_TIMING | 2646 c.dcOffset= c.ppMode.maxDcDiff; |
2639 long long T0, T1, diffTime=0; | 2647 c.dcThreshold= c.ppMode.maxDcDiff*2 + 1; |
2640 #endif | |
2641 #ifdef TIMING | |
2642 long long memcpyTime=0, vertTime=0, horizTime=0, sumTime; | |
2643 sumTime= rdtsc(); | |
2644 #endif | |
2645 dcOffset= ppMode->maxDcDiff; | |
2646 dcThreshold= ppMode->maxDcDiff*2 + 1; | |
2647 | 2648 |
2648 #ifdef HAVE_MMX | 2649 #ifdef HAVE_MMX |
2649 maxTmpNoise[0]= ppMode->maxTmpNoise[0]; | 2650 c.mmxDcOffset= 0x7F - c.dcOffset; |
2650 maxTmpNoise[1]= ppMode->maxTmpNoise[1]; | 2651 c.mmxDcThreshold= 0x7F - c.dcThreshold; |
2651 maxTmpNoise[2]= ppMode->maxTmpNoise[2]; | 2652 |
2652 | 2653 c.mmxDcOffset*= 0x0101010101010101LL; |
2653 mmxDCOffset= 0x7F - dcOffset; | 2654 c.mmxDcThreshold*= 0x0101010101010101LL; |
2654 mmxDCThreshold= 0x7F - dcThreshold; | |
2655 | |
2656 mmxDCOffset*= 0x0101010101010101LL; | |
2657 mmxDCThreshold*= 0x0101010101010101LL; | |
2658 #endif | 2655 #endif |
2659 | 2656 |
2660 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; | 2657 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; |
2661 else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14; | 2658 else if( (mode & LINEAR_BLEND_DEINT_FILTER) |
2659 || (mode & FFMPEG_DEINT_FILTER)) copyAhead=14; | |
2662 else if( (mode & V_DEBLOCK) | 2660 else if( (mode & V_DEBLOCK) |
2663 || (mode & LINEAR_IPOL_DEINT_FILTER) | 2661 || (mode & LINEAR_IPOL_DEINT_FILTER) |
2664 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; | 2662 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; |
2665 else if(mode & V_X1_FILTER) copyAhead=11; | 2663 else if(mode & V_X1_FILTER) copyAhead=11; |
2666 else if(mode & V_RK1_FILTER) copyAhead=10; | 2664 // else if(mode & V_RK1_FILTER) copyAhead=10; |
2667 else if(mode & DERING) copyAhead=9; | 2665 else if(mode & DERING) copyAhead=9; |
2668 else copyAhead=8; | 2666 else copyAhead=8; |
2669 | 2667 |
2670 copyAhead-= 8; | 2668 copyAhead-= 8; |
2671 | |
2672 if(tempDst==NULL) | |
2673 { | |
2674 tempDst= (uint8_t*)memalign(8, 1024*24); | |
2675 tempSrc= (uint8_t*)memalign(8, 1024*24); | |
2676 tempDstBlock= (uint8_t*)memalign(8, 1024*24); | |
2677 tempSrcBlock= (uint8_t*)memalign(8, 1024*24); | |
2678 } | |
2679 | |
2680 if(tempBlured[isColor]==NULL && (mode & TEMP_NOISE_FILTER)) | |
2681 { | |
2682 // printf("%d %d %d\n", isColor, dstStride, height); | |
2683 //FIXME works only as long as the size doesnt increase | |
2684 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end | |
2685 tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024); | |
2686 tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024); | |
2687 | |
2688 memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024); | |
2689 memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024); | |
2690 } | |
2691 | |
2692 if(!yHistogram) | |
2693 { | |
2694 int i; | |
2695 yHistogram= (uint64_t*)malloc(8*256); | |
2696 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256; | |
2697 | |
2698 if(mode & FULL_Y_RANGE) | |
2699 { | |
2700 ppMode->maxAllowedY=255; | |
2701 ppMode->minAllowedY=0; | |
2702 } | |
2703 } | |
2704 | 2669 |
2705 if(!isColor) | 2670 if(!isColor) |
2706 { | 2671 { |
2707 uint64_t sum= 0; | 2672 uint64_t sum= 0; |
2708 int i; | 2673 int i; |
2709 static int framenum= -1; | |
2710 uint64_t maxClipped; | 2674 uint64_t maxClipped; |
2711 uint64_t clipped; | 2675 uint64_t clipped; |
2712 double scale; | 2676 double scale; |
2713 | 2677 |
2714 framenum++; | 2678 c.frameNum++; |
2715 if(framenum == 1) yHistogram[0]= width*height/64*15/256; | 2679 // first frame is fscked so we ignore it |
2680 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256; | |
2716 | 2681 |
2717 for(i=0; i<256; i++) | 2682 for(i=0; i<256; i++) |
2718 { | 2683 { |
2719 sum+= yHistogram[i]; | 2684 sum+= yHistogram[i]; |
2720 // printf("%d ", yHistogram[i]); | 2685 // printf("%d ", yHistogram[i]); |
2736 { | 2701 { |
2737 if(clipped < maxClipped) break; | 2702 if(clipped < maxClipped) break; |
2738 clipped-= yHistogram[white]; | 2703 clipped-= yHistogram[white]; |
2739 } | 2704 } |
2740 | 2705 |
2741 scale= (double)(ppMode->maxAllowedY - ppMode->minAllowedY) / (double)(white-black); | 2706 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); |
2742 | 2707 |
2743 #ifdef HAVE_MMX2 | 2708 #ifdef HAVE_MMX2 |
2744 packedYScale= (uint16_t)(scale*256.0 + 0.5); | 2709 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); |
2745 packedYOffset= (((black*packedYScale)>>8) - ppMode->minAllowedY) & 0xFFFF; | 2710 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; |
2746 #else | 2711 #else |
2747 packedYScale= (uint16_t)(scale*1024.0 + 0.5); | 2712 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); |
2748 packedYOffset= (black - ppMode->minAllowedY) & 0xFFFF; | 2713 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; |
2749 #endif | 2714 #endif |
2750 | 2715 |
2751 packedYOffset|= packedYOffset<<32; | 2716 c.packedYOffset|= c.packedYOffset<<32; |
2752 packedYOffset|= packedYOffset<<16; | 2717 c.packedYOffset|= c.packedYOffset<<16; |
2753 | 2718 |
2754 packedYScale|= packedYScale<<32; | 2719 c.packedYScale|= c.packedYScale<<32; |
2755 packedYScale|= packedYScale<<16; | 2720 c.packedYScale|= c.packedYScale<<16; |
2756 | 2721 |
2757 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); | 2722 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); |
2758 else QPCorrecture= 256*256; | 2723 else QPCorrecture= 256*256; |
2759 } | 2724 } |
2760 else | 2725 else |
2761 { | 2726 { |
2762 packedYScale= 0x0100010001000100LL; | 2727 c.packedYScale= 0x0100010001000100LL; |
2763 packedYOffset= 0; | 2728 c.packedYOffset= 0; |
2764 QPCorrecture= 256*256; | 2729 QPCorrecture= 256*256; |
2765 } | 2730 } |
2766 | 2731 |
2767 /* copy & deinterlace first row of blocks */ | 2732 /* copy & deinterlace first row of blocks */ |
2768 y=-BLOCK_SIZE; | 2733 y=-BLOCK_SIZE; |
2787 asm( | 2752 asm( |
2788 "movl %4, %%eax \n\t" | 2753 "movl %4, %%eax \n\t" |
2789 "shrl $2, %%eax \n\t" | 2754 "shrl $2, %%eax \n\t" |
2790 "andl $6, %%eax \n\t" | 2755 "andl $6, %%eax \n\t" |
2791 "addl %5, %%eax \n\t" | 2756 "addl %5, %%eax \n\t" |
2792 "movl %%eax, %%ebx \n\t" | 2757 "movl %%eax, %%edx \n\t" |
2793 "imul %1, %%eax \n\t" | 2758 "imul %1, %%eax \n\t" |
2794 "imul %3, %%ebx \n\t" | 2759 "imul %3, %%edx \n\t" |
2795 "prefetchnta 32(%%eax, %0) \n\t" | 2760 "prefetchnta 32(%%eax, %0) \n\t" |
2796 "prefetcht0 32(%%ebx, %2) \n\t" | 2761 "prefetcht0 32(%%edx, %2) \n\t" |
2797 "addl %1, %%eax \n\t" | 2762 "addl %1, %%eax \n\t" |
2798 "addl %3, %%ebx \n\t" | 2763 "addl %3, %%edx \n\t" |
2799 "prefetchnta 32(%%eax, %0) \n\t" | 2764 "prefetchnta 32(%%eax, %0) \n\t" |
2800 "prefetcht0 32(%%ebx, %2) \n\t" | 2765 "prefetcht0 32(%%edx, %2) \n\t" |
2801 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), | 2766 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
2802 "m" (x), "m" (copyAhead) | 2767 "m" (x), "m" (copyAhead) |
2803 : "%eax", "%ebx" | 2768 : "%eax", "%edx" |
2804 ); | 2769 ); |
2805 | 2770 |
2806 #elif defined(HAVE_3DNOW) | 2771 #elif defined(HAVE_3DNOW) |
2807 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | 2772 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... |
2808 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | 2773 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
2811 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | 2776 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
2812 */ | 2777 */ |
2813 #endif | 2778 #endif |
2814 | 2779 |
2815 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, | 2780 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, |
2816 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX); | 2781 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
2817 | 2782 |
2818 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); | 2783 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); |
2819 | 2784 |
2820 if(mode & LINEAR_IPOL_DEINT_FILTER) | 2785 if(mode & LINEAR_IPOL_DEINT_FILTER) |
2821 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); | 2786 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
2823 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); | 2788 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
2824 else if(mode & MEDIAN_DEINT_FILTER) | 2789 else if(mode & MEDIAN_DEINT_FILTER) |
2825 RENAME(deInterlaceMedian)(dstBlock, dstStride); | 2790 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
2826 else if(mode & CUBIC_IPOL_DEINT_FILTER) | 2791 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
2827 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); | 2792 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
2793 else if(mode & FFMPEG_DEINT_FILTER) | |
2794 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
2828 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) | 2795 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
2829 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); | 2796 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
2830 */ | 2797 */ |
2831 dstBlock+=8; | 2798 dstBlock+=8; |
2832 srcBlock+=8; | 2799 srcBlock+=8; |
2833 } | 2800 } |
2834 memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride ); | 2801 memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride ); |
2835 } | 2802 } |
2836 | 2803 |
2804 //printf("\n"); | |
2837 for(y=0; y<height; y+=BLOCK_SIZE) | 2805 for(y=0; y<height; y+=BLOCK_SIZE) |
2838 { | 2806 { |
2839 //1% speedup if these are here instead of the inner loop | 2807 //1% speedup if these are here instead of the inner loop |
2840 uint8_t *srcBlock= &(src[y*srcStride]); | 2808 uint8_t *srcBlock= &(src[y*srcStride]); |
2841 uint8_t *dstBlock= &(dst[y*dstStride]); | 2809 uint8_t *dstBlock= &(dst[y*dstStride]); |
2842 #ifdef HAVE_MMX | 2810 #ifdef HAVE_MMX |
2843 uint8_t *tempBlock1= tempBlocks; | 2811 uint8_t *tempBlock1= c.tempBlocks; |
2844 uint8_t *tempBlock2= tempBlocks + 8; | 2812 uint8_t *tempBlock2= c.tempBlocks + 8; |
2845 #endif | 2813 #endif |
2846 #ifdef ARCH_X86 | 2814 #ifdef ARCH_X86 |
2847 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; | 2815 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; |
2848 int QPDelta= isColor ? (-1) : 1<<31; | 2816 int QPDelta= isColor ? (-1) : 1<<31; |
2849 int QPFrac= 1<<30; | 2817 int QPFrac= 1<<30; |
2871 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); | 2839 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); |
2872 | 2840 |
2873 dstBlock= tempDst + dstStride; | 2841 dstBlock= tempDst + dstStride; |
2874 srcBlock= tempSrc; | 2842 srcBlock= tempSrc; |
2875 } | 2843 } |
2844 //printf("\n"); | |
2876 | 2845 |
2877 // From this point on it is guranteed that we can read and write 16 lines downward | 2846 // From this point on it is guranteed that we can read and write 16 lines downward |
2878 // finish 1 block before the next otherwise weŽll might have a problem | 2847 // finish 1 block before the next otherwise weŽll might have a problem |
2879 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | 2848 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
2880 for(x=0; x<width; x+=BLOCK_SIZE) | 2849 for(x=0; x<width; x+=BLOCK_SIZE) |
2902 if(!isColor) | 2871 if(!isColor) |
2903 { | 2872 { |
2904 QP= (QP* QPCorrecture + 256*128)>>16; | 2873 QP= (QP* QPCorrecture + 256*128)>>16; |
2905 yHistogram[ srcBlock[srcStride*12 + 4] ]++; | 2874 yHistogram[ srcBlock[srcStride*12 + 4] ]++; |
2906 } | 2875 } |
2876 //printf("%d ", QP); | |
2877 c.QP= QP; | |
2907 #ifdef HAVE_MMX | 2878 #ifdef HAVE_MMX |
2908 asm volatile( | 2879 asm volatile( |
2909 "movd %0, %%mm7 \n\t" | 2880 "movd %1, %%mm7 \n\t" |
2910 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP | 2881 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
2911 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | 2882 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP |
2912 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | 2883 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP |
2913 "movq %%mm7, "MANGLE(pQPb)" \n\t" | 2884 "movq %%mm7, %0 \n\t" |
2914 : : "r" (QP) | 2885 : "=m" (c.pQPb) |
2886 : "r" (QP) | |
2915 ); | 2887 ); |
2916 #endif | 2888 #endif |
2917 | 2889 |
2918 #ifdef MORE_TIMING | |
2919 T0= rdtsc(); | |
2920 #endif | |
2921 | 2890 |
2922 #ifdef HAVE_MMX2 | 2891 #ifdef HAVE_MMX2 |
2923 /* | 2892 /* |
2924 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | 2893 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
2925 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | 2894 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
2930 asm( | 2899 asm( |
2931 "movl %4, %%eax \n\t" | 2900 "movl %4, %%eax \n\t" |
2932 "shrl $2, %%eax \n\t" | 2901 "shrl $2, %%eax \n\t" |
2933 "andl $6, %%eax \n\t" | 2902 "andl $6, %%eax \n\t" |
2934 "addl %5, %%eax \n\t" | 2903 "addl %5, %%eax \n\t" |
2935 "movl %%eax, %%ebx \n\t" | 2904 "movl %%eax, %%edx \n\t" |
2936 "imul %1, %%eax \n\t" | 2905 "imul %1, %%eax \n\t" |
2937 "imul %3, %%ebx \n\t" | 2906 "imul %3, %%edx \n\t" |
2938 "prefetchnta 32(%%eax, %0) \n\t" | 2907 "prefetchnta 32(%%eax, %0) \n\t" |
2939 "prefetcht0 32(%%ebx, %2) \n\t" | 2908 "prefetcht0 32(%%edx, %2) \n\t" |
2940 "addl %1, %%eax \n\t" | 2909 "addl %1, %%eax \n\t" |
2941 "addl %3, %%ebx \n\t" | 2910 "addl %3, %%edx \n\t" |
2942 "prefetchnta 32(%%eax, %0) \n\t" | 2911 "prefetchnta 32(%%eax, %0) \n\t" |
2943 "prefetcht0 32(%%ebx, %2) \n\t" | 2912 "prefetcht0 32(%%edx, %2) \n\t" |
2944 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), | 2913 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
2945 "m" (x), "m" (copyAhead) | 2914 "m" (x), "m" (copyAhead) |
2946 : "%eax", "%ebx" | 2915 : "%eax", "%edx" |
2947 ); | 2916 ); |
2948 | 2917 |
2949 #elif defined(HAVE_3DNOW) | 2918 #elif defined(HAVE_3DNOW) |
2950 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | 2919 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... |
2951 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | 2920 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
2953 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | 2922 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
2954 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | 2923 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
2955 */ | 2924 */ |
2956 #endif | 2925 #endif |
2957 | 2926 |
2958 #ifdef PP_FUNNY_STRIDE | |
2959 //can we mess with a 8x16 block, if not use a temp buffer, yes again | |
2960 if(x+7 >= width) | |
2961 { | |
2962 int i; | |
2963 dstBlockPtrBackup= dstBlock; | |
2964 srcBlockPtrBackup= srcBlock; | |
2965 | |
2966 for(i=0;i<BLOCK_SIZE*2; i++) | |
2967 { | |
2968 memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x); | |
2969 memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x); | |
2970 } | |
2971 | |
2972 dstBlock= tempDstBlock; | |
2973 srcBlock= tempSrcBlock; | |
2974 } | |
2975 #endif | |
2976 | |
2977 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, | 2927 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
2978 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); | 2928 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
2979 | 2929 |
2980 if(mode & LINEAR_IPOL_DEINT_FILTER) | 2930 if(mode & LINEAR_IPOL_DEINT_FILTER) |
2981 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); | 2931 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
2982 else if(mode & LINEAR_BLEND_DEINT_FILTER) | 2932 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
2983 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); | 2933 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
2984 else if(mode & MEDIAN_DEINT_FILTER) | 2934 else if(mode & MEDIAN_DEINT_FILTER) |
2985 RENAME(deInterlaceMedian)(dstBlock, dstStride); | 2935 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
2986 else if(mode & CUBIC_IPOL_DEINT_FILTER) | 2936 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
2987 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); | 2937 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
2938 else if(mode & FFMPEG_DEINT_FILTER) | |
2939 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
2988 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) | 2940 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
2989 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); | 2941 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
2990 */ | 2942 */ |
2991 | 2943 |
2992 /* only deblock if we have 2 blocks */ | 2944 /* only deblock if we have 2 blocks */ |
2993 if(y + 8 < height) | 2945 if(y + 8 < height) |
2994 { | 2946 { |
2995 #ifdef MORE_TIMING | 2947 if(mode & V_X1_FILTER) |
2996 T1= rdtsc(); | 2948 RENAME(vertX1Filter)(dstBlock, stride, &c); |
2997 memcpyTime+= T1-T0; | |
2998 T0=T1; | |
2999 #endif | |
3000 if(mode & V_RK1_FILTER) | |
3001 RENAME(vertRK1Filter)(dstBlock, stride, QP); | |
3002 else if(mode & V_X1_FILTER) | |
3003 RENAME(vertX1Filter)(dstBlock, stride, QP); | |
3004 else if(mode & V_DEBLOCK) | 2949 else if(mode & V_DEBLOCK) |
3005 { | 2950 { |
3006 if( RENAME(isVertDC)(dstBlock, stride)) | 2951 if( RENAME(isVertDC)(dstBlock, stride, &c)) |
3007 { | 2952 { |
3008 if(RENAME(isVertMinMaxOk)(dstBlock, stride, QP)) | 2953 if(RENAME(isVertMinMaxOk)(dstBlock, stride, &c)) |
3009 RENAME(doVertLowPass)(dstBlock, stride, QP); | 2954 RENAME(doVertLowPass)(dstBlock, stride, &c); |
3010 } | 2955 } |
3011 else | 2956 else |
3012 RENAME(doVertDefFilter)(dstBlock, stride, QP); | 2957 RENAME(doVertDefFilter)(dstBlock, stride, &c); |
3013 } | 2958 } |
3014 #ifdef MORE_TIMING | |
3015 T1= rdtsc(); | |
3016 vertTime+= T1-T0; | |
3017 T0=T1; | |
3018 #endif | |
3019 } | 2959 } |
3020 | 2960 |
3021 #ifdef HAVE_MMX | 2961 #ifdef HAVE_MMX |
3022 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); | 2962 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
3023 #endif | 2963 #endif |
3024 /* check if we have a previous block to deblock it with dstBlock */ | 2964 /* check if we have a previous block to deblock it with dstBlock */ |
3025 if(x - 8 >= 0) | 2965 if(x - 8 >= 0) |
3026 { | 2966 { |
3027 #ifdef MORE_TIMING | |
3028 T0= rdtsc(); | |
3029 #endif | |
3030 #ifdef HAVE_MMX | 2967 #ifdef HAVE_MMX |
3031 if(mode & H_RK1_FILTER) | 2968 if(mode & H_X1_FILTER) |
3032 RENAME(vertRK1Filter)(tempBlock1, 16, QP); | 2969 RENAME(vertX1Filter)(tempBlock1, 16, &c); |
3033 else if(mode & H_X1_FILTER) | |
3034 RENAME(vertX1Filter)(tempBlock1, 16, QP); | |
3035 else if(mode & H_DEBLOCK) | 2970 else if(mode & H_DEBLOCK) |
3036 { | 2971 { |
3037 if( RENAME(isVertDC)(tempBlock1, 16) ) | 2972 if( RENAME(isVertDC)(tempBlock1, 16, &c)) |
3038 { | 2973 { |
3039 if(RENAME(isVertMinMaxOk)(tempBlock1, 16, QP)) | 2974 if(RENAME(isVertMinMaxOk)(tempBlock1, 16, &c)) |
3040 RENAME(doVertLowPass)(tempBlock1, 16, QP); | 2975 RENAME(doVertLowPass)(tempBlock1, 16, &c); |
3041 } | 2976 } |
3042 else | 2977 else |
3043 RENAME(doVertDefFilter)(tempBlock1, 16, QP); | 2978 RENAME(doVertDefFilter)(tempBlock1, 16, &c); |
3044 } | 2979 } |
3045 | 2980 |
3046 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); | 2981 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); |
3047 | 2982 |
3048 #else | 2983 #else |
3049 if(mode & H_X1_FILTER) | 2984 if(mode & H_X1_FILTER) |
3050 horizX1Filter(dstBlock-4, stride, QP); | 2985 horizX1Filter(dstBlock-4, stride, QP); |
3051 else if(mode & H_DEBLOCK) | 2986 else if(mode & H_DEBLOCK) |
3052 { | 2987 { |
3053 if( isHorizDC(dstBlock-4, stride)) | 2988 if( isHorizDC(dstBlock-4, stride, &c)) |
3054 { | 2989 { |
3055 if(isHorizMinMaxOk(dstBlock-4, stride, QP)) | 2990 if(isHorizMinMaxOk(dstBlock-4, stride, QP)) |
3056 doHorizLowPass(dstBlock-4, stride, QP); | 2991 doHorizLowPass(dstBlock-4, stride, QP); |
3057 } | 2992 } |
3058 else | 2993 else |
3059 doHorizDefFilter(dstBlock-4, stride, QP); | 2994 doHorizDefFilter(dstBlock-4, stride, QP); |
3060 } | 2995 } |
3061 #endif | 2996 #endif |
3062 #ifdef MORE_TIMING | |
3063 T1= rdtsc(); | |
3064 horizTime+= T1-T0; | |
3065 T0=T1; | |
3066 #endif | |
3067 if(mode & DERING) | 2997 if(mode & DERING) |
3068 { | 2998 { |
3069 //FIXME filter first line | 2999 //FIXME filter first line |
3070 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, QP); | 3000 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); |
3071 } | 3001 } |
3072 | 3002 |
3073 if(mode & TEMP_NOISE_FILTER) | 3003 if(mode & TEMP_NOISE_FILTER) |
3074 { | 3004 { |
3075 RENAME(tempNoiseReducer)(dstBlock-8, stride, | 3005 RENAME(tempNoiseReducer)(dstBlock-8, stride, |
3076 tempBlured[isColor] + y*dstStride + x, | 3006 c.tempBlured[isColor] + y*dstStride + x, |
3077 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | 3007 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), |
3078 ppMode->maxTmpNoise); | 3008 c.ppMode.maxTmpNoise); |
3079 } | 3009 } |
3080 } | 3010 } |
3081 | |
3082 #ifdef PP_FUNNY_STRIDE | |
3083 /* did we use a tmp-block buffer */ | |
3084 if(x+7 >= width) | |
3085 { | |
3086 int i; | |
3087 dstBlock= dstBlockPtrBackup; | |
3088 srcBlock= srcBlockPtrBackup; | |
3089 | |
3090 for(i=0;i<BLOCK_SIZE*2; i++) | |
3091 { | |
3092 memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x); | |
3093 } | |
3094 } | |
3095 #endif | |
3096 | 3011 |
3097 dstBlock+=8; | 3012 dstBlock+=8; |
3098 srcBlock+=8; | 3013 srcBlock+=8; |
3099 | 3014 |
3100 #ifdef HAVE_MMX | 3015 #ifdef HAVE_MMX |
3104 #endif | 3019 #endif |
3105 } | 3020 } |
3106 | 3021 |
3107 if(mode & DERING) | 3022 if(mode & DERING) |
3108 { | 3023 { |
3109 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, QP); | 3024 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); |
3110 } | 3025 } |
3111 | 3026 |
3112 if((mode & TEMP_NOISE_FILTER)) | 3027 if((mode & TEMP_NOISE_FILTER)) |
3113 { | 3028 { |
3114 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, | 3029 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, |
3115 tempBlured[isColor] + y*dstStride + x, | 3030 c.tempBlured[isColor] + y*dstStride + x, |
3116 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | 3031 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), |
3117 ppMode->maxTmpNoise); | 3032 c.ppMode.maxTmpNoise); |
3118 } | 3033 } |
3119 | 3034 |
3120 /* did we use a tmp buffer for the last lines*/ | 3035 /* did we use a tmp buffer for the last lines*/ |
3121 if(y+15 >= height) | 3036 if(y+15 >= height) |
3122 { | 3037 { |
3138 asm volatile("femms"); | 3053 asm volatile("femms"); |
3139 #elif defined (HAVE_MMX) | 3054 #elif defined (HAVE_MMX) |
3140 asm volatile("emms"); | 3055 asm volatile("emms"); |
3141 #endif | 3056 #endif |
3142 | 3057 |
3143 #ifdef TIMING | |
3144 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...) | |
3145 sumTime= rdtsc() - sumTime; | |
3146 if(!isColor) | |
3147 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r", | |
3148 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000), | |
3149 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000) | |
3150 , black, white); | |
3151 #endif | |
3152 #ifdef DEBUG_BRIGHTNESS | 3058 #ifdef DEBUG_BRIGHTNESS |
3153 if(!isColor) | 3059 if(!isColor) |
3154 { | 3060 { |
3155 int max=1; | 3061 int max=1; |
3156 int i; | 3062 int i; |
3174 } | 3080 } |
3175 | 3081 |
3176 } | 3082 } |
3177 #endif | 3083 #endif |
3178 | 3084 |
3085 *c2= c; //copy local context back | |
3086 | |
3179 } | 3087 } |