comparison postproc/postprocess_template.c @ 7946:f483ab704252

postprocessing cleanup: remove opendivx #ifdefs remove rk1 filter remove unused / obsolete stuff add -1,4,2,4,-1 deinterlacing filter (ffmpeg uses that) threadsafe / no more non-const globals some optimizations different strides for Y,U,V possible remove ebx usage (someone really should fix gcc, this is really lame) change the dering filter slightly (tell me if its worse for any files)
author michael
date Mon, 28 Oct 2002 19:31:04 +0000
parents e3ecccc7e505
children 5a6cbe774760
comparison
equal deleted inserted replaced
7945:32939f2b3d2e 7946:f483ab704252
43 "paddb " #a ", " #b " \n\t" 43 "paddb " #a ", " #b " \n\t"
44 #endif 44 #endif
45 45
46 46
47 //FIXME? |255-0| = 1 (shouldnt be a problem ...) 47 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
48 #ifdef HAVE_MMX
48 /** 49 /**
49 * Check if the middle 8x8 Block in the given 8x16 block is flat 50 * Check if the middle 8x8 Block in the given 8x16 block is flat
50 */ 51 */
51 static inline int RENAME(isVertDC)(uint8_t src[], int stride){ 52 static inline int RENAME(isVertDC)(uint8_t src[], int stride, PPContext *c){
52 int numEq= 0; 53 int numEq= 0;
53 #ifndef HAVE_MMX
54 int y;
55 #endif
56 src+= stride*4; // src points to begin of the 8x8 Block 54 src+= stride*4; // src points to begin of the 8x8 Block
57 #ifdef HAVE_MMX
58 asm volatile( 55 asm volatile(
59 "leal (%1, %2), %%eax \n\t" 56 "leal (%1, %2), %%eax \n\t"
60 "leal (%%eax, %2, 4), %%ebx \n\t"
61 // 0 1 2 3 4 5 6 7 8 9 57 // 0 1 2 3 4 5 6 7 8 9
62 // %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2 58 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
63 "movq "MANGLE(mmxDCOffset)", %%mm7 \n\t" // mm7 = 0x7F 59 "movq %3, %%mm7 \n\t" // mm7 = 0x7F
64 "movq "MANGLE(mmxDCThreshold)", %%mm6 \n\t" // mm6 = 0x7D 60 "movq %4, %%mm6 \n\t" // mm6 = 0x7D
65 "movq (%1), %%mm0 \n\t" 61 "movq (%1), %%mm0 \n\t"
66 "movq (%%eax), %%mm1 \n\t" 62 "movq (%%eax), %%mm1 \n\t"
67 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece 63 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
68 "paddb %%mm7, %%mm0 \n\t" 64 "paddb %%mm7, %%mm0 \n\t"
69 "pcmpgtb %%mm6, %%mm0 \n\t" 65 "pcmpgtb %%mm6, %%mm0 \n\t"
77 "movq (%%eax, %2, 2), %%mm1 \n\t" 73 "movq (%%eax, %2, 2), %%mm1 \n\t"
78 "psubb %%mm1, %%mm2 \n\t" 74 "psubb %%mm1, %%mm2 \n\t"
79 "paddb %%mm7, %%mm2 \n\t" 75 "paddb %%mm7, %%mm2 \n\t"
80 "pcmpgtb %%mm6, %%mm2 \n\t" 76 "pcmpgtb %%mm6, %%mm2 \n\t"
81 "paddb %%mm2, %%mm0 \n\t" 77 "paddb %%mm2, %%mm0 \n\t"
78
79 "leal (%%eax, %2, 4), %%eax \n\t"
82 80
83 "movq (%1, %2, 4), %%mm2 \n\t" 81 "movq (%1, %2, 4), %%mm2 \n\t"
84 "psubb %%mm2, %%mm1 \n\t" 82 "psubb %%mm2, %%mm1 \n\t"
85 "paddb %%mm7, %%mm1 \n\t" 83 "paddb %%mm7, %%mm1 \n\t"
86 "pcmpgtb %%mm6, %%mm1 \n\t" 84 "pcmpgtb %%mm6, %%mm1 \n\t"
87 "paddb %%mm1, %%mm0 \n\t" 85 "paddb %%mm1, %%mm0 \n\t"
88 86
89 "movq (%%ebx), %%mm1 \n\t" 87 "movq (%%eax), %%mm1 \n\t"
90 "psubb %%mm1, %%mm2 \n\t" 88 "psubb %%mm1, %%mm2 \n\t"
91 "paddb %%mm7, %%mm2 \n\t" 89 "paddb %%mm7, %%mm2 \n\t"
92 "pcmpgtb %%mm6, %%mm2 \n\t" 90 "pcmpgtb %%mm6, %%mm2 \n\t"
93 "paddb %%mm2, %%mm0 \n\t" 91 "paddb %%mm2, %%mm0 \n\t"
94 92
95 "movq (%%ebx, %2), %%mm2 \n\t" 93 "movq (%%eax, %2), %%mm2 \n\t"
96 "psubb %%mm2, %%mm1 \n\t" 94 "psubb %%mm2, %%mm1 \n\t"
97 "paddb %%mm7, %%mm1 \n\t" 95 "paddb %%mm7, %%mm1 \n\t"
98 "pcmpgtb %%mm6, %%mm1 \n\t" 96 "pcmpgtb %%mm6, %%mm1 \n\t"
99 "paddb %%mm1, %%mm0 \n\t" 97 "paddb %%mm1, %%mm0 \n\t"
100 98
101 "movq (%%ebx, %2, 2), %%mm1 \n\t" 99 "movq (%%eax, %2, 2), %%mm1 \n\t"
102 "psubb %%mm1, %%mm2 \n\t" 100 "psubb %%mm1, %%mm2 \n\t"
103 "paddb %%mm7, %%mm2 \n\t" 101 "paddb %%mm7, %%mm2 \n\t"
104 "pcmpgtb %%mm6, %%mm2 \n\t" 102 "pcmpgtb %%mm6, %%mm2 \n\t"
105 "paddb %%mm2, %%mm0 \n\t" 103 "paddb %%mm2, %%mm0 \n\t"
106 104
119 "psrlq $32, %%mm0 \n\t" 117 "psrlq $32, %%mm0 \n\t"
120 "paddb %%mm1, %%mm0 \n\t" 118 "paddb %%mm1, %%mm0 \n\t"
121 #endif 119 #endif
122 "movd %%mm0, %0 \n\t" 120 "movd %%mm0, %0 \n\t"
123 : "=r" (numEq) 121 : "=r" (numEq)
124 : "r" (src), "r" (stride) 122 : "r" (src), "r" (stride), "m" (c->mmxDcOffset), "m" (c->mmxDcThreshold)
125 : "%eax", "%ebx" 123 : "%eax"
126 ); 124 );
127 numEq= (-numEq) &0xFF; 125 numEq= (-numEq) &0xFF;
128 126 return numEq > c->ppMode.flatnessThreshold;
129 #else
130 for(y=0; y<BLOCK_SIZE-1; y++)
131 {
132 if(((src[0] - src[0+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
133 if(((src[1] - src[1+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
134 if(((src[2] - src[2+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
135 if(((src[3] - src[3+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
136 if(((src[4] - src[4+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
137 if(((src[5] - src[5+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
138 if(((src[6] - src[6+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
139 if(((src[7] - src[7+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++;
140 src+= stride;
141 }
142 #endif
143 /* if(abs(numEq - asmEq) > 0)
144 {
145 printf("\nasm:%d c:%d\n", asmEq, numEq);
146 for(int y=0; y<8; y++)
147 {
148 for(int x=0; x<8; x++)
149 {
150 printf("%d ", temp[x + y*stride]);
151 }
152 printf("\n");
153 }
154 }
155 */
156 // for(int i=0; i<numEq/8; i++) src[i]=255;
157 return (numEq > vFlatnessThreshold) ? 1 : 0;
158 } 127 }
159 128 #endif
160 static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, int QP) 129
130 static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, PPContext *c)
161 { 131 {
162 #ifdef HAVE_MMX 132 #ifdef HAVE_MMX
163 int isOk; 133 int isOk;
164 src+= stride*3; 134 src+= stride*3;
165 asm volatile( 135 asm volatile(
166 // "int $3 \n\t"
167 "movq (%1, %2), %%mm0 \n\t" 136 "movq (%1, %2), %%mm0 \n\t"
168 "movq (%1, %2, 8), %%mm1 \n\t" 137 "movq (%1, %2, 8), %%mm1 \n\t"
169 "movq %%mm0, %%mm2 \n\t" 138 "movq %%mm0, %%mm2 \n\t"
170 "psubusb %%mm1, %%mm0 \n\t" 139 "psubusb %%mm1, %%mm0 \n\t"
171 "psubusb %%mm2, %%mm1 \n\t" 140 "psubusb %%mm2, %%mm1 \n\t"
172 "por %%mm1, %%mm0 \n\t" // ABS Diff 141 "por %%mm1, %%mm0 \n\t" // ABS Diff
173 142
174 "movq "MANGLE(pQPb)", %%mm7 \n\t" // QP,..., QP 143 "movq %3, %%mm7 \n\t" // QP,..., QP
175 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP 144 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
176 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 145 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
177 "pcmpeqd "MANGLE(b00)", %%mm0 \n\t" 146 "packssdw %%mm0, %%mm0 \n\t"
178 "psrlq $16, %%mm0 \n\t"
179 "pcmpeqd "MANGLE(bFF)", %%mm0 \n\t"
180 // "movd %%mm0, (%1, %2, 4)\n\t"
181 "movd %%mm0, %0 \n\t" 147 "movd %%mm0, %0 \n\t"
182 : "=r" (isOk) 148 : "=r" (isOk)
183 : "r" (src), "r" (stride) 149 : "r" (src), "r" (stride), "m" (c->pQPb)
184 ); 150 );
185 return isOk; 151 return isOk==0;
186 #else 152 #else
187
188 int isOk2= 1;
189 int x; 153 int x;
154 const int QP= c->QP;
190 src+= stride*3; 155 src+= stride*3;
191 for(x=0; x<BLOCK_SIZE; x++) 156 for(x=0; x<BLOCK_SIZE; x++)
192 { 157 {
193 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; 158 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
194 } 159 }
195 /* if(isOk && !isOk2 || !isOk && isOk2) 160
196 { 161 return 1;
197 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP); 162 #endif
198 for(int y=0; y<9; y++)
199 {
200 for(int x=0; x<8; x++)
201 {
202 printf("%d ", src[x + y*stride]);
203 }
204 printf("\n");
205 }
206 } */
207
208 return isOk2;
209 #endif
210
211 } 163 }
212 164
213 /** 165 /**
214 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) 166 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
215 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 167 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
216 */ 168 */
217 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) 169 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
218 { 170 {
219 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 171 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
220 src+= stride*3; 172 src+= stride*3;
221 asm volatile( //"movv %0 %1 %2\n\t" 173 asm volatile( //"movv %0 %1 %2\n\t"
222 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP 174 "movq %2, %%mm0 \n\t" // QP,..., QP
175 "pxor %%mm4, %%mm4 \n\t"
223 176
224 "movq (%0), %%mm6 \n\t" 177 "movq (%0), %%mm6 \n\t"
225 "movq (%0, %1), %%mm5 \n\t" 178 "movq (%0, %1), %%mm5 \n\t"
226 "movq %%mm5, %%mm1 \n\t" 179 "movq %%mm5, %%mm1 \n\t"
227 "movq %%mm6, %%mm2 \n\t" 180 "movq %%mm6, %%mm2 \n\t"
228 "psubusb %%mm6, %%mm5 \n\t" 181 "psubusb %%mm6, %%mm5 \n\t"
229 "psubusb %%mm1, %%mm2 \n\t" 182 "psubusb %%mm1, %%mm2 \n\t"
230 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 183 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
231 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 184 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
232 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // diff <= QP -> FF 185 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
233 186
234 "pand %%mm2, %%mm6 \n\t" 187 "pand %%mm2, %%mm6 \n\t"
235 "pandn %%mm1, %%mm2 \n\t" 188 "pandn %%mm1, %%mm2 \n\t"
236 "por %%mm2, %%mm6 \n\t"// First Line to Filter 189 "por %%mm2, %%mm6 \n\t"// First Line to Filter
237 190
238 "movq (%0, %1, 8), %%mm5 \n\t" 191 "movq (%0, %1, 8), %%mm5 \n\t"
239 "leal (%0, %1, 4), %%eax \n\t" 192 "leal (%0, %1, 4), %%eax \n\t"
240 "leal (%0, %1, 8), %%ebx \n\t" 193 "leal (%0, %1, 8), %%ecx \n\t"
241 "subl %1, %%ebx \n\t" 194 "subl %1, %%ecx \n\t"
242 "addl %1, %0 \n\t" // %0 points to line 1 not 0 195 "addl %1, %0 \n\t" // %0 points to line 1 not 0
243 "movq (%0, %1, 8), %%mm7 \n\t" 196 "movq (%0, %1, 8), %%mm7 \n\t"
244 "movq %%mm5, %%mm1 \n\t" 197 "movq %%mm5, %%mm1 \n\t"
245 "movq %%mm7, %%mm2 \n\t" 198 "movq %%mm7, %%mm2 \n\t"
246 "psubusb %%mm7, %%mm5 \n\t" 199 "psubusb %%mm7, %%mm5 \n\t"
247 "psubusb %%mm1, %%mm2 \n\t" 200 "psubusb %%mm1, %%mm2 \n\t"
248 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 201 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
249 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 202 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
250 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // diff <= QP -> FF 203 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
251 204
252 "pand %%mm2, %%mm7 \n\t" 205 "pand %%mm2, %%mm7 \n\t"
253 "pandn %%mm1, %%mm2 \n\t" 206 "pandn %%mm1, %%mm2 \n\t"
254 "por %%mm2, %%mm7 \n\t" // First Line to Filter 207 "por %%mm2, %%mm7 \n\t" // First Line to Filter
255 208
256 209
257 // 1 2 3 4 5 6 7 8 210 // 1 2 3 4 5 6 7 8
258 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1 211 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
259 // 6 4 2 2 1 1 212 // 6 4 2 2 1 1
260 // 6 4 4 2 213 // 6 4 4 2
261 // 6 8 2 214 // 6 8 2
262 215
263 "movq (%0, %1), %%mm0 \n\t" // 1 216 "movq (%0, %1), %%mm0 \n\t" // 1
284 PAVGB(%%mm5, %%mm3) // 2 2211 /8 237 PAVGB(%%mm5, %%mm3) // 2 2211 /8
285 PAVGB(%%mm0, %%mm3) //4242211 /16 238 PAVGB(%%mm0, %%mm3) //4242211 /16
286 "movq %%mm3, (%0,%1) \n\t" // X 239 "movq %%mm3, (%0,%1) \n\t" // X
287 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 240 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
288 PAVGB(%%mm4, %%mm6) //11 /2 241 PAVGB(%%mm4, %%mm6) //11 /2
289 "movq (%%ebx), %%mm0 \n\t" // 1 242 "movq (%%ecx), %%mm0 \n\t" // 1
290 PAVGB((%%eax, %1, 2), %%mm0) // 11/2 243 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
291 "movq %%mm0, %%mm3 \n\t" // 11/2 244 "movq %%mm0, %%mm3 \n\t" // 11/2
292 PAVGB(%%mm1, %%mm0) // 2 11/4 245 PAVGB(%%mm1, %%mm0) // 2 11/4
293 PAVGB(%%mm6, %%mm0) //222 11/8 246 PAVGB(%%mm6, %%mm0) //222 11/8
294 PAVGB(%%mm2, %%mm0) //22242211/16 247 PAVGB(%%mm2, %%mm0) //22242211/16
295 "movq (%0, %1, 2), %%mm2 \n\t" // 1 248 "movq (%0, %1, 2), %%mm2 \n\t" // 1
296 "movq %%mm0, (%0, %1, 2) \n\t" // X 249 "movq %%mm0, (%0, %1, 2) \n\t" // X
297 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 250 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
298 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 251 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
299 PAVGB((%%ebx), %%mm0) // 11 /2 252 PAVGB((%%ecx), %%mm0) // 11 /2
300 PAVGB(%%mm0, %%mm6) //11 11 /4 253 PAVGB(%%mm0, %%mm6) //11 11 /4
301 PAVGB(%%mm1, %%mm4) // 11 /2 254 PAVGB(%%mm1, %%mm4) // 11 /2
302 PAVGB(%%mm2, %%mm1) // 11 /2 255 PAVGB(%%mm2, %%mm1) // 11 /2
303 PAVGB(%%mm1, %%mm6) //1122 11 /8 256 PAVGB(%%mm1, %%mm6) //1122 11 /8
304 PAVGB(%%mm5, %%mm6) //112242211 /16 257 PAVGB(%%mm5, %%mm6) //112242211 /16
321 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 274 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
322 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 275 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
323 PAVGB(%%mm0, %%mm1) // 11224222 /16 276 PAVGB(%%mm0, %%mm1) // 11224222 /16
324 "movq %%mm1, (%%eax, %1, 2) \n\t" // X 277 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
325 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 278 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
326 PAVGB((%%ebx), %%mm2) // 112 4 /8 279 PAVGB((%%ecx), %%mm2) // 112 4 /8
327 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 280 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
328 PAVGB(%%mm0, %%mm6) // 1 1 /2 281 PAVGB(%%mm0, %%mm6) // 1 1 /2
329 PAVGB(%%mm7, %%mm6) // 1 12 /4 282 PAVGB(%%mm7, %%mm6) // 1 12 /4
330 PAVGB(%%mm2, %%mm6) // 1122424 /4 283 PAVGB(%%mm2, %%mm6) // 1122424 /4
331 "movq %%mm6, (%%ebx) \n\t" // X 284 "movq %%mm6, (%%ecx) \n\t" // X
332 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 285 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
333 PAVGB(%%mm7, %%mm5) // 11 2 /4 286 PAVGB(%%mm7, %%mm5) // 11 2 /4
334 PAVGB(%%mm7, %%mm5) // 11 6 /8 287 PAVGB(%%mm7, %%mm5) // 11 6 /8
335 288
336 PAVGB(%%mm3, %%mm0) // 112 /4 289 PAVGB(%%mm3, %%mm0) // 112 /4
337 PAVGB(%%mm0, %%mm5) // 112246 /16 290 PAVGB(%%mm0, %%mm5) // 112246 /16
338 "movq %%mm5, (%%eax, %1, 4) \n\t" // X 291 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
339 "subl %1, %0 \n\t" 292 "subl %1, %0 \n\t"
340 293
341 : 294 :
342 : "r" (src), "r" (stride) 295 : "r" (src), "r" (stride), "m" (c->pQPb)
343 : "%eax", "%ebx" 296 : "%eax", "%ecx"
344 ); 297 );
345 #else 298 #else
346 const int l1= stride; 299 const int l1= stride;
347 const int l2= stride + l1; 300 const int l2= stride + l1;
348 const int l3= stride + l2; 301 const int l3= stride + l2;
354 const int l9= stride + l8; 307 const int l9= stride + l8;
355 int x; 308 int x;
356 src+= stride*3; 309 src+= stride*3;
357 for(x=0; x<BLOCK_SIZE; x++) 310 for(x=0; x<BLOCK_SIZE; x++)
358 { 311 {
359 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; 312 const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
360 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; 313 const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
361 314
362 int sums[9]; 315 int sums[9];
363 sums[0] = first + src[l1]; 316 sums[0] = first + src[l1];
364 sums[1] = src[l1] + src[l2]; 317 sums[1] = src[l1] + src[l2];
365 sums[2] = src[l2] + src[l3]; 318 sums[2] = src[l2] + src[l3];
379 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4; 332 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
380 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; 333 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
381 334
382 src++; 335 src++;
383 } 336 }
384
385 #endif 337 #endif
386 } 338 }
387 339
340 #if 0
388 /** 341 /**
389 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar 342 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
390 * values are correctly clipped (MMX2) 343 * values are correctly clipped (MMX2)
391 * values are wraparound (C) 344 * values are wraparound (C)
392 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient 345 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
403 // FIXME rounding 356 // FIXME rounding
404 asm volatile( 357 asm volatile(
405 "pxor %%mm7, %%mm7 \n\t" // 0 358 "pxor %%mm7, %%mm7 \n\t" // 0
406 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE 359 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE
407 "leal (%0, %1), %%eax \n\t" 360 "leal (%0, %1), %%eax \n\t"
408 "leal (%%eax, %1, 4), %%ebx \n\t" 361 "leal (%%eax, %1, 4), %%ecx \n\t"
409 // 0 1 2 3 4 5 6 7 8 9 362 // 0 1 2 3 4 5 6 7 8 9
410 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 363 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
411 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP 364 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP
412 "movq %%mm0, %%mm1 \n\t" // QP,..., QP 365 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
413 "paddusb "MANGLE(b02)", %%mm0 \n\t" 366 "paddusb "MANGLE(b02)", %%mm0 \n\t"
414 "psrlw $2, %%mm0 \n\t" 367 "psrlw $2, %%mm0 \n\t"
415 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 368 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4
416 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... 369 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
417 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 370 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
418 "movq (%%ebx), %%mm3 \n\t" // line 5 371 "movq (%%ecx), %%mm3 \n\t" // line 5
419 "movq %%mm2, %%mm4 \n\t" // line 4 372 "movq %%mm2, %%mm4 \n\t" // line 4
420 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 373 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
421 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 374 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
422 PAVGB(%%mm3, %%mm5) 375 PAVGB(%%mm3, %%mm5)
423 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 376 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
431 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 384 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
432 "paddb %%mm5, %%mm2 \n\t" 385 "paddb %%mm5, %%mm2 \n\t"
433 // "psubb %%mm6, %%mm2 \n\t" 386 // "psubb %%mm6, %%mm2 \n\t"
434 "movq %%mm2, (%0,%1, 4) \n\t" 387 "movq %%mm2, (%0,%1, 4) \n\t"
435 388
436 "movq (%%ebx), %%mm2 \n\t" 389 "movq (%%ecx), %%mm2 \n\t"
437 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 390 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
438 "psubb %%mm5, %%mm2 \n\t" 391 "psubb %%mm5, %%mm2 \n\t"
439 // "psubb %%mm6, %%mm2 \n\t" 392 // "psubb %%mm6, %%mm2 \n\t"
440 "movq %%mm2, (%%ebx) \n\t" 393 "movq %%mm2, (%%ecx) \n\t"
441 394
442 "paddb %%mm6, %%mm5 \n\t" 395 "paddb %%mm6, %%mm5 \n\t"
443 "psrlw $2, %%mm5 \n\t" 396 "psrlw $2, %%mm5 \n\t"
444 "pand "MANGLE(b3F)", %%mm5 \n\t" 397 "pand "MANGLE(b3F)", %%mm5 \n\t"
445 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 398 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8
448 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 401 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
449 "paddsb %%mm5, %%mm2 \n\t" 402 "paddsb %%mm5, %%mm2 \n\t"
450 "psubb %%mm6, %%mm2 \n\t" 403 "psubb %%mm6, %%mm2 \n\t"
451 "movq %%mm2, (%%eax, %1, 2) \n\t" 404 "movq %%mm2, (%%eax, %1, 2) \n\t"
452 405
453 "movq (%%ebx, %1), %%mm2 \n\t" 406 "movq (%%ecx, %1), %%mm2 \n\t"
454 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 407 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
455 "psubsb %%mm5, %%mm2 \n\t" 408 "psubsb %%mm5, %%mm2 \n\t"
456 "psubb %%mm6, %%mm2 \n\t" 409 "psubb %%mm6, %%mm2 \n\t"
457 "movq %%mm2, (%%ebx, %1) \n\t" 410 "movq %%mm2, (%%ecx, %1) \n\t"
458 411
459 : 412 :
460 : "r" (src), "r" (stride) 413 : "r" (src), "r" (stride)
461 : "%eax", "%ebx" 414 : "%eax", "%ecx"
462 ); 415 );
463 #else 416 #else
464 const int l1= stride; 417 const int l1= stride;
465 const int l2= stride + l1; 418 const int l2= stride + l1;
466 const int l3= stride + l2; 419 const int l3= stride + l2;
486 } 439 }
487 } 440 }
488 441
489 #endif 442 #endif
490 } 443 }
444 #endif
491 445
492 /** 446 /**
493 * Experimental Filter 1 447 * Experimental Filter 1
494 * will not damage linear gradients 448 * will not damage linear gradients
495 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter 449 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
496 * can only smooth blocks at the expected locations (it cant smooth them if they did move) 450 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
497 * MMX2 version does correct clipping C version doesnt 451 * MMX2 version does correct clipping C version doesnt
498 */ 452 */
499 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP) 453 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
500 { 454 {
501 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 455 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
502 src+= stride*3; 456 src+= stride*3;
503 457
504 asm volatile( 458 asm volatile(
505 "pxor %%mm7, %%mm7 \n\t" // 0 459 "pxor %%mm7, %%mm7 \n\t" // 0
506 "leal (%0, %1), %%eax \n\t" 460 "leal (%0, %1), %%eax \n\t"
507 "leal (%%eax, %1, 4), %%ebx \n\t" 461 "leal (%%eax, %1, 4), %%ecx \n\t"
508 // 0 1 2 3 4 5 6 7 8 9 462 // 0 1 2 3 4 5 6 7 8 9
509 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 463 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
510 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 464 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
511 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 465 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
512 "movq %%mm1, %%mm2 \n\t" // line 4 466 "movq %%mm1, %%mm2 \n\t" // line 4
513 "psubusb %%mm0, %%mm1 \n\t" 467 "psubusb %%mm0, %%mm1 \n\t"
514 "psubusb %%mm2, %%mm0 \n\t" 468 "psubusb %%mm2, %%mm0 \n\t"
515 "por %%mm1, %%mm0 \n\t" // |l2 - l3| 469 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
516 "movq (%%ebx), %%mm3 \n\t" // line 5 470 "movq (%%ecx), %%mm3 \n\t" // line 5
517 "movq (%%ebx, %1), %%mm4 \n\t" // line 6 471 "movq (%%ecx, %1), %%mm4 \n\t" // line 6
518 "movq %%mm3, %%mm5 \n\t" // line 5 472 "movq %%mm3, %%mm5 \n\t" // line 5
519 "psubusb %%mm4, %%mm3 \n\t" 473 "psubusb %%mm4, %%mm3 \n\t"
520 "psubusb %%mm5, %%mm4 \n\t" 474 "psubusb %%mm5, %%mm4 \n\t"
521 "por %%mm4, %%mm3 \n\t" // |l5 - l6| 475 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
522 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 476 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
526 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 480 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
527 "psubusb %%mm1, %%mm5 \n\t" 481 "psubusb %%mm1, %%mm5 \n\t"
528 "por %%mm5, %%mm4 \n\t" // |l4 - l5| 482 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
529 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) 483 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
530 "movq %%mm4, %%mm3 \n\t" // d 484 "movq %%mm4, %%mm3 \n\t" // d
531 "movq "MANGLE(pQPb)", %%mm0 \n\t" 485 "movq %2, %%mm0 \n\t"
532 "paddusb %%mm0, %%mm0 \n\t" 486 "paddusb %%mm0, %%mm0 \n\t"
533 "psubusb %%mm0, %%mm4 \n\t" 487 "psubusb %%mm0, %%mm4 \n\t"
534 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 488 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
535 "psubusb "MANGLE(b01)", %%mm3 \n\t" 489 "psubusb "MANGLE(b01)", %%mm3 \n\t"
536 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 490 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
544 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 498 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
545 "psubusb %%mm3, %%mm0 \n\t" 499 "psubusb %%mm3, %%mm0 \n\t"
546 "pxor %%mm2, %%mm0 \n\t" 500 "pxor %%mm2, %%mm0 \n\t"
547 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 501 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
548 502
549 "movq (%%ebx), %%mm0 \n\t" // line 5 503 "movq (%%ecx), %%mm0 \n\t" // line 5
550 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 504 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
551 "paddusb %%mm3, %%mm0 \n\t" 505 "paddusb %%mm3, %%mm0 \n\t"
552 "pxor %%mm2, %%mm0 \n\t" 506 "pxor %%mm2, %%mm0 \n\t"
553 "movq %%mm0, (%%ebx) \n\t" // line 5 507 "movq %%mm0, (%%ecx) \n\t" // line 5
554 508
555 PAVGB(%%mm7, %%mm1) // d/4 509 PAVGB(%%mm7, %%mm1) // d/4
556 510
557 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 511 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
558 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 512 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
559 "psubusb %%mm1, %%mm0 \n\t" 513 "psubusb %%mm1, %%mm0 \n\t"
560 "pxor %%mm2, %%mm0 \n\t" 514 "pxor %%mm2, %%mm0 \n\t"
561 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 515 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
562 516
563 "movq (%%ebx, %1), %%mm0 \n\t" // line 6 517 "movq (%%ecx, %1), %%mm0 \n\t" // line 6
564 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 518 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
565 "paddusb %%mm1, %%mm0 \n\t" 519 "paddusb %%mm1, %%mm0 \n\t"
566 "pxor %%mm2, %%mm0 \n\t" 520 "pxor %%mm2, %%mm0 \n\t"
567 "movq %%mm0, (%%ebx, %1) \n\t" // line 6 521 "movq %%mm0, (%%ecx, %1) \n\t" // line 6
568 522
569 PAVGB(%%mm7, %%mm1) // d/8 523 PAVGB(%%mm7, %%mm1) // d/8
570 524
571 "movq (%%eax, %1), %%mm0 \n\t" // line 2 525 "movq (%%eax, %1), %%mm0 \n\t" // line 2
572 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 526 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
573 "psubusb %%mm1, %%mm0 \n\t" 527 "psubusb %%mm1, %%mm0 \n\t"
574 "pxor %%mm2, %%mm0 \n\t" 528 "pxor %%mm2, %%mm0 \n\t"
575 "movq %%mm0, (%%eax, %1) \n\t" // line 2 529 "movq %%mm0, (%%eax, %1) \n\t" // line 2
576 530
577 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 531 "movq (%%ecx, %1, 2), %%mm0 \n\t" // line 7
578 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 532 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
579 "paddusb %%mm1, %%mm0 \n\t" 533 "paddusb %%mm1, %%mm0 \n\t"
580 "pxor %%mm2, %%mm0 \n\t" 534 "pxor %%mm2, %%mm0 \n\t"
581 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 535 "movq %%mm0, (%%ecx, %1, 2) \n\t" // line 7
582 536
583 : 537 :
584 : "r" (src), "r" (stride) 538 : "r" (src), "r" (stride), "m" (co->pQPb)
585 : "%eax", "%ebx" 539 : "%eax", "%ecx"
586 ); 540 );
587 #else 541 #else
588 542
589 const int l1= stride; 543 const int l1= stride;
590 const int l2= stride + l1; 544 const int l2= stride + l1;
605 int c= src[l5] - src[l6]; 559 int c= src[l5] - src[l6];
606 560
607 int d= ABS(b) - ((ABS(a) + ABS(c))>>1); 561 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
608 d= MAX(d, 0); 562 d= MAX(d, 0);
609 563
610 if(d < QP*2) 564 if(d < co->QP*2)
611 { 565 {
612 int v = d * SIGN(-b); 566 int v = d * SIGN(-b);
613 567
614 src[l2] +=v>>3; 568 src[l2] +=v>>3;
615 src[l3] +=v>>2; 569 src[l3] +=v>>2;
619 src[l7] -=v>>3; 573 src[l7] -=v>>3;
620 574
621 } 575 }
622 src++; 576 src++;
623 } 577 }
624 /*
625 const int l1= stride;
626 const int l2= stride + l1;
627 const int l3= stride + l2;
628 const int l4= stride + l3;
629 const int l5= stride + l4;
630 const int l6= stride + l5;
631 const int l7= stride + l6;
632 const int l8= stride + l7;
633 const int l9= stride + l8;
634 for(int x=0; x<BLOCK_SIZE; x++)
635 {
636 int v2= src[l2];
637 int v3= src[l3];
638 int v4= src[l4];
639 int v5= src[l5];
640 int v6= src[l6];
641 int v7= src[l7];
642
643 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
644 {
645 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16;
646 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16;
647 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
648 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
649 }
650 src++;
651 }
652 */
653 #endif 578 #endif
654 } 579 }
655 580
656 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) 581 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
657 { 582 {
658 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 583 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
659 /* 584 /*
660 uint8_t tmp[16]; 585 uint8_t tmp[16];
661 const int l1= stride; 586 const int l1= stride;
674 asm volatile( 599 asm volatile(
675 600
676 #if 0 //sligtly more accurate and slightly slower 601 #if 0 //sligtly more accurate and slightly slower
677 "pxor %%mm7, %%mm7 \n\t" // 0 602 "pxor %%mm7, %%mm7 \n\t" // 0
678 "leal (%0, %1), %%eax \n\t" 603 "leal (%0, %1), %%eax \n\t"
679 "leal (%%eax, %1, 4), %%ebx \n\t" 604 "leal (%%eax, %1, 4), %%ecx \n\t"
680 // 0 1 2 3 4 5 6 7 605 // 0 1 2 3 4 5 6 7
681 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 606 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
682 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 607 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
683 608
684 609
685 "movq (%0, %1, 2), %%mm0 \n\t" // l2 610 "movq (%0, %1, 2), %%mm0 \n\t" // l2
686 "movq (%0), %%mm1 \n\t" // l0 611 "movq (%0), %%mm1 \n\t" // l0
687 "movq %%mm0, %%mm2 \n\t" // l2 612 "movq %%mm0, %%mm2 \n\t" // l2
706 "movq %%mm0, %%mm4 \n\t" // l4 631 "movq %%mm0, %%mm4 \n\t" // l4
707 PAVGB(%%mm7, %%mm0) // ~l4/2 632 PAVGB(%%mm7, %%mm0) // ~l4/2
708 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 633 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
709 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 634 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
710 635
711 "movq (%%ebx), %%mm2 \n\t" // l5 636 "movq (%%ecx), %%mm2 \n\t" // l5
712 "movq %%mm3, %%mm5 \n\t" // l3 637 "movq %%mm3, %%mm5 \n\t" // l3
713 PAVGB(%%mm7, %%mm3) // ~l3/2 638 PAVGB(%%mm7, %%mm3) // ~l3/2
714 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 639 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
715 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 640 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
716 641
719 "psubusb %%mm6, %%mm3 \n\t" 644 "psubusb %%mm6, %%mm3 \n\t"
720 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 645 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
721 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) 646 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
722 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 647 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
723 648
724 "movq (%%ebx, %1), %%mm6 \n\t" // l6 649 "movq (%%ecx, %1), %%mm6 \n\t" // l6
725 "movq %%mm6, %%mm5 \n\t" // l6 650 "movq %%mm6, %%mm5 \n\t" // l6
726 PAVGB(%%mm7, %%mm6) // ~l6/2 651 PAVGB(%%mm7, %%mm6) // ~l6/2
727 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 652 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
728 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 653 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
729 654
730 "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7 655 "movq (%%ecx, %1, 2), %%mm5 \n\t" // l7
731 "movq %%mm2, %%mm4 \n\t" // l5 656 "movq %%mm2, %%mm4 \n\t" // l5
732 PAVGB(%%mm7, %%mm2) // ~l5/2 657 PAVGB(%%mm7, %%mm2) // ~l5/2
733 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 658 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
734 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 659 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
735 660
739 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 664 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
740 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 665 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
741 666
742 667
743 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 668 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
744 "movq "MANGLE(pQPb)", %%mm4 \n\t" // QP //FIXME QP+1 ? 669 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
745 "paddusb "MANGLE(b01)", %%mm4 \n\t" 670 "paddusb "MANGLE(b01)", %%mm4 \n\t"
746 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP 671 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
747 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 672 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
748 "pand %%mm4, %%mm3 \n\t" 673 "pand %%mm4, %%mm3 \n\t"
749 674
781 #endif 706 #endif
782 707
783 "leal (%0, %1), %%eax \n\t" 708 "leal (%0, %1), %%eax \n\t"
784 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 709 "pcmpeqb %%mm6, %%mm6 \n\t" // -1
785 // 0 1 2 3 4 5 6 7 710 // 0 1 2 3 4 5 6 7
786 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 711 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
787 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 712 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
788 713
789 714
790 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 715 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3
791 "movq (%0, %1, 4), %%mm0 \n\t" // l4 716 "movq (%0, %1, 4), %%mm0 \n\t" // l4
792 "pxor %%mm6, %%mm1 \n\t" // -l3-1 717 "pxor %%mm6, %%mm1 \n\t" // -l3-1
796 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 721 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5
797 "movq (%%eax, %1), %%mm3 \n\t" // l2 722 "movq (%%eax, %1), %%mm3 \n\t" // l2
798 "pxor %%mm6, %%mm2 \n\t" // -l5-1 723 "pxor %%mm6, %%mm2 \n\t" // -l5-1
799 "movq %%mm2, %%mm5 \n\t" // -l5-1 724 "movq %%mm2, %%mm5 \n\t" // -l5-1
800 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 725 "movq "MANGLE(b80)", %%mm4 \n\t" // 128
801 "leal (%%eax, %1, 4), %%ebx \n\t" 726 "leal (%%eax, %1, 4), %%ecx \n\t"
802 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 727 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
803 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 728 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
804 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 729 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
805 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 730 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
806 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 731 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
813 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 738 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
814 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 739 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
815 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 740 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
816 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 741 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
817 742
818 PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2 743 PAVGB((%%ecx, %1), %%mm5) // (l6-l5+256)/2
819 "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7 744 "movq (%%ecx, %1, 2), %%mm1 \n\t" // l7
820 "pxor %%mm6, %%mm1 \n\t" // -l7-1 745 "pxor %%mm6, %%mm1 \n\t" // -l7-1
821 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 746 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
822 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 747 "movq "MANGLE(b80)", %%mm2 \n\t" // 128
823 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 748 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
824 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 749 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
834 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 759 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
835 760
836 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 761 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
837 762
838 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 763 "movq "MANGLE(b00)", %%mm7 \n\t" // 0
839 "movq "MANGLE(pQPb)", %%mm2 \n\t" // QP 764 "movq %2, %%mm2 \n\t" // QP
840 PAVGB(%%mm6, %%mm2) // 128 + QP/2 765 PAVGB(%%mm6, %%mm2) // 128 + QP/2
841 "psubb %%mm6, %%mm2 \n\t" 766 "psubb %%mm6, %%mm2 \n\t"
842 767
843 "movq %%mm4, %%mm1 \n\t" 768 "movq %%mm4, %%mm1 \n\t"
844 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) 769 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
875 "pxor %%mm1, %%mm2 \n\t" 800 "pxor %%mm1, %%mm2 \n\t"
876 "movq %%mm0, (%%eax, %1, 2) \n\t" 801 "movq %%mm0, (%%eax, %1, 2) \n\t"
877 "movq %%mm2, (%0, %1, 4) \n\t" 802 "movq %%mm2, (%0, %1, 4) \n\t"
878 803
879 : 804 :
880 : "r" (src), "r" (stride) 805 : "r" (src), "r" (stride), "m" (c->pQPb)
881 : "%eax", "%ebx" 806 : "%eax", "%ecx"
882 ); 807 );
883 808
884 /* 809 /*
885 { 810 {
886 int x; 811 int x;
949 src+= stride*4; 874 src+= stride*4;
950 875
951 asm volatile( 876 asm volatile(
952 "pxor %%mm7, %%mm7 \n\t" 877 "pxor %%mm7, %%mm7 \n\t"
953 "leal (%0, %1), %%eax \n\t" 878 "leal (%0, %1), %%eax \n\t"
954 "leal (%%eax, %1, 4), %%ebx \n\t" 879 "leal (%%eax, %1, 4), %%edx \n\t"
880 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars
881 "andl $0xFFFFFFF8, %%ecx \n\t" // align
955 // 0 1 2 3 4 5 6 7 882 // 0 1 2 3 4 5 6 7
956 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 883 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
957 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 884 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
958 885
959 "movq (%0), %%mm0 \n\t" 886 "movq (%0), %%mm0 \n\t"
960 "movq %%mm0, %%mm1 \n\t" 887 "movq %%mm0, %%mm1 \n\t"
961 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 888 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
962 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 889 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
990 917
991 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 918 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
992 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 919 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
993 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 920 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
994 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 921 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
995 "movq %%mm0, "MANGLE(temp0)" \n\t" // 2L0 - 5L1 + 5L2 - 2L3 922 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
996 "movq %%mm1, "MANGLE(temp1)" \n\t" // 2H0 - 5H1 + 5H2 - 2H3 923 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
997 924
998 "movq (%0, %1, 4), %%mm0 \n\t" 925 "movq (%0, %1, 4), %%mm0 \n\t"
999 "movq %%mm0, %%mm1 \n\t" 926 "movq %%mm0, %%mm1 \n\t"
1000 "punpcklbw %%mm7, %%mm0 \n\t" // L4 927 "punpcklbw %%mm7, %%mm0 \n\t" // L4
1001 "punpckhbw %%mm7, %%mm1 \n\t" // H4 928 "punpckhbw %%mm7, %%mm1 \n\t" // H4
1002 929
1003 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 930 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
1004 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 931 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
1005 "movq %%mm2, "MANGLE(temp2)" \n\t" // L3 - L4 932 "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4
1006 "movq %%mm3, "MANGLE(temp3)" \n\t" // H3 - H4 933 "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4
1007 "paddw %%mm4, %%mm4 \n\t" // 2L2 934 "paddw %%mm4, %%mm4 \n\t" // 2L2
1008 "paddw %%mm5, %%mm5 \n\t" // 2H2 935 "paddw %%mm5, %%mm5 \n\t" // 2H2
1009 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 936 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
1010 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 937 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
1011 938
1012 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 939 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
1013 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 940 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
1014 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 941 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
1015 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 942 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
1016 //50 opcodes so far 943 //50 opcodes so far
1017 "movq (%%ebx), %%mm2 \n\t" 944 "movq (%%edx), %%mm2 \n\t"
1018 "movq %%mm2, %%mm3 \n\t" 945 "movq %%mm2, %%mm3 \n\t"
1019 "punpcklbw %%mm7, %%mm2 \n\t" // L5 946 "punpcklbw %%mm7, %%mm2 \n\t" // L5
1020 "punpckhbw %%mm7, %%mm3 \n\t" // H5 947 "punpckhbw %%mm7, %%mm3 \n\t" // H5
1021 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 948 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
1022 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 949 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
1023 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 950 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1024 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 951 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1025 952
1026 "movq (%%ebx, %1), %%mm6 \n\t" 953 "movq (%%edx, %1), %%mm6 \n\t"
1027 "punpcklbw %%mm7, %%mm6 \n\t" // L6 954 "punpcklbw %%mm7, %%mm6 \n\t" // L6
1028 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 955 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
1029 "movq (%%ebx, %1), %%mm6 \n\t" 956 "movq (%%edx, %1), %%mm6 \n\t"
1030 "punpckhbw %%mm7, %%mm6 \n\t" // H6 957 "punpckhbw %%mm7, %%mm6 \n\t" // H6
1031 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 958 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
1032 959
1033 "paddw %%mm0, %%mm0 \n\t" // 2L4 960 "paddw %%mm0, %%mm0 \n\t" // 2L4
1034 "paddw %%mm1, %%mm1 \n\t" // 2H4 961 "paddw %%mm1, %%mm1 \n\t" // 2H4
1038 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 965 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
1039 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 966 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
1040 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 967 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
1041 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 968 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
1042 969
1043 "movq (%%ebx, %1, 2), %%mm2 \n\t" 970 "movq (%%edx, %1, 2), %%mm2 \n\t"
1044 "movq %%mm2, %%mm3 \n\t" 971 "movq %%mm2, %%mm3 \n\t"
1045 "punpcklbw %%mm7, %%mm2 \n\t" // L7 972 "punpcklbw %%mm7, %%mm2 \n\t" // L7
1046 "punpckhbw %%mm7, %%mm3 \n\t" // H7 973 "punpckhbw %%mm7, %%mm3 \n\t" // H7
1047 974
1048 "paddw %%mm2, %%mm2 \n\t" // 2L7 975 "paddw %%mm2, %%mm2 \n\t" // 2L7
1049 "paddw %%mm3, %%mm3 \n\t" // 2H7 976 "paddw %%mm3, %%mm3 \n\t" // 2H7
1050 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 977 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1051 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 978 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1052 979
1053 "movq "MANGLE(temp0)", %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 980 "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1054 "movq "MANGLE(temp1)", %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 981 "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1055 982
1056 #ifdef HAVE_MMX2 983 #ifdef HAVE_MMX2
1057 "movq %%mm7, %%mm6 \n\t" // 0 984 "movq %%mm7, %%mm6 \n\t" // 0
1058 "psubw %%mm0, %%mm6 \n\t" 985 "psubw %%mm0, %%mm6 \n\t"
1059 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 986 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1104 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) 1031 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1105 "pxor %%mm7, %%mm5 \n\t" 1032 "pxor %%mm7, %%mm5 \n\t"
1106 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| 1033 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1107 // 100 opcodes 1034 // 100 opcodes
1108 "movd %2, %%mm2 \n\t" // QP 1035 "movd %2, %%mm2 \n\t" // QP
1109 "punpcklwd %%mm2, %%mm2 \n\t"
1110 "punpcklwd %%mm2, %%mm2 \n\t"
1111 "psllw $3, %%mm2 \n\t" // 8QP 1036 "psllw $3, %%mm2 \n\t" // 8QP
1112 "movq %%mm2, %%mm3 \n\t" // 8QP 1037 "movq %%mm2, %%mm3 \n\t" // 8QP
1113 "pcmpgtw %%mm4, %%mm2 \n\t" 1038 "pcmpgtw %%mm4, %%mm2 \n\t"
1114 "pcmpgtw %%mm5, %%mm3 \n\t" 1039 "pcmpgtw %%mm5, %%mm3 \n\t"
1115 "pand %%mm2, %%mm4 \n\t" 1040 "pand %%mm2, %%mm4 \n\t"
1127 "paddw %%mm2, %%mm4 \n\t" 1052 "paddw %%mm2, %%mm4 \n\t"
1128 "paddw %%mm2, %%mm5 \n\t" 1053 "paddw %%mm2, %%mm5 \n\t"
1129 "psrlw $6, %%mm4 \n\t" 1054 "psrlw $6, %%mm4 \n\t"
1130 "psrlw $6, %%mm5 \n\t" 1055 "psrlw $6, %%mm5 \n\t"
1131 1056
1132 /* 1057 "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4
1133 "movq w06, %%mm2 \n\t" // 6 1058 "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4
1134 "paddw %%mm2, %%mm4 \n\t"
1135 "paddw %%mm2, %%mm5 \n\t"
1136 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
1137 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1138 "pmulhw %%mm2, %%mm4 \n\t" // hd/13
1139 "pmulhw %%mm2, %%mm5 \n\t" // ld/13
1140 */
1141
1142 "movq "MANGLE(temp2)", %%mm0 \n\t" // L3 - L4
1143 "movq "MANGLE(temp3)", %%mm1 \n\t" // H3 - H4
1144 1059
1145 "pxor %%mm2, %%mm2 \n\t" 1060 "pxor %%mm2, %%mm2 \n\t"
1146 "pxor %%mm3, %%mm3 \n\t" 1061 "pxor %%mm3, %%mm3 \n\t"
1147 1062
1148 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) 1063 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1181 "movq (%0, %1, 4), %%mm0 \n\t" 1096 "movq (%0, %1, 4), %%mm0 \n\t"
1182 "psubb %%mm4, %%mm0 \n\t" 1097 "psubb %%mm4, %%mm0 \n\t"
1183 "movq %%mm0, (%0, %1, 4) \n\t" 1098 "movq %%mm0, (%0, %1, 4) \n\t"
1184 1099
1185 : 1100 :
1186 : "r" (src), "r" (stride), "r" (QP) 1101 : "r" (src), "r" (stride), "m" (c->pQPb)
1187 : "%eax", "%ebx" 1102 : "%eax", "%edx", "%ecx"
1188 ); 1103 );
1189 #else 1104 #else
1190 const int l1= stride; 1105 const int l1= stride;
1191 const int l2= stride + l1; 1106 const int l2= stride + l1;
1192 const int l3= stride + l2; 1107 const int l3= stride + l2;
1199 int x; 1114 int x;
1200 src+= stride*3; 1115 src+= stride*3;
1201 for(x=0; x<BLOCK_SIZE; x++) 1116 for(x=0; x<BLOCK_SIZE; x++)
1202 { 1117 {
1203 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); 1118 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1204 if(ABS(middleEnergy) < 8*QP) 1119 if(ABS(middleEnergy) < 8*c->QP)
1205 { 1120 {
1206 const int q=(src[l4] - src[l5])/2; 1121 const int q=(src[l4] - src[l5])/2;
1207 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); 1122 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1208 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); 1123 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1209 1124
1230 src++; 1145 src++;
1231 } 1146 }
1232 #endif 1147 #endif
1233 } 1148 }
1234 1149
1235 static inline void RENAME(dering)(uint8_t src[], int stride, int QP) 1150 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1236 { 1151 {
1237 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1152 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1238 asm volatile( 1153 asm volatile(
1239 "movq "MANGLE(pQPb)", %%mm0 \n\t" 1154 "pxor %%mm6, %%mm6 \n\t"
1240 "paddusb %%mm0, %%mm0 \n\t" 1155 "pcmpeqb %%mm7, %%mm7 \n\t"
1241 "movq %%mm0, "MANGLE(pQPb2)" \n\t" 1156 "movq %2, %%mm0 \n\t"
1157 "punpcklbw %%mm6, %%mm0 \n\t"
1158 "psrlw $1, %%mm0 \n\t"
1159 "psubw %%mm7, %%mm0 \n\t"
1160 "packuswb %%mm0, %%mm0 \n\t"
1161 "movq %%mm0, %3 \n\t"
1242 1162
1243 "leal (%0, %1), %%eax \n\t" 1163 "leal (%0, %1), %%eax \n\t"
1244 "leal (%%eax, %1, 4), %%ebx \n\t" 1164 "leal (%%eax, %1, 4), %%edx \n\t"
1165
1245 // 0 1 2 3 4 5 6 7 8 9 1166 // 0 1 2 3 4 5 6 7 8 9
1246 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1167 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1247 1168
1248 "pcmpeqb %%mm7, %%mm7 \n\t"
1249 "pxor %%mm6, %%mm6 \n\t"
1250 #undef FIND_MIN_MAX 1169 #undef FIND_MIN_MAX
1251 #ifdef HAVE_MMX2 1170 #ifdef HAVE_MMX2
1252 #define FIND_MIN_MAX(addr)\ 1171 #define FIND_MIN_MAX(addr)\
1253 "movq " #addr ", %%mm0 \n\t"\ 1172 "movq " #addr ", %%mm0 \n\t"\
1254 "pminub %%mm0, %%mm7 \n\t"\ 1173 "pminub %%mm0, %%mm7 \n\t"\
1265 1184
1266 FIND_MIN_MAX((%%eax)) 1185 FIND_MIN_MAX((%%eax))
1267 FIND_MIN_MAX((%%eax, %1)) 1186 FIND_MIN_MAX((%%eax, %1))
1268 FIND_MIN_MAX((%%eax, %1, 2)) 1187 FIND_MIN_MAX((%%eax, %1, 2))
1269 FIND_MIN_MAX((%0, %1, 4)) 1188 FIND_MIN_MAX((%0, %1, 4))
1270 FIND_MIN_MAX((%%ebx)) 1189 FIND_MIN_MAX((%%edx))
1271 FIND_MIN_MAX((%%ebx, %1)) 1190 FIND_MIN_MAX((%%edx, %1))
1272 FIND_MIN_MAX((%%ebx, %1, 2)) 1191 FIND_MIN_MAX((%%edx, %1, 2))
1273 FIND_MIN_MAX((%0, %1, 8)) 1192 FIND_MIN_MAX((%0, %1, 8))
1274 1193
1275 "movq %%mm7, %%mm4 \n\t" 1194 "movq %%mm7, %%mm4 \n\t"
1276 "psrlq $8, %%mm7 \n\t" 1195 "psrlq $8, %%mm7 \n\t"
1277 #ifdef HAVE_MMX2 1196 #ifdef HAVE_MMX2
1320 "movq %%mm6, %%mm0 \n\t" // max 1239 "movq %%mm6, %%mm0 \n\t" // max
1321 "psubb %%mm7, %%mm6 \n\t" // max - min 1240 "psubb %%mm7, %%mm6 \n\t" // max - min
1322 "movd %%mm6, %%ecx \n\t" 1241 "movd %%mm6, %%ecx \n\t"
1323 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" 1242 "cmpb "MANGLE(deringThreshold)", %%cl \n\t"
1324 " jb 1f \n\t" 1243 " jb 1f \n\t"
1244 "leal -24(%%esp), %%ecx \n\t"
1245 "andl $0xFFFFFFF8, %%ecx \n\t"
1325 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 1246 PAVGB(%%mm0, %%mm7) // a=(max + min)/2
1326 "punpcklbw %%mm7, %%mm7 \n\t" 1247 "punpcklbw %%mm7, %%mm7 \n\t"
1327 "punpcklbw %%mm7, %%mm7 \n\t" 1248 "punpcklbw %%mm7, %%mm7 \n\t"
1328 "punpcklbw %%mm7, %%mm7 \n\t" 1249 "punpcklbw %%mm7, %%mm7 \n\t"
1329 "movq %%mm7, "MANGLE(temp0)" \n\t" 1250 "movq %%mm7, (%%ecx) \n\t"
1330 1251
1331 "movq (%0), %%mm0 \n\t" // L10 1252 "movq (%0), %%mm0 \n\t" // L10
1332 "movq %%mm0, %%mm1 \n\t" // L10 1253 "movq %%mm0, %%mm1 \n\t" // L10
1333 "movq %%mm0, %%mm2 \n\t" // L10 1254 "movq %%mm0, %%mm2 \n\t" // L10
1334 "psllq $8, %%mm1 \n\t" 1255 "psllq $8, %%mm1 \n\t"
1388 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ 1309 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1389 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ 1310 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1390 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ 1311 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1391 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ 1312 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
1392 PAVGB(lx, pplx) \ 1313 PAVGB(lx, pplx) \
1393 "movq " #lx ", "MANGLE(temp1)" \n\t"\ 1314 "movq " #lx ", 8(%%ecx) \n\t"\
1394 "movq "MANGLE(temp0)", " #lx " \n\t"\ 1315 "movq (%%ecx), " #lx " \n\t"\
1395 "psubusb " #lx ", " #t1 " \n\t"\ 1316 "psubusb " #lx ", " #t1 " \n\t"\
1396 "psubusb " #lx ", " #t0 " \n\t"\ 1317 "psubusb " #lx ", " #t0 " \n\t"\
1397 "psubusb " #lx ", " #sx " \n\t"\ 1318 "psubusb " #lx ", " #sx " \n\t"\
1398 "movq "MANGLE(b00)", " #lx " \n\t"\ 1319 "movq "MANGLE(b00)", " #lx " \n\t"\
1399 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ 1320 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1403 "paddb " #t0 ", " #sx " \n\t"\ 1324 "paddb " #t0 ", " #sx " \n\t"\
1404 \ 1325 \
1405 PAVGB(plx, pplx) /* filtered */\ 1326 PAVGB(plx, pplx) /* filtered */\
1406 "movq " #dst ", " #t0 " \n\t" /* dst */\ 1327 "movq " #dst ", " #t0 " \n\t" /* dst */\
1407 "movq " #t0 ", " #t1 " \n\t" /* dst */\ 1328 "movq " #t0 ", " #t1 " \n\t" /* dst */\
1408 "psubusb "MANGLE(pQPb2)", " #t0 " \n\t"\ 1329 "psubusb %3, " #t0 " \n\t"\
1409 "paddusb "MANGLE(pQPb2)", " #t1 " \n\t"\ 1330 "paddusb %3, " #t1 " \n\t"\
1410 PMAXUB(t0, pplx)\ 1331 PMAXUB(t0, pplx)\
1411 PMINUB(t1, pplx, t0)\ 1332 PMINUB(t1, pplx, t0)\
1412 "paddb " #sx ", " #ppsx " \n\t"\ 1333 "paddb " #sx ", " #ppsx " \n\t"\
1413 "paddb " #psx ", " #ppsx " \n\t"\ 1334 "paddb " #psx ", " #ppsx " \n\t"\
1414 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ 1335 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
1416 "pcmpeqb " #lx ", " #ppsx " \n\t"\ 1337 "pcmpeqb " #lx ", " #ppsx " \n\t"\
1417 "pand " #ppsx ", " #pplx " \n\t"\ 1338 "pand " #ppsx ", " #pplx " \n\t"\
1418 "pandn " #dst ", " #ppsx " \n\t"\ 1339 "pandn " #dst ", " #ppsx " \n\t"\
1419 "por " #pplx ", " #ppsx " \n\t"\ 1340 "por " #pplx ", " #ppsx " \n\t"\
1420 "movq " #ppsx ", " #dst " \n\t"\ 1341 "movq " #ppsx ", " #dst " \n\t"\
1421 "movq "MANGLE(temp1)", " #lx " \n\t" 1342 "movq 8(%%ecx), " #lx " \n\t"
1422 1343
1423 /* 1344 /*
1424 0000000 1345 0000000
1425 1111111 1346 1111111
1426 1347
1437 */ 1358 */
1438 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) 1359 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1439 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1360 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1440 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1361 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1441 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) 1362 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1442 DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1363 DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1443 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1364 DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1444 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) 1365 DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1445 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1366 DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1446 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1367 DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1447 1368
1448 "1: \n\t" 1369 "1: \n\t"
1449 : : "r" (src), "r" (stride), "r" (QP) 1370 : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2)
1450 : "%eax", "%ebx", "%ecx" 1371 : "%eax", "%edx", "%ecx"
1451 ); 1372 );
1452 #else 1373 #else
1453 int y; 1374 int y;
1454 int min=255; 1375 int min=255;
1455 int max=0; 1376 int max=0;
1456 int avg; 1377 int avg;
1457 uint8_t *p; 1378 uint8_t *p;
1458 int s[10]; 1379 int s[10];
1380 const int QP2= c->QP/2 + 1;
1459 1381
1460 for(y=1; y<9; y++) 1382 for(y=1; y<9; y++)
1461 { 1383 {
1462 int x; 1384 int x;
1463 p= src + stride*y; 1385 p= src + stride*y;
1466 p++; 1388 p++;
1467 if(*p > max) max= *p; 1389 if(*p > max) max= *p;
1468 if(*p < min) min= *p; 1390 if(*p < min) min= *p;
1469 } 1391 }
1470 } 1392 }
1471 avg= (min + max + 1)/2; 1393 avg= (min + max + 1)>>1;
1472 1394
1473 if(max - min <deringThreshold) return; 1395 if(max - min <deringThreshold) return;
1474 1396
1475 for(y=0; y<10; y++) 1397 for(y=0; y<10; y++)
1476 { 1398 {
1477 int x;
1478 int t = 0; 1399 int t = 0;
1479 p= src + stride*y; 1400
1480 for(x=0; x<10; x++) 1401 if(src[stride*y + 0] > avg) t+= 1;
1481 { 1402 if(src[stride*y + 1] > avg) t+= 2;
1482 if(*p > avg) t |= (1<<x); 1403 if(src[stride*y + 2] > avg) t+= 4;
1483 p++; 1404 if(src[stride*y + 3] > avg) t+= 8;
1484 } 1405 if(src[stride*y + 4] > avg) t+= 16;
1406 if(src[stride*y + 5] > avg) t+= 32;
1407 if(src[stride*y + 6] > avg) t+= 64;
1408 if(src[stride*y + 7] > avg) t+= 128;
1409 if(src[stride*y + 8] > avg) t+= 256;
1410 if(src[stride*y + 9] > avg) t+= 512;
1411
1485 t |= (~t)<<16; 1412 t |= (~t)<<16;
1486 t &= (t<<1) & (t>>1); 1413 t &= (t<<1) & (t>>1);
1487 s[y] = t; 1414 s[y] = t;
1488 } 1415 }
1416
1417 for(y=1; y<9; y++)
1418 {
1419 int t = s[y-1] & s[y] & s[y+1];
1420 t|= t>>16;
1421 s[y-1]= t;
1422 }
1489 1423
1490 for(y=1; y<9; y++) 1424 for(y=1; y<9; y++)
1491 { 1425 {
1492 int x; 1426 int x;
1493 int t = s[y-1] & s[y] & s[y+1]; 1427 int t = s[y-1];
1494 t|= t>>16;
1495 1428
1496 p= src + stride*y; 1429 p= src + stride*y;
1497 for(x=1; x<9; x++) 1430 for(x=1; x<9; x++)
1498 { 1431 {
1499 p++; 1432 p++;
1542 worstDiff, (float)numSkiped/numPixels); 1475 worstDiff, (float)numSkiped/numPixels);
1543 } 1476 }
1544 } 1477 }
1545 } 1478 }
1546 #endif 1479 #endif
1547 if (*p + 2*QP < f) *p= *p + 2*QP; 1480 if (*p + QP2 < f) *p= *p + QP2;
1548 else if(*p - 2*QP > f) *p= *p - 2*QP; 1481 else if(*p - QP2 > f) *p= *p - QP2;
1549 else *p=f; 1482 else *p=f;
1550 } 1483 }
1551 } 1484 }
1552 } 1485 }
1553 #ifdef DEBUG_DERING_THRESHOLD 1486 #ifdef DEBUG_DERING_THRESHOLD
1580 { 1513 {
1581 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1514 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1582 src+= 4*stride; 1515 src+= 4*stride;
1583 asm volatile( 1516 asm volatile(
1584 "leal (%0, %1), %%eax \n\t" 1517 "leal (%0, %1), %%eax \n\t"
1585 "leal (%%eax, %1, 4), %%ebx \n\t" 1518 "leal (%%eax, %1, 4), %%ecx \n\t"
1586 // 0 1 2 3 4 5 6 7 8 9 1519 // 0 1 2 3 4 5 6 7 8 9
1587 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1520 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
1588 1521
1589 "movq (%0), %%mm0 \n\t" 1522 "movq (%0), %%mm0 \n\t"
1590 "movq (%%eax, %1), %%mm1 \n\t" 1523 "movq (%%eax, %1), %%mm1 \n\t"
1591 PAVGB(%%mm1, %%mm0) 1524 PAVGB(%%mm1, %%mm0)
1592 "movq %%mm0, (%%eax) \n\t" 1525 "movq %%mm0, (%%eax) \n\t"
1593 "movq (%0, %1, 4), %%mm0 \n\t" 1526 "movq (%0, %1, 4), %%mm0 \n\t"
1594 PAVGB(%%mm0, %%mm1) 1527 PAVGB(%%mm0, %%mm1)
1595 "movq %%mm1, (%%eax, %1, 2) \n\t" 1528 "movq %%mm1, (%%eax, %1, 2) \n\t"
1596 "movq (%%ebx, %1), %%mm1 \n\t" 1529 "movq (%%ecx, %1), %%mm1 \n\t"
1597 PAVGB(%%mm1, %%mm0) 1530 PAVGB(%%mm1, %%mm0)
1598 "movq %%mm0, (%%ebx) \n\t" 1531 "movq %%mm0, (%%ecx) \n\t"
1599 "movq (%0, %1, 8), %%mm0 \n\t" 1532 "movq (%0, %1, 8), %%mm0 \n\t"
1600 PAVGB(%%mm0, %%mm1) 1533 PAVGB(%%mm0, %%mm1)
1601 "movq %%mm1, (%%ebx, %1, 2) \n\t" 1534 "movq %%mm1, (%%ecx, %1, 2) \n\t"
1602 1535
1603 : : "r" (src), "r" (stride) 1536 : : "r" (src), "r" (stride)
1604 : "%eax", "%ebx" 1537 : "%eax", "%ecx"
1605 ); 1538 );
1606 #else 1539 #else
1607 int x; 1540 int x;
1608 src+= 4*stride; 1541 src+= 4*stride;
1609 for(x=0; x<8; x++) 1542 for(x=0; x<8; x++)
1629 { 1562 {
1630 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1563 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1631 src+= stride*3; 1564 src+= stride*3;
1632 asm volatile( 1565 asm volatile(
1633 "leal (%0, %1), %%eax \n\t" 1566 "leal (%0, %1), %%eax \n\t"
1634 "leal (%%eax, %1, 4), %%ebx \n\t" 1567 "leal (%%eax, %1, 4), %%edx \n\t"
1635 "leal (%%ebx, %1, 4), %%ecx \n\t" 1568 "leal (%%edx, %1, 4), %%ecx \n\t"
1636 "addl %1, %%ecx \n\t" 1569 "addl %1, %%ecx \n\t"
1637 "pxor %%mm7, %%mm7 \n\t" 1570 "pxor %%mm7, %%mm7 \n\t"
1638 // 0 1 2 3 4 5 6 7 8 9 10 1571 // 0 1 2 3 4 5 6 7 8 9 10
1639 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx 1572 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1640 1573
1641 #define DEINT_CUBIC(a,b,c,d,e)\ 1574 #define DEINT_CUBIC(a,b,c,d,e)\
1642 "movq " #a ", %%mm0 \n\t"\ 1575 "movq " #a ", %%mm0 \n\t"\
1643 "movq " #b ", %%mm1 \n\t"\ 1576 "movq " #b ", %%mm1 \n\t"\
1644 "movq " #d ", %%mm2 \n\t"\ 1577 "movq " #d ", %%mm2 \n\t"\
1658 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ 1591 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1659 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ 1592 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1660 "packuswb %%mm3, %%mm1 \n\t"\ 1593 "packuswb %%mm3, %%mm1 \n\t"\
1661 "movq %%mm1, " #c " \n\t" 1594 "movq %%mm1, " #c " \n\t"
1662 1595
1663 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1)) 1596 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1))
1664 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8)) 1597 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8))
1665 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx)) 1598 DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx))
1666 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2)) 1599 DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1667 1600
1668 : : "r" (src), "r" (stride) 1601 : : "r" (src), "r" (stride)
1669 : "%eax", "%ebx", "ecx" 1602 : "%eax", "%edx", "ecx"
1670 ); 1603 );
1671 #else 1604 #else
1672 int x; 1605 int x;
1673 src+= stride*3; 1606 src+= stride*3;
1674 for(x=0; x<8; x++) 1607 for(x=0; x<8; x++)
1675 { 1608 {
1676 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; 1609 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1677 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; 1610 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1678 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; 1611 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1679 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4; 1612 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
1613 src++;
1614 }
1615 #endif
1616 }
1617
1618 /**
1619 * Deinterlaces the given block
1620 * will be called for every 8x8 block and can read & write from line 4-15
1621 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1622 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1623 * this filter will read lines 4-13 and write 5-11
1624 * no cliping in C version
1625 */
1626 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1627 {
1628 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1629 src+= stride*4;
1630 asm volatile(
1631 "leal (%0, %1), %%eax \n\t"
1632 "leal (%%eax, %1, 4), %%edx \n\t"
1633 "pxor %%mm7, %%mm7 \n\t"
1634 "movq (%2), %%mm0 \n\t"
1635 // 0 1 2 3 4 5 6 7 8 9 10
1636 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1637
1638 #define DEINT_FF(a,b,c,d)\
1639 "movq " #a ", %%mm1 \n\t"\
1640 "movq " #b ", %%mm2 \n\t"\
1641 "movq " #c ", %%mm3 \n\t"\
1642 "movq " #d ", %%mm4 \n\t"\
1643 PAVGB(%%mm3, %%mm1) \
1644 PAVGB(%%mm4, %%mm0) \
1645 "movq %%mm0, %%mm3 \n\t"\
1646 "punpcklbw %%mm7, %%mm0 \n\t"\
1647 "punpckhbw %%mm7, %%mm3 \n\t"\
1648 "movq %%mm1, %%mm4 \n\t"\
1649 "punpcklbw %%mm7, %%mm1 \n\t"\
1650 "punpckhbw %%mm7, %%mm4 \n\t"\
1651 "psllw $2, %%mm1 \n\t"\
1652 "psllw $2, %%mm4 \n\t"\
1653 "psubw %%mm0, %%mm1 \n\t"\
1654 "psubw %%mm3, %%mm4 \n\t"\
1655 "movq %%mm2, %%mm5 \n\t"\
1656 "movq %%mm2, %%mm0 \n\t"\
1657 "punpcklbw %%mm7, %%mm2 \n\t"\
1658 "punpckhbw %%mm7, %%mm5 \n\t"\
1659 "paddw %%mm2, %%mm1 \n\t"\
1660 "paddw %%mm5, %%mm4 \n\t"\
1661 "psraw $2, %%mm1 \n\t"\
1662 "psraw $2, %%mm4 \n\t"\
1663 "packuswb %%mm4, %%mm1 \n\t"\
1664 "movq %%mm1, " #b " \n\t"\
1665
1666 DEINT_FF((%0) , (%%eax) , (%%eax, %1), (%%eax, %1, 2))
1667 DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx) )
1668 DEINT_FF((%0, %1, 4), (%%edx) , (%%edx, %1), (%%edx, %1, 2))
1669 DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4))
1670
1671 "movq %%mm0, (%2) \n\t"
1672 : : "r" (src), "r" (stride), "r"(tmp)
1673 : "%eax", "%edx"
1674 );
1675 #else
1676 int x;
1677 src+= stride*4;
1678 for(x=0; x<8; x++)
1679 {
1680 int t1= tmp[x];
1681 int t2= src[stride*1];
1682
1683 src[stride*1]= (-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3;
1684 t1= src[stride*4];
1685 src[stride*3]= (-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3;
1686 t2= src[stride*6];
1687 src[stride*5]= (-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3;
1688 t1= src[stride*8];
1689 src[stride*7]= (-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3;
1690 tmp[x]= t1;
1691
1680 src++; 1692 src++;
1681 } 1693 }
1682 #endif 1694 #endif
1683 } 1695 }
1684 1696
1694 { 1706 {
1695 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1707 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1696 src+= 4*stride; 1708 src+= 4*stride;
1697 asm volatile( 1709 asm volatile(
1698 "leal (%0, %1), %%eax \n\t" 1710 "leal (%0, %1), %%eax \n\t"
1699 "leal (%%eax, %1, 4), %%ebx \n\t" 1711 "leal (%%eax, %1, 4), %%edx \n\t"
1700 // 0 1 2 3 4 5 6 7 8 9 1712 // 0 1 2 3 4 5 6 7 8 9
1701 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1713 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1702 1714
1703 "movq (%0), %%mm0 \n\t" // L0 1715 "movq (%0), %%mm0 \n\t" // L0
1704 "movq (%%eax, %1), %%mm1 \n\t" // L2 1716 "movq (%%eax, %1), %%mm1 \n\t" // L2
1705 PAVGB(%%mm1, %%mm0) // L0+L2 1717 PAVGB(%%mm1, %%mm0) // L0+L2
1706 "movq (%%eax), %%mm2 \n\t" // L1 1718 "movq (%%eax), %%mm2 \n\t" // L1
1712 "movq %%mm2, (%%eax) \n\t" 1724 "movq %%mm2, (%%eax) \n\t"
1713 "movq (%0, %1, 4), %%mm2 \n\t" // L4 1725 "movq (%0, %1, 4), %%mm2 \n\t" // L4
1714 PAVGB(%%mm2, %%mm1) // L2+L4 1726 PAVGB(%%mm2, %%mm1) // L2+L4
1715 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 1727 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1716 "movq %%mm1, (%%eax, %1) \n\t" 1728 "movq %%mm1, (%%eax, %1) \n\t"
1717 "movq (%%ebx), %%mm1 \n\t" // L5 1729 "movq (%%edx), %%mm1 \n\t" // L5
1718 PAVGB(%%mm1, %%mm0) // L3+L5 1730 PAVGB(%%mm1, %%mm0) // L3+L5
1719 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 1731 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1720 "movq %%mm0, (%%eax, %1, 2) \n\t" 1732 "movq %%mm0, (%%eax, %1, 2) \n\t"
1721 "movq (%%ebx, %1), %%mm0 \n\t" // L6 1733 "movq (%%edx, %1), %%mm0 \n\t" // L6
1722 PAVGB(%%mm0, %%mm2) // L4+L6 1734 PAVGB(%%mm0, %%mm2) // L4+L6
1723 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 1735 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1724 "movq %%mm2, (%0, %1, 4) \n\t" 1736 "movq %%mm2, (%0, %1, 4) \n\t"
1725 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7 1737 "movq (%%edx, %1, 2), %%mm2 \n\t" // L7
1726 PAVGB(%%mm2, %%mm1) // L5+L7 1738 PAVGB(%%mm2, %%mm1) // L5+L7
1727 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 1739 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1728 "movq %%mm1, (%%ebx) \n\t" 1740 "movq %%mm1, (%%edx) \n\t"
1729 "movq (%0, %1, 8), %%mm1 \n\t" // L8 1741 "movq (%0, %1, 8), %%mm1 \n\t" // L8
1730 PAVGB(%%mm1, %%mm0) // L6+L8 1742 PAVGB(%%mm1, %%mm0) // L6+L8
1731 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 1743 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1732 "movq %%mm0, (%%ebx, %1) \n\t" 1744 "movq %%mm0, (%%edx, %1) \n\t"
1733 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9 1745 "movq (%%edx, %1, 4), %%mm0 \n\t" // L9
1734 PAVGB(%%mm0, %%mm2) // L7+L9 1746 PAVGB(%%mm0, %%mm2) // L7+L9
1735 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 1747 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1736 "movq %%mm2, (%%ebx, %1, 2) \n\t" 1748 "movq %%mm2, (%%edx, %1, 2) \n\t"
1737 1749
1738 1750
1739 : : "r" (src), "r" (stride) 1751 : : "r" (src), "r" (stride)
1740 : "%eax", "%ebx" 1752 : "%eax", "%edx"
1741 ); 1753 );
1742 #else 1754 #else
1743 int x; 1755 int x;
1744 src+= 4*stride; 1756 src+= 4*stride;
1745 for(x=0; x<8; x++) 1757 for(x=0; x<8; x++)
1768 #ifdef HAVE_MMX 1780 #ifdef HAVE_MMX
1769 src+= 4*stride; 1781 src+= 4*stride;
1770 #ifdef HAVE_MMX2 1782 #ifdef HAVE_MMX2
1771 asm volatile( 1783 asm volatile(
1772 "leal (%0, %1), %%eax \n\t" 1784 "leal (%0, %1), %%eax \n\t"
1773 "leal (%%eax, %1, 4), %%ebx \n\t" 1785 "leal (%%eax, %1, 4), %%edx \n\t"
1774 // 0 1 2 3 4 5 6 7 8 9 1786 // 0 1 2 3 4 5 6 7 8 9
1775 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1787 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1776 1788
1777 "movq (%0), %%mm0 \n\t" // 1789 "movq (%0), %%mm0 \n\t" //
1778 "movq (%%eax, %1), %%mm2 \n\t" // 1790 "movq (%%eax, %1), %%mm2 \n\t" //
1779 "movq (%%eax), %%mm1 \n\t" // 1791 "movq (%%eax), %%mm1 \n\t" //
1780 "movq %%mm0, %%mm3 \n\t" 1792 "movq %%mm0, %%mm3 \n\t"
1791 "pminub %%mm3, %%mm1 \n\t" // 1803 "pminub %%mm3, %%mm1 \n\t" //
1792 "pmaxub %%mm0, %%mm1 \n\t" // 1804 "pmaxub %%mm0, %%mm1 \n\t" //
1793 "pminub %%mm1, %%mm2 \n\t" 1805 "pminub %%mm1, %%mm2 \n\t"
1794 "movq %%mm2, (%%eax, %1, 2) \n\t" 1806 "movq %%mm2, (%%eax, %1, 2) \n\t"
1795 1807
1796 "movq (%%ebx), %%mm2 \n\t" // 1808 "movq (%%edx), %%mm2 \n\t" //
1797 "movq (%%ebx, %1), %%mm1 \n\t" // 1809 "movq (%%edx, %1), %%mm1 \n\t" //
1798 "movq %%mm2, %%mm3 \n\t" 1810 "movq %%mm2, %%mm3 \n\t"
1799 "pmaxub %%mm0, %%mm2 \n\t" // 1811 "pmaxub %%mm0, %%mm2 \n\t" //
1800 "pminub %%mm3, %%mm0 \n\t" // 1812 "pminub %%mm3, %%mm0 \n\t" //
1801 "pmaxub %%mm1, %%mm0 \n\t" // 1813 "pmaxub %%mm1, %%mm0 \n\t" //
1802 "pminub %%mm0, %%mm2 \n\t" 1814 "pminub %%mm0, %%mm2 \n\t"
1803 "movq %%mm2, (%%ebx) \n\t" 1815 "movq %%mm2, (%%edx) \n\t"
1804 1816
1805 "movq (%%ebx, %1, 2), %%mm2 \n\t" // 1817 "movq (%%edx, %1, 2), %%mm2 \n\t" //
1806 "movq (%0, %1, 8), %%mm0 \n\t" // 1818 "movq (%0, %1, 8), %%mm0 \n\t" //
1807 "movq %%mm2, %%mm3 \n\t" 1819 "movq %%mm2, %%mm3 \n\t"
1808 "pmaxub %%mm0, %%mm2 \n\t" // 1820 "pmaxub %%mm0, %%mm2 \n\t" //
1809 "pminub %%mm3, %%mm0 \n\t" // 1821 "pminub %%mm3, %%mm0 \n\t" //
1810 "pmaxub %%mm1, %%mm0 \n\t" // 1822 "pmaxub %%mm1, %%mm0 \n\t" //
1811 "pminub %%mm0, %%mm2 \n\t" 1823 "pminub %%mm0, %%mm2 \n\t"
1812 "movq %%mm2, (%%ebx, %1, 2) \n\t" 1824 "movq %%mm2, (%%edx, %1, 2) \n\t"
1813 1825
1814 1826
1815 : : "r" (src), "r" (stride) 1827 : : "r" (src), "r" (stride)
1816 : "%eax", "%ebx" 1828 : "%eax", "%edx"
1817 ); 1829 );
1818 1830
1819 #else // MMX without MMX2 1831 #else // MMX without MMX2
1820 asm volatile( 1832 asm volatile(
1821 "leal (%0, %1), %%eax \n\t" 1833 "leal (%0, %1), %%eax \n\t"
1822 "leal (%%eax, %1, 4), %%ebx \n\t" 1834 "leal (%%eax, %1, 4), %%edx \n\t"
1823 // 0 1 2 3 4 5 6 7 8 9 1835 // 0 1 2 3 4 5 6 7 8 9
1824 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1836 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1825 "pxor %%mm7, %%mm7 \n\t" 1837 "pxor %%mm7, %%mm7 \n\t"
1826 1838
1827 #define MEDIAN(a,b,c)\ 1839 #define MEDIAN(a,b,c)\
1828 "movq " #a ", %%mm0 \n\t"\ 1840 "movq " #a ", %%mm0 \n\t"\
1829 "movq " #b ", %%mm2 \n\t"\ 1841 "movq " #b ", %%mm2 \n\t"\
1848 "pand %%mm1, %%mm0 \n\t"\ 1860 "pand %%mm1, %%mm0 \n\t"\
1849 "movq %%mm0, " #b " \n\t" 1861 "movq %%mm0, " #b " \n\t"
1850 1862
1851 MEDIAN((%0), (%%eax), (%%eax, %1)) 1863 MEDIAN((%0), (%%eax), (%%eax, %1))
1852 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) 1864 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
1853 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1)) 1865 MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1))
1854 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8)) 1866 MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8))
1855 1867
1856 : : "r" (src), "r" (stride) 1868 : : "r" (src), "r" (stride)
1857 : "%eax", "%ebx" 1869 : "%eax", "%edx"
1858 ); 1870 );
1859 #endif // MMX 1871 #endif // MMX
1860 #else 1872 #else
1861 //FIXME 1873 //FIXME
1862 int x; 1874 int x;
1882 */ 1894 */
1883 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) 1895 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
1884 { 1896 {
1885 asm( 1897 asm(
1886 "leal (%0, %1), %%eax \n\t" 1898 "leal (%0, %1), %%eax \n\t"
1887 "leal (%%eax, %1, 4), %%ebx \n\t" 1899 "leal (%%eax, %1, 4), %%edx \n\t"
1888 // 0 1 2 3 4 5 6 7 8 9 1900 // 0 1 2 3 4 5 6 7 8 9
1889 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1901 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1890 "movq (%0), %%mm0 \n\t" // 12345678 1902 "movq (%0), %%mm0 \n\t" // 12345678
1891 "movq (%%eax), %%mm1 \n\t" // abcdefgh 1903 "movq (%%eax), %%mm1 \n\t" // abcdefgh
1892 "movq %%mm0, %%mm2 \n\t" // 12345678 1904 "movq %%mm0, %%mm2 \n\t" // 12345678
1893 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 1905 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1894 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 1906 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1920 "movd %%mm1, 96(%3) \n\t" 1932 "movd %%mm1, 96(%3) \n\t"
1921 "psrlq $32, %%mm1 \n\t" 1933 "psrlq $32, %%mm1 \n\t"
1922 "movd %%mm1, 112(%3) \n\t" 1934 "movd %%mm1, 112(%3) \n\t"
1923 1935
1924 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 1936 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
1925 "movq (%%ebx), %%mm1 \n\t" // abcdefgh 1937 "movq (%%edx), %%mm1 \n\t" // abcdefgh
1926 "movq %%mm0, %%mm2 \n\t" // 12345678 1938 "movq %%mm0, %%mm2 \n\t" // 12345678
1927 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 1939 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1928 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 1940 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1929 1941
1930 "movq (%%ebx, %1), %%mm1 \n\t" 1942 "movq (%%edx, %1), %%mm1 \n\t"
1931 "movq (%%ebx, %1, 2), %%mm3 \n\t" 1943 "movq (%%edx, %1, 2), %%mm3 \n\t"
1932 "movq %%mm1, %%mm4 \n\t" 1944 "movq %%mm1, %%mm4 \n\t"
1933 "punpcklbw %%mm3, %%mm1 \n\t" 1945 "punpcklbw %%mm3, %%mm1 \n\t"
1934 "punpckhbw %%mm3, %%mm4 \n\t" 1946 "punpckhbw %%mm3, %%mm4 \n\t"
1935 1947
1936 "movq %%mm0, %%mm3 \n\t" 1948 "movq %%mm0, %%mm3 \n\t"
1955 "psrlq $32, %%mm1 \n\t" 1967 "psrlq $32, %%mm1 \n\t"
1956 "movd %%mm1, 116(%3) \n\t" 1968 "movd %%mm1, 116(%3) \n\t"
1957 1969
1958 1970
1959 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) 1971 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
1960 : "%eax", "%ebx" 1972 : "%eax", "%edx"
1961 ); 1973 );
1962 } 1974 }
1963 1975
1964 /** 1976 /**
1965 * transposes the given 8x8 block 1977 * transposes the given 8x8 block
1966 */ 1978 */
1967 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) 1979 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
1968 { 1980 {
1969 asm( 1981 asm(
1970 "leal (%0, %1), %%eax \n\t" 1982 "leal (%0, %1), %%eax \n\t"
1971 "leal (%%eax, %1, 4), %%ebx \n\t" 1983 "leal (%%eax, %1, 4), %%edx \n\t"
1972 // 0 1 2 3 4 5 6 7 8 9 1984 // 0 1 2 3 4 5 6 7 8 9
1973 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1985 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1974 "movq (%2), %%mm0 \n\t" // 12345678 1986 "movq (%2), %%mm0 \n\t" // 12345678
1975 "movq 16(%2), %%mm1 \n\t" // abcdefgh 1987 "movq 16(%2), %%mm1 \n\t" // abcdefgh
1976 "movq %%mm0, %%mm2 \n\t" // 12345678 1988 "movq %%mm0, %%mm2 \n\t" // 12345678
1977 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 1989 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1978 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 1990 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1996 "movd %%mm3, (%%eax, %1) \n\t" 2008 "movd %%mm3, (%%eax, %1) \n\t"
1997 "psrlq $32, %%mm3 \n\t" 2009 "psrlq $32, %%mm3 \n\t"
1998 "movd %%mm3, (%%eax, %1, 2) \n\t" 2010 "movd %%mm3, (%%eax, %1, 2) \n\t"
1999 "movd %%mm2, (%0, %1, 4) \n\t" 2011 "movd %%mm2, (%0, %1, 4) \n\t"
2000 "psrlq $32, %%mm2 \n\t" 2012 "psrlq $32, %%mm2 \n\t"
2001 "movd %%mm2, (%%ebx) \n\t" 2013 "movd %%mm2, (%%edx) \n\t"
2002 "movd %%mm1, (%%ebx, %1) \n\t" 2014 "movd %%mm1, (%%edx, %1) \n\t"
2003 "psrlq $32, %%mm1 \n\t" 2015 "psrlq $32, %%mm1 \n\t"
2004 "movd %%mm1, (%%ebx, %1, 2) \n\t" 2016 "movd %%mm1, (%%edx, %1, 2) \n\t"
2005 2017
2006 2018
2007 "movq 64(%2), %%mm0 \n\t" // 12345678 2019 "movq 64(%2), %%mm0 \n\t" // 12345678
2008 "movq 80(%2), %%mm1 \n\t" // abcdefgh 2020 "movq 80(%2), %%mm1 \n\t" // abcdefgh
2009 "movq %%mm0, %%mm2 \n\t" // 12345678 2021 "movq %%mm0, %%mm2 \n\t" // 12345678
2029 "movd %%mm3, 4(%%eax, %1) \n\t" 2041 "movd %%mm3, 4(%%eax, %1) \n\t"
2030 "psrlq $32, %%mm3 \n\t" 2042 "psrlq $32, %%mm3 \n\t"
2031 "movd %%mm3, 4(%%eax, %1, 2) \n\t" 2043 "movd %%mm3, 4(%%eax, %1, 2) \n\t"
2032 "movd %%mm2, 4(%0, %1, 4) \n\t" 2044 "movd %%mm2, 4(%0, %1, 4) \n\t"
2033 "psrlq $32, %%mm2 \n\t" 2045 "psrlq $32, %%mm2 \n\t"
2034 "movd %%mm2, 4(%%ebx) \n\t" 2046 "movd %%mm2, 4(%%edx) \n\t"
2035 "movd %%mm1, 4(%%ebx, %1) \n\t" 2047 "movd %%mm1, 4(%%edx, %1) \n\t"
2036 "psrlq $32, %%mm1 \n\t" 2048 "psrlq $32, %%mm1 \n\t"
2037 "movd %%mm1, 4(%%ebx, %1, 2) \n\t" 2049 "movd %%mm1, 4(%%edx, %1, 2) \n\t"
2038 2050
2039 :: "r" (dst), "r" (dstStride), "r" (src) 2051 :: "r" (dst), "r" (dstStride), "r" (src)
2040 : "%eax", "%ebx" 2052 : "%eax", "%edx"
2041 ); 2053 );
2042 } 2054 }
2043 #endif 2055 #endif
2044 //static int test=0; 2056 //static int test=0;
2045 2057
2046 static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride, 2058 static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2047 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) 2059 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2048 { 2060 {
2061 // to save a register (FIXME do this outside of the loops)
2062 tempBluredPast[127]= maxNoise[0];
2063 tempBluredPast[128]= maxNoise[1];
2064 tempBluredPast[129]= maxNoise[2];
2065
2049 #define FAST_L2_DIFF 2066 #define FAST_L2_DIFF
2050 //#define L1_DIFF //u should change the thresholds too if u try that one 2067 //#define L1_DIFF //u should change the thresholds too if u try that one
2051 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2068 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2052 asm volatile( 2069 asm volatile(
2053 "leal (%2, %2, 2), %%eax \n\t" // 3*stride 2070 "leal (%2, %2, 2), %%eax \n\t" // 3*stride
2054 "leal (%2, %2, 4), %%ebx \n\t" // 5*stride 2071 "leal (%2, %2, 4), %%edx \n\t" // 5*stride
2055 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride 2072 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2056 // 0 1 2 3 4 5 6 7 8 9 2073 // 0 1 2 3 4 5 6 7 8 9
2057 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+ebx %x+2eax %x+ecx %x+8%2 2074 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
2058 //FIXME reorder? 2075 //FIXME reorder?
2059 #ifdef L1_DIFF //needs mmx2 2076 #ifdef L1_DIFF //needs mmx2
2060 "movq (%0), %%mm0 \n\t" // L0 2077 "movq (%0), %%mm0 \n\t" // L0
2061 "psadbw (%1), %%mm0 \n\t" // |L0-R0| 2078 "psadbw (%1), %%mm0 \n\t" // |L0-R0|
2062 "movq (%0, %2), %%mm1 \n\t" // L1 2079 "movq (%0, %2), %%mm1 \n\t" // L1
2067 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3| 2084 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3|
2068 2085
2069 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2086 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2070 "paddw %%mm1, %%mm0 \n\t" 2087 "paddw %%mm1, %%mm0 \n\t"
2071 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| 2088 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
2072 "movq (%0, %%ebx), %%mm5 \n\t" // L5 2089 "movq (%0, %%edx), %%mm5 \n\t" // L5
2073 "paddw %%mm2, %%mm0 \n\t" 2090 "paddw %%mm2, %%mm0 \n\t"
2074 "psadbw (%1, %%ebx), %%mm5 \n\t" // |L5-R5| 2091 "psadbw (%1, %%edx), %%mm5 \n\t" // |L5-R5|
2075 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 2092 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2076 "paddw %%mm3, %%mm0 \n\t" 2093 "paddw %%mm3, %%mm0 \n\t"
2077 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6| 2094 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6|
2078 "movq (%0, %%ecx), %%mm7 \n\t" // L7 2095 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2079 "paddw %%mm4, %%mm0 \n\t" 2096 "paddw %%mm4, %%mm0 \n\t"
2102 L2_DIFF_CORE((%0), (%1)) 2119 L2_DIFF_CORE((%0), (%1))
2103 L2_DIFF_CORE((%0, %2), (%1, %2)) 2120 L2_DIFF_CORE((%0, %2), (%1, %2))
2104 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) 2121 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2105 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) 2122 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2106 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) 2123 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2107 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx)) 2124 L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2108 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) 2125 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2109 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) 2126 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2110 2127
2111 #else 2128 #else
2112 "pxor %%mm7, %%mm7 \n\t" 2129 "pxor %%mm7, %%mm7 \n\t"
2130 L2_DIFF_CORE((%0), (%1)) 2147 L2_DIFF_CORE((%0), (%1))
2131 L2_DIFF_CORE((%0, %2), (%1, %2)) 2148 L2_DIFF_CORE((%0, %2), (%1, %2))
2132 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) 2149 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2133 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) 2150 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2134 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) 2151 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2135 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx)) 2152 L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2136 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) 2153 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2137 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) 2154 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2138 2155
2139 #endif 2156 #endif
2140 2157
2141 "movq %%mm0, %%mm4 \n\t" 2158 "movq %%mm0, %%mm4 \n\t"
2142 "psrlq $32, %%mm0 \n\t" 2159 "psrlq $32, %%mm0 \n\t"
2143 "paddd %%mm0, %%mm4 \n\t" 2160 "paddd %%mm0, %%mm4 \n\t"
2144 "movd %%mm4, %%ecx \n\t" 2161 "movd %%mm4, %%ecx \n\t"
2145 "shll $2, %%ecx \n\t" 2162 "shll $2, %%ecx \n\t"
2146 "movl %3, %%ebx \n\t" 2163 "movl %3, %%edx \n\t"
2147 "addl -4(%%ebx), %%ecx \n\t" 2164 "addl -4(%%edx), %%ecx \n\t"
2148 "addl 4(%%ebx), %%ecx \n\t" 2165 "addl 4(%%edx), %%ecx \n\t"
2149 "addl -1024(%%ebx), %%ecx \n\t" 2166 "addl -1024(%%edx), %%ecx \n\t"
2150 "addl $4, %%ecx \n\t" 2167 "addl $4, %%ecx \n\t"
2151 "addl 1024(%%ebx), %%ecx \n\t" 2168 "addl 1024(%%edx), %%ecx \n\t"
2152 "shrl $3, %%ecx \n\t" 2169 "shrl $3, %%ecx \n\t"
2153 "movl %%ecx, (%%ebx) \n\t" 2170 "movl %%ecx, (%%edx) \n\t"
2154 "leal (%%eax, %2, 2), %%ebx \n\t" // 5*stride
2155 2171
2156 // "movl %3, %%ecx \n\t" 2172 // "movl %3, %%ecx \n\t"
2157 // "movl %%ecx, test \n\t" 2173 // "movl %%ecx, test \n\t"
2158 // "jmp 4f \n\t" 2174 // "jmp 4f \n\t"
2159 "cmpl 4+"MANGLE(maxTmpNoise)", %%ecx \n\t" 2175 "cmpl 512(%%edx), %%ecx \n\t"
2160 " jb 2f \n\t" 2176 " jb 2f \n\t"
2161 "cmpl 8+"MANGLE(maxTmpNoise)", %%ecx \n\t" 2177 "cmpl 516(%%edx), %%ecx \n\t"
2162 " jb 1f \n\t" 2178 " jb 1f \n\t"
2163 2179
2164 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride 2180 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
2181 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2165 "movq (%0), %%mm0 \n\t" // L0 2182 "movq (%0), %%mm0 \n\t" // L0
2166 "movq (%0, %2), %%mm1 \n\t" // L1 2183 "movq (%0, %2), %%mm1 \n\t" // L1
2167 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2184 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2168 "movq (%0, %%eax), %%mm3 \n\t" // L3 2185 "movq (%0, %%eax), %%mm3 \n\t" // L3
2169 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2186 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2170 "movq (%0, %%ebx), %%mm5 \n\t" // L5 2187 "movq (%0, %%edx), %%mm5 \n\t" // L5
2171 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 2188 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2172 "movq (%0, %%ecx), %%mm7 \n\t" // L7 2189 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2173 "movq %%mm0, (%1) \n\t" // L0 2190 "movq %%mm0, (%1) \n\t" // L0
2174 "movq %%mm1, (%1, %2) \n\t" // L1 2191 "movq %%mm1, (%1, %2) \n\t" // L1
2175 "movq %%mm2, (%1, %2, 2) \n\t" // L2 2192 "movq %%mm2, (%1, %2, 2) \n\t" // L2
2176 "movq %%mm3, (%1, %%eax) \n\t" // L3 2193 "movq %%mm3, (%1, %%eax) \n\t" // L3
2177 "movq %%mm4, (%1, %2, 4) \n\t" // L4 2194 "movq %%mm4, (%1, %2, 4) \n\t" // L4
2178 "movq %%mm5, (%1, %%ebx) \n\t" // L5 2195 "movq %%mm5, (%1, %%edx) \n\t" // L5
2179 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6 2196 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6
2180 "movq %%mm7, (%1, %%ecx) \n\t" // L7 2197 "movq %%mm7, (%1, %%ecx) \n\t" // L7
2181 "jmp 4f \n\t" 2198 "jmp 4f \n\t"
2182 2199
2183 "1: \n\t" 2200 "1: \n\t"
2184 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride 2201 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
2202 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2185 "movq (%0), %%mm0 \n\t" // L0 2203 "movq (%0), %%mm0 \n\t" // L0
2186 PAVGB((%1), %%mm0) // L0 2204 PAVGB((%1), %%mm0) // L0
2187 "movq (%0, %2), %%mm1 \n\t" // L1 2205 "movq (%0, %2), %%mm1 \n\t" // L1
2188 PAVGB((%1, %2), %%mm1) // L1 2206 PAVGB((%1, %2), %%mm1) // L1
2189 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2207 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2190 PAVGB((%1, %2, 2), %%mm2) // L2 2208 PAVGB((%1, %2, 2), %%mm2) // L2
2191 "movq (%0, %%eax), %%mm3 \n\t" // L3 2209 "movq (%0, %%eax), %%mm3 \n\t" // L3
2192 PAVGB((%1, %%eax), %%mm3) // L3 2210 PAVGB((%1, %%eax), %%mm3) // L3
2193 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2211 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2194 PAVGB((%1, %2, 4), %%mm4) // L4 2212 PAVGB((%1, %2, 4), %%mm4) // L4
2195 "movq (%0, %%ebx), %%mm5 \n\t" // L5 2213 "movq (%0, %%edx), %%mm5 \n\t" // L5
2196 PAVGB((%1, %%ebx), %%mm5) // L5 2214 PAVGB((%1, %%edx), %%mm5) // L5
2197 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 2215 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2198 PAVGB((%1, %%eax, 2), %%mm6) // L6 2216 PAVGB((%1, %%eax, 2), %%mm6) // L6
2199 "movq (%0, %%ecx), %%mm7 \n\t" // L7 2217 "movq (%0, %%ecx), %%mm7 \n\t" // L7
2200 PAVGB((%1, %%ecx), %%mm7) // L7 2218 PAVGB((%1, %%ecx), %%mm7) // L7
2201 "movq %%mm0, (%1) \n\t" // R0 2219 "movq %%mm0, (%1) \n\t" // R0
2202 "movq %%mm1, (%1, %2) \n\t" // R1 2220 "movq %%mm1, (%1, %2) \n\t" // R1
2203 "movq %%mm2, (%1, %2, 2) \n\t" // R2 2221 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2204 "movq %%mm3, (%1, %%eax) \n\t" // R3 2222 "movq %%mm3, (%1, %%eax) \n\t" // R3
2205 "movq %%mm4, (%1, %2, 4) \n\t" // R4 2223 "movq %%mm4, (%1, %2, 4) \n\t" // R4
2206 "movq %%mm5, (%1, %%ebx) \n\t" // R5 2224 "movq %%mm5, (%1, %%edx) \n\t" // R5
2207 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6 2225 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6
2208 "movq %%mm7, (%1, %%ecx) \n\t" // R7 2226 "movq %%mm7, (%1, %%ecx) \n\t" // R7
2209 "movq %%mm0, (%0) \n\t" // L0 2227 "movq %%mm0, (%0) \n\t" // L0
2210 "movq %%mm1, (%0, %2) \n\t" // L1 2228 "movq %%mm1, (%0, %2) \n\t" // L1
2211 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2229 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2212 "movq %%mm3, (%0, %%eax) \n\t" // L3 2230 "movq %%mm3, (%0, %%eax) \n\t" // L3
2213 "movq %%mm4, (%0, %2, 4) \n\t" // L4 2231 "movq %%mm4, (%0, %2, 4) \n\t" // L4
2214 "movq %%mm5, (%0, %%ebx) \n\t" // L5 2232 "movq %%mm5, (%0, %%edx) \n\t" // L5
2215 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 2233 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6
2216 "movq %%mm7, (%0, %%ecx) \n\t" // L7 2234 "movq %%mm7, (%0, %%ecx) \n\t" // L7
2217 "jmp 4f \n\t" 2235 "jmp 4f \n\t"
2218 2236
2219 "2: \n\t" 2237 "2: \n\t"
2220 "cmpl "MANGLE(maxTmpNoise)", %%ecx \n\t" 2238 "cmpl 508(%%edx), %%ecx \n\t"
2221 " jb 3f \n\t" 2239 " jb 3f \n\t"
2222 2240
2223 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride 2241 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
2242 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2224 "movq (%0), %%mm0 \n\t" // L0 2243 "movq (%0), %%mm0 \n\t" // L0
2225 "movq (%0, %2), %%mm1 \n\t" // L1 2244 "movq (%0, %2), %%mm1 \n\t" // L1
2226 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2245 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2227 "movq (%0, %%eax), %%mm3 \n\t" // L3 2246 "movq (%0, %%eax), %%mm3 \n\t" // L3
2228 "movq (%1), %%mm4 \n\t" // R0 2247 "movq (%1), %%mm4 \n\t" // R0
2245 "movq %%mm1, (%0, %2) \n\t" // L1 2264 "movq %%mm1, (%0, %2) \n\t" // L1
2246 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2265 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2247 "movq %%mm3, (%0, %%eax) \n\t" // L3 2266 "movq %%mm3, (%0, %%eax) \n\t" // L3
2248 2267
2249 "movq (%0, %2, 4), %%mm0 \n\t" // L4 2268 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2250 "movq (%0, %%ebx), %%mm1 \n\t" // L5 2269 "movq (%0, %%edx), %%mm1 \n\t" // L5
2251 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 2270 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2252 "movq (%0, %%ecx), %%mm3 \n\t" // L7 2271 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2253 "movq (%1, %2, 4), %%mm4 \n\t" // R4 2272 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2254 "movq (%1, %%ebx), %%mm5 \n\t" // R5 2273 "movq (%1, %%edx), %%mm5 \n\t" // R5
2255 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 2274 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2256 "movq (%1, %%ecx), %%mm7 \n\t" // R7 2275 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2257 PAVGB(%%mm4, %%mm0) 2276 PAVGB(%%mm4, %%mm0)
2258 PAVGB(%%mm5, %%mm1) 2277 PAVGB(%%mm5, %%mm1)
2259 PAVGB(%%mm6, %%mm2) 2278 PAVGB(%%mm6, %%mm2)
2261 PAVGB(%%mm4, %%mm0) 2280 PAVGB(%%mm4, %%mm0)
2262 PAVGB(%%mm5, %%mm1) 2281 PAVGB(%%mm5, %%mm1)
2263 PAVGB(%%mm6, %%mm2) 2282 PAVGB(%%mm6, %%mm2)
2264 PAVGB(%%mm7, %%mm3) 2283 PAVGB(%%mm7, %%mm3)
2265 "movq %%mm0, (%1, %2, 4) \n\t" // R4 2284 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2266 "movq %%mm1, (%1, %%ebx) \n\t" // R5 2285 "movq %%mm1, (%1, %%edx) \n\t" // R5
2267 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 2286 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2268 "movq %%mm3, (%1, %%ecx) \n\t" // R7 2287 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2269 "movq %%mm0, (%0, %2, 4) \n\t" // L4 2288 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2270 "movq %%mm1, (%0, %%ebx) \n\t" // L5 2289 "movq %%mm1, (%0, %%edx) \n\t" // L5
2271 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 2290 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2272 "movq %%mm3, (%0, %%ecx) \n\t" // L7 2291 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2273 "jmp 4f \n\t" 2292 "jmp 4f \n\t"
2274 2293
2275 "3: \n\t" 2294 "3: \n\t"
2276 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride 2295 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
2296 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2277 "movq (%0), %%mm0 \n\t" // L0 2297 "movq (%0), %%mm0 \n\t" // L0
2278 "movq (%0, %2), %%mm1 \n\t" // L1 2298 "movq (%0, %2), %%mm1 \n\t" // L1
2279 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2299 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2280 "movq (%0, %%eax), %%mm3 \n\t" // L3 2300 "movq (%0, %%eax), %%mm3 \n\t" // L3
2281 "movq (%1), %%mm4 \n\t" // R0 2301 "movq (%1), %%mm4 \n\t" // R0
2302 "movq %%mm1, (%0, %2) \n\t" // L1 2322 "movq %%mm1, (%0, %2) \n\t" // L1
2303 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2323 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2304 "movq %%mm3, (%0, %%eax) \n\t" // L3 2324 "movq %%mm3, (%0, %%eax) \n\t" // L3
2305 2325
2306 "movq (%0, %2, 4), %%mm0 \n\t" // L4 2326 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2307 "movq (%0, %%ebx), %%mm1 \n\t" // L5 2327 "movq (%0, %%edx), %%mm1 \n\t" // L5
2308 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 2328 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
2309 "movq (%0, %%ecx), %%mm3 \n\t" // L7 2329 "movq (%0, %%ecx), %%mm3 \n\t" // L7
2310 "movq (%1, %2, 4), %%mm4 \n\t" // R4 2330 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2311 "movq (%1, %%ebx), %%mm5 \n\t" // R5 2331 "movq (%1, %%edx), %%mm5 \n\t" // R5
2312 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 2332 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
2313 "movq (%1, %%ecx), %%mm7 \n\t" // R7 2333 "movq (%1, %%ecx), %%mm7 \n\t" // R7
2314 PAVGB(%%mm4, %%mm0) 2334 PAVGB(%%mm4, %%mm0)
2315 PAVGB(%%mm5, %%mm1) 2335 PAVGB(%%mm5, %%mm1)
2316 PAVGB(%%mm6, %%mm2) 2336 PAVGB(%%mm6, %%mm2)
2322 PAVGB(%%mm4, %%mm0) 2342 PAVGB(%%mm4, %%mm0)
2323 PAVGB(%%mm5, %%mm1) 2343 PAVGB(%%mm5, %%mm1)
2324 PAVGB(%%mm6, %%mm2) 2344 PAVGB(%%mm6, %%mm2)
2325 PAVGB(%%mm7, %%mm3) 2345 PAVGB(%%mm7, %%mm3)
2326 "movq %%mm0, (%1, %2, 4) \n\t" // R4 2346 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2327 "movq %%mm1, (%1, %%ebx) \n\t" // R5 2347 "movq %%mm1, (%1, %%edx) \n\t" // R5
2328 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 2348 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
2329 "movq %%mm3, (%1, %%ecx) \n\t" // R7 2349 "movq %%mm3, (%1, %%ecx) \n\t" // R7
2330 "movq %%mm0, (%0, %2, 4) \n\t" // L4 2350 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2331 "movq %%mm1, (%0, %%ebx) \n\t" // L5 2351 "movq %%mm1, (%0, %%edx) \n\t" // L5
2332 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 2352 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
2333 "movq %%mm3, (%0, %%ecx) \n\t" // L7 2353 "movq %%mm3, (%0, %%ecx) \n\t" // L7
2334 2354
2335 "4: \n\t" 2355 "4: \n\t"
2336 2356
2337 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast) 2357 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
2338 : "%eax", "%ebx", "%ecx", "memory" 2358 : "%eax", "%edx", "%ecx", "memory"
2339 ); 2359 );
2340 //printf("%d\n", test); 2360 //printf("%d\n", test);
2341 #else 2361 #else
2342 int y; 2362 int y;
2343 int d=0; 2363 int d=0;
2441 } 2461 }
2442 #endif 2462 #endif
2443 } 2463 }
2444 2464
2445 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 2465 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2446 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); 2466 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
2447 2467
2448 /** 2468 /**
2449 * Copies a block from src to dst and fixes the blacklevel 2469 * Copies a block from src to dst and fixes the blacklevel
2450 * levelFix == 0 -> dont touch the brighness & contrast 2470 * levelFix == 0 -> dont touch the brighness & contrast
2451 */ 2471 */
2452 #undef SCALED_CPY 2472 #undef SCALED_CPY
2453 2473
2454 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, 2474 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2455 int levelFix) 2475 int levelFix, int64_t *packedOffsetAndScale)
2456 { 2476 {
2457 #ifndef HAVE_MMX 2477 #ifndef HAVE_MMX
2458 int i; 2478 int i;
2459 #endif 2479 #endif
2460 if(levelFix) 2480 if(levelFix)
2461 { 2481 {
2462 #ifdef HAVE_MMX 2482 #ifdef HAVE_MMX
2463 asm volatile( 2483 asm volatile(
2464 "leal (%0,%2), %%eax \n\t" 2484 "movq (%%eax), %%mm2 \n\t" // packedYOffset
2465 "leal (%1,%3), %%ebx \n\t" 2485 "movq 8(%%eax), %%mm3 \n\t" // packedYScale
2466 "movq "MANGLE(packedYOffset)", %%mm2\n\t" 2486 "leal (%2,%4), %%eax \n\t"
2467 "movq "MANGLE(packedYScale)", %%mm3\n\t" 2487 "leal (%3,%5), %%edx \n\t"
2468 "pxor %%mm4, %%mm4 \n\t" 2488 "pxor %%mm4, %%mm4 \n\t"
2469 #ifdef HAVE_MMX2 2489 #ifdef HAVE_MMX2
2470 #define SCALED_CPY(src1, src2, dst1, dst2) \ 2490 #define SCALED_CPY(src1, src2, dst1, dst2) \
2471 "movq " #src1 ", %%mm0 \n\t"\ 2491 "movq " #src1 ", %%mm0 \n\t"\
2472 "movq " #src1 ", %%mm5 \n\t"\ 2492 "movq " #src1 ", %%mm5 \n\t"\
2516 "movq %%mm0, " #dst1 " \n\t"\ 2536 "movq %%mm0, " #dst1 " \n\t"\
2517 "movq %%mm1, " #dst2 " \n\t"\ 2537 "movq %%mm1, " #dst2 " \n\t"\
2518 2538
2519 #endif //!HAVE_MMX2 2539 #endif //!HAVE_MMX2
2520 2540
2521 SCALED_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) 2541 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
2522 SCALED_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2)) 2542 SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2))
2523 SCALED_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4)) 2543 SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4))
2524 "leal (%%eax,%2,4), %%eax \n\t" 2544 "leal (%%eax,%4,4), %%eax \n\t"
2525 "leal (%%ebx,%3,4), %%ebx \n\t" 2545 "leal (%%edx,%5,4), %%edx \n\t"
2526 SCALED_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2)) 2546 SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2))
2527 2547
2528 2548
2529 : : "r"(src), 2549 : "=&a" (packedOffsetAndScale)
2550 : "0" (packedOffsetAndScale),
2551 "r"(src),
2530 "r"(dst), 2552 "r"(dst),
2531 "r" (srcStride), 2553 "r" (srcStride),
2532 "r" (dstStride) 2554 "r" (dstStride)
2533 : "%eax", "%ebx" 2555 : "%edx"
2534 ); 2556 );
2535 #else 2557 #else
2536 for(i=0; i<8; i++) 2558 for(i=0; i<8; i++)
2537 memcpy( &(dst[dstStride*i]), 2559 memcpy( &(dst[dstStride*i]),
2538 &(src[srcStride*i]), BLOCK_SIZE); 2560 &(src[srcStride*i]), BLOCK_SIZE);
2541 else 2563 else
2542 { 2564 {
2543 #ifdef HAVE_MMX 2565 #ifdef HAVE_MMX
2544 asm volatile( 2566 asm volatile(
2545 "leal (%0,%2), %%eax \n\t" 2567 "leal (%0,%2), %%eax \n\t"
2546 "leal (%1,%3), %%ebx \n\t" 2568 "leal (%1,%3), %%edx \n\t"
2547 2569
2548 #define SIMPLE_CPY(src1, src2, dst1, dst2) \ 2570 #define SIMPLE_CPY(src1, src2, dst1, dst2) \
2549 "movq " #src1 ", %%mm0 \n\t"\ 2571 "movq " #src1 ", %%mm0 \n\t"\
2550 "movq " #src2 ", %%mm1 \n\t"\ 2572 "movq " #src2 ", %%mm1 \n\t"\
2551 "movq %%mm0, " #dst1 " \n\t"\ 2573 "movq %%mm0, " #dst1 " \n\t"\
2552 "movq %%mm1, " #dst2 " \n\t"\ 2574 "movq %%mm1, " #dst2 " \n\t"\
2553 2575
2554 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) 2576 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
2555 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2)) 2577 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2))
2556 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4)) 2578 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4))
2557 "leal (%%eax,%2,4), %%eax \n\t" 2579 "leal (%%eax,%2,4), %%eax \n\t"
2558 "leal (%%ebx,%3,4), %%ebx \n\t" 2580 "leal (%%edx,%3,4), %%edx \n\t"
2559 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2)) 2581 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2))
2560 2582
2561 : : "r" (src), 2583 : : "r" (src),
2562 "r" (dst), 2584 "r" (dst),
2563 "r" (srcStride), 2585 "r" (srcStride),
2564 "r" (dstStride) 2586 "r" (dstStride)
2565 : "%eax", "%ebx" 2587 : "%eax", "%edx"
2566 ); 2588 );
2567 #else 2589 #else
2568 for(i=0; i<8; i++) 2590 for(i=0; i<8; i++)
2569 memcpy( &(dst[dstStride*i]), 2591 memcpy( &(dst[dstStride*i]),
2570 &(src[srcStride*i]), BLOCK_SIZE); 2592 &(src[srcStride*i]), BLOCK_SIZE);
2600 2622
2601 /** 2623 /**
2602 * Filters array of bytes (Y or U or V values) 2624 * Filters array of bytes (Y or U or V values)
2603 */ 2625 */
2604 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 2626 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2605 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode) 2627 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
2606 { 2628 {
2629 PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access
2607 int x,y; 2630 int x,y;
2608 #ifdef COMPILE_TIME_MODE 2631 #ifdef COMPILE_TIME_MODE
2609 const int mode= COMPILE_TIME_MODE; 2632 const int mode= COMPILE_TIME_MODE;
2610 #else 2633 #else
2611 const int mode= isColor ? ppMode->chromMode : ppMode->lumMode; 2634 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
2612 #endif 2635 #endif
2613 /* we need 64bit here otherwise weŽll going to have a problem
2614 after watching a black picture for 5 hours*/
2615 static uint64_t *yHistogram= NULL;
2616 int black=0, white=255; // blackest black and whitest white in the picture 2636 int black=0, white=255; // blackest black and whitest white in the picture
2617 int QPCorrecture= 256*256; 2637 int QPCorrecture= 256*256;
2618 2638
2619 /* Temporary buffers for handling the last row(s) */
2620 static uint8_t *tempDst= NULL;
2621 static uint8_t *tempSrc= NULL;
2622
2623 /* Temporary buffers for handling the last block */
2624 static uint8_t *tempDstBlock= NULL;
2625 static uint8_t *tempSrcBlock= NULL;
2626
2627 /* Temporal noise reducing buffers */
2628 static uint8_t *tempBlured[3]= {NULL,NULL,NULL};
2629 static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL};
2630
2631 int copyAhead; 2639 int copyAhead;
2632 2640
2633 #ifdef PP_FUNNY_STRIDE 2641 //FIXME remove
2634 uint8_t *dstBlockPtrBackup; 2642 uint64_t * const yHistogram= c.yHistogram;
2635 uint8_t *srcBlockPtrBackup; 2643 uint8_t * const tempSrc= c.tempSrc;
2636 #endif 2644 uint8_t * const tempDst= c.tempDst;
2637 2645
2638 #ifdef MORE_TIMING 2646 c.dcOffset= c.ppMode.maxDcDiff;
2639 long long T0, T1, diffTime=0; 2647 c.dcThreshold= c.ppMode.maxDcDiff*2 + 1;
2640 #endif
2641 #ifdef TIMING
2642 long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
2643 sumTime= rdtsc();
2644 #endif
2645 dcOffset= ppMode->maxDcDiff;
2646 dcThreshold= ppMode->maxDcDiff*2 + 1;
2647 2648
2648 #ifdef HAVE_MMX 2649 #ifdef HAVE_MMX
2649 maxTmpNoise[0]= ppMode->maxTmpNoise[0]; 2650 c.mmxDcOffset= 0x7F - c.dcOffset;
2650 maxTmpNoise[1]= ppMode->maxTmpNoise[1]; 2651 c.mmxDcThreshold= 0x7F - c.dcThreshold;
2651 maxTmpNoise[2]= ppMode->maxTmpNoise[2]; 2652
2652 2653 c.mmxDcOffset*= 0x0101010101010101LL;
2653 mmxDCOffset= 0x7F - dcOffset; 2654 c.mmxDcThreshold*= 0x0101010101010101LL;
2654 mmxDCThreshold= 0x7F - dcThreshold;
2655
2656 mmxDCOffset*= 0x0101010101010101LL;
2657 mmxDCThreshold*= 0x0101010101010101LL;
2658 #endif 2655 #endif
2659 2656
2660 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; 2657 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
2661 else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14; 2658 else if( (mode & LINEAR_BLEND_DEINT_FILTER)
2659 || (mode & FFMPEG_DEINT_FILTER)) copyAhead=14;
2662 else if( (mode & V_DEBLOCK) 2660 else if( (mode & V_DEBLOCK)
2663 || (mode & LINEAR_IPOL_DEINT_FILTER) 2661 || (mode & LINEAR_IPOL_DEINT_FILTER)
2664 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; 2662 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
2665 else if(mode & V_X1_FILTER) copyAhead=11; 2663 else if(mode & V_X1_FILTER) copyAhead=11;
2666 else if(mode & V_RK1_FILTER) copyAhead=10; 2664 // else if(mode & V_RK1_FILTER) copyAhead=10;
2667 else if(mode & DERING) copyAhead=9; 2665 else if(mode & DERING) copyAhead=9;
2668 else copyAhead=8; 2666 else copyAhead=8;
2669 2667
2670 copyAhead-= 8; 2668 copyAhead-= 8;
2671
2672 if(tempDst==NULL)
2673 {
2674 tempDst= (uint8_t*)memalign(8, 1024*24);
2675 tempSrc= (uint8_t*)memalign(8, 1024*24);
2676 tempDstBlock= (uint8_t*)memalign(8, 1024*24);
2677 tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
2678 }
2679
2680 if(tempBlured[isColor]==NULL && (mode & TEMP_NOISE_FILTER))
2681 {
2682 // printf("%d %d %d\n", isColor, dstStride, height);
2683 //FIXME works only as long as the size doesnt increase
2684 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
2685 tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024);
2686 tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024);
2687
2688 memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024);
2689 memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024);
2690 }
2691
2692 if(!yHistogram)
2693 {
2694 int i;
2695 yHistogram= (uint64_t*)malloc(8*256);
2696 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
2697
2698 if(mode & FULL_Y_RANGE)
2699 {
2700 ppMode->maxAllowedY=255;
2701 ppMode->minAllowedY=0;
2702 }
2703 }
2704 2669
2705 if(!isColor) 2670 if(!isColor)
2706 { 2671 {
2707 uint64_t sum= 0; 2672 uint64_t sum= 0;
2708 int i; 2673 int i;
2709 static int framenum= -1;
2710 uint64_t maxClipped; 2674 uint64_t maxClipped;
2711 uint64_t clipped; 2675 uint64_t clipped;
2712 double scale; 2676 double scale;
2713 2677
2714 framenum++; 2678 c.frameNum++;
2715 if(framenum == 1) yHistogram[0]= width*height/64*15/256; 2679 // first frame is fscked so we ignore it
2680 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
2716 2681
2717 for(i=0; i<256; i++) 2682 for(i=0; i<256; i++)
2718 { 2683 {
2719 sum+= yHistogram[i]; 2684 sum+= yHistogram[i];
2720 // printf("%d ", yHistogram[i]); 2685 // printf("%d ", yHistogram[i]);
2736 { 2701 {
2737 if(clipped < maxClipped) break; 2702 if(clipped < maxClipped) break;
2738 clipped-= yHistogram[white]; 2703 clipped-= yHistogram[white];
2739 } 2704 }
2740 2705
2741 scale= (double)(ppMode->maxAllowedY - ppMode->minAllowedY) / (double)(white-black); 2706 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
2742 2707
2743 #ifdef HAVE_MMX2 2708 #ifdef HAVE_MMX2
2744 packedYScale= (uint16_t)(scale*256.0 + 0.5); 2709 c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
2745 packedYOffset= (((black*packedYScale)>>8) - ppMode->minAllowedY) & 0xFFFF; 2710 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
2746 #else 2711 #else
2747 packedYScale= (uint16_t)(scale*1024.0 + 0.5); 2712 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
2748 packedYOffset= (black - ppMode->minAllowedY) & 0xFFFF; 2713 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
2749 #endif 2714 #endif
2750 2715
2751 packedYOffset|= packedYOffset<<32; 2716 c.packedYOffset|= c.packedYOffset<<32;
2752 packedYOffset|= packedYOffset<<16; 2717 c.packedYOffset|= c.packedYOffset<<16;
2753 2718
2754 packedYScale|= packedYScale<<32; 2719 c.packedYScale|= c.packedYScale<<32;
2755 packedYScale|= packedYScale<<16; 2720 c.packedYScale|= c.packedYScale<<16;
2756 2721
2757 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); 2722 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
2758 else QPCorrecture= 256*256; 2723 else QPCorrecture= 256*256;
2759 } 2724 }
2760 else 2725 else
2761 { 2726 {
2762 packedYScale= 0x0100010001000100LL; 2727 c.packedYScale= 0x0100010001000100LL;
2763 packedYOffset= 0; 2728 c.packedYOffset= 0;
2764 QPCorrecture= 256*256; 2729 QPCorrecture= 256*256;
2765 } 2730 }
2766 2731
2767 /* copy & deinterlace first row of blocks */ 2732 /* copy & deinterlace first row of blocks */
2768 y=-BLOCK_SIZE; 2733 y=-BLOCK_SIZE;
2787 asm( 2752 asm(
2788 "movl %4, %%eax \n\t" 2753 "movl %4, %%eax \n\t"
2789 "shrl $2, %%eax \n\t" 2754 "shrl $2, %%eax \n\t"
2790 "andl $6, %%eax \n\t" 2755 "andl $6, %%eax \n\t"
2791 "addl %5, %%eax \n\t" 2756 "addl %5, %%eax \n\t"
2792 "movl %%eax, %%ebx \n\t" 2757 "movl %%eax, %%edx \n\t"
2793 "imul %1, %%eax \n\t" 2758 "imul %1, %%eax \n\t"
2794 "imul %3, %%ebx \n\t" 2759 "imul %3, %%edx \n\t"
2795 "prefetchnta 32(%%eax, %0) \n\t" 2760 "prefetchnta 32(%%eax, %0) \n\t"
2796 "prefetcht0 32(%%ebx, %2) \n\t" 2761 "prefetcht0 32(%%edx, %2) \n\t"
2797 "addl %1, %%eax \n\t" 2762 "addl %1, %%eax \n\t"
2798 "addl %3, %%ebx \n\t" 2763 "addl %3, %%edx \n\t"
2799 "prefetchnta 32(%%eax, %0) \n\t" 2764 "prefetchnta 32(%%eax, %0) \n\t"
2800 "prefetcht0 32(%%ebx, %2) \n\t" 2765 "prefetcht0 32(%%edx, %2) \n\t"
2801 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), 2766 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
2802 "m" (x), "m" (copyAhead) 2767 "m" (x), "m" (copyAhead)
2803 : "%eax", "%ebx" 2768 : "%eax", "%edx"
2804 ); 2769 );
2805 2770
2806 #elif defined(HAVE_3DNOW) 2771 #elif defined(HAVE_3DNOW)
2807 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... 2772 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2808 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 2773 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2811 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 2776 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2812 */ 2777 */
2813 #endif 2778 #endif
2814 2779
2815 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, 2780 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
2816 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX); 2781 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
2817 2782
2818 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); 2783 RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
2819 2784
2820 if(mode & LINEAR_IPOL_DEINT_FILTER) 2785 if(mode & LINEAR_IPOL_DEINT_FILTER)
2821 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); 2786 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2823 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); 2788 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2824 else if(mode & MEDIAN_DEINT_FILTER) 2789 else if(mode & MEDIAN_DEINT_FILTER)
2825 RENAME(deInterlaceMedian)(dstBlock, dstStride); 2790 RENAME(deInterlaceMedian)(dstBlock, dstStride);
2826 else if(mode & CUBIC_IPOL_DEINT_FILTER) 2791 else if(mode & CUBIC_IPOL_DEINT_FILTER)
2827 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); 2792 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
2793 else if(mode & FFMPEG_DEINT_FILTER)
2794 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
2828 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) 2795 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
2829 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); 2796 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2830 */ 2797 */
2831 dstBlock+=8; 2798 dstBlock+=8;
2832 srcBlock+=8; 2799 srcBlock+=8;
2833 } 2800 }
2834 memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride ); 2801 memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride );
2835 } 2802 }
2836 2803
2804 //printf("\n");
2837 for(y=0; y<height; y+=BLOCK_SIZE) 2805 for(y=0; y<height; y+=BLOCK_SIZE)
2838 { 2806 {
2839 //1% speedup if these are here instead of the inner loop 2807 //1% speedup if these are here instead of the inner loop
2840 uint8_t *srcBlock= &(src[y*srcStride]); 2808 uint8_t *srcBlock= &(src[y*srcStride]);
2841 uint8_t *dstBlock= &(dst[y*dstStride]); 2809 uint8_t *dstBlock= &(dst[y*dstStride]);
2842 #ifdef HAVE_MMX 2810 #ifdef HAVE_MMX
2843 uint8_t *tempBlock1= tempBlocks; 2811 uint8_t *tempBlock1= c.tempBlocks;
2844 uint8_t *tempBlock2= tempBlocks + 8; 2812 uint8_t *tempBlock2= c.tempBlocks + 8;
2845 #endif 2813 #endif
2846 #ifdef ARCH_X86 2814 #ifdef ARCH_X86
2847 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; 2815 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
2848 int QPDelta= isColor ? (-1) : 1<<31; 2816 int QPDelta= isColor ? (-1) : 1<<31;
2849 int QPFrac= 1<<30; 2817 int QPFrac= 1<<30;
2871 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); 2839 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
2872 2840
2873 dstBlock= tempDst + dstStride; 2841 dstBlock= tempDst + dstStride;
2874 srcBlock= tempSrc; 2842 srcBlock= tempSrc;
2875 } 2843 }
2844 //printf("\n");
2876 2845
2877 // From this point on it is guranteed that we can read and write 16 lines downward 2846 // From this point on it is guranteed that we can read and write 16 lines downward
2878 // finish 1 block before the next otherwise weŽll might have a problem 2847 // finish 1 block before the next otherwise weŽll might have a problem
2879 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 2848 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2880 for(x=0; x<width; x+=BLOCK_SIZE) 2849 for(x=0; x<width; x+=BLOCK_SIZE)
2902 if(!isColor) 2871 if(!isColor)
2903 { 2872 {
2904 QP= (QP* QPCorrecture + 256*128)>>16; 2873 QP= (QP* QPCorrecture + 256*128)>>16;
2905 yHistogram[ srcBlock[srcStride*12 + 4] ]++; 2874 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
2906 } 2875 }
2876 //printf("%d ", QP);
2877 c.QP= QP;
2907 #ifdef HAVE_MMX 2878 #ifdef HAVE_MMX
2908 asm volatile( 2879 asm volatile(
2909 "movd %0, %%mm7 \n\t" 2880 "movd %1, %%mm7 \n\t"
2910 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP 2881 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2911 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP 2882 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2912 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP 2883 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
2913 "movq %%mm7, "MANGLE(pQPb)" \n\t" 2884 "movq %%mm7, %0 \n\t"
2914 : : "r" (QP) 2885 : "=m" (c.pQPb)
2886 : "r" (QP)
2915 ); 2887 );
2916 #endif 2888 #endif
2917 2889
2918 #ifdef MORE_TIMING
2919 T0= rdtsc();
2920 #endif
2921 2890
2922 #ifdef HAVE_MMX2 2891 #ifdef HAVE_MMX2
2923 /* 2892 /*
2924 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 2893 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
2925 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 2894 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
2930 asm( 2899 asm(
2931 "movl %4, %%eax \n\t" 2900 "movl %4, %%eax \n\t"
2932 "shrl $2, %%eax \n\t" 2901 "shrl $2, %%eax \n\t"
2933 "andl $6, %%eax \n\t" 2902 "andl $6, %%eax \n\t"
2934 "addl %5, %%eax \n\t" 2903 "addl %5, %%eax \n\t"
2935 "movl %%eax, %%ebx \n\t" 2904 "movl %%eax, %%edx \n\t"
2936 "imul %1, %%eax \n\t" 2905 "imul %1, %%eax \n\t"
2937 "imul %3, %%ebx \n\t" 2906 "imul %3, %%edx \n\t"
2938 "prefetchnta 32(%%eax, %0) \n\t" 2907 "prefetchnta 32(%%eax, %0) \n\t"
2939 "prefetcht0 32(%%ebx, %2) \n\t" 2908 "prefetcht0 32(%%edx, %2) \n\t"
2940 "addl %1, %%eax \n\t" 2909 "addl %1, %%eax \n\t"
2941 "addl %3, %%ebx \n\t" 2910 "addl %3, %%edx \n\t"
2942 "prefetchnta 32(%%eax, %0) \n\t" 2911 "prefetchnta 32(%%eax, %0) \n\t"
2943 "prefetcht0 32(%%ebx, %2) \n\t" 2912 "prefetcht0 32(%%edx, %2) \n\t"
2944 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), 2913 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
2945 "m" (x), "m" (copyAhead) 2914 "m" (x), "m" (copyAhead)
2946 : "%eax", "%ebx" 2915 : "%eax", "%edx"
2947 ); 2916 );
2948 2917
2949 #elif defined(HAVE_3DNOW) 2918 #elif defined(HAVE_3DNOW)
2950 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... 2919 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2951 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 2920 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2953 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 2922 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2954 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 2923 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2955 */ 2924 */
2956 #endif 2925 #endif
2957 2926
2958 #ifdef PP_FUNNY_STRIDE
2959 //can we mess with a 8x16 block, if not use a temp buffer, yes again
2960 if(x+7 >= width)
2961 {
2962 int i;
2963 dstBlockPtrBackup= dstBlock;
2964 srcBlockPtrBackup= srcBlock;
2965
2966 for(i=0;i<BLOCK_SIZE*2; i++)
2967 {
2968 memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
2969 memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
2970 }
2971
2972 dstBlock= tempDstBlock;
2973 srcBlock= tempSrcBlock;
2974 }
2975 #endif
2976
2977 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, 2927 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
2978 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); 2928 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
2979 2929
2980 if(mode & LINEAR_IPOL_DEINT_FILTER) 2930 if(mode & LINEAR_IPOL_DEINT_FILTER)
2981 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); 2931 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2982 else if(mode & LINEAR_BLEND_DEINT_FILTER) 2932 else if(mode & LINEAR_BLEND_DEINT_FILTER)
2983 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); 2933 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2984 else if(mode & MEDIAN_DEINT_FILTER) 2934 else if(mode & MEDIAN_DEINT_FILTER)
2985 RENAME(deInterlaceMedian)(dstBlock, dstStride); 2935 RENAME(deInterlaceMedian)(dstBlock, dstStride);
2986 else if(mode & CUBIC_IPOL_DEINT_FILTER) 2936 else if(mode & CUBIC_IPOL_DEINT_FILTER)
2987 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); 2937 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
2938 else if(mode & FFMPEG_DEINT_FILTER)
2939 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
2988 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) 2940 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
2989 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); 2941 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2990 */ 2942 */
2991 2943
2992 /* only deblock if we have 2 blocks */ 2944 /* only deblock if we have 2 blocks */
2993 if(y + 8 < height) 2945 if(y + 8 < height)
2994 { 2946 {
2995 #ifdef MORE_TIMING 2947 if(mode & V_X1_FILTER)
2996 T1= rdtsc(); 2948 RENAME(vertX1Filter)(dstBlock, stride, &c);
2997 memcpyTime+= T1-T0;
2998 T0=T1;
2999 #endif
3000 if(mode & V_RK1_FILTER)
3001 RENAME(vertRK1Filter)(dstBlock, stride, QP);
3002 else if(mode & V_X1_FILTER)
3003 RENAME(vertX1Filter)(dstBlock, stride, QP);
3004 else if(mode & V_DEBLOCK) 2949 else if(mode & V_DEBLOCK)
3005 { 2950 {
3006 if( RENAME(isVertDC)(dstBlock, stride)) 2951 if( RENAME(isVertDC)(dstBlock, stride, &c))
3007 { 2952 {
3008 if(RENAME(isVertMinMaxOk)(dstBlock, stride, QP)) 2953 if(RENAME(isVertMinMaxOk)(dstBlock, stride, &c))
3009 RENAME(doVertLowPass)(dstBlock, stride, QP); 2954 RENAME(doVertLowPass)(dstBlock, stride, &c);
3010 } 2955 }
3011 else 2956 else
3012 RENAME(doVertDefFilter)(dstBlock, stride, QP); 2957 RENAME(doVertDefFilter)(dstBlock, stride, &c);
3013 } 2958 }
3014 #ifdef MORE_TIMING
3015 T1= rdtsc();
3016 vertTime+= T1-T0;
3017 T0=T1;
3018 #endif
3019 } 2959 }
3020 2960
3021 #ifdef HAVE_MMX 2961 #ifdef HAVE_MMX
3022 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); 2962 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3023 #endif 2963 #endif
3024 /* check if we have a previous block to deblock it with dstBlock */ 2964 /* check if we have a previous block to deblock it with dstBlock */
3025 if(x - 8 >= 0) 2965 if(x - 8 >= 0)
3026 { 2966 {
3027 #ifdef MORE_TIMING
3028 T0= rdtsc();
3029 #endif
3030 #ifdef HAVE_MMX 2967 #ifdef HAVE_MMX
3031 if(mode & H_RK1_FILTER) 2968 if(mode & H_X1_FILTER)
3032 RENAME(vertRK1Filter)(tempBlock1, 16, QP); 2969 RENAME(vertX1Filter)(tempBlock1, 16, &c);
3033 else if(mode & H_X1_FILTER)
3034 RENAME(vertX1Filter)(tempBlock1, 16, QP);
3035 else if(mode & H_DEBLOCK) 2970 else if(mode & H_DEBLOCK)
3036 { 2971 {
3037 if( RENAME(isVertDC)(tempBlock1, 16) ) 2972 if( RENAME(isVertDC)(tempBlock1, 16, &c))
3038 { 2973 {
3039 if(RENAME(isVertMinMaxOk)(tempBlock1, 16, QP)) 2974 if(RENAME(isVertMinMaxOk)(tempBlock1, 16, &c))
3040 RENAME(doVertLowPass)(tempBlock1, 16, QP); 2975 RENAME(doVertLowPass)(tempBlock1, 16, &c);
3041 } 2976 }
3042 else 2977 else
3043 RENAME(doVertDefFilter)(tempBlock1, 16, QP); 2978 RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3044 } 2979 }
3045 2980
3046 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); 2981 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3047 2982
3048 #else 2983 #else
3049 if(mode & H_X1_FILTER) 2984 if(mode & H_X1_FILTER)
3050 horizX1Filter(dstBlock-4, stride, QP); 2985 horizX1Filter(dstBlock-4, stride, QP);
3051 else if(mode & H_DEBLOCK) 2986 else if(mode & H_DEBLOCK)
3052 { 2987 {
3053 if( isHorizDC(dstBlock-4, stride)) 2988 if( isHorizDC(dstBlock-4, stride, &c))
3054 { 2989 {
3055 if(isHorizMinMaxOk(dstBlock-4, stride, QP)) 2990 if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3056 doHorizLowPass(dstBlock-4, stride, QP); 2991 doHorizLowPass(dstBlock-4, stride, QP);
3057 } 2992 }
3058 else 2993 else
3059 doHorizDefFilter(dstBlock-4, stride, QP); 2994 doHorizDefFilter(dstBlock-4, stride, QP);
3060 } 2995 }
3061 #endif 2996 #endif
3062 #ifdef MORE_TIMING
3063 T1= rdtsc();
3064 horizTime+= T1-T0;
3065 T0=T1;
3066 #endif
3067 if(mode & DERING) 2997 if(mode & DERING)
3068 { 2998 {
3069 //FIXME filter first line 2999 //FIXME filter first line
3070 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, QP); 3000 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3071 } 3001 }
3072 3002
3073 if(mode & TEMP_NOISE_FILTER) 3003 if(mode & TEMP_NOISE_FILTER)
3074 { 3004 {
3075 RENAME(tempNoiseReducer)(dstBlock-8, stride, 3005 RENAME(tempNoiseReducer)(dstBlock-8, stride,
3076 tempBlured[isColor] + y*dstStride + x, 3006 c.tempBlured[isColor] + y*dstStride + x,
3077 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), 3007 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3078 ppMode->maxTmpNoise); 3008 c.ppMode.maxTmpNoise);
3079 } 3009 }
3080 } 3010 }
3081
3082 #ifdef PP_FUNNY_STRIDE
3083 /* did we use a tmp-block buffer */
3084 if(x+7 >= width)
3085 {
3086 int i;
3087 dstBlock= dstBlockPtrBackup;
3088 srcBlock= srcBlockPtrBackup;
3089
3090 for(i=0;i<BLOCK_SIZE*2; i++)
3091 {
3092 memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
3093 }
3094 }
3095 #endif
3096 3011
3097 dstBlock+=8; 3012 dstBlock+=8;
3098 srcBlock+=8; 3013 srcBlock+=8;
3099 3014
3100 #ifdef HAVE_MMX 3015 #ifdef HAVE_MMX
3104 #endif 3019 #endif
3105 } 3020 }
3106 3021
3107 if(mode & DERING) 3022 if(mode & DERING)
3108 { 3023 {
3109 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, QP); 3024 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3110 } 3025 }
3111 3026
3112 if((mode & TEMP_NOISE_FILTER)) 3027 if((mode & TEMP_NOISE_FILTER))
3113 { 3028 {
3114 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, 3029 RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3115 tempBlured[isColor] + y*dstStride + x, 3030 c.tempBlured[isColor] + y*dstStride + x,
3116 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), 3031 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3117 ppMode->maxTmpNoise); 3032 c.ppMode.maxTmpNoise);
3118 } 3033 }
3119 3034
3120 /* did we use a tmp buffer for the last lines*/ 3035 /* did we use a tmp buffer for the last lines*/
3121 if(y+15 >= height) 3036 if(y+15 >= height)
3122 { 3037 {
3138 asm volatile("femms"); 3053 asm volatile("femms");
3139 #elif defined (HAVE_MMX) 3054 #elif defined (HAVE_MMX)
3140 asm volatile("emms"); 3055 asm volatile("emms");
3141 #endif 3056 #endif
3142 3057
3143 #ifdef TIMING
3144 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
3145 sumTime= rdtsc() - sumTime;
3146 if(!isColor)
3147 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
3148 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
3149 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
3150 , black, white);
3151 #endif
3152 #ifdef DEBUG_BRIGHTNESS 3058 #ifdef DEBUG_BRIGHTNESS
3153 if(!isColor) 3059 if(!isColor)
3154 { 3060 {
3155 int max=1; 3061 int max=1;
3156 int i; 3062 int i;
3174 } 3080 }
3175 3081
3176 } 3082 }
3177 #endif 3083 #endif
3178 3084
3085 *c2= c; //copy local context back
3086
3179 } 3087 }