comparison i386/dsputil_mmx.c @ 2979:bfabfdf9ce55 libavcodec

COSMETICS: tabs --> spaces, some prettyprinting
author diego
date Thu, 22 Dec 2005 01:10:11 +0000
parents ef2149182f1c
children 0b546eab515d
comparison
equal deleted inserted replaced
2978:403183bbb505 2979:bfabfdf9ce55
87 87
88 // using regr as temporary and for the output result 88 // using regr as temporary and for the output result
89 // first argument is unmodifed and second is trashed 89 // first argument is unmodifed and second is trashed
90 // regfe is supposed to contain 0xfefefefefefefefe 90 // regfe is supposed to contain 0xfefefefefefefefe
91 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ 91 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
92 "movq " #rega ", " #regr " \n\t"\ 92 "movq " #rega ", " #regr " \n\t"\
93 "pand " #regb ", " #regr " \n\t"\ 93 "pand " #regb ", " #regr " \n\t"\
94 "pxor " #rega ", " #regb " \n\t"\ 94 "pxor " #rega ", " #regb " \n\t"\
95 "pand " #regfe "," #regb " \n\t"\ 95 "pand " #regfe "," #regb " \n\t"\
96 "psrlq $1, " #regb " \n\t"\ 96 "psrlq $1, " #regb " \n\t"\
97 "paddb " #regb ", " #regr " \n\t" 97 "paddb " #regb ", " #regr " \n\t"
98 98
99 #define PAVGB_MMX(rega, regb, regr, regfe) \ 99 #define PAVGB_MMX(rega, regb, regr, regfe) \
100 "movq " #rega ", " #regr " \n\t"\ 100 "movq " #rega ", " #regr " \n\t"\
101 "por " #regb ", " #regr " \n\t"\ 101 "por " #regb ", " #regr " \n\t"\
102 "pxor " #rega ", " #regb " \n\t"\ 102 "pxor " #rega ", " #regb " \n\t"\
103 "pand " #regfe "," #regb " \n\t"\ 103 "pand " #regfe "," #regb " \n\t"\
104 "psrlq $1, " #regb " \n\t"\ 104 "psrlq $1, " #regb " \n\t"\
105 "psubb " #regb ", " #regr " \n\t" 105 "psubb " #regb ", " #regr " \n\t"
106 106
107 // mm6 is supposed to contain 0xfefefefefefefefe 107 // mm6 is supposed to contain 0xfefefefefefefefe
108 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ 108 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
109 "movq " #rega ", " #regr " \n\t"\ 109 "movq " #rega ", " #regr " \n\t"\
110 "movq " #regc ", " #regp " \n\t"\ 110 "movq " #regc ", " #regp " \n\t"\
111 "pand " #regb ", " #regr " \n\t"\ 111 "pand " #regb ", " #regr " \n\t"\
112 "pand " #regd ", " #regp " \n\t"\ 112 "pand " #regd ", " #regp " \n\t"\
113 "pxor " #rega ", " #regb " \n\t"\ 113 "pxor " #rega ", " #regb " \n\t"\
114 "pxor " #regc ", " #regd " \n\t"\ 114 "pxor " #regc ", " #regd " \n\t"\
115 "pand %%mm6, " #regb " \n\t"\ 115 "pand %%mm6, " #regb " \n\t"\
116 "pand %%mm6, " #regd " \n\t"\ 116 "pand %%mm6, " #regd " \n\t"\
117 "psrlq $1, " #regb " \n\t"\ 117 "psrlq $1, " #regb " \n\t"\
118 "psrlq $1, " #regd " \n\t"\ 118 "psrlq $1, " #regd " \n\t"\
119 "paddb " #regb ", " #regr " \n\t"\ 119 "paddb " #regb ", " #regr " \n\t"\
120 "paddb " #regd ", " #regp " \n\t" 120 "paddb " #regd ", " #regp " \n\t"
121 121
122 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ 122 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
123 "movq " #rega ", " #regr " \n\t"\ 123 "movq " #rega ", " #regr " \n\t"\
124 "movq " #regc ", " #regp " \n\t"\ 124 "movq " #regc ", " #regp " \n\t"\
125 "por " #regb ", " #regr " \n\t"\ 125 "por " #regb ", " #regr " \n\t"\
126 "por " #regd ", " #regp " \n\t"\ 126 "por " #regd ", " #regp " \n\t"\
127 "pxor " #rega ", " #regb " \n\t"\ 127 "pxor " #rega ", " #regb " \n\t"\
128 "pxor " #regc ", " #regd " \n\t"\ 128 "pxor " #regc ", " #regd " \n\t"\
129 "pand %%mm6, " #regb " \n\t"\ 129 "pand %%mm6, " #regb " \n\t"\
130 "pand %%mm6, " #regd " \n\t"\ 130 "pand %%mm6, " #regd " \n\t"\
131 "psrlq $1, " #regd " \n\t"\ 131 "psrlq $1, " #regd " \n\t"\
132 "psrlq $1, " #regb " \n\t"\ 132 "psrlq $1, " #regb " \n\t"\
133 "psubb " #regb ", " #regr " \n\t"\ 133 "psubb " #regb ", " #regr " \n\t"\
134 "psubb " #regd ", " #regp " \n\t" 134 "psubb " #regd ", " #regp " \n\t"
135 135
136 /***********************************/ 136 /***********************************/
137 /* MMX no rounding */ 137 /* MMX no rounding */
138 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx 138 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
139 #define SET_RND MOVQ_WONE 139 #define SET_RND MOVQ_WONE
140 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) 140 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
141 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) 141 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
142 142
143 #include "dsputil_mmx_rnd.h" 143 #include "dsputil_mmx_rnd.h"
144 144
145 #undef DEF 145 #undef DEF
146 #undef SET_RND 146 #undef SET_RND
149 /***********************************/ 149 /***********************************/
150 /* MMX rounding */ 150 /* MMX rounding */
151 151
152 #define DEF(x, y) x ## _ ## y ##_mmx 152 #define DEF(x, y) x ## _ ## y ##_mmx
153 #define SET_RND MOVQ_WTWO 153 #define SET_RND MOVQ_WTWO
154 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) 154 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
155 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) 155 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
156 156
157 #include "dsputil_mmx_rnd.h" 157 #include "dsputil_mmx_rnd.h"
158 158
159 #undef DEF 159 #undef DEF
160 #undef SET_RND 160 #undef SET_RND
191 191
192 #ifdef CONFIG_ENCODERS 192 #ifdef CONFIG_ENCODERS
193 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) 193 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
194 { 194 {
195 asm volatile( 195 asm volatile(
196 "mov $-128, %%"REG_a" \n\t" 196 "mov $-128, %%"REG_a" \n\t"
197 "pxor %%mm7, %%mm7 \n\t" 197 "pxor %%mm7, %%mm7 \n\t"
198 ".balign 16 \n\t" 198 ".balign 16 \n\t"
199 "1: \n\t" 199 "1: \n\t"
200 "movq (%0), %%mm0 \n\t" 200 "movq (%0), %%mm0 \n\t"
201 "movq (%0, %2), %%mm2 \n\t" 201 "movq (%0, %2), %%mm2 \n\t"
202 "movq %%mm0, %%mm1 \n\t" 202 "movq %%mm0, %%mm1 \n\t"
203 "movq %%mm2, %%mm3 \n\t" 203 "movq %%mm2, %%mm3 \n\t"
204 "punpcklbw %%mm7, %%mm0 \n\t" 204 "punpcklbw %%mm7, %%mm0 \n\t"
205 "punpckhbw %%mm7, %%mm1 \n\t" 205 "punpckhbw %%mm7, %%mm1 \n\t"
206 "punpcklbw %%mm7, %%mm2 \n\t" 206 "punpcklbw %%mm7, %%mm2 \n\t"
207 "punpckhbw %%mm7, %%mm3 \n\t" 207 "punpckhbw %%mm7, %%mm3 \n\t"
208 "movq %%mm0, (%1, %%"REG_a")\n\t" 208 "movq %%mm0, (%1, %%"REG_a") \n\t"
209 "movq %%mm1, 8(%1, %%"REG_a")\n\t" 209 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
210 "movq %%mm2, 16(%1, %%"REG_a")\n\t" 210 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
211 "movq %%mm3, 24(%1, %%"REG_a")\n\t" 211 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
212 "add %3, %0 \n\t" 212 "add %3, %0 \n\t"
213 "add $32, %%"REG_a" \n\t" 213 "add $32, %%"REG_a" \n\t"
214 "js 1b \n\t" 214 "js 1b \n\t"
215 : "+r" (pixels) 215 : "+r" (pixels)
216 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2) 216 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
217 : "%"REG_a 217 : "%"REG_a
218 ); 218 );
219 } 219 }
220 220
221 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) 221 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
222 { 222 {
223 asm volatile( 223 asm volatile(
224 "pxor %%mm7, %%mm7 \n\t" 224 "pxor %%mm7, %%mm7 \n\t"
225 "mov $-128, %%"REG_a" \n\t" 225 "mov $-128, %%"REG_a" \n\t"
226 ".balign 16 \n\t" 226 ".balign 16 \n\t"
227 "1: \n\t" 227 "1: \n\t"
228 "movq (%0), %%mm0 \n\t" 228 "movq (%0), %%mm0 \n\t"
229 "movq (%1), %%mm2 \n\t" 229 "movq (%1), %%mm2 \n\t"
230 "movq %%mm0, %%mm1 \n\t" 230 "movq %%mm0, %%mm1 \n\t"
231 "movq %%mm2, %%mm3 \n\t" 231 "movq %%mm2, %%mm3 \n\t"
232 "punpcklbw %%mm7, %%mm0 \n\t" 232 "punpcklbw %%mm7, %%mm0 \n\t"
233 "punpckhbw %%mm7, %%mm1 \n\t" 233 "punpckhbw %%mm7, %%mm1 \n\t"
234 "punpcklbw %%mm7, %%mm2 \n\t" 234 "punpcklbw %%mm7, %%mm2 \n\t"
235 "punpckhbw %%mm7, %%mm3 \n\t" 235 "punpckhbw %%mm7, %%mm3 \n\t"
236 "psubw %%mm2, %%mm0 \n\t" 236 "psubw %%mm2, %%mm0 \n\t"
237 "psubw %%mm3, %%mm1 \n\t" 237 "psubw %%mm3, %%mm1 \n\t"
238 "movq %%mm0, (%2, %%"REG_a")\n\t" 238 "movq %%mm0, (%2, %%"REG_a") \n\t"
239 "movq %%mm1, 8(%2, %%"REG_a")\n\t" 239 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
240 "add %3, %0 \n\t" 240 "add %3, %0 \n\t"
241 "add %3, %1 \n\t" 241 "add %3, %1 \n\t"
242 "add $16, %%"REG_a" \n\t" 242 "add $16, %%"REG_a" \n\t"
243 "jnz 1b \n\t" 243 "jnz 1b \n\t"
244 : "+r" (s1), "+r" (s2) 244 : "+r" (s1), "+r" (s2)
245 : "r" (block+64), "r" ((long)stride) 245 : "r" (block+64), "r" ((long)stride)
246 : "%"REG_a 246 : "%"REG_a
247 ); 247 );
248 } 248 }
255 255
256 /* read the pixels */ 256 /* read the pixels */
257 p = block; 257 p = block;
258 pix = pixels; 258 pix = pixels;
259 /* unrolled loop */ 259 /* unrolled loop */
260 __asm __volatile( 260 __asm __volatile(
261 "movq %3, %%mm0\n\t" 261 "movq %3, %%mm0 \n\t"
262 "movq 8%3, %%mm1\n\t" 262 "movq 8%3, %%mm1 \n\t"
263 "movq 16%3, %%mm2\n\t" 263 "movq 16%3, %%mm2 \n\t"
264 "movq 24%3, %%mm3\n\t" 264 "movq 24%3, %%mm3 \n\t"
265 "movq 32%3, %%mm4\n\t" 265 "movq 32%3, %%mm4 \n\t"
266 "movq 40%3, %%mm5\n\t" 266 "movq 40%3, %%mm5 \n\t"
267 "movq 48%3, %%mm6\n\t" 267 "movq 48%3, %%mm6 \n\t"
268 "movq 56%3, %%mm7\n\t" 268 "movq 56%3, %%mm7 \n\t"
269 "packuswb %%mm1, %%mm0\n\t" 269 "packuswb %%mm1, %%mm0 \n\t"
270 "packuswb %%mm3, %%mm2\n\t" 270 "packuswb %%mm3, %%mm2 \n\t"
271 "packuswb %%mm5, %%mm4\n\t" 271 "packuswb %%mm5, %%mm4 \n\t"
272 "packuswb %%mm7, %%mm6\n\t" 272 "packuswb %%mm7, %%mm6 \n\t"
273 "movq %%mm0, (%0)\n\t" 273 "movq %%mm0, (%0) \n\t"
274 "movq %%mm2, (%0, %1)\n\t" 274 "movq %%mm2, (%0, %1) \n\t"
275 "movq %%mm4, (%0, %1, 2)\n\t" 275 "movq %%mm4, (%0, %1, 2) \n\t"
276 "movq %%mm6, (%0, %2)\n\t" 276 "movq %%mm6, (%0, %2) \n\t"
277 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p) 277 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
278 :"memory"); 278 :"memory");
279 pix += line_size*4; 279 pix += line_size*4;
280 p += 32; 280 p += 32;
281 281
282 // if here would be an exact copy of the code above 282 // if here would be an exact copy of the code above
283 // compiler would generate some very strange code 283 // compiler would generate some very strange code
284 // thus using "r" 284 // thus using "r"
285 __asm __volatile( 285 __asm __volatile(
286 "movq (%3), %%mm0\n\t" 286 "movq (%3), %%mm0 \n\t"
287 "movq 8(%3), %%mm1\n\t" 287 "movq 8(%3), %%mm1 \n\t"
288 "movq 16(%3), %%mm2\n\t" 288 "movq 16(%3), %%mm2 \n\t"
289 "movq 24(%3), %%mm3\n\t" 289 "movq 24(%3), %%mm3 \n\t"
290 "movq 32(%3), %%mm4\n\t" 290 "movq 32(%3), %%mm4 \n\t"
291 "movq 40(%3), %%mm5\n\t" 291 "movq 40(%3), %%mm5 \n\t"
292 "movq 48(%3), %%mm6\n\t" 292 "movq 48(%3), %%mm6 \n\t"
293 "movq 56(%3), %%mm7\n\t" 293 "movq 56(%3), %%mm7 \n\t"
294 "packuswb %%mm1, %%mm0\n\t" 294 "packuswb %%mm1, %%mm0 \n\t"
295 "packuswb %%mm3, %%mm2\n\t" 295 "packuswb %%mm3, %%mm2 \n\t"
296 "packuswb %%mm5, %%mm4\n\t" 296 "packuswb %%mm5, %%mm4 \n\t"
297 "packuswb %%mm7, %%mm6\n\t" 297 "packuswb %%mm7, %%mm6 \n\t"
298 "movq %%mm0, (%0)\n\t" 298 "movq %%mm0, (%0) \n\t"
299 "movq %%mm2, (%0, %1)\n\t" 299 "movq %%mm2, (%0, %1) \n\t"
300 "movq %%mm4, (%0, %1, 2)\n\t" 300 "movq %%mm4, (%0, %1, 2) \n\t"
301 "movq %%mm6, (%0, %2)\n\t" 301 "movq %%mm6, (%0, %2) \n\t"
302 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p) 302 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
303 :"memory"); 303 :"memory");
304 } 304 }
305 305
306 static const unsigned char __align8 vector128[8] = 306 static const unsigned char __align8 vector128[8] =
307 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; 307 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
308 308
331 p = block; 331 p = block;
332 pix = pixels; 332 pix = pixels;
333 MOVQ_ZERO(mm7); 333 MOVQ_ZERO(mm7);
334 i = 4; 334 i = 4;
335 do { 335 do {
336 __asm __volatile( 336 __asm __volatile(
337 "movq (%2), %%mm0\n\t" 337 "movq (%2), %%mm0 \n\t"
338 "movq 8(%2), %%mm1\n\t" 338 "movq 8(%2), %%mm1 \n\t"
339 "movq 16(%2), %%mm2\n\t" 339 "movq 16(%2), %%mm2 \n\t"
340 "movq 24(%2), %%mm3\n\t" 340 "movq 24(%2), %%mm3 \n\t"
341 "movq %0, %%mm4\n\t" 341 "movq %0, %%mm4 \n\t"
342 "movq %1, %%mm6\n\t" 342 "movq %1, %%mm6 \n\t"
343 "movq %%mm4, %%mm5\n\t" 343 "movq %%mm4, %%mm5 \n\t"
344 "punpcklbw %%mm7, %%mm4\n\t" 344 "punpcklbw %%mm7, %%mm4 \n\t"
345 "punpckhbw %%mm7, %%mm5\n\t" 345 "punpckhbw %%mm7, %%mm5 \n\t"
346 "paddsw %%mm4, %%mm0\n\t" 346 "paddsw %%mm4, %%mm0 \n\t"
347 "paddsw %%mm5, %%mm1\n\t" 347 "paddsw %%mm5, %%mm1 \n\t"
348 "movq %%mm6, %%mm5\n\t" 348 "movq %%mm6, %%mm5 \n\t"
349 "punpcklbw %%mm7, %%mm6\n\t" 349 "punpcklbw %%mm7, %%mm6 \n\t"
350 "punpckhbw %%mm7, %%mm5\n\t" 350 "punpckhbw %%mm7, %%mm5 \n\t"
351 "paddsw %%mm6, %%mm2\n\t" 351 "paddsw %%mm6, %%mm2 \n\t"
352 "paddsw %%mm5, %%mm3\n\t" 352 "paddsw %%mm5, %%mm3 \n\t"
353 "packuswb %%mm1, %%mm0\n\t" 353 "packuswb %%mm1, %%mm0 \n\t"
354 "packuswb %%mm3, %%mm2\n\t" 354 "packuswb %%mm3, %%mm2 \n\t"
355 "movq %%mm0, %0\n\t" 355 "movq %%mm0, %0 \n\t"
356 "movq %%mm2, %1\n\t" 356 "movq %%mm2, %1 \n\t"
357 :"+m"(*pix), "+m"(*(pix+line_size)) 357 :"+m"(*pix), "+m"(*(pix+line_size))
358 :"r"(p) 358 :"r"(p)
359 :"memory"); 359 :"memory");
360 pix += line_size*2; 360 pix += line_size*2;
361 p += 16; 361 p += 16;
362 } while (--i); 362 } while (--i);
363 } 363 }
364 364
365 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 365 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
366 { 366 {
367 __asm __volatile( 367 __asm __volatile(
368 "lea (%3, %3), %%"REG_a" \n\t" 368 "lea (%3, %3), %%"REG_a" \n\t"
369 ".balign 8 \n\t" 369 ".balign 8 \n\t"
370 "1: \n\t" 370 "1: \n\t"
371 "movd (%1), %%mm0 \n\t" 371 "movd (%1), %%mm0 \n\t"
372 "movd (%1, %3), %%mm1 \n\t" 372 "movd (%1, %3), %%mm1 \n\t"
373 "movd %%mm0, (%2) \n\t" 373 "movd %%mm0, (%2) \n\t"
374 "movd %%mm1, (%2, %3) \n\t" 374 "movd %%mm1, (%2, %3) \n\t"
375 "add %%"REG_a", %1 \n\t" 375 "add %%"REG_a", %1 \n\t"
376 "add %%"REG_a", %2 \n\t" 376 "add %%"REG_a", %2 \n\t"
377 "movd (%1), %%mm0 \n\t" 377 "movd (%1), %%mm0 \n\t"
378 "movd (%1, %3), %%mm1 \n\t" 378 "movd (%1, %3), %%mm1 \n\t"
379 "movd %%mm0, (%2) \n\t" 379 "movd %%mm0, (%2) \n\t"
380 "movd %%mm1, (%2, %3) \n\t" 380 "movd %%mm1, (%2, %3) \n\t"
381 "add %%"REG_a", %1 \n\t" 381 "add %%"REG_a", %1 \n\t"
382 "add %%"REG_a", %2 \n\t" 382 "add %%"REG_a", %2 \n\t"
383 "subl $4, %0 \n\t" 383 "subl $4, %0 \n\t"
384 "jnz 1b \n\t" 384 "jnz 1b \n\t"
385 : "+g"(h), "+r" (pixels), "+r" (block) 385 : "+g"(h), "+r" (pixels), "+r" (block)
386 : "r"((long)line_size) 386 : "r"((long)line_size)
387 : "%"REG_a, "memory" 387 : "%"REG_a, "memory"
388 ); 388 );
389 } 389 }
390 390
391 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 391 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
392 { 392 {
393 __asm __volatile( 393 __asm __volatile(
394 "lea (%3, %3), %%"REG_a" \n\t" 394 "lea (%3, %3), %%"REG_a" \n\t"
395 ".balign 8 \n\t" 395 ".balign 8 \n\t"
396 "1: \n\t" 396 "1: \n\t"
397 "movq (%1), %%mm0 \n\t" 397 "movq (%1), %%mm0 \n\t"
398 "movq (%1, %3), %%mm1 \n\t" 398 "movq (%1, %3), %%mm1 \n\t"
399 "movq %%mm0, (%2) \n\t" 399 "movq %%mm0, (%2) \n\t"
400 "movq %%mm1, (%2, %3) \n\t" 400 "movq %%mm1, (%2, %3) \n\t"
401 "add %%"REG_a", %1 \n\t" 401 "add %%"REG_a", %1 \n\t"
402 "add %%"REG_a", %2 \n\t" 402 "add %%"REG_a", %2 \n\t"
403 "movq (%1), %%mm0 \n\t" 403 "movq (%1), %%mm0 \n\t"
404 "movq (%1, %3), %%mm1 \n\t" 404 "movq (%1, %3), %%mm1 \n\t"
405 "movq %%mm0, (%2) \n\t" 405 "movq %%mm0, (%2) \n\t"
406 "movq %%mm1, (%2, %3) \n\t" 406 "movq %%mm1, (%2, %3) \n\t"
407 "add %%"REG_a", %1 \n\t" 407 "add %%"REG_a", %1 \n\t"
408 "add %%"REG_a", %2 \n\t" 408 "add %%"REG_a", %2 \n\t"
409 "subl $4, %0 \n\t" 409 "subl $4, %0 \n\t"
410 "jnz 1b \n\t" 410 "jnz 1b \n\t"
411 : "+g"(h), "+r" (pixels), "+r" (block) 411 : "+g"(h), "+r" (pixels), "+r" (block)
412 : "r"((long)line_size) 412 : "r"((long)line_size)
413 : "%"REG_a, "memory" 413 : "%"REG_a, "memory"
414 ); 414 );
415 } 415 }
416 416
417 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 417 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
418 { 418 {
419 __asm __volatile( 419 __asm __volatile(
420 "lea (%3, %3), %%"REG_a" \n\t" 420 "lea (%3, %3), %%"REG_a" \n\t"
421 ".balign 8 \n\t" 421 ".balign 8 \n\t"
422 "1: \n\t" 422 "1: \n\t"
423 "movq (%1), %%mm0 \n\t" 423 "movq (%1), %%mm0 \n\t"
424 "movq 8(%1), %%mm4 \n\t" 424 "movq 8(%1), %%mm4 \n\t"
425 "movq (%1, %3), %%mm1 \n\t" 425 "movq (%1, %3), %%mm1 \n\t"
426 "movq 8(%1, %3), %%mm5 \n\t" 426 "movq 8(%1, %3), %%mm5 \n\t"
427 "movq %%mm0, (%2) \n\t" 427 "movq %%mm0, (%2) \n\t"
428 "movq %%mm4, 8(%2) \n\t" 428 "movq %%mm4, 8(%2) \n\t"
429 "movq %%mm1, (%2, %3) \n\t" 429 "movq %%mm1, (%2, %3) \n\t"
430 "movq %%mm5, 8(%2, %3) \n\t" 430 "movq %%mm5, 8(%2, %3) \n\t"
431 "add %%"REG_a", %1 \n\t" 431 "add %%"REG_a", %1 \n\t"
432 "add %%"REG_a", %2 \n\t" 432 "add %%"REG_a", %2 \n\t"
433 "movq (%1), %%mm0 \n\t" 433 "movq (%1), %%mm0 \n\t"
434 "movq 8(%1), %%mm4 \n\t" 434 "movq 8(%1), %%mm4 \n\t"
435 "movq (%1, %3), %%mm1 \n\t" 435 "movq (%1, %3), %%mm1 \n\t"
436 "movq 8(%1, %3), %%mm5 \n\t" 436 "movq 8(%1, %3), %%mm5 \n\t"
437 "movq %%mm0, (%2) \n\t" 437 "movq %%mm0, (%2) \n\t"
438 "movq %%mm4, 8(%2) \n\t" 438 "movq %%mm4, 8(%2) \n\t"
439 "movq %%mm1, (%2, %3) \n\t" 439 "movq %%mm1, (%2, %3) \n\t"
440 "movq %%mm5, 8(%2, %3) \n\t" 440 "movq %%mm5, 8(%2, %3) \n\t"
441 "add %%"REG_a", %1 \n\t" 441 "add %%"REG_a", %1 \n\t"
442 "add %%"REG_a", %2 \n\t" 442 "add %%"REG_a", %2 \n\t"
443 "subl $4, %0 \n\t" 443 "subl $4, %0 \n\t"
444 "jnz 1b \n\t" 444 "jnz 1b \n\t"
445 : "+g"(h), "+r" (pixels), "+r" (block) 445 : "+g"(h), "+r" (pixels), "+r" (block)
446 : "r"((long)line_size) 446 : "r"((long)line_size)
447 : "%"REG_a, "memory" 447 : "%"REG_a, "memory"
448 ); 448 );
449 } 449 }
450 450
451 static void clear_blocks_mmx(DCTELEM *blocks) 451 static void clear_blocks_mmx(DCTELEM *blocks)
452 { 452 {
453 __asm __volatile( 453 __asm __volatile(
454 "pxor %%mm7, %%mm7 \n\t" 454 "pxor %%mm7, %%mm7 \n\t"
455 "mov $-128*6, %%"REG_a" \n\t" 455 "mov $-128*6, %%"REG_a" \n\t"
456 "1: \n\t" 456 "1: \n\t"
457 "movq %%mm7, (%0, %%"REG_a") \n\t" 457 "movq %%mm7, (%0, %%"REG_a") \n\t"
458 "movq %%mm7, 8(%0, %%"REG_a") \n\t" 458 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
459 "movq %%mm7, 16(%0, %%"REG_a") \n\t" 459 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
460 "movq %%mm7, 24(%0, %%"REG_a") \n\t" 460 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
461 "add $32, %%"REG_a" \n\t" 461 "add $32, %%"REG_a" \n\t"
462 " js 1b \n\t" 462 " js 1b \n\t"
463 : : "r" (((uint8_t *)blocks)+128*6) 463 : : "r" (((uint8_t *)blocks)+128*6)
464 : "%"REG_a 464 : "%"REG_a
465 ); 465 );
466 } 466 }
467 467
470 const int h=16; 470 const int h=16;
471 int sum; 471 int sum;
472 long index= -line_size*h; 472 long index= -line_size*h;
473 473
474 __asm __volatile( 474 __asm __volatile(
475 "pxor %%mm7, %%mm7 \n\t" 475 "pxor %%mm7, %%mm7 \n\t"
476 "pxor %%mm6, %%mm6 \n\t" 476 "pxor %%mm6, %%mm6 \n\t"
477 "1: \n\t" 477 "1: \n\t"
478 "movq (%2, %1), %%mm0 \n\t" 478 "movq (%2, %1), %%mm0 \n\t"
479 "movq (%2, %1), %%mm1 \n\t" 479 "movq (%2, %1), %%mm1 \n\t"
480 "movq 8(%2, %1), %%mm2 \n\t" 480 "movq 8(%2, %1), %%mm2 \n\t"
481 "movq 8(%2, %1), %%mm3 \n\t" 481 "movq 8(%2, %1), %%mm3 \n\t"
482 "punpcklbw %%mm7, %%mm0 \n\t" 482 "punpcklbw %%mm7, %%mm0 \n\t"
483 "punpckhbw %%mm7, %%mm1 \n\t" 483 "punpckhbw %%mm7, %%mm1 \n\t"
484 "punpcklbw %%mm7, %%mm2 \n\t" 484 "punpcklbw %%mm7, %%mm2 \n\t"
485 "punpckhbw %%mm7, %%mm3 \n\t" 485 "punpckhbw %%mm7, %%mm3 \n\t"
486 "paddw %%mm0, %%mm1 \n\t" 486 "paddw %%mm0, %%mm1 \n\t"
487 "paddw %%mm2, %%mm3 \n\t" 487 "paddw %%mm2, %%mm3 \n\t"
488 "paddw %%mm1, %%mm3 \n\t" 488 "paddw %%mm1, %%mm3 \n\t"
489 "paddw %%mm3, %%mm6 \n\t" 489 "paddw %%mm3, %%mm6 \n\t"
490 "add %3, %1 \n\t" 490 "add %3, %1 \n\t"
491 " js 1b \n\t" 491 " js 1b \n\t"
492 "movq %%mm6, %%mm5 \n\t" 492 "movq %%mm6, %%mm5 \n\t"
493 "psrlq $32, %%mm6 \n\t" 493 "psrlq $32, %%mm6 \n\t"
494 "paddw %%mm5, %%mm6 \n\t" 494 "paddw %%mm5, %%mm6 \n\t"
495 "movq %%mm6, %%mm5 \n\t" 495 "movq %%mm6, %%mm5 \n\t"
496 "psrlq $16, %%mm6 \n\t" 496 "psrlq $16, %%mm6 \n\t"
497 "paddw %%mm5, %%mm6 \n\t" 497 "paddw %%mm5, %%mm6 \n\t"
498 "movd %%mm6, %0 \n\t" 498 "movd %%mm6, %0 \n\t"
499 "andl $0xFFFF, %0 \n\t" 499 "andl $0xFFFF, %0 \n\t"
500 : "=&r" (sum), "+r" (index) 500 : "=&r" (sum), "+r" (index)
501 : "r" (pix - index), "r" ((long)line_size) 501 : "r" (pix - index), "r" ((long)line_size)
502 ); 502 );
503 503
504 return sum; 504 return sum;
506 #endif //CONFIG_ENCODERS 506 #endif //CONFIG_ENCODERS
507 507
508 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ 508 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
509 long i=0; 509 long i=0;
510 asm volatile( 510 asm volatile(
511 "1: \n\t" 511 "1: \n\t"
512 "movq (%1, %0), %%mm0 \n\t" 512 "movq (%1, %0), %%mm0 \n\t"
513 "movq (%2, %0), %%mm1 \n\t" 513 "movq (%2, %0), %%mm1 \n\t"
514 "paddb %%mm0, %%mm1 \n\t" 514 "paddb %%mm0, %%mm1 \n\t"
515 "movq %%mm1, (%2, %0) \n\t" 515 "movq %%mm1, (%2, %0) \n\t"
516 "movq 8(%1, %0), %%mm0 \n\t" 516 "movq 8(%1, %0), %%mm0 \n\t"
517 "movq 8(%2, %0), %%mm1 \n\t" 517 "movq 8(%2, %0), %%mm1 \n\t"
518 "paddb %%mm0, %%mm1 \n\t" 518 "paddb %%mm0, %%mm1 \n\t"
519 "movq %%mm1, 8(%2, %0) \n\t" 519 "movq %%mm1, 8(%2, %0) \n\t"
520 "add $16, %0 \n\t" 520 "add $16, %0 \n\t"
521 "cmp %3, %0 \n\t" 521 "cmp %3, %0 \n\t"
522 " jb 1b \n\t" 522 " jb 1b \n\t"
523 : "+r" (i) 523 : "+r" (i)
524 : "r"(src), "r"(dst), "r"((long)w-15) 524 : "r"(src), "r"(dst), "r"((long)w-15)
525 ); 525 );
526 for(; i<w; i++) 526 for(; i<w; i++)
527 dst[i+0] += src[i+0]; 527 dst[i+0] += src[i+0];
528 } 528 }
529 529
530 #define H263_LOOP_FILTER \ 530 #define H263_LOOP_FILTER \
531 "pxor %%mm7, %%mm7 \n\t"\ 531 "pxor %%mm7, %%mm7 \n\t"\
532 "movq %0, %%mm0 \n\t"\ 532 "movq %0, %%mm0 \n\t"\
533 "movq %0, %%mm1 \n\t"\ 533 "movq %0, %%mm1 \n\t"\
534 "movq %3, %%mm2 \n\t"\ 534 "movq %3, %%mm2 \n\t"\
535 "movq %3, %%mm3 \n\t"\ 535 "movq %3, %%mm3 \n\t"\
536 "punpcklbw %%mm7, %%mm0 \n\t"\ 536 "punpcklbw %%mm7, %%mm0 \n\t"\
537 "punpckhbw %%mm7, %%mm1 \n\t"\ 537 "punpckhbw %%mm7, %%mm1 \n\t"\
538 "punpcklbw %%mm7, %%mm2 \n\t"\ 538 "punpcklbw %%mm7, %%mm2 \n\t"\
539 "punpckhbw %%mm7, %%mm3 \n\t"\ 539 "punpckhbw %%mm7, %%mm3 \n\t"\
540 "psubw %%mm2, %%mm0 \n\t"\ 540 "psubw %%mm2, %%mm0 \n\t"\
541 "psubw %%mm3, %%mm1 \n\t"\ 541 "psubw %%mm3, %%mm1 \n\t"\
542 "movq %1, %%mm2 \n\t"\ 542 "movq %1, %%mm2 \n\t"\
543 "movq %1, %%mm3 \n\t"\ 543 "movq %1, %%mm3 \n\t"\
544 "movq %2, %%mm4 \n\t"\ 544 "movq %2, %%mm4 \n\t"\
545 "movq %2, %%mm5 \n\t"\ 545 "movq %2, %%mm5 \n\t"\
546 "punpcklbw %%mm7, %%mm2 \n\t"\ 546 "punpcklbw %%mm7, %%mm2 \n\t"\
547 "punpckhbw %%mm7, %%mm3 \n\t"\ 547 "punpckhbw %%mm7, %%mm3 \n\t"\
548 "punpcklbw %%mm7, %%mm4 \n\t"\ 548 "punpcklbw %%mm7, %%mm4 \n\t"\
549 "punpckhbw %%mm7, %%mm5 \n\t"\ 549 "punpckhbw %%mm7, %%mm5 \n\t"\
550 "psubw %%mm2, %%mm4 \n\t"\ 550 "psubw %%mm2, %%mm4 \n\t"\
551 "psubw %%mm3, %%mm5 \n\t"\ 551 "psubw %%mm3, %%mm5 \n\t"\
552 "psllw $2, %%mm4 \n\t"\ 552 "psllw $2, %%mm4 \n\t"\
553 "psllw $2, %%mm5 \n\t"\ 553 "psllw $2, %%mm5 \n\t"\
554 "paddw %%mm0, %%mm4 \n\t"\ 554 "paddw %%mm0, %%mm4 \n\t"\
555 "paddw %%mm1, %%mm5 \n\t"\ 555 "paddw %%mm1, %%mm5 \n\t"\
556 "pxor %%mm6, %%mm6 \n\t"\ 556 "pxor %%mm6, %%mm6 \n\t"\
557 "pcmpgtw %%mm4, %%mm6 \n\t"\ 557 "pcmpgtw %%mm4, %%mm6 \n\t"\
558 "pcmpgtw %%mm5, %%mm7 \n\t"\ 558 "pcmpgtw %%mm5, %%mm7 \n\t"\
559 "pxor %%mm6, %%mm4 \n\t"\ 559 "pxor %%mm6, %%mm4 \n\t"\
560 "pxor %%mm7, %%mm5 \n\t"\ 560 "pxor %%mm7, %%mm5 \n\t"\
561 "psubw %%mm6, %%mm4 \n\t"\ 561 "psubw %%mm6, %%mm4 \n\t"\
562 "psubw %%mm7, %%mm5 \n\t"\ 562 "psubw %%mm7, %%mm5 \n\t"\
563 "psrlw $3, %%mm4 \n\t"\ 563 "psrlw $3, %%mm4 \n\t"\
564 "psrlw $3, %%mm5 \n\t"\ 564 "psrlw $3, %%mm5 \n\t"\
565 "packuswb %%mm5, %%mm4 \n\t"\ 565 "packuswb %%mm5, %%mm4 \n\t"\
566 "packsswb %%mm7, %%mm6 \n\t"\ 566 "packsswb %%mm7, %%mm6 \n\t"\
567 "pxor %%mm7, %%mm7 \n\t"\ 567 "pxor %%mm7, %%mm7 \n\t"\
568 "movd %4, %%mm2 \n\t"\ 568 "movd %4, %%mm2 \n\t"\
569 "punpcklbw %%mm2, %%mm2 \n\t"\ 569 "punpcklbw %%mm2, %%mm2 \n\t"\
570 "punpcklbw %%mm2, %%mm2 \n\t"\ 570 "punpcklbw %%mm2, %%mm2 \n\t"\
571 "punpcklbw %%mm2, %%mm2 \n\t"\ 571 "punpcklbw %%mm2, %%mm2 \n\t"\
572 "psubusb %%mm4, %%mm2 \n\t"\ 572 "psubusb %%mm4, %%mm2 \n\t"\
573 "movq %%mm2, %%mm3 \n\t"\ 573 "movq %%mm2, %%mm3 \n\t"\
574 "psubusb %%mm4, %%mm3 \n\t"\ 574 "psubusb %%mm4, %%mm3 \n\t"\
575 "psubb %%mm3, %%mm2 \n\t"\ 575 "psubb %%mm3, %%mm2 \n\t"\
576 "movq %1, %%mm3 \n\t"\ 576 "movq %1, %%mm3 \n\t"\
577 "movq %2, %%mm4 \n\t"\ 577 "movq %2, %%mm4 \n\t"\
578 "pxor %%mm6, %%mm3 \n\t"\ 578 "pxor %%mm6, %%mm3 \n\t"\
579 "pxor %%mm6, %%mm4 \n\t"\ 579 "pxor %%mm6, %%mm4 \n\t"\
580 "paddusb %%mm2, %%mm3 \n\t"\ 580 "paddusb %%mm2, %%mm3 \n\t"\
581 "psubusb %%mm2, %%mm4 \n\t"\ 581 "psubusb %%mm2, %%mm4 \n\t"\
582 "pxor %%mm6, %%mm3 \n\t"\ 582 "pxor %%mm6, %%mm3 \n\t"\
583 "pxor %%mm6, %%mm4 \n\t"\ 583 "pxor %%mm6, %%mm4 \n\t"\
584 "paddusb %%mm2, %%mm2 \n\t"\ 584 "paddusb %%mm2, %%mm2 \n\t"\
585 "packsswb %%mm1, %%mm0 \n\t"\ 585 "packsswb %%mm1, %%mm0 \n\t"\
586 "pcmpgtb %%mm0, %%mm7 \n\t"\ 586 "pcmpgtb %%mm0, %%mm7 \n\t"\
587 "pxor %%mm7, %%mm0 \n\t"\ 587 "pxor %%mm7, %%mm0 \n\t"\
588 "psubb %%mm7, %%mm0 \n\t"\ 588 "psubb %%mm7, %%mm0 \n\t"\
589 "movq %%mm0, %%mm1 \n\t"\ 589 "movq %%mm0, %%mm1 \n\t"\
590 "psubusb %%mm2, %%mm0 \n\t"\ 590 "psubusb %%mm2, %%mm0 \n\t"\
591 "psubb %%mm0, %%mm1 \n\t"\ 591 "psubb %%mm0, %%mm1 \n\t"\
592 "pand %5, %%mm1 \n\t"\ 592 "pand %5, %%mm1 \n\t"\
593 "psrlw $2, %%mm1 \n\t"\ 593 "psrlw $2, %%mm1 \n\t"\
594 "pxor %%mm7, %%mm1 \n\t"\ 594 "pxor %%mm7, %%mm1 \n\t"\
595 "psubb %%mm7, %%mm1 \n\t"\ 595 "psubb %%mm7, %%mm1 \n\t"\
596 "movq %0, %%mm5 \n\t"\ 596 "movq %0, %%mm5 \n\t"\
597 "movq %3, %%mm6 \n\t"\ 597 "movq %3, %%mm6 \n\t"\
598 "psubb %%mm1, %%mm5 \n\t"\ 598 "psubb %%mm1, %%mm5 \n\t"\
599 "paddb %%mm1, %%mm6 \n\t" 599 "paddb %%mm1, %%mm6 \n\t"
600 600
601 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ 601 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
602 const int strength= ff_h263_loop_filter_strength[qscale]; 602 const int strength= ff_h263_loop_filter_strength[qscale];
603 603
604 asm volatile( 604 asm volatile(
605 605
606 H263_LOOP_FILTER 606 H263_LOOP_FILTER
607 607
608 "movq %%mm3, %1 \n\t" 608 "movq %%mm3, %1 \n\t"
609 "movq %%mm4, %2 \n\t" 609 "movq %%mm4, %2 \n\t"
610 "movq %%mm5, %0 \n\t" 610 "movq %%mm5, %0 \n\t"
611 "movq %%mm6, %3 \n\t" 611 "movq %%mm6, %3 \n\t"
612 : "+m" (*(uint64_t*)(src - 2*stride)), 612 : "+m" (*(uint64_t*)(src - 2*stride)),
613 "+m" (*(uint64_t*)(src - 1*stride)), 613 "+m" (*(uint64_t*)(src - 1*stride)),
614 "+m" (*(uint64_t*)(src + 0*stride)), 614 "+m" (*(uint64_t*)(src + 0*stride)),
615 "+m" (*(uint64_t*)(src + 1*stride)) 615 "+m" (*(uint64_t*)(src + 1*stride))
616 : "g" (2*strength), "m"(ff_pb_FC) 616 : "g" (2*strength), "m"(ff_pb_FC)
617 ); 617 );
618 } 618 }
619 619
620 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ 620 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
621 asm volatile( //FIXME could save 1 instruction if done as 8x4 ... 621 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
622 "movd %4, %%mm0 \n\t" 622 "movd %4, %%mm0 \n\t"
623 "movd %5, %%mm1 \n\t" 623 "movd %5, %%mm1 \n\t"
624 "movd %6, %%mm2 \n\t" 624 "movd %6, %%mm2 \n\t"
625 "movd %7, %%mm3 \n\t" 625 "movd %7, %%mm3 \n\t"
626 "punpcklbw %%mm1, %%mm0 \n\t" 626 "punpcklbw %%mm1, %%mm0 \n\t"
627 "punpcklbw %%mm3, %%mm2 \n\t" 627 "punpcklbw %%mm3, %%mm2 \n\t"
628 "movq %%mm0, %%mm1 \n\t" 628 "movq %%mm0, %%mm1 \n\t"
629 "punpcklwd %%mm2, %%mm0 \n\t" 629 "punpcklwd %%mm2, %%mm0 \n\t"
630 "punpckhwd %%mm2, %%mm1 \n\t" 630 "punpckhwd %%mm2, %%mm1 \n\t"
631 "movd %%mm0, %0 \n\t" 631 "movd %%mm0, %0 \n\t"
632 "punpckhdq %%mm0, %%mm0 \n\t" 632 "punpckhdq %%mm0, %%mm0 \n\t"
633 "movd %%mm0, %1 \n\t" 633 "movd %%mm0, %1 \n\t"
634 "movd %%mm1, %2 \n\t" 634 "movd %%mm1, %2 \n\t"
635 "punpckhdq %%mm1, %%mm1 \n\t" 635 "punpckhdq %%mm1, %%mm1 \n\t"
636 "movd %%mm1, %3 \n\t" 636 "movd %%mm1, %3 \n\t"
637 637
638 : "=m" (*(uint32_t*)(dst + 0*dst_stride)), 638 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
639 "=m" (*(uint32_t*)(dst + 1*dst_stride)), 639 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
640 "=m" (*(uint32_t*)(dst + 2*dst_stride)), 640 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
641 "=m" (*(uint32_t*)(dst + 3*dst_stride)) 641 "=m" (*(uint32_t*)(dst + 3*dst_stride))
664 "+m" (temp[3]) 664 "+m" (temp[3])
665 : "g" (2*strength), "m"(ff_pb_FC) 665 : "g" (2*strength), "m"(ff_pb_FC)
666 ); 666 );
667 667
668 asm volatile( 668 asm volatile(
669 "movq %%mm5, %%mm1 \n\t" 669 "movq %%mm5, %%mm1 \n\t"
670 "movq %%mm4, %%mm0 \n\t" 670 "movq %%mm4, %%mm0 \n\t"
671 "punpcklbw %%mm3, %%mm5 \n\t" 671 "punpcklbw %%mm3, %%mm5 \n\t"
672 "punpcklbw %%mm6, %%mm4 \n\t" 672 "punpcklbw %%mm6, %%mm4 \n\t"
673 "punpckhbw %%mm3, %%mm1 \n\t" 673 "punpckhbw %%mm3, %%mm1 \n\t"
674 "punpckhbw %%mm6, %%mm0 \n\t" 674 "punpckhbw %%mm6, %%mm0 \n\t"
675 "movq %%mm5, %%mm3 \n\t" 675 "movq %%mm5, %%mm3 \n\t"
676 "movq %%mm1, %%mm6 \n\t" 676 "movq %%mm1, %%mm6 \n\t"
677 "punpcklwd %%mm4, %%mm5 \n\t" 677 "punpcklwd %%mm4, %%mm5 \n\t"
678 "punpcklwd %%mm0, %%mm1 \n\t" 678 "punpcklwd %%mm0, %%mm1 \n\t"
679 "punpckhwd %%mm4, %%mm3 \n\t" 679 "punpckhwd %%mm4, %%mm3 \n\t"
680 "punpckhwd %%mm0, %%mm6 \n\t" 680 "punpckhwd %%mm0, %%mm6 \n\t"
681 "movd %%mm5, (%0) \n\t" 681 "movd %%mm5, (%0) \n\t"
682 "punpckhdq %%mm5, %%mm5 \n\t" 682 "punpckhdq %%mm5, %%mm5 \n\t"
683 "movd %%mm5, (%0,%2) \n\t" 683 "movd %%mm5, (%0,%2) \n\t"
684 "movd %%mm3, (%0,%2,2) \n\t" 684 "movd %%mm3, (%0,%2,2) \n\t"
685 "punpckhdq %%mm3, %%mm3 \n\t" 685 "punpckhdq %%mm3, %%mm3 \n\t"
686 "movd %%mm3, (%0,%3) \n\t" 686 "movd %%mm3, (%0,%3) \n\t"
687 "movd %%mm1, (%1) \n\t" 687 "movd %%mm1, (%1) \n\t"
688 "punpckhdq %%mm1, %%mm1 \n\t" 688 "punpckhdq %%mm1, %%mm1 \n\t"
689 "movd %%mm1, (%1,%2) \n\t" 689 "movd %%mm1, (%1,%2) \n\t"
690 "movd %%mm6, (%1,%2,2) \n\t" 690 "movd %%mm6, (%1,%2,2) \n\t"
691 "punpckhdq %%mm6, %%mm6 \n\t" 691 "punpckhdq %%mm6, %%mm6 \n\t"
692 "movd %%mm6, (%1,%3) \n\t" 692 "movd %%mm6, (%1,%3) \n\t"
693 :: "r" (src), 693 :: "r" (src),
694 "r" (src + 4*stride), 694 "r" (src + 4*stride),
695 "r" ((long) stride ), 695 "r" ((long) stride ),
696 "r" ((long)(3*stride)) 696 "r" ((long)(3*stride))
697 ); 697 );
703 asm volatile ( 703 asm volatile (
704 "movl $16,%%ecx\n" 704 "movl $16,%%ecx\n"
705 "pxor %%mm0,%%mm0\n" 705 "pxor %%mm0,%%mm0\n"
706 "pxor %%mm7,%%mm7\n" 706 "pxor %%mm7,%%mm7\n"
707 "1:\n" 707 "1:\n"
708 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ 708 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
709 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ 709 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
710 710
711 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ 711 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
712 712
713 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ 713 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
714 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ 714 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
715 715
716 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ 716 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
717 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ 717 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
718 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ 718 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
719 719
720 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ 720 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
721 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ 721 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
722 722
723 "pmaddwd %%mm3,%%mm3\n" 723 "pmaddwd %%mm3,%%mm3\n"
724 "pmaddwd %%mm4,%%mm4\n" 724 "pmaddwd %%mm4,%%mm4\n"
725 725
726 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, 726 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
727 pix2^2+pix3^2+pix6^2+pix7^2) */ 727 pix2^2+pix3^2+pix6^2+pix7^2) */
728 "paddd %%mm3,%%mm4\n" 728 "paddd %%mm3,%%mm4\n"
729 "paddd %%mm2,%%mm7\n" 729 "paddd %%mm2,%%mm7\n"
730 730
731 "add %2, %0\n" 731 "add %2, %0\n"
732 "paddd %%mm4,%%mm7\n" 732 "paddd %%mm4,%%mm7\n"
733 "dec %%ecx\n" 733 "dec %%ecx\n"
734 "jnz 1b\n" 734 "jnz 1b\n"
735 735
736 "movq %%mm7,%%mm1\n" 736 "movq %%mm7,%%mm1\n"
737 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 737 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
738 "paddd %%mm7,%%mm1\n" 738 "paddd %%mm7,%%mm1\n"
739 "movd %%mm1,%1\n" 739 "movd %%mm1,%1\n"
740 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" ); 740 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
741 return tmp; 741 return tmp;
742 } 742 }
744 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 744 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
745 int tmp; 745 int tmp;
746 asm volatile ( 746 asm volatile (
747 "movl %4,%%ecx\n" 747 "movl %4,%%ecx\n"
748 "shr $1,%%ecx\n" 748 "shr $1,%%ecx\n"
749 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 749 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
750 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 750 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
751 "1:\n" 751 "1:\n"
752 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ 752 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
753 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ 753 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
754 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ 754 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
755 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ 755 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
756 756
757 /* todo: mm1-mm2, mm3-mm4 */ 757 /* todo: mm1-mm2, mm3-mm4 */
758 /* algo: substract mm1 from mm2 with saturation and vice versa */ 758 /* algo: substract mm1 from mm2 with saturation and vice versa */
759 /* OR the results to get absolute difference */ 759 /* OR the results to get absolute difference */
760 "movq %%mm1,%%mm5\n" 760 "movq %%mm1,%%mm5\n"
771 "movq %%mm2,%%mm1\n" 771 "movq %%mm2,%%mm1\n"
772 "movq %%mm4,%%mm3\n" 772 "movq %%mm4,%%mm3\n"
773 773
774 "punpckhbw %%mm0,%%mm2\n" 774 "punpckhbw %%mm0,%%mm2\n"
775 "punpckhbw %%mm0,%%mm4\n" 775 "punpckhbw %%mm0,%%mm4\n"
776 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 776 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
777 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 777 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
778 778
779 "pmaddwd %%mm2,%%mm2\n" 779 "pmaddwd %%mm2,%%mm2\n"
780 "pmaddwd %%mm4,%%mm4\n" 780 "pmaddwd %%mm4,%%mm4\n"
781 "pmaddwd %%mm1,%%mm1\n" 781 "pmaddwd %%mm1,%%mm1\n"
782 "pmaddwd %%mm3,%%mm3\n" 782 "pmaddwd %%mm3,%%mm3\n"
783 783
784 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ 784 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
785 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ 785 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
786 786
787 "paddd %%mm2,%%mm1\n" 787 "paddd %%mm2,%%mm1\n"
788 "paddd %%mm4,%%mm3\n" 788 "paddd %%mm4,%%mm3\n"
789 "paddd %%mm1,%%mm7\n" 789 "paddd %%mm1,%%mm7\n"
790 "paddd %%mm3,%%mm7\n" 790 "paddd %%mm3,%%mm7\n"
791 791
792 "decl %%ecx\n" 792 "decl %%ecx\n"
793 "jnz 1b\n" 793 "jnz 1b\n"
794 794
795 "movq %%mm7,%%mm1\n" 795 "movq %%mm7,%%mm1\n"
796 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 796 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
797 "paddd %%mm7,%%mm1\n" 797 "paddd %%mm7,%%mm1\n"
798 "movd %%mm1,%2\n" 798 "movd %%mm1,%2\n"
799 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 799 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
800 : "r" ((long)line_size) , "m" (h) 800 : "r" ((long)line_size) , "m" (h)
801 : "%ecx"); 801 : "%ecx");
804 804
805 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 805 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
806 int tmp; 806 int tmp;
807 asm volatile ( 807 asm volatile (
808 "movl %4,%%ecx\n" 808 "movl %4,%%ecx\n"
809 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 809 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
810 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 810 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
811 "1:\n" 811 "1:\n"
812 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ 812 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
813 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ 813 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
814 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ 814 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
815 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ 815 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
816 816
817 /* todo: mm1-mm2, mm3-mm4 */ 817 /* todo: mm1-mm2, mm3-mm4 */
818 /* algo: substract mm1 from mm2 with saturation and vice versa */ 818 /* algo: substract mm1 from mm2 with saturation and vice versa */
819 /* OR the results to get absolute difference */ 819 /* OR the results to get absolute difference */
820 "movq %%mm1,%%mm5\n" 820 "movq %%mm1,%%mm5\n"
831 "movq %%mm2,%%mm1\n" 831 "movq %%mm2,%%mm1\n"
832 "movq %%mm4,%%mm3\n" 832 "movq %%mm4,%%mm3\n"
833 833
834 "punpckhbw %%mm0,%%mm2\n" 834 "punpckhbw %%mm0,%%mm2\n"
835 "punpckhbw %%mm0,%%mm4\n" 835 "punpckhbw %%mm0,%%mm4\n"
836 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 836 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
837 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 837 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
838 838
839 "pmaddwd %%mm2,%%mm2\n" 839 "pmaddwd %%mm2,%%mm2\n"
840 "pmaddwd %%mm4,%%mm4\n" 840 "pmaddwd %%mm4,%%mm4\n"
841 "pmaddwd %%mm1,%%mm1\n" 841 "pmaddwd %%mm1,%%mm1\n"
842 "pmaddwd %%mm3,%%mm3\n" 842 "pmaddwd %%mm3,%%mm3\n"
851 851
852 "decl %%ecx\n" 852 "decl %%ecx\n"
853 "jnz 1b\n" 853 "jnz 1b\n"
854 854
855 "movq %%mm7,%%mm1\n" 855 "movq %%mm7,%%mm1\n"
856 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 856 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
857 "paddd %%mm7,%%mm1\n" 857 "paddd %%mm7,%%mm1\n"
858 "movd %%mm1,%2\n" 858 "movd %%mm1,%2\n"
859 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 859 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
860 : "r" ((long)line_size) , "m" (h) 860 : "r" ((long)line_size) , "m" (h)
861 : "%ecx"); 861 : "%ecx");
864 864
865 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 865 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
866 int tmp; 866 int tmp;
867 asm volatile ( 867 asm volatile (
868 "shr $1,%2\n" 868 "shr $1,%2\n"
869 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ 869 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
870 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ 870 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
871 "1:\n" 871 "1:\n"
872 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */ 872 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
873 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */ 873 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
874 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */ 874 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
875 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */ 875 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
876 876
877 /* todo: mm1-mm2, mm3-mm4 */ 877 /* todo: mm1-mm2, mm3-mm4 */
878 /* algo: substract mm1 from mm2 with saturation and vice versa */ 878 /* algo: substract mm1 from mm2 with saturation and vice versa */
879 /* OR the results to get absolute difference */ 879 /* OR the results to get absolute difference */
880 "movdqa %%xmm1,%%xmm5\n" 880 "movdqa %%xmm1,%%xmm5\n"
891 "movdqa %%xmm2,%%xmm1\n" 891 "movdqa %%xmm2,%%xmm1\n"
892 "movdqa %%xmm4,%%xmm3\n" 892 "movdqa %%xmm4,%%xmm3\n"
893 893
894 "punpckhbw %%xmm0,%%xmm2\n" 894 "punpckhbw %%xmm0,%%xmm2\n"
895 "punpckhbw %%xmm0,%%xmm4\n" 895 "punpckhbw %%xmm0,%%xmm4\n"
896 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ 896 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
897 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ 897 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
898 898
899 "pmaddwd %%xmm2,%%xmm2\n" 899 "pmaddwd %%xmm2,%%xmm2\n"
900 "pmaddwd %%xmm4,%%xmm4\n" 900 "pmaddwd %%xmm4,%%xmm4\n"
901 "pmaddwd %%xmm1,%%xmm1\n" 901 "pmaddwd %%xmm1,%%xmm1\n"
902 "pmaddwd %%xmm3,%%xmm3\n" 902 "pmaddwd %%xmm3,%%xmm3\n"
903 903
904 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ 904 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
905 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ 905 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
906 906
907 "paddd %%xmm2,%%xmm1\n" 907 "paddd %%xmm2,%%xmm1\n"
908 "paddd %%xmm4,%%xmm3\n" 908 "paddd %%xmm4,%%xmm3\n"
909 "paddd %%xmm1,%%xmm7\n" 909 "paddd %%xmm1,%%xmm7\n"
910 "paddd %%xmm3,%%xmm7\n" 910 "paddd %%xmm3,%%xmm7\n"
911 911
912 "decl %2\n" 912 "decl %2\n"
913 "jnz 1b\n" 913 "jnz 1b\n"
914 914
915 "movdqa %%xmm7,%%xmm1\n" 915 "movdqa %%xmm7,%%xmm1\n"
916 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */ 916 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
917 "paddd %%xmm1,%%xmm7\n" 917 "paddd %%xmm1,%%xmm7\n"
918 "movdqa %%xmm7,%%xmm1\n" 918 "movdqa %%xmm7,%%xmm1\n"
919 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ 919 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
920 "paddd %%xmm1,%%xmm7\n" 920 "paddd %%xmm1,%%xmm7\n"
921 "movd %%xmm7,%3\n" 921 "movd %%xmm7,%3\n"
922 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) 922 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
923 : "r" ((long)line_size)); 923 : "r" ((long)line_size));
924 return tmp; 924 return tmp;
1425 #undef SUM 1425 #undef SUM
1426 1426
1427 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 1427 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1428 long i=0; 1428 long i=0;
1429 asm volatile( 1429 asm volatile(
1430 "1: \n\t" 1430 "1: \n\t"
1431 "movq (%2, %0), %%mm0 \n\t" 1431 "movq (%2, %0), %%mm0 \n\t"
1432 "movq (%1, %0), %%mm1 \n\t" 1432 "movq (%1, %0), %%mm1 \n\t"
1433 "psubb %%mm0, %%mm1 \n\t" 1433 "psubb %%mm0, %%mm1 \n\t"
1434 "movq %%mm1, (%3, %0) \n\t" 1434 "movq %%mm1, (%3, %0) \n\t"
1435 "movq 8(%2, %0), %%mm0 \n\t" 1435 "movq 8(%2, %0), %%mm0 \n\t"
1436 "movq 8(%1, %0), %%mm1 \n\t" 1436 "movq 8(%1, %0), %%mm1 \n\t"
1437 "psubb %%mm0, %%mm1 \n\t" 1437 "psubb %%mm0, %%mm1 \n\t"
1438 "movq %%mm1, 8(%3, %0) \n\t" 1438 "movq %%mm1, 8(%3, %0) \n\t"
1439 "add $16, %0 \n\t" 1439 "add $16, %0 \n\t"
1440 "cmp %4, %0 \n\t" 1440 "cmp %4, %0 \n\t"
1441 " jb 1b \n\t" 1441 " jb 1b \n\t"
1442 : "+r" (i) 1442 : "+r" (i)
1443 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15) 1443 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1444 ); 1444 );
1445 for(; i<w; i++) 1445 for(; i<w; i++)
1446 dst[i+0] = src1[i+0]-src2[i+0]; 1446 dst[i+0] = src1[i+0]-src2[i+0];
1449 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ 1449 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1450 long i=0; 1450 long i=0;
1451 uint8_t l, lt; 1451 uint8_t l, lt;
1452 1452
1453 asm volatile( 1453 asm volatile(
1454 "1: \n\t" 1454 "1: \n\t"
1455 "movq -1(%1, %0), %%mm0 \n\t" // LT 1455 "movq -1(%1, %0), %%mm0 \n\t" // LT
1456 "movq (%1, %0), %%mm1 \n\t" // T 1456 "movq (%1, %0), %%mm1 \n\t" // T
1457 "movq -1(%2, %0), %%mm2 \n\t" // L 1457 "movq -1(%2, %0), %%mm2 \n\t" // L
1458 "movq (%2, %0), %%mm3 \n\t" // X 1458 "movq (%2, %0), %%mm3 \n\t" // X
1459 "movq %%mm2, %%mm4 \n\t" // L 1459 "movq %%mm2, %%mm4 \n\t" // L
1460 "psubb %%mm0, %%mm2 \n\t" 1460 "psubb %%mm0, %%mm2 \n\t"
1461 "paddb %%mm1, %%mm2 \n\t" // L + T - LT 1461 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
1462 "movq %%mm4, %%mm5 \n\t" // L 1462 "movq %%mm4, %%mm5 \n\t" // L
1463 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) 1463 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
1464 "pminub %%mm5, %%mm1 \n\t" // min(T, L) 1464 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
1465 "pminub %%mm2, %%mm4 \n\t" 1465 "pminub %%mm2, %%mm4 \n\t"
1466 "pmaxub %%mm1, %%mm4 \n\t" 1466 "pmaxub %%mm1, %%mm4 \n\t"
1467 "psubb %%mm4, %%mm3 \n\t" // dst - pred 1467 "psubb %%mm4, %%mm3 \n\t" // dst - pred
1468 "movq %%mm3, (%3, %0) \n\t" 1468 "movq %%mm3, (%3, %0) \n\t"
1469 "add $8, %0 \n\t" 1469 "add $8, %0 \n\t"
1470 "cmp %4, %0 \n\t" 1470 "cmp %4, %0 \n\t"
1471 " jb 1b \n\t" 1471 " jb 1b \n\t"
1472 : "+r" (i) 1472 : "+r" (i)
1473 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w) 1473 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1474 ); 1474 );
1475 1475
1476 l= *left; 1476 l= *left;
1481 *left_top= src1[w-1]; 1481 *left_top= src1[w-1];
1482 *left = src2[w-1]; 1482 *left = src2[w-1];
1483 } 1483 }
1484 1484
1485 #define LBUTTERFLY2(a1,b1,a2,b2)\ 1485 #define LBUTTERFLY2(a1,b1,a2,b2)\
1486 "paddw " #b1 ", " #a1 " \n\t"\ 1486 "paddw " #b1 ", " #a1 " \n\t"\
1487 "paddw " #b2 ", " #a2 " \n\t"\ 1487 "paddw " #b2 ", " #a2 " \n\t"\
1488 "paddw " #b1 ", " #b1 " \n\t"\ 1488 "paddw " #b1 ", " #b1 " \n\t"\
1489 "paddw " #b2 ", " #b2 " \n\t"\ 1489 "paddw " #b2 ", " #b2 " \n\t"\
1490 "psubw " #a1 ", " #b1 " \n\t"\ 1490 "psubw " #a1 ", " #b1 " \n\t"\
1491 "psubw " #a2 ", " #b2 " \n\t" 1491 "psubw " #a2 ", " #b2 " \n\t"
1492 1492
1493 #define HADAMARD48\ 1493 #define HADAMARD48\
1494 LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\ 1494 LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
1495 LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\ 1495 LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
1496 LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\ 1496 LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
1497 LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\ 1497 LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
1498 LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\ 1498 LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
1499 LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\ 1499 LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
1500 1500
1501 #define MMABS(a,z)\ 1501 #define MMABS(a,z)\
1502 "pxor " #z ", " #z " \n\t"\ 1502 "pxor " #z ", " #z " \n\t"\
1503 "pcmpgtw " #a ", " #z " \n\t"\ 1503 "pcmpgtw " #a ", " #z " \n\t"\
1504 "pxor " #z ", " #a " \n\t"\ 1504 "pxor " #z ", " #a " \n\t"\
1505 "psubw " #z ", " #a " \n\t" 1505 "psubw " #z ", " #a " \n\t"
1506 1506
1507 #define MMABS_SUM(a,z, sum)\ 1507 #define MMABS_SUM(a,z, sum)\
1508 "pxor " #z ", " #z " \n\t"\ 1508 "pxor " #z ", " #z " \n\t"\
1509 "pcmpgtw " #a ", " #z " \n\t"\ 1509 "pcmpgtw " #a ", " #z " \n\t"\
1510 "pxor " #z ", " #a " \n\t"\ 1510 "pxor " #z ", " #a " \n\t"\
1511 "psubw " #z ", " #a " \n\t"\ 1511 "psubw " #z ", " #a " \n\t"\
1512 "paddusw " #a ", " #sum " \n\t" 1512 "paddusw " #a ", " #sum " \n\t"
1513 1513
1514 #define MMABS_MMX2(a,z)\ 1514 #define MMABS_MMX2(a,z)\
1515 "pxor " #z ", " #z " \n\t"\ 1515 "pxor " #z ", " #z " \n\t"\
1516 "psubw " #a ", " #z " \n\t"\ 1516 "psubw " #a ", " #z " \n\t"\
1517 "pmaxsw " #z ", " #a " \n\t" 1517 "pmaxsw " #z ", " #a " \n\t"
1518 1518
1519 #define MMABS_SUM_MMX2(a,z, sum)\ 1519 #define MMABS_SUM_MMX2(a,z, sum)\
1520 "pxor " #z ", " #z " \n\t"\ 1520 "pxor " #z ", " #z " \n\t"\
1521 "psubw " #a ", " #z " \n\t"\ 1521 "psubw " #a ", " #z " \n\t"\
1522 "pmaxsw " #z ", " #a " \n\t"\ 1522 "pmaxsw " #z ", " #a " \n\t"\
1523 "paddusw " #a ", " #sum " \n\t" 1523 "paddusw " #a ", " #sum " \n\t"
1524 1524
1525 #define SBUTTERFLY(a,b,t,n)\ 1525 #define SBUTTERFLY(a,b,t,n)\
1526 "movq " #a ", " #t " \n\t" /* abcd */\ 1526 "movq " #a ", " #t " \n\t" /* abcd */\
1527 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ 1527 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
1528 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ 1528 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
1529 1529
1530 #define TRANSPOSE4(a,b,c,d,t)\ 1530 #define TRANSPOSE4(a,b,c,d,t)\
1531 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\ 1531 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
1532 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\ 1532 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
1533 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\ 1533 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
1534 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */ 1534 SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
1535 1535
1536 #define LOAD4(o, a, b, c, d)\ 1536 #define LOAD4(o, a, b, c, d)\
1537 "movq "#o"(%1), " #a " \n\t"\ 1537 "movq "#o"(%1), " #a " \n\t"\
1538 "movq "#o"+16(%1), " #b " \n\t"\ 1538 "movq "#o"+16(%1), " #b " \n\t"\
1539 "movq "#o"+32(%1), " #c " \n\t"\ 1539 "movq "#o"+32(%1), " #c " \n\t"\
1540 "movq "#o"+48(%1), " #d " \n\t" 1540 "movq "#o"+48(%1), " #d " \n\t"
1541 1541
1542 #define STORE4(o, a, b, c, d)\ 1542 #define STORE4(o, a, b, c, d)\
1543 "movq "#a", "#o"(%1) \n\t"\ 1543 "movq "#a", "#o"(%1) \n\t"\
1544 "movq "#b", "#o"+16(%1) \n\t"\ 1544 "movq "#b", "#o"+16(%1) \n\t"\
1545 "movq "#c", "#o"+32(%1) \n\t"\ 1545 "movq "#c", "#o"+32(%1) \n\t"\
1546 "movq "#d", "#o"+48(%1) \n\t"\ 1546 "movq "#d", "#o"+48(%1) \n\t"\
1547 1547
1548 static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ 1548 static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1549 uint64_t temp[16] __align8; 1549 uint64_t temp[16] __align8;
1550 int sum=0; 1550 int sum=0;
1551 1551
1557 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) 1557 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1558 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) 1558 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1559 1559
1560 HADAMARD48 1560 HADAMARD48
1561 1561
1562 "movq %%mm7, 112(%1) \n\t" 1562 "movq %%mm7, 112(%1) \n\t"
1563 1563
1564 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) 1564 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1565 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) 1565 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1566 1566
1567 "movq 112(%1), %%mm7 \n\t" 1567 "movq 112(%1), %%mm7 \n\t"
1568 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) 1568 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1569 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) 1569 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1570 1570
1571 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) 1571 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1572 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) 1572 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1573 1573
1574 HADAMARD48 1574 HADAMARD48
1575 1575
1576 "movq %%mm7, 120(%1) \n\t" 1576 "movq %%mm7, 120(%1) \n\t"
1577 1577
1578 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) 1578 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1579 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) 1579 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1580 1580
1581 "movq 120(%1), %%mm7 \n\t" 1581 "movq 120(%1), %%mm7 \n\t"
1582 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) 1582 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1583 "movq %%mm7, %%mm5 \n\t"//FIXME remove 1583 "movq %%mm7, %%mm5 \n\t"//FIXME remove
1584 "movq %%mm6, %%mm7 \n\t" 1584 "movq %%mm6, %%mm7 \n\t"
1585 "movq %%mm0, %%mm6 \n\t" 1585 "movq %%mm0, %%mm6 \n\t"
1586 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove 1586 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1587 1587
1588 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) 1588 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1589 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) 1589 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1590 1590
1591 HADAMARD48 1591 HADAMARD48
1592 "movq %%mm7, 64(%1) \n\t" 1592 "movq %%mm7, 64(%1) \n\t"
1593 MMABS(%%mm0, %%mm7) 1593 MMABS(%%mm0, %%mm7)
1594 MMABS_SUM(%%mm1, %%mm7, %%mm0) 1594 MMABS_SUM(%%mm1, %%mm7, %%mm0)
1595 MMABS_SUM(%%mm2, %%mm7, %%mm0) 1595 MMABS_SUM(%%mm2, %%mm7, %%mm0)
1596 MMABS_SUM(%%mm3, %%mm7, %%mm0) 1596 MMABS_SUM(%%mm3, %%mm7, %%mm0)
1597 MMABS_SUM(%%mm4, %%mm7, %%mm0) 1597 MMABS_SUM(%%mm4, %%mm7, %%mm0)
1598 MMABS_SUM(%%mm5, %%mm7, %%mm0) 1598 MMABS_SUM(%%mm5, %%mm7, %%mm0)
1599 MMABS_SUM(%%mm6, %%mm7, %%mm0) 1599 MMABS_SUM(%%mm6, %%mm7, %%mm0)
1600 "movq 64(%1), %%mm1 \n\t" 1600 "movq 64(%1), %%mm1 \n\t"
1601 MMABS_SUM(%%mm1, %%mm7, %%mm0) 1601 MMABS_SUM(%%mm1, %%mm7, %%mm0)
1602 "movq %%mm0, 64(%1) \n\t" 1602 "movq %%mm0, 64(%1) \n\t"
1603 1603
1604 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) 1604 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1605 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) 1605 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1606 1606
1607 HADAMARD48 1607 HADAMARD48
1608 "movq %%mm7, (%1) \n\t" 1608 "movq %%mm7, (%1) \n\t"
1609 MMABS(%%mm0, %%mm7) 1609 MMABS(%%mm0, %%mm7)
1610 MMABS_SUM(%%mm1, %%mm7, %%mm0) 1610 MMABS_SUM(%%mm1, %%mm7, %%mm0)
1611 MMABS_SUM(%%mm2, %%mm7, %%mm0) 1611 MMABS_SUM(%%mm2, %%mm7, %%mm0)
1612 MMABS_SUM(%%mm3, %%mm7, %%mm0) 1612 MMABS_SUM(%%mm3, %%mm7, %%mm0)
1613 MMABS_SUM(%%mm4, %%mm7, %%mm0) 1613 MMABS_SUM(%%mm4, %%mm7, %%mm0)
1614 MMABS_SUM(%%mm5, %%mm7, %%mm0) 1614 MMABS_SUM(%%mm5, %%mm7, %%mm0)
1615 MMABS_SUM(%%mm6, %%mm7, %%mm0) 1615 MMABS_SUM(%%mm6, %%mm7, %%mm0)
1616 "movq (%1), %%mm1 \n\t" 1616 "movq (%1), %%mm1 \n\t"
1617 MMABS_SUM(%%mm1, %%mm7, %%mm0) 1617 MMABS_SUM(%%mm1, %%mm7, %%mm0)
1618 "movq 64(%1), %%mm1 \n\t" 1618 "movq 64(%1), %%mm1 \n\t"
1619 MMABS_SUM(%%mm1, %%mm7, %%mm0) 1619 MMABS_SUM(%%mm1, %%mm7, %%mm0)
1620 1620
1621 "movq %%mm0, %%mm1 \n\t" 1621 "movq %%mm0, %%mm1 \n\t"
1622 "psrlq $32, %%mm0 \n\t" 1622 "psrlq $32, %%mm0 \n\t"
1623 "paddusw %%mm1, %%mm0 \n\t" 1623 "paddusw %%mm1, %%mm0 \n\t"
1624 "movq %%mm0, %%mm1 \n\t" 1624 "movq %%mm0, %%mm1 \n\t"
1625 "psrlq $16, %%mm0 \n\t" 1625 "psrlq $16, %%mm0 \n\t"
1626 "paddusw %%mm1, %%mm0 \n\t" 1626 "paddusw %%mm1, %%mm0 \n\t"
1627 "movd %%mm0, %0 \n\t" 1627 "movd %%mm0, %0 \n\t"
1628 1628
1629 : "=r" (sum) 1629 : "=r" (sum)
1630 : "r"(temp) 1630 : "r"(temp)
1631 ); 1631 );
1632 return sum&0xFFFF; 1632 return sum&0xFFFF;
1644 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) 1644 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1645 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) 1645 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1646 1646
1647 HADAMARD48 1647 HADAMARD48
1648 1648
1649 "movq %%mm7, 112(%1) \n\t" 1649 "movq %%mm7, 112(%1) \n\t"
1650 1650
1651 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) 1651 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1652 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) 1652 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1653 1653
1654 "movq 112(%1), %%mm7 \n\t" 1654 "movq 112(%1), %%mm7 \n\t"
1655 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) 1655 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1656 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) 1656 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1657 1657
1658 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) 1658 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1659 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) 1659 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1660 1660
1661 HADAMARD48 1661 HADAMARD48
1662 1662
1663 "movq %%mm7, 120(%1) \n\t" 1663 "movq %%mm7, 120(%1) \n\t"
1664 1664
1665 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) 1665 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1666 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) 1666 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1667 1667
1668 "movq 120(%1), %%mm7 \n\t" 1668 "movq 120(%1), %%mm7 \n\t"
1669 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) 1669 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1670 "movq %%mm7, %%mm5 \n\t"//FIXME remove 1670 "movq %%mm7, %%mm5 \n\t"//FIXME remove
1671 "movq %%mm6, %%mm7 \n\t" 1671 "movq %%mm6, %%mm7 \n\t"
1672 "movq %%mm0, %%mm6 \n\t" 1672 "movq %%mm0, %%mm6 \n\t"
1673 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove 1673 // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1674 1674
1675 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) 1675 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1676 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) 1676 // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1677 1677
1678 HADAMARD48 1678 HADAMARD48
1679 "movq %%mm7, 64(%1) \n\t" 1679 "movq %%mm7, 64(%1) \n\t"
1680 MMABS_MMX2(%%mm0, %%mm7) 1680 MMABS_MMX2(%%mm0, %%mm7)
1681 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) 1681 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1682 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0) 1682 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1683 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0) 1683 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
1684 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0) 1684 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
1685 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0) 1685 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
1686 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0) 1686 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1687 "movq 64(%1), %%mm1 \n\t" 1687 "movq 64(%1), %%mm1 \n\t"
1688 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) 1688 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1689 "movq %%mm0, 64(%1) \n\t" 1689 "movq %%mm0, 64(%1) \n\t"
1690 1690
1691 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) 1691 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1692 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) 1692 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1693 1693
1694 HADAMARD48 1694 HADAMARD48
1695 "movq %%mm7, (%1) \n\t" 1695 "movq %%mm7, (%1) \n\t"
1696 MMABS_MMX2(%%mm0, %%mm7) 1696 MMABS_MMX2(%%mm0, %%mm7)
1697 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) 1697 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1698 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0) 1698 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1699 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0) 1699 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
1700 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0) 1700 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
1701 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0) 1701 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
1702 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0) 1702 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1703 "movq (%1), %%mm1 \n\t" 1703 "movq (%1), %%mm1 \n\t"
1704 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) 1704 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1705 "movq 64(%1), %%mm1 \n\t" 1705 "movq 64(%1), %%mm1 \n\t"
1706 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) 1706 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1707 1707
1708 "pshufw $0x0E, %%mm0, %%mm1 \n\t" 1708 "pshufw $0x0E, %%mm0, %%mm1 \n\t"
1709 "paddusw %%mm1, %%mm0 \n\t" 1709 "paddusw %%mm1, %%mm0 \n\t"
1710 "pshufw $0x01, %%mm0, %%mm1 \n\t" 1710 "pshufw $0x01, %%mm0, %%mm1 \n\t"
1711 "paddusw %%mm1, %%mm0 \n\t" 1711 "paddusw %%mm1, %%mm0 \n\t"
1712 "movd %%mm0, %0 \n\t" 1712 "movd %%mm0, %0 \n\t"
1713 1713
1714 : "=r" (sum) 1714 : "=r" (sum)
1715 : "r"(temp) 1715 : "r"(temp)
1716 ); 1716 );
1717 return sum&0xFFFF; 1717 return sum&0xFFFF;
1724 1724
1725 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d) 1725 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
1726 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d) 1726 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
1727 1727
1728 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ 1728 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
1729 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ 1729 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
1730 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ 1730 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
1731 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ 1731 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
1732 "movq "#in7", " #m3 " \n\t" /* d */\ 1732 "movq "#in7", " #m3 " \n\t" /* d */\
1733 "movq "#in0", %%mm5 \n\t" /* D */\ 1733 "movq "#in0", %%mm5 \n\t" /* D */\
1734 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ 1734 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
1735 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ 1735 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
1736 "movq "#in1", %%mm5 \n\t" /* C */\ 1736 "movq "#in1", %%mm5 \n\t" /* C */\
1737 "movq "#in2", %%mm6 \n\t" /* B */\ 1737 "movq "#in2", %%mm6 \n\t" /* B */\
1738 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ 1738 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
1739 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ 1739 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
1740 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ 1740 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
1741 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ 1741 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
1742 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ 1742 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
1743 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ 1743 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
1744 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ 1744 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
1745 "psraw $5, %%mm5 \n\t"\ 1745 "psraw $5, %%mm5 \n\t"\
1746 "packuswb %%mm5, %%mm5 \n\t"\ 1746 "packuswb %%mm5, %%mm5 \n\t"\
1747 OP(%%mm5, out, %%mm7, d) 1747 OP(%%mm5, out, %%mm7, d)
1748 1748
1749 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ 1749 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1750 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1750 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1751 uint64_t temp;\ 1751 uint64_t temp;\
1752 \ 1752 \
1753 asm volatile(\ 1753 asm volatile(\
1754 "pxor %%mm7, %%mm7 \n\t"\ 1754 "pxor %%mm7, %%mm7 \n\t"\
1755 "1: \n\t"\ 1755 "1: \n\t"\
1756 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ 1756 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1757 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ 1757 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1758 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ 1758 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1759 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ 1759 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1760 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ 1760 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1761 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ 1761 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1762 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ 1762 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1763 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ 1763 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1764 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ 1764 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1765 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ 1765 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1766 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ 1766 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1767 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ 1767 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1768 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ 1768 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1769 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ 1769 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1770 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ 1770 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1771 "paddw %%mm3, %%mm5 \n\t" /* b */\ 1771 "paddw %%mm3, %%mm5 \n\t" /* b */\
1772 "paddw %%mm2, %%mm6 \n\t" /* c */\ 1772 "paddw %%mm2, %%mm6 \n\t" /* c */\
1773 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ 1773 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1774 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ 1774 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1775 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ 1775 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1776 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ 1776 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1777 "paddw %%mm4, %%mm0 \n\t" /* a */\ 1777 "paddw %%mm4, %%mm0 \n\t" /* a */\
1778 "paddw %%mm1, %%mm5 \n\t" /* d */\ 1778 "paddw %%mm1, %%mm5 \n\t" /* d */\
1779 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ 1779 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1780 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ 1780 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1781 "paddw %6, %%mm6 \n\t"\ 1781 "paddw %6, %%mm6 \n\t"\
1782 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ 1782 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1783 "psraw $5, %%mm0 \n\t"\ 1783 "psraw $5, %%mm0 \n\t"\
1784 "movq %%mm0, %5 \n\t"\ 1784 "movq %%mm0, %5 \n\t"\
1785 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ 1785 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1786 \ 1786 \
1787 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ 1787 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1788 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ 1788 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1789 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ 1789 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1790 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ 1790 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1791 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ 1791 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1792 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ 1792 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1793 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ 1793 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1794 "paddw %%mm0, %%mm2 \n\t" /* b */\ 1794 "paddw %%mm0, %%mm2 \n\t" /* b */\
1795 "paddw %%mm5, %%mm3 \n\t" /* c */\ 1795 "paddw %%mm5, %%mm3 \n\t" /* c */\
1796 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ 1796 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1797 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ 1797 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1798 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ 1798 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1799 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ 1799 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1800 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ 1800 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1801 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ 1801 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1802 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ 1802 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1803 "paddw %%mm2, %%mm1 \n\t" /* a */\ 1803 "paddw %%mm2, %%mm1 \n\t" /* a */\
1804 "paddw %%mm6, %%mm4 \n\t" /* d */\ 1804 "paddw %%mm6, %%mm4 \n\t" /* d */\
1805 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ 1805 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1806 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ 1806 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
1807 "paddw %6, %%mm1 \n\t"\ 1807 "paddw %6, %%mm1 \n\t"\
1808 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ 1808 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1809 "psraw $5, %%mm3 \n\t"\ 1809 "psraw $5, %%mm3 \n\t"\
1810 "movq %5, %%mm1 \n\t"\ 1810 "movq %5, %%mm1 \n\t"\
1811 "packuswb %%mm3, %%mm1 \n\t"\ 1811 "packuswb %%mm3, %%mm1 \n\t"\
1812 OP_MMX2(%%mm1, (%1),%%mm4, q)\ 1812 OP_MMX2(%%mm1, (%1),%%mm4, q)\
1813 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ 1813 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1814 \ 1814 \
1815 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ 1815 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
1816 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ 1816 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
1817 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ 1817 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
1818 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ 1818 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
1819 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ 1819 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
1820 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ 1820 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
1821 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ 1821 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
1822 "paddw %%mm1, %%mm5 \n\t" /* b */\ 1822 "paddw %%mm1, %%mm5 \n\t" /* b */\
1823 "paddw %%mm4, %%mm0 \n\t" /* c */\ 1823 "paddw %%mm4, %%mm0 \n\t" /* c */\
1824 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ 1824 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1825 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ 1825 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
1826 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ 1826 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
1827 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ 1827 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
1828 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ 1828 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
1829 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ 1829 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
1830 "paddw %%mm3, %%mm2 \n\t" /* d */\ 1830 "paddw %%mm3, %%mm2 \n\t" /* d */\
1831 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ 1831 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
1832 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ 1832 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
1833 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ 1833 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
1834 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ 1834 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
1835 "paddw %%mm2, %%mm6 \n\t" /* a */\ 1835 "paddw %%mm2, %%mm6 \n\t" /* a */\
1836 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ 1836 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1837 "paddw %6, %%mm0 \n\t"\ 1837 "paddw %6, %%mm0 \n\t"\
1838 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ 1838 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1839 "psraw $5, %%mm0 \n\t"\ 1839 "psraw $5, %%mm0 \n\t"\
1840 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ 1840 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1841 \ 1841 \
1842 "paddw %%mm5, %%mm3 \n\t" /* a */\ 1842 "paddw %%mm5, %%mm3 \n\t" /* a */\
1843 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ 1843 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
1844 "paddw %%mm4, %%mm6 \n\t" /* b */\ 1844 "paddw %%mm4, %%mm6 \n\t" /* b */\
1845 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ 1845 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
1846 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ 1846 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
1847 "paddw %%mm1, %%mm4 \n\t" /* c */\ 1847 "paddw %%mm1, %%mm4 \n\t" /* c */\
1848 "paddw %%mm2, %%mm5 \n\t" /* d */\ 1848 "paddw %%mm2, %%mm5 \n\t" /* d */\
1849 "paddw %%mm6, %%mm6 \n\t" /* 2b */\ 1849 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
1850 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ 1850 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
1851 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ 1851 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
1852 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ 1852 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
1853 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ 1853 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
1854 "paddw %6, %%mm4 \n\t"\ 1854 "paddw %6, %%mm4 \n\t"\
1855 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ 1855 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
1856 "psraw $5, %%mm4 \n\t"\ 1856 "psraw $5, %%mm4 \n\t"\
1857 "packuswb %%mm4, %%mm0 \n\t"\ 1857 "packuswb %%mm4, %%mm0 \n\t"\
1858 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ 1858 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
1859 \ 1859 \
1860 "add %3, %0 \n\t"\ 1860 "add %3, %0 \n\t"\
1861 "add %4, %1 \n\t"\ 1861 "add %4, %1 \n\t"\
1862 "decl %2 \n\t"\ 1862 "decl %2 \n\t"\
1863 " jnz 1b \n\t"\ 1863 " jnz 1b \n\t"\
1864 : "+a"(src), "+c"(dst), "+m"(h)\ 1864 : "+a"(src), "+c"(dst), "+m"(h)\
1865 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ 1865 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1866 : "memory"\ 1866 : "memory"\
1867 );\ 1867 );\
1868 }\ 1868 }\
1888 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\ 1888 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1889 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\ 1889 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1890 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\ 1890 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1891 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\ 1891 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1892 asm volatile(\ 1892 asm volatile(\
1893 "movq (%0), %%mm0 \n\t"\ 1893 "movq (%0), %%mm0 \n\t"\
1894 "movq 8(%0), %%mm1 \n\t"\ 1894 "movq 8(%0), %%mm1 \n\t"\
1895 "paddw %2, %%mm0 \n\t"\ 1895 "paddw %2, %%mm0 \n\t"\
1896 "paddw %2, %%mm1 \n\t"\ 1896 "paddw %2, %%mm1 \n\t"\
1897 "psraw $5, %%mm0 \n\t"\ 1897 "psraw $5, %%mm0 \n\t"\
1898 "psraw $5, %%mm1 \n\t"\ 1898 "psraw $5, %%mm1 \n\t"\
1899 "packuswb %%mm1, %%mm0 \n\t"\ 1899 "packuswb %%mm1, %%mm0 \n\t"\
1900 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ 1900 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1901 "movq 16(%0), %%mm0 \n\t"\ 1901 "movq 16(%0), %%mm0 \n\t"\
1902 "movq 24(%0), %%mm1 \n\t"\ 1902 "movq 24(%0), %%mm1 \n\t"\
1903 "paddw %2, %%mm0 \n\t"\ 1903 "paddw %2, %%mm0 \n\t"\
1904 "paddw %2, %%mm1 \n\t"\ 1904 "paddw %2, %%mm1 \n\t"\
1905 "psraw $5, %%mm0 \n\t"\ 1905 "psraw $5, %%mm0 \n\t"\
1906 "psraw $5, %%mm1 \n\t"\ 1906 "psraw $5, %%mm1 \n\t"\
1907 "packuswb %%mm1, %%mm0 \n\t"\ 1907 "packuswb %%mm1, %%mm0 \n\t"\
1908 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ 1908 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1909 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ 1909 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1910 : "memory"\ 1910 : "memory"\
1911 );\ 1911 );\
1912 dst+=dstStride;\ 1912 dst+=dstStride;\
1916 \ 1916 \
1917 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1917 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1918 uint64_t temp;\ 1918 uint64_t temp;\
1919 \ 1919 \
1920 asm volatile(\ 1920 asm volatile(\
1921 "pxor %%mm7, %%mm7 \n\t"\ 1921 "pxor %%mm7, %%mm7 \n\t"\
1922 "1: \n\t"\ 1922 "1: \n\t"\
1923 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ 1923 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1924 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ 1924 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1925 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ 1925 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1926 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ 1926 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1927 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ 1927 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1928 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ 1928 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1929 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ 1929 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1930 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ 1930 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1931 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ 1931 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1932 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ 1932 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1933 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ 1933 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1934 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ 1934 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1935 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ 1935 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1936 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ 1936 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1937 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ 1937 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1938 "paddw %%mm3, %%mm5 \n\t" /* b */\ 1938 "paddw %%mm3, %%mm5 \n\t" /* b */\
1939 "paddw %%mm2, %%mm6 \n\t" /* c */\ 1939 "paddw %%mm2, %%mm6 \n\t" /* c */\
1940 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ 1940 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1941 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ 1941 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1942 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ 1942 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1943 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ 1943 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1944 "paddw %%mm4, %%mm0 \n\t" /* a */\ 1944 "paddw %%mm4, %%mm0 \n\t" /* a */\
1945 "paddw %%mm1, %%mm5 \n\t" /* d */\ 1945 "paddw %%mm1, %%mm5 \n\t" /* d */\
1946 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ 1946 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1947 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ 1947 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1948 "paddw %6, %%mm6 \n\t"\ 1948 "paddw %6, %%mm6 \n\t"\
1949 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ 1949 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1950 "psraw $5, %%mm0 \n\t"\ 1950 "psraw $5, %%mm0 \n\t"\
1951 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ 1951 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1952 \ 1952 \
1953 "movd 5(%0), %%mm5 \n\t" /* FGHI */\ 1953 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
1954 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ 1954 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1955 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ 1955 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1956 "paddw %%mm5, %%mm1 \n\t" /* a */\ 1956 "paddw %%mm5, %%mm1 \n\t" /* a */\
1957 "paddw %%mm6, %%mm2 \n\t" /* b */\ 1957 "paddw %%mm6, %%mm2 \n\t" /* b */\
1958 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ 1958 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1959 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ 1959 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1960 "paddw %%mm6, %%mm3 \n\t" /* c */\ 1960 "paddw %%mm6, %%mm3 \n\t" /* c */\
1961 "paddw %%mm5, %%mm4 \n\t" /* d */\ 1961 "paddw %%mm5, %%mm4 \n\t" /* d */\
1962 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ 1962 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1963 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ 1963 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1964 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ 1964 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1965 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ 1965 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1966 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ 1966 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1967 "paddw %6, %%mm1 \n\t"\ 1967 "paddw %6, %%mm1 \n\t"\
1968 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ 1968 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1969 "psraw $5, %%mm3 \n\t"\ 1969 "psraw $5, %%mm3 \n\t"\
1970 "packuswb %%mm3, %%mm0 \n\t"\ 1970 "packuswb %%mm3, %%mm0 \n\t"\
1971 OP_MMX2(%%mm0, (%1), %%mm4, q)\ 1971 OP_MMX2(%%mm0, (%1), %%mm4, q)\
1972 \ 1972 \
1973 "add %3, %0 \n\t"\ 1973 "add %3, %0 \n\t"\
1974 "add %4, %1 \n\t"\ 1974 "add %4, %1 \n\t"\
1975 "decl %2 \n\t"\ 1975 "decl %2 \n\t"\
1976 " jnz 1b \n\t"\ 1976 " jnz 1b \n\t"\
1977 : "+a"(src), "+c"(dst), "+m"(h)\ 1977 : "+a"(src), "+c"(dst), "+m"(h)\
1978 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ 1978 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1979 : "memory"\ 1979 : "memory"\
1980 );\ 1980 );\
1981 }\ 1981 }\
1993 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ 1993 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1994 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ 1994 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1995 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ 1995 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1996 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ 1996 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1997 asm volatile(\ 1997 asm volatile(\
1998 "movq (%0), %%mm0 \n\t"\ 1998 "movq (%0), %%mm0 \n\t"\
1999 "movq 8(%0), %%mm1 \n\t"\ 1999 "movq 8(%0), %%mm1 \n\t"\
2000 "paddw %2, %%mm0 \n\t"\ 2000 "paddw %2, %%mm0 \n\t"\
2001 "paddw %2, %%mm1 \n\t"\ 2001 "paddw %2, %%mm1 \n\t"\
2002 "psraw $5, %%mm0 \n\t"\ 2002 "psraw $5, %%mm0 \n\t"\
2003 "psraw $5, %%mm1 \n\t"\ 2003 "psraw $5, %%mm1 \n\t"\
2004 "packuswb %%mm1, %%mm0 \n\t"\ 2004 "packuswb %%mm1, %%mm0 \n\t"\
2005 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ 2005 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2006 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ 2006 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
2007 :"memory"\ 2007 :"memory"\
2008 );\ 2008 );\
2009 dst+=dstStride;\ 2009 dst+=dstStride;\
2018 uint64_t *temp_ptr= temp;\ 2018 uint64_t *temp_ptr= temp;\
2019 int count= 17;\ 2019 int count= 17;\
2020 \ 2020 \
2021 /*FIXME unroll */\ 2021 /*FIXME unroll */\
2022 asm volatile(\ 2022 asm volatile(\
2023 "pxor %%mm7, %%mm7 \n\t"\ 2023 "pxor %%mm7, %%mm7 \n\t"\
2024 "1: \n\t"\ 2024 "1: \n\t"\
2025 "movq (%0), %%mm0 \n\t"\ 2025 "movq (%0), %%mm0 \n\t"\
2026 "movq (%0), %%mm1 \n\t"\ 2026 "movq (%0), %%mm1 \n\t"\
2027 "movq 8(%0), %%mm2 \n\t"\ 2027 "movq 8(%0), %%mm2 \n\t"\
2028 "movq 8(%0), %%mm3 \n\t"\ 2028 "movq 8(%0), %%mm3 \n\t"\
2029 "punpcklbw %%mm7, %%mm0 \n\t"\ 2029 "punpcklbw %%mm7, %%mm0 \n\t"\
2030 "punpckhbw %%mm7, %%mm1 \n\t"\ 2030 "punpckhbw %%mm7, %%mm1 \n\t"\
2031 "punpcklbw %%mm7, %%mm2 \n\t"\ 2031 "punpcklbw %%mm7, %%mm2 \n\t"\
2032 "punpckhbw %%mm7, %%mm3 \n\t"\ 2032 "punpckhbw %%mm7, %%mm3 \n\t"\
2033 "movq %%mm0, (%1) \n\t"\ 2033 "movq %%mm0, (%1) \n\t"\
2034 "movq %%mm1, 17*8(%1) \n\t"\ 2034 "movq %%mm1, 17*8(%1) \n\t"\
2035 "movq %%mm2, 2*17*8(%1) \n\t"\ 2035 "movq %%mm2, 2*17*8(%1) \n\t"\
2036 "movq %%mm3, 3*17*8(%1) \n\t"\ 2036 "movq %%mm3, 3*17*8(%1) \n\t"\
2037 "add $8, %1 \n\t"\ 2037 "add $8, %1 \n\t"\
2038 "add %3, %0 \n\t"\ 2038 "add %3, %0 \n\t"\
2039 "decl %2 \n\t"\ 2039 "decl %2 \n\t"\
2040 " jnz 1b \n\t"\ 2040 " jnz 1b \n\t"\
2041 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ 2041 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2042 : "r" ((long)srcStride)\ 2042 : "r" ((long)srcStride)\
2043 : "memory"\ 2043 : "memory"\
2044 );\ 2044 );\
2045 \ 2045 \
2046 temp_ptr= temp;\ 2046 temp_ptr= temp;\
2047 count=4;\ 2047 count=4;\
2048 \ 2048 \
2049 /*FIXME reorder for speed */\ 2049 /*FIXME reorder for speed */\
2050 asm volatile(\ 2050 asm volatile(\
2051 /*"pxor %%mm7, %%mm7 \n\t"*/\ 2051 /*"pxor %%mm7, %%mm7 \n\t"*/\
2052 "1: \n\t"\ 2052 "1: \n\t"\
2053 "movq (%0), %%mm0 \n\t"\ 2053 "movq (%0), %%mm0 \n\t"\
2054 "movq 8(%0), %%mm1 \n\t"\ 2054 "movq 8(%0), %%mm1 \n\t"\
2055 "movq 16(%0), %%mm2 \n\t"\ 2055 "movq 16(%0), %%mm2 \n\t"\
2056 "movq 24(%0), %%mm3 \n\t"\ 2056 "movq 24(%0), %%mm3 \n\t"\
2057 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ 2057 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2058 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ 2058 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2059 "add %4, %1 \n\t"\ 2059 "add %4, %1 \n\t"\
2060 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ 2060 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2061 \ 2061 \
2062 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ 2062 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2063 "add %4, %1 \n\t"\ 2063 "add %4, %1 \n\t"\
2064 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ 2064 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2065 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ 2065 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2066 "add %4, %1 \n\t"\ 2066 "add %4, %1 \n\t"\
2067 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ 2067 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
2068 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ 2068 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2069 "add %4, %1 \n\t"\ 2069 "add %4, %1 \n\t"\
2070 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ 2070 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
2071 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ 2071 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2072 "add %4, %1 \n\t"\ 2072 "add %4, %1 \n\t"\
2073 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ 2073 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
2074 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ 2074 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2075 "add %4, %1 \n\t"\ 2075 "add %4, %1 \n\t"\
2076 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ 2076 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
2077 \ 2077 \
2078 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ 2078 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2079 "add %4, %1 \n\t" \ 2079 "add %4, %1 \n\t" \
2080 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ 2080 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
2081 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ 2081 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
2082 \ 2082 \
2083 "add $136, %0 \n\t"\ 2083 "add $136, %0 \n\t"\
2084 "add %6, %1 \n\t"\ 2084 "add %6, %1 \n\t"\
2085 "decl %2 \n\t"\ 2085 "decl %2 \n\t"\
2086 " jnz 1b \n\t"\ 2086 " jnz 1b \n\t"\
2087 \ 2087 \
2088 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ 2088 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2089 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\ 2089 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2090 :"memory"\ 2090 :"memory"\
2091 );\ 2091 );\
2096 uint64_t *temp_ptr= temp;\ 2096 uint64_t *temp_ptr= temp;\
2097 int count= 9;\ 2097 int count= 9;\
2098 \ 2098 \
2099 /*FIXME unroll */\ 2099 /*FIXME unroll */\
2100 asm volatile(\ 2100 asm volatile(\
2101 "pxor %%mm7, %%mm7 \n\t"\ 2101 "pxor %%mm7, %%mm7 \n\t"\
2102 "1: \n\t"\ 2102 "1: \n\t"\
2103 "movq (%0), %%mm0 \n\t"\ 2103 "movq (%0), %%mm0 \n\t"\
2104 "movq (%0), %%mm1 \n\t"\ 2104 "movq (%0), %%mm1 \n\t"\
2105 "punpcklbw %%mm7, %%mm0 \n\t"\ 2105 "punpcklbw %%mm7, %%mm0 \n\t"\
2106 "punpckhbw %%mm7, %%mm1 \n\t"\ 2106 "punpckhbw %%mm7, %%mm1 \n\t"\
2107 "movq %%mm0, (%1) \n\t"\ 2107 "movq %%mm0, (%1) \n\t"\
2108 "movq %%mm1, 9*8(%1) \n\t"\ 2108 "movq %%mm1, 9*8(%1) \n\t"\
2109 "add $8, %1 \n\t"\ 2109 "add $8, %1 \n\t"\
2110 "add %3, %0 \n\t"\ 2110 "add %3, %0 \n\t"\
2111 "decl %2 \n\t"\ 2111 "decl %2 \n\t"\
2112 " jnz 1b \n\t"\ 2112 " jnz 1b \n\t"\
2113 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ 2113 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2114 : "r" ((long)srcStride)\ 2114 : "r" ((long)srcStride)\
2115 : "memory"\ 2115 : "memory"\
2116 );\ 2116 );\
2117 \ 2117 \
2118 temp_ptr= temp;\ 2118 temp_ptr= temp;\
2119 count=2;\ 2119 count=2;\
2120 \ 2120 \
2121 /*FIXME reorder for speed */\ 2121 /*FIXME reorder for speed */\
2122 asm volatile(\ 2122 asm volatile(\
2123 /*"pxor %%mm7, %%mm7 \n\t"*/\ 2123 /*"pxor %%mm7, %%mm7 \n\t"*/\
2124 "1: \n\t"\ 2124 "1: \n\t"\
2125 "movq (%0), %%mm0 \n\t"\ 2125 "movq (%0), %%mm0 \n\t"\
2126 "movq 8(%0), %%mm1 \n\t"\ 2126 "movq 8(%0), %%mm1 \n\t"\
2127 "movq 16(%0), %%mm2 \n\t"\ 2127 "movq 16(%0), %%mm2 \n\t"\
2128 "movq 24(%0), %%mm3 \n\t"\ 2128 "movq 24(%0), %%mm3 \n\t"\
2129 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ 2129 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2130 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ 2130 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2131 "add %4, %1 \n\t"\ 2131 "add %4, %1 \n\t"\
2132 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ 2132 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2133 \ 2133 \
2134 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ 2134 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2135 "add %4, %1 \n\t"\ 2135 "add %4, %1 \n\t"\
2136 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ 2136 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2137 \ 2137 \
2138 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ 2138 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2139 "add %4, %1 \n\t"\ 2139 "add %4, %1 \n\t"\
2140 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ 2140 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2141 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ 2141 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2142 \ 2142 \
2143 "add $72, %0 \n\t"\ 2143 "add $72, %0 \n\t"\
2144 "add %6, %1 \n\t"\ 2144 "add %6, %1 \n\t"\
2145 "decl %2 \n\t"\ 2145 "decl %2 \n\t"\
2146 " jnz 1b \n\t"\ 2146 " jnz 1b \n\t"\
2147 \ 2147 \
2148 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ 2148 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2149 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\ 2149 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2150 : "memory"\ 2150 : "memory"\
2151 );\ 2151 );\
2372 uint8_t * const halfH= ((uint8_t*)half);\ 2372 uint8_t * const halfH= ((uint8_t*)half);\
2373 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 2373 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2374 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ 2374 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2375 } 2375 }
2376 2376
2377 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" 2377 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
2378 #define AVG_3DNOW_OP(a,b,temp, size) \ 2378 #define AVG_3DNOW_OP(a,b,temp, size) \
2379 "mov" #size " " #b ", " #temp " \n\t"\ 2379 "mov" #size " " #b ", " #temp " \n\t"\
2380 "pavgusb " #temp ", " #a " \n\t"\ 2380 "pavgusb " #temp ", " #a " \n\t"\
2381 "mov" #size " " #a ", " #b " \n\t" 2381 "mov" #size " " #a ", " #b " \n\t"
2382 #define AVG_MMX2_OP(a,b,temp, size) \ 2382 #define AVG_MMX2_OP(a,b,temp, size) \
2383 "mov" #size " " #b ", " #temp " \n\t"\ 2383 "mov" #size " " #b ", " #temp " \n\t"\
2384 "pavgb " #temp ", " #a " \n\t"\ 2384 "pavgb " #temp ", " #a " \n\t"\
2385 "mov" #size " " #a ", " #b " \n\t" 2385 "mov" #size " " #a ", " #b " \n\t"
2386 2386
2387 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) 2387 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
2388 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) 2388 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
2389 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) 2389 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
2390 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) 2390 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
2408 2408
2409 assert(ABS(scale) < 256); 2409 assert(ABS(scale) < 256);
2410 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; 2410 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2411 2411
2412 asm volatile( 2412 asm volatile(
2413 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w 2413 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
2414 "psrlw $15, %%mm6 \n\t" // 1w 2414 "psrlw $15, %%mm6 \n\t" // 1w
2415 "pxor %%mm7, %%mm7 \n\t" 2415 "pxor %%mm7, %%mm7 \n\t"
2416 "movd %4, %%mm5 \n\t" 2416 "movd %4, %%mm5 \n\t"
2417 "punpcklwd %%mm5, %%mm5 \n\t" 2417 "punpcklwd %%mm5, %%mm5 \n\t"
2418 "punpcklwd %%mm5, %%mm5 \n\t" 2418 "punpcklwd %%mm5, %%mm5 \n\t"
2419 "1: \n\t" 2419 "1: \n\t"
2420 "movq (%1, %0), %%mm0 \n\t" 2420 "movq (%1, %0), %%mm0 \n\t"
2421 "movq 8(%1, %0), %%mm1 \n\t" 2421 "movq 8(%1, %0), %%mm1 \n\t"
2422 "pmulhw %%mm5, %%mm0 \n\t" 2422 "pmulhw %%mm5, %%mm0 \n\t"
2423 "pmulhw %%mm5, %%mm1 \n\t" 2423 "pmulhw %%mm5, %%mm1 \n\t"
2424 "paddw %%mm6, %%mm0 \n\t" 2424 "paddw %%mm6, %%mm0 \n\t"
2425 "paddw %%mm6, %%mm1 \n\t" 2425 "paddw %%mm6, %%mm1 \n\t"
2426 "psraw $1, %%mm0 \n\t" 2426 "psraw $1, %%mm0 \n\t"
2427 "psraw $1, %%mm1 \n\t" 2427 "psraw $1, %%mm1 \n\t"
2428 "paddw (%2, %0), %%mm0 \n\t" 2428 "paddw (%2, %0), %%mm0 \n\t"
2429 "paddw 8(%2, %0), %%mm1 \n\t" 2429 "paddw 8(%2, %0), %%mm1 \n\t"
2430 "psraw $6, %%mm0 \n\t" 2430 "psraw $6, %%mm0 \n\t"
2431 "psraw $6, %%mm1 \n\t" 2431 "psraw $6, %%mm1 \n\t"
2432 "pmullw (%3, %0), %%mm0 \n\t" 2432 "pmullw (%3, %0), %%mm0 \n\t"
2433 "pmullw 8(%3, %0), %%mm1 \n\t" 2433 "pmullw 8(%3, %0), %%mm1 \n\t"
2434 "pmaddwd %%mm0, %%mm0 \n\t" 2434 "pmaddwd %%mm0, %%mm0 \n\t"
2435 "pmaddwd %%mm1, %%mm1 \n\t" 2435 "pmaddwd %%mm1, %%mm1 \n\t"
2436 "paddd %%mm1, %%mm0 \n\t" 2436 "paddd %%mm1, %%mm0 \n\t"
2437 "psrld $4, %%mm0 \n\t" 2437 "psrld $4, %%mm0 \n\t"
2438 "paddd %%mm0, %%mm7 \n\t" 2438 "paddd %%mm0, %%mm7 \n\t"
2439 "add $16, %0 \n\t" 2439 "add $16, %0 \n\t"
2440 "cmp $128, %0 \n\t" //FIXME optimize & bench 2440 "cmp $128, %0 \n\t" //FIXME optimize & bench
2441 " jb 1b \n\t" 2441 " jb 1b \n\t"
2442 "movq %%mm7, %%mm6 \n\t" 2442 "movq %%mm7, %%mm6 \n\t"
2443 "psrlq $32, %%mm7 \n\t" 2443 "psrlq $32, %%mm7 \n\t"
2444 "paddd %%mm6, %%mm7 \n\t" 2444 "paddd %%mm6, %%mm7 \n\t"
2445 "psrld $2, %%mm7 \n\t" 2445 "psrld $2, %%mm7 \n\t"
2446 "movd %%mm7, %0 \n\t" 2446 "movd %%mm7, %0 \n\t"
2447 2447
2448 : "+r" (i) 2448 : "+r" (i)
2449 : "r"(basis), "r"(rem), "r"(weight), "g"(scale) 2449 : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
2450 ); 2450 );
2451 return i; 2451 return i;
2455 long i=0; 2455 long i=0;
2456 2456
2457 if(ABS(scale) < 256){ 2457 if(ABS(scale) < 256){
2458 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; 2458 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2459 asm volatile( 2459 asm volatile(
2460 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w 2460 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
2461 "psrlw $15, %%mm6 \n\t" // 1w 2461 "psrlw $15, %%mm6 \n\t" // 1w
2462 "movd %3, %%mm5 \n\t" 2462 "movd %3, %%mm5 \n\t"
2463 "punpcklwd %%mm5, %%mm5 \n\t" 2463 "punpcklwd %%mm5, %%mm5 \n\t"
2464 "punpcklwd %%mm5, %%mm5 \n\t" 2464 "punpcklwd %%mm5, %%mm5 \n\t"
2465 "1: \n\t" 2465 "1: \n\t"
2466 "movq (%1, %0), %%mm0 \n\t" 2466 "movq (%1, %0), %%mm0 \n\t"
2467 "movq 8(%1, %0), %%mm1 \n\t" 2467 "movq 8(%1, %0), %%mm1 \n\t"
2468 "pmulhw %%mm5, %%mm0 \n\t" 2468 "pmulhw %%mm5, %%mm0 \n\t"
2469 "pmulhw %%mm5, %%mm1 \n\t" 2469 "pmulhw %%mm5, %%mm1 \n\t"
2470 "paddw %%mm6, %%mm0 \n\t" 2470 "paddw %%mm6, %%mm0 \n\t"
2471 "paddw %%mm6, %%mm1 \n\t" 2471 "paddw %%mm6, %%mm1 \n\t"
2472 "psraw $1, %%mm0 \n\t" 2472 "psraw $1, %%mm0 \n\t"
2473 "psraw $1, %%mm1 \n\t" 2473 "psraw $1, %%mm1 \n\t"
2474 "paddw (%2, %0), %%mm0 \n\t" 2474 "paddw (%2, %0), %%mm0 \n\t"
2475 "paddw 8(%2, %0), %%mm1 \n\t" 2475 "paddw 8(%2, %0), %%mm1 \n\t"
2476 "movq %%mm0, (%2, %0) \n\t" 2476 "movq %%mm0, (%2, %0) \n\t"
2477 "movq %%mm1, 8(%2, %0) \n\t" 2477 "movq %%mm1, 8(%2, %0) \n\t"
2478 "add $16, %0 \n\t" 2478 "add $16, %0 \n\t"
2479 "cmp $128, %0 \n\t" //FIXME optimize & bench 2479 "cmp $128, %0 \n\t" //FIXME optimize & bench
2480 " jb 1b \n\t" 2480 " jb 1b \n\t"
2481 2481
2482 : "+r" (i) 2482 : "+r" (i)
2483 : "r"(basis), "r"(rem), "g"(scale) 2483 : "r"(basis), "r"(rem), "g"(scale)
2484 ); 2484 );
2485 }else{ 2485 }else{
2567 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) 2567 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2568 { 2568 {
2569 mm_flags = mm_support(); 2569 mm_flags = mm_support();
2570 2570
2571 if (avctx->dsp_mask) { 2571 if (avctx->dsp_mask) {
2572 if (avctx->dsp_mask & FF_MM_FORCE) 2572 if (avctx->dsp_mask & FF_MM_FORCE)
2573 mm_flags |= (avctx->dsp_mask & 0xffff); 2573 mm_flags |= (avctx->dsp_mask & 0xffff);
2574 else 2574 else
2575 mm_flags &= ~(avctx->dsp_mask & 0xffff); 2575 mm_flags &= ~(avctx->dsp_mask & 0xffff);
2576 } 2576 }
2577 2577
2578 #if 0 2578 #if 0
2579 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); 2579 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
2580 if (mm_flags & MM_MMX) 2580 if (mm_flags & MM_MMX)
2596 #ifdef CONFIG_ENCODERS 2596 #ifdef CONFIG_ENCODERS
2597 const int dct_algo = avctx->dct_algo; 2597 const int dct_algo = avctx->dct_algo;
2598 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ 2598 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
2599 if(mm_flags & MM_SSE2){ 2599 if(mm_flags & MM_SSE2){
2600 c->fdct = ff_fdct_sse2; 2600 c->fdct = ff_fdct_sse2;
2601 }else if(mm_flags & MM_MMXEXT){ 2601 }else if(mm_flags & MM_MMXEXT){
2602 c->fdct = ff_fdct_mmx2; 2602 c->fdct = ff_fdct_mmx2;
2603 }else{ 2603 }else{
2604 c->fdct = ff_fdct_mmx; 2604 c->fdct = ff_fdct_mmx;
2605 } 2605 }
2606 } 2606 }
2707 c->diff_bytes= diff_bytes_mmx; 2707 c->diff_bytes= diff_bytes_mmx;
2708 2708
2709 c->hadamard8_diff[0]= hadamard8_diff16_mmx; 2709 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
2710 c->hadamard8_diff[1]= hadamard8_diff_mmx; 2710 c->hadamard8_diff[1]= hadamard8_diff_mmx;
2711 2711
2712 c->pix_norm1 = pix_norm1_mmx; 2712 c->pix_norm1 = pix_norm1_mmx;
2713 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx; 2713 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
2714 c->sse[1] = sse8_mmx; 2714 c->sse[1] = sse8_mmx;
2715 c->vsad[4]= vsad_intra16_mmx; 2715 c->vsad[4]= vsad_intra16_mmx;
2716 2716
2717 c->nsse[0] = nsse16_mmx; 2717 c->nsse[0] = nsse16_mmx;
2718 c->nsse[1] = nsse8_mmx; 2718 c->nsse[1] = nsse8_mmx;
2719 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 2719 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2720 c->vsad[0] = vsad16_mmx; 2720 c->vsad[0] = vsad16_mmx;
2721 } 2721 }
2722 2722
2723 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 2723 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2727 2727
2728 #endif //CONFIG_ENCODERS 2728 #endif //CONFIG_ENCODERS
2729 2729
2730 c->h263_v_loop_filter= h263_v_loop_filter_mmx; 2730 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2731 c->h263_h_loop_filter= h263_h_loop_filter_mmx; 2731 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
2732 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx; 2732 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
2733 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; 2733 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
2734 2734
2735 if (mm_flags & MM_MMXEXT) { 2735 if (mm_flags & MM_MMXEXT) {
2736 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; 2736 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2737 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; 2737 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2827 dspfunc(avg_h264_qpel, 0, 16); 2827 dspfunc(avg_h264_qpel, 0, 16);
2828 dspfunc(avg_h264_qpel, 1, 8); 2828 dspfunc(avg_h264_qpel, 1, 8);
2829 dspfunc(avg_h264_qpel, 2, 4); 2829 dspfunc(avg_h264_qpel, 2, 4);
2830 #undef dspfunc 2830 #undef dspfunc
2831 2831
2832 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2; 2832 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
2833 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2; 2833 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
2834 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; 2834 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
2835 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; 2835 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
2836 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; 2836 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
2837 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; 2837 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
2939 dspfunc(put_h264_qpel, 2, 4); 2939 dspfunc(put_h264_qpel, 2, 4);
2940 dspfunc(avg_h264_qpel, 0, 16); 2940 dspfunc(avg_h264_qpel, 0, 16);
2941 dspfunc(avg_h264_qpel, 1, 8); 2941 dspfunc(avg_h264_qpel, 1, 8);
2942 dspfunc(avg_h264_qpel, 2, 4); 2942 dspfunc(avg_h264_qpel, 2, 4);
2943 2943
2944 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow; 2944 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
2945 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; 2945 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
2946 } 2946 }
2947 } 2947 }
2948 2948
2949 #ifdef CONFIG_ENCODERS 2949 #ifdef CONFIG_ENCODERS