Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 2293:15cfba1b97b5 libavcodec
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
author | michael |
---|---|
date | Mon, 11 Oct 2004 02:19:29 +0000 |
parents | 7e0b2e86afa9 |
children | 86e2b1424801 |
comparison
equal
deleted
inserted
replaced
2292:8556f080fcc2 | 2293:15cfba1b97b5 |
---|---|
185 | 185 |
186 #ifdef CONFIG_ENCODERS | 186 #ifdef CONFIG_ENCODERS |
187 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) | 187 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) |
188 { | 188 { |
189 asm volatile( | 189 asm volatile( |
190 "movl $-128, %%eax \n\t" | 190 "mov $-128, %%"REG_a" \n\t" |
191 "pxor %%mm7, %%mm7 \n\t" | 191 "pxor %%mm7, %%mm7 \n\t" |
192 ".balign 16 \n\t" | 192 ".balign 16 \n\t" |
193 "1: \n\t" | 193 "1: \n\t" |
194 "movq (%0), %%mm0 \n\t" | 194 "movq (%0), %%mm0 \n\t" |
195 "movq (%0, %2), %%mm2 \n\t" | 195 "movq (%0, %2), %%mm2 \n\t" |
197 "movq %%mm2, %%mm3 \n\t" | 197 "movq %%mm2, %%mm3 \n\t" |
198 "punpcklbw %%mm7, %%mm0 \n\t" | 198 "punpcklbw %%mm7, %%mm0 \n\t" |
199 "punpckhbw %%mm7, %%mm1 \n\t" | 199 "punpckhbw %%mm7, %%mm1 \n\t" |
200 "punpcklbw %%mm7, %%mm2 \n\t" | 200 "punpcklbw %%mm7, %%mm2 \n\t" |
201 "punpckhbw %%mm7, %%mm3 \n\t" | 201 "punpckhbw %%mm7, %%mm3 \n\t" |
202 "movq %%mm0, (%1, %%eax)\n\t" | 202 "movq %%mm0, (%1, %%"REG_a")\n\t" |
203 "movq %%mm1, 8(%1, %%eax)\n\t" | 203 "movq %%mm1, 8(%1, %%"REG_a")\n\t" |
204 "movq %%mm2, 16(%1, %%eax)\n\t" | 204 "movq %%mm2, 16(%1, %%"REG_a")\n\t" |
205 "movq %%mm3, 24(%1, %%eax)\n\t" | 205 "movq %%mm3, 24(%1, %%"REG_a")\n\t" |
206 "addl %3, %0 \n\t" | 206 "add %3, %0 \n\t" |
207 "addl $32, %%eax \n\t" | 207 "add $32, %%"REG_a" \n\t" |
208 "js 1b \n\t" | 208 "js 1b \n\t" |
209 : "+r" (pixels) | 209 : "+r" (pixels) |
210 : "r" (block+64), "r" (line_size), "r" (line_size*2) | 210 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2) |
211 : "%eax" | 211 : "%"REG_a |
212 ); | 212 ); |
213 } | 213 } |
214 | 214 |
215 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) | 215 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) |
216 { | 216 { |
217 asm volatile( | 217 asm volatile( |
218 "pxor %%mm7, %%mm7 \n\t" | 218 "pxor %%mm7, %%mm7 \n\t" |
219 "movl $-128, %%eax \n\t" | 219 "mov $-128, %%"REG_a" \n\t" |
220 ".balign 16 \n\t" | 220 ".balign 16 \n\t" |
221 "1: \n\t" | 221 "1: \n\t" |
222 "movq (%0), %%mm0 \n\t" | 222 "movq (%0), %%mm0 \n\t" |
223 "movq (%1), %%mm2 \n\t" | 223 "movq (%1), %%mm2 \n\t" |
224 "movq %%mm0, %%mm1 \n\t" | 224 "movq %%mm0, %%mm1 \n\t" |
227 "punpckhbw %%mm7, %%mm1 \n\t" | 227 "punpckhbw %%mm7, %%mm1 \n\t" |
228 "punpcklbw %%mm7, %%mm2 \n\t" | 228 "punpcklbw %%mm7, %%mm2 \n\t" |
229 "punpckhbw %%mm7, %%mm3 \n\t" | 229 "punpckhbw %%mm7, %%mm3 \n\t" |
230 "psubw %%mm2, %%mm0 \n\t" | 230 "psubw %%mm2, %%mm0 \n\t" |
231 "psubw %%mm3, %%mm1 \n\t" | 231 "psubw %%mm3, %%mm1 \n\t" |
232 "movq %%mm0, (%2, %%eax)\n\t" | 232 "movq %%mm0, (%2, %%"REG_a")\n\t" |
233 "movq %%mm1, 8(%2, %%eax)\n\t" | 233 "movq %%mm1, 8(%2, %%"REG_a")\n\t" |
234 "addl %3, %0 \n\t" | 234 "add %3, %0 \n\t" |
235 "addl %3, %1 \n\t" | 235 "add %3, %1 \n\t" |
236 "addl $16, %%eax \n\t" | 236 "add $16, %%"REG_a" \n\t" |
237 "jnz 1b \n\t" | 237 "jnz 1b \n\t" |
238 : "+r" (s1), "+r" (s2) | 238 : "+r" (s1), "+r" (s2) |
239 : "r" (block+64), "r" (stride) | 239 : "r" (block+64), "r" ((long)stride) |
240 : "%eax" | 240 : "%"REG_a |
241 ); | 241 ); |
242 } | 242 } |
243 #endif //CONFIG_ENCODERS | 243 #endif //CONFIG_ENCODERS |
244 | 244 |
245 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) | 245 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
266 "packuswb %%mm7, %%mm6\n\t" | 266 "packuswb %%mm7, %%mm6\n\t" |
267 "movq %%mm0, (%0)\n\t" | 267 "movq %%mm0, (%0)\n\t" |
268 "movq %%mm2, (%0, %1)\n\t" | 268 "movq %%mm2, (%0, %1)\n\t" |
269 "movq %%mm4, (%0, %1, 2)\n\t" | 269 "movq %%mm4, (%0, %1, 2)\n\t" |
270 "movq %%mm6, (%0, %2)\n\t" | 270 "movq %%mm6, (%0, %2)\n\t" |
271 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) | 271 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p) |
272 :"memory"); | 272 :"memory"); |
273 pix += line_size*4; | 273 pix += line_size*4; |
274 p += 32; | 274 p += 32; |
275 | 275 |
276 // if here would be an exact copy of the code above | 276 // if here would be an exact copy of the code above |
291 "packuswb %%mm7, %%mm6\n\t" | 291 "packuswb %%mm7, %%mm6\n\t" |
292 "movq %%mm0, (%0)\n\t" | 292 "movq %%mm0, (%0)\n\t" |
293 "movq %%mm2, (%0, %1)\n\t" | 293 "movq %%mm2, (%0, %1)\n\t" |
294 "movq %%mm4, (%0, %1, 2)\n\t" | 294 "movq %%mm4, (%0, %1, 2)\n\t" |
295 "movq %%mm6, (%0, %2)\n\t" | 295 "movq %%mm6, (%0, %2)\n\t" |
296 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) | 296 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p) |
297 :"memory"); | 297 :"memory"); |
298 } | 298 } |
299 | 299 |
300 static unsigned char __align8 vector128[8] = | 300 static unsigned char __align8 vector128[8] = |
301 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; | 301 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; |
357 } | 357 } |
358 | 358 |
359 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 359 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
360 { | 360 { |
361 __asm __volatile( | 361 __asm __volatile( |
362 "lea (%3, %3), %%eax \n\t" | 362 "lea (%3, %3), %%"REG_a" \n\t" |
363 ".balign 8 \n\t" | 363 ".balign 8 \n\t" |
364 "1: \n\t" | 364 "1: \n\t" |
365 "movd (%1), %%mm0 \n\t" | 365 "movd (%1), %%mm0 \n\t" |
366 "movd (%1, %3), %%mm1 \n\t" | 366 "movd (%1, %3), %%mm1 \n\t" |
367 "movd %%mm0, (%2) \n\t" | 367 "movd %%mm0, (%2) \n\t" |
368 "movd %%mm1, (%2, %3) \n\t" | 368 "movd %%mm1, (%2, %3) \n\t" |
369 "addl %%eax, %1 \n\t" | 369 "add %%"REG_a", %1 \n\t" |
370 "addl %%eax, %2 \n\t" | 370 "add %%"REG_a", %2 \n\t" |
371 "movd (%1), %%mm0 \n\t" | 371 "movd (%1), %%mm0 \n\t" |
372 "movd (%1, %3), %%mm1 \n\t" | 372 "movd (%1, %3), %%mm1 \n\t" |
373 "movd %%mm0, (%2) \n\t" | 373 "movd %%mm0, (%2) \n\t" |
374 "movd %%mm1, (%2, %3) \n\t" | 374 "movd %%mm1, (%2, %3) \n\t" |
375 "addl %%eax, %1 \n\t" | 375 "add %%"REG_a", %1 \n\t" |
376 "addl %%eax, %2 \n\t" | 376 "add %%"REG_a", %2 \n\t" |
377 "subl $4, %0 \n\t" | 377 "subl $4, %0 \n\t" |
378 "jnz 1b \n\t" | 378 "jnz 1b \n\t" |
379 : "+g"(h), "+r" (pixels), "+r" (block) | 379 : "+g"(h), "+r" (pixels), "+r" (block) |
380 : "r"(line_size) | 380 : "r"((long)line_size) |
381 : "%eax", "memory" | 381 : "%"REG_a, "memory" |
382 ); | 382 ); |
383 } | 383 } |
384 | 384 |
385 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 385 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
386 { | 386 { |
387 __asm __volatile( | 387 __asm __volatile( |
388 "lea (%3, %3), %%eax \n\t" | 388 "lea (%3, %3), %%"REG_a" \n\t" |
389 ".balign 8 \n\t" | 389 ".balign 8 \n\t" |
390 "1: \n\t" | 390 "1: \n\t" |
391 "movq (%1), %%mm0 \n\t" | 391 "movq (%1), %%mm0 \n\t" |
392 "movq (%1, %3), %%mm1 \n\t" | 392 "movq (%1, %3), %%mm1 \n\t" |
393 "movq %%mm0, (%2) \n\t" | 393 "movq %%mm0, (%2) \n\t" |
394 "movq %%mm1, (%2, %3) \n\t" | 394 "movq %%mm1, (%2, %3) \n\t" |
395 "addl %%eax, %1 \n\t" | 395 "add %%"REG_a", %1 \n\t" |
396 "addl %%eax, %2 \n\t" | 396 "add %%"REG_a", %2 \n\t" |
397 "movq (%1), %%mm0 \n\t" | 397 "movq (%1), %%mm0 \n\t" |
398 "movq (%1, %3), %%mm1 \n\t" | 398 "movq (%1, %3), %%mm1 \n\t" |
399 "movq %%mm0, (%2) \n\t" | 399 "movq %%mm0, (%2) \n\t" |
400 "movq %%mm1, (%2, %3) \n\t" | 400 "movq %%mm1, (%2, %3) \n\t" |
401 "addl %%eax, %1 \n\t" | 401 "add %%"REG_a", %1 \n\t" |
402 "addl %%eax, %2 \n\t" | 402 "add %%"REG_a", %2 \n\t" |
403 "subl $4, %0 \n\t" | 403 "subl $4, %0 \n\t" |
404 "jnz 1b \n\t" | 404 "jnz 1b \n\t" |
405 : "+g"(h), "+r" (pixels), "+r" (block) | 405 : "+g"(h), "+r" (pixels), "+r" (block) |
406 : "r"(line_size) | 406 : "r"((long)line_size) |
407 : "%eax", "memory" | 407 : "%"REG_a, "memory" |
408 ); | 408 ); |
409 } | 409 } |
410 | 410 |
411 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 411 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
412 { | 412 { |
413 __asm __volatile( | 413 __asm __volatile( |
414 "lea (%3, %3), %%eax \n\t" | 414 "lea (%3, %3), %%"REG_a" \n\t" |
415 ".balign 8 \n\t" | 415 ".balign 8 \n\t" |
416 "1: \n\t" | 416 "1: \n\t" |
417 "movq (%1), %%mm0 \n\t" | 417 "movq (%1), %%mm0 \n\t" |
418 "movq 8(%1), %%mm4 \n\t" | 418 "movq 8(%1), %%mm4 \n\t" |
419 "movq (%1, %3), %%mm1 \n\t" | 419 "movq (%1, %3), %%mm1 \n\t" |
420 "movq 8(%1, %3), %%mm5 \n\t" | 420 "movq 8(%1, %3), %%mm5 \n\t" |
421 "movq %%mm0, (%2) \n\t" | 421 "movq %%mm0, (%2) \n\t" |
422 "movq %%mm4, 8(%2) \n\t" | 422 "movq %%mm4, 8(%2) \n\t" |
423 "movq %%mm1, (%2, %3) \n\t" | 423 "movq %%mm1, (%2, %3) \n\t" |
424 "movq %%mm5, 8(%2, %3) \n\t" | 424 "movq %%mm5, 8(%2, %3) \n\t" |
425 "addl %%eax, %1 \n\t" | 425 "add %%"REG_a", %1 \n\t" |
426 "addl %%eax, %2 \n\t" | 426 "add %%"REG_a", %2 \n\t" |
427 "movq (%1), %%mm0 \n\t" | 427 "movq (%1), %%mm0 \n\t" |
428 "movq 8(%1), %%mm4 \n\t" | 428 "movq 8(%1), %%mm4 \n\t" |
429 "movq (%1, %3), %%mm1 \n\t" | 429 "movq (%1, %3), %%mm1 \n\t" |
430 "movq 8(%1, %3), %%mm5 \n\t" | 430 "movq 8(%1, %3), %%mm5 \n\t" |
431 "movq %%mm0, (%2) \n\t" | 431 "movq %%mm0, (%2) \n\t" |
432 "movq %%mm4, 8(%2) \n\t" | 432 "movq %%mm4, 8(%2) \n\t" |
433 "movq %%mm1, (%2, %3) \n\t" | 433 "movq %%mm1, (%2, %3) \n\t" |
434 "movq %%mm5, 8(%2, %3) \n\t" | 434 "movq %%mm5, 8(%2, %3) \n\t" |
435 "addl %%eax, %1 \n\t" | 435 "add %%"REG_a", %1 \n\t" |
436 "addl %%eax, %2 \n\t" | 436 "add %%"REG_a", %2 \n\t" |
437 "subl $4, %0 \n\t" | 437 "subl $4, %0 \n\t" |
438 "jnz 1b \n\t" | 438 "jnz 1b \n\t" |
439 : "+g"(h), "+r" (pixels), "+r" (block) | 439 : "+g"(h), "+r" (pixels), "+r" (block) |
440 : "r"(line_size) | 440 : "r"((long)line_size) |
441 : "%eax", "memory" | 441 : "%"REG_a, "memory" |
442 ); | 442 ); |
443 } | 443 } |
444 | 444 |
445 static void clear_blocks_mmx(DCTELEM *blocks) | 445 static void clear_blocks_mmx(DCTELEM *blocks) |
446 { | 446 { |
447 __asm __volatile( | 447 __asm __volatile( |
448 "pxor %%mm7, %%mm7 \n\t" | 448 "pxor %%mm7, %%mm7 \n\t" |
449 "movl $-128*6, %%eax \n\t" | 449 "mov $-128*6, %%"REG_a" \n\t" |
450 "1: \n\t" | 450 "1: \n\t" |
451 "movq %%mm7, (%0, %%eax) \n\t" | 451 "movq %%mm7, (%0, %%"REG_a") \n\t" |
452 "movq %%mm7, 8(%0, %%eax) \n\t" | 452 "movq %%mm7, 8(%0, %%"REG_a") \n\t" |
453 "movq %%mm7, 16(%0, %%eax) \n\t" | 453 "movq %%mm7, 16(%0, %%"REG_a") \n\t" |
454 "movq %%mm7, 24(%0, %%eax) \n\t" | 454 "movq %%mm7, 24(%0, %%"REG_a") \n\t" |
455 "addl $32, %%eax \n\t" | 455 "add $32, %%"REG_a" \n\t" |
456 " js 1b \n\t" | 456 " js 1b \n\t" |
457 : : "r" (((int)blocks)+128*6) | 457 : : "r" (((uint8_t *)blocks)+128*6) |
458 : "%eax" | 458 : "%"REG_a |
459 ); | 459 ); |
460 } | 460 } |
461 | 461 |
462 #ifdef CONFIG_ENCODERS | 462 #ifdef CONFIG_ENCODERS |
463 static int pix_sum16_mmx(uint8_t * pix, int line_size){ | 463 static int pix_sum16_mmx(uint8_t * pix, int line_size){ |
464 const int h=16; | 464 const int h=16; |
465 int sum; | 465 int sum; |
466 int index= -line_size*h; | 466 long index= -line_size*h; |
467 | 467 |
468 __asm __volatile( | 468 __asm __volatile( |
469 "pxor %%mm7, %%mm7 \n\t" | 469 "pxor %%mm7, %%mm7 \n\t" |
470 "pxor %%mm6, %%mm6 \n\t" | 470 "pxor %%mm6, %%mm6 \n\t" |
471 "1: \n\t" | 471 "1: \n\t" |
479 "punpckhbw %%mm7, %%mm3 \n\t" | 479 "punpckhbw %%mm7, %%mm3 \n\t" |
480 "paddw %%mm0, %%mm1 \n\t" | 480 "paddw %%mm0, %%mm1 \n\t" |
481 "paddw %%mm2, %%mm3 \n\t" | 481 "paddw %%mm2, %%mm3 \n\t" |
482 "paddw %%mm1, %%mm3 \n\t" | 482 "paddw %%mm1, %%mm3 \n\t" |
483 "paddw %%mm3, %%mm6 \n\t" | 483 "paddw %%mm3, %%mm6 \n\t" |
484 "addl %3, %1 \n\t" | 484 "add %3, %1 \n\t" |
485 " js 1b \n\t" | 485 " js 1b \n\t" |
486 "movq %%mm6, %%mm5 \n\t" | 486 "movq %%mm6, %%mm5 \n\t" |
487 "psrlq $32, %%mm6 \n\t" | 487 "psrlq $32, %%mm6 \n\t" |
488 "paddw %%mm5, %%mm6 \n\t" | 488 "paddw %%mm5, %%mm6 \n\t" |
489 "movq %%mm6, %%mm5 \n\t" | 489 "movq %%mm6, %%mm5 \n\t" |
490 "psrlq $16, %%mm6 \n\t" | 490 "psrlq $16, %%mm6 \n\t" |
491 "paddw %%mm5, %%mm6 \n\t" | 491 "paddw %%mm5, %%mm6 \n\t" |
492 "movd %%mm6, %0 \n\t" | 492 "movd %%mm6, %0 \n\t" |
493 "andl $0xFFFF, %0 \n\t" | 493 "andl $0xFFFF, %0 \n\t" |
494 : "=&r" (sum), "+r" (index) | 494 : "=&r" (sum), "+r" (index) |
495 : "r" (pix - index), "r" (line_size) | 495 : "r" (pix - index), "r" ((long)line_size) |
496 ); | 496 ); |
497 | 497 |
498 return sum; | 498 return sum; |
499 } | 499 } |
500 #endif //CONFIG_ENCODERS | 500 #endif //CONFIG_ENCODERS |
501 | 501 |
502 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ | 502 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ |
503 int i=0; | 503 long i=0; |
504 asm volatile( | 504 asm volatile( |
505 "1: \n\t" | 505 "1: \n\t" |
506 "movq (%1, %0), %%mm0 \n\t" | 506 "movq (%1, %0), %%mm0 \n\t" |
507 "movq (%2, %0), %%mm1 \n\t" | 507 "movq (%2, %0), %%mm1 \n\t" |
508 "paddb %%mm0, %%mm1 \n\t" | 508 "paddb %%mm0, %%mm1 \n\t" |
509 "movq %%mm1, (%2, %0) \n\t" | 509 "movq %%mm1, (%2, %0) \n\t" |
510 "movq 8(%1, %0), %%mm0 \n\t" | 510 "movq 8(%1, %0), %%mm0 \n\t" |
511 "movq 8(%2, %0), %%mm1 \n\t" | 511 "movq 8(%2, %0), %%mm1 \n\t" |
512 "paddb %%mm0, %%mm1 \n\t" | 512 "paddb %%mm0, %%mm1 \n\t" |
513 "movq %%mm1, 8(%2, %0) \n\t" | 513 "movq %%mm1, 8(%2, %0) \n\t" |
514 "addl $16, %0 \n\t" | 514 "add $16, %0 \n\t" |
515 "cmpl %3, %0 \n\t" | 515 "cmp %3, %0 \n\t" |
516 " jb 1b \n\t" | 516 " jb 1b \n\t" |
517 : "+r" (i) | 517 : "+r" (i) |
518 : "r"(src), "r"(dst), "r"(w-15) | 518 : "r"(src), "r"(dst), "r"((long)w-15) |
519 ); | 519 ); |
520 for(; i<w; i++) | 520 for(; i<w; i++) |
521 dst[i+0] += src[i+0]; | 521 dst[i+0] += src[i+0]; |
522 } | 522 } |
523 | 523 |
724 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, | 724 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, |
725 pix2^2+pix3^2+pix6^2+pix7^2) */ | 725 pix2^2+pix3^2+pix6^2+pix7^2) */ |
726 "paddd %%mm3,%%mm4\n" | 726 "paddd %%mm3,%%mm4\n" |
727 "paddd %%mm2,%%mm7\n" | 727 "paddd %%mm2,%%mm7\n" |
728 | 728 |
729 "addl %2, %0\n" | 729 "add %2, %0\n" |
730 "paddd %%mm4,%%mm7\n" | 730 "paddd %%mm4,%%mm7\n" |
731 "dec %%ecx\n" | 731 "dec %%ecx\n" |
732 "jnz 1b\n" | 732 "jnz 1b\n" |
733 | 733 |
734 "movq %%mm7,%%mm1\n" | 734 "movq %%mm7,%%mm1\n" |
735 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | 735 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
736 "paddd %%mm7,%%mm1\n" | 736 "paddd %%mm7,%%mm1\n" |
737 "movd %%mm1,%1\n" | 737 "movd %%mm1,%1\n" |
738 : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" ); | 738 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" ); |
739 return tmp; | 739 return tmp; |
740 } | 740 } |
741 | 741 |
742 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | 742 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
743 int tmp; | 743 int tmp; |
761 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ | 761 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ |
762 | 762 |
763 "pmaddwd %%mm2,%%mm2\n" | 763 "pmaddwd %%mm2,%%mm2\n" |
764 "pmaddwd %%mm1,%%mm1\n" | 764 "pmaddwd %%mm1,%%mm1\n" |
765 | 765 |
766 "addl %3,%0\n" | 766 "add %3,%0\n" |
767 "addl %3,%1\n" | 767 "add %3,%1\n" |
768 | 768 |
769 "paddd %%mm2,%%mm1\n" | 769 "paddd %%mm2,%%mm1\n" |
770 "paddd %%mm1,%%mm7\n" | 770 "paddd %%mm1,%%mm7\n" |
771 | 771 |
772 "decl %%ecx\n" | 772 "decl %%ecx\n" |
775 "movq %%mm7,%%mm1\n" | 775 "movq %%mm7,%%mm1\n" |
776 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | 776 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
777 "paddd %%mm7,%%mm1\n" | 777 "paddd %%mm7,%%mm1\n" |
778 "movd %%mm1,%2\n" | 778 "movd %%mm1,%2\n" |
779 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | 779 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
780 : "r" (line_size) , "m" (h) | 780 : "r" ((long)line_size) , "m" (h) |
781 : "%ecx"); | 781 : "%ecx"); |
782 return tmp; | 782 return tmp; |
783 } | 783 } |
784 | 784 |
785 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | 785 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
819 "pmaddwd %%mm2,%%mm2\n" | 819 "pmaddwd %%mm2,%%mm2\n" |
820 "pmaddwd %%mm4,%%mm4\n" | 820 "pmaddwd %%mm4,%%mm4\n" |
821 "pmaddwd %%mm1,%%mm1\n" | 821 "pmaddwd %%mm1,%%mm1\n" |
822 "pmaddwd %%mm3,%%mm3\n" | 822 "pmaddwd %%mm3,%%mm3\n" |
823 | 823 |
824 "addl %3,%0\n" | 824 "add %3,%0\n" |
825 "addl %3,%1\n" | 825 "add %3,%1\n" |
826 | 826 |
827 "paddd %%mm2,%%mm1\n" | 827 "paddd %%mm2,%%mm1\n" |
828 "paddd %%mm4,%%mm3\n" | 828 "paddd %%mm4,%%mm3\n" |
829 "paddd %%mm1,%%mm7\n" | 829 "paddd %%mm1,%%mm7\n" |
830 "paddd %%mm3,%%mm7\n" | 830 "paddd %%mm3,%%mm7\n" |
835 "movq %%mm7,%%mm1\n" | 835 "movq %%mm7,%%mm1\n" |
836 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | 836 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
837 "paddd %%mm7,%%mm1\n" | 837 "paddd %%mm7,%%mm1\n" |
838 "movd %%mm1,%2\n" | 838 "movd %%mm1,%2\n" |
839 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | 839 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
840 : "r" (line_size) , "m" (h) | 840 : "r" ((long)line_size) , "m" (h) |
841 : "%ecx"); | 841 : "%ecx"); |
842 return tmp; | 842 return tmp; |
843 } | 843 } |
844 | 844 |
845 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { | 845 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { |
861 "punpckhbw %%mm7,%%mm2\n" | 861 "punpckhbw %%mm7,%%mm2\n" |
862 "punpckhbw %%mm7,%%mm3\n" | 862 "punpckhbw %%mm7,%%mm3\n" |
863 "psubw %%mm1, %%mm0\n" | 863 "psubw %%mm1, %%mm0\n" |
864 "psubw %%mm3, %%mm2\n" | 864 "psubw %%mm3, %%mm2\n" |
865 | 865 |
866 "addl %2,%0\n" | 866 "add %2,%0\n" |
867 | 867 |
868 "movq (%0),%%mm4\n" | 868 "movq (%0),%%mm4\n" |
869 "movq %%mm4, %%mm1\n" | 869 "movq %%mm4, %%mm1\n" |
870 "psllq $8, %%mm4\n" | 870 "psllq $8, %%mm4\n" |
871 "psrlq $8, %%mm1\n" | 871 "psrlq $8, %%mm1\n" |
889 "psubw %%mm3, %%mm0\n" | 889 "psubw %%mm3, %%mm0\n" |
890 "psubw %%mm1, %%mm2\n" | 890 "psubw %%mm1, %%mm2\n" |
891 "paddw %%mm0, %%mm2\n" | 891 "paddw %%mm0, %%mm2\n" |
892 "paddw %%mm2, %%mm6\n" | 892 "paddw %%mm2, %%mm6\n" |
893 | 893 |
894 "addl %2,%0\n" | 894 "add %2,%0\n" |
895 "1:\n" | 895 "1:\n" |
896 | 896 |
897 "movq (%0),%%mm0\n" | 897 "movq (%0),%%mm0\n" |
898 "movq %%mm0, %%mm1\n" | 898 "movq %%mm0, %%mm1\n" |
899 "psllq $8, %%mm0\n" | 899 "psllq $8, %%mm0\n" |
918 "psubw %%mm3, %%mm4\n" | 918 "psubw %%mm3, %%mm4\n" |
919 "psubw %%mm1, %%mm5\n" | 919 "psubw %%mm1, %%mm5\n" |
920 "paddw %%mm4, %%mm5\n" | 920 "paddw %%mm4, %%mm5\n" |
921 "paddw %%mm5, %%mm6\n" | 921 "paddw %%mm5, %%mm6\n" |
922 | 922 |
923 "addl %2,%0\n" | 923 "add %2,%0\n" |
924 | 924 |
925 "movq (%0),%%mm4\n" | 925 "movq (%0),%%mm4\n" |
926 "movq %%mm4, %%mm1\n" | 926 "movq %%mm4, %%mm1\n" |
927 "psllq $8, %%mm4\n" | 927 "psllq $8, %%mm4\n" |
928 "psrlq $8, %%mm1\n" | 928 "psrlq $8, %%mm1\n" |
946 "psubw %%mm3, %%mm0\n" | 946 "psubw %%mm3, %%mm0\n" |
947 "psubw %%mm1, %%mm2\n" | 947 "psubw %%mm1, %%mm2\n" |
948 "paddw %%mm0, %%mm2\n" | 948 "paddw %%mm0, %%mm2\n" |
949 "paddw %%mm2, %%mm6\n" | 949 "paddw %%mm2, %%mm6\n" |
950 | 950 |
951 "addl %2,%0\n" | 951 "add %2,%0\n" |
952 "subl $2, %%ecx\n" | 952 "subl $2, %%ecx\n" |
953 " jnz 1b\n" | 953 " jnz 1b\n" |
954 | 954 |
955 "movq %%mm6, %%mm0\n" | 955 "movq %%mm6, %%mm0\n" |
956 "punpcklwd %%mm7,%%mm0\n" | 956 "punpcklwd %%mm7,%%mm0\n" |
960 "movq %%mm6,%%mm0\n" | 960 "movq %%mm6,%%mm0\n" |
961 "psrlq $32, %%mm6\n" | 961 "psrlq $32, %%mm6\n" |
962 "paddd %%mm6,%%mm0\n" | 962 "paddd %%mm6,%%mm0\n" |
963 "movd %%mm0,%1\n" | 963 "movd %%mm0,%1\n" |
964 : "+r" (pix1), "=r"(tmp) | 964 : "+r" (pix1), "=r"(tmp) |
965 : "r" (line_size) , "g" (h-2) | 965 : "r" ((long)line_size) , "g" (h-2) |
966 : "%ecx"); | 966 : "%ecx"); |
967 return tmp; | 967 return tmp; |
968 } | 968 } |
969 | 969 |
970 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { | 970 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { |
984 "punpckhbw %%mm7,%%mm2\n" | 984 "punpckhbw %%mm7,%%mm2\n" |
985 "punpckhbw %%mm7,%%mm3\n" | 985 "punpckhbw %%mm7,%%mm3\n" |
986 "psubw %%mm1, %%mm0\n" | 986 "psubw %%mm1, %%mm0\n" |
987 "psubw %%mm3, %%mm2\n" | 987 "psubw %%mm3, %%mm2\n" |
988 | 988 |
989 "addl %2,%0\n" | 989 "add %2,%0\n" |
990 | 990 |
991 "movq (%0),%%mm4\n" | 991 "movq (%0),%%mm4\n" |
992 "movq 1(%0),%%mm1\n" | 992 "movq 1(%0),%%mm1\n" |
993 "movq %%mm4, %%mm5\n" | 993 "movq %%mm4, %%mm5\n" |
994 "movq %%mm1, %%mm3\n" | 994 "movq %%mm1, %%mm3\n" |
1009 "psubw %%mm3, %%mm0\n" | 1009 "psubw %%mm3, %%mm0\n" |
1010 "psubw %%mm1, %%mm2\n" | 1010 "psubw %%mm1, %%mm2\n" |
1011 "paddw %%mm0, %%mm2\n" | 1011 "paddw %%mm0, %%mm2\n" |
1012 "paddw %%mm2, %%mm6\n" | 1012 "paddw %%mm2, %%mm6\n" |
1013 | 1013 |
1014 "addl %2,%0\n" | 1014 "add %2,%0\n" |
1015 "1:\n" | 1015 "1:\n" |
1016 | 1016 |
1017 "movq (%0),%%mm0\n" | 1017 "movq (%0),%%mm0\n" |
1018 "movq 1(%0),%%mm1\n" | 1018 "movq 1(%0),%%mm1\n" |
1019 "movq %%mm0, %%mm2\n" | 1019 "movq %%mm0, %%mm2\n" |
1035 "psubw %%mm3, %%mm4\n" | 1035 "psubw %%mm3, %%mm4\n" |
1036 "psubw %%mm1, %%mm5\n" | 1036 "psubw %%mm1, %%mm5\n" |
1037 "paddw %%mm4, %%mm5\n" | 1037 "paddw %%mm4, %%mm5\n" |
1038 "paddw %%mm5, %%mm6\n" | 1038 "paddw %%mm5, %%mm6\n" |
1039 | 1039 |
1040 "addl %2,%0\n" | 1040 "add %2,%0\n" |
1041 | 1041 |
1042 "movq (%0),%%mm4\n" | 1042 "movq (%0),%%mm4\n" |
1043 "movq 1(%0),%%mm1\n" | 1043 "movq 1(%0),%%mm1\n" |
1044 "movq %%mm4, %%mm5\n" | 1044 "movq %%mm4, %%mm5\n" |
1045 "movq %%mm1, %%mm3\n" | 1045 "movq %%mm1, %%mm3\n" |
1060 "psubw %%mm3, %%mm0\n" | 1060 "psubw %%mm3, %%mm0\n" |
1061 "psubw %%mm1, %%mm2\n" | 1061 "psubw %%mm1, %%mm2\n" |
1062 "paddw %%mm0, %%mm2\n" | 1062 "paddw %%mm0, %%mm2\n" |
1063 "paddw %%mm2, %%mm6\n" | 1063 "paddw %%mm2, %%mm6\n" |
1064 | 1064 |
1065 "addl %2,%0\n" | 1065 "add %2,%0\n" |
1066 "subl $2, %%ecx\n" | 1066 "subl $2, %%ecx\n" |
1067 " jnz 1b\n" | 1067 " jnz 1b\n" |
1068 | 1068 |
1069 "movq %%mm6, %%mm0\n" | 1069 "movq %%mm6, %%mm0\n" |
1070 "punpcklwd %%mm7,%%mm0\n" | 1070 "punpcklwd %%mm7,%%mm0\n" |
1074 "movq %%mm6,%%mm0\n" | 1074 "movq %%mm6,%%mm0\n" |
1075 "psrlq $32, %%mm6\n" | 1075 "psrlq $32, %%mm6\n" |
1076 "paddd %%mm6,%%mm0\n" | 1076 "paddd %%mm6,%%mm0\n" |
1077 "movd %%mm0,%1\n" | 1077 "movd %%mm0,%1\n" |
1078 : "+r" (pix1), "=r"(tmp) | 1078 : "+r" (pix1), "=r"(tmp) |
1079 : "r" (line_size) , "g" (h-2) | 1079 : "r" ((long)line_size) , "g" (h-2) |
1080 : "%ecx"); | 1080 : "%ecx"); |
1081 return tmp + hf_noise8_mmx(pix+8, line_size, h); | 1081 return tmp + hf_noise8_mmx(pix+8, line_size, h); |
1082 } | 1082 } |
1083 | 1083 |
1084 static int nsse16_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | 1084 static int nsse16_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
1104 assert((line_size &7) ==0); | 1104 assert((line_size &7) ==0); |
1105 | 1105 |
1106 #define SUM(in0, in1, out0, out1) \ | 1106 #define SUM(in0, in1, out0, out1) \ |
1107 "movq (%0), %%mm2\n"\ | 1107 "movq (%0), %%mm2\n"\ |
1108 "movq 8(%0), %%mm3\n"\ | 1108 "movq 8(%0), %%mm3\n"\ |
1109 "addl %2,%0\n"\ | 1109 "add %2,%0\n"\ |
1110 "movq %%mm2, " #out0 "\n"\ | 1110 "movq %%mm2, " #out0 "\n"\ |
1111 "movq %%mm3, " #out1 "\n"\ | 1111 "movq %%mm3, " #out1 "\n"\ |
1112 "psubusb " #in0 ", %%mm2\n"\ | 1112 "psubusb " #in0 ", %%mm2\n"\ |
1113 "psubusb " #in1 ", %%mm3\n"\ | 1113 "psubusb " #in1 ", %%mm3\n"\ |
1114 "psubusb " #out0 ", " #in0 "\n"\ | 1114 "psubusb " #out0 ", " #in0 "\n"\ |
1131 "movl %3,%%ecx\n" | 1131 "movl %3,%%ecx\n" |
1132 "pxor %%mm6,%%mm6\n" | 1132 "pxor %%mm6,%%mm6\n" |
1133 "pxor %%mm7,%%mm7\n" | 1133 "pxor %%mm7,%%mm7\n" |
1134 "movq (%0),%%mm0\n" | 1134 "movq (%0),%%mm0\n" |
1135 "movq 8(%0),%%mm1\n" | 1135 "movq 8(%0),%%mm1\n" |
1136 "addl %2,%0\n" | 1136 "add %2,%0\n" |
1137 "subl $2, %%ecx\n" | 1137 "subl $2, %%ecx\n" |
1138 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | 1138 SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
1139 "1:\n" | 1139 "1:\n" |
1140 | 1140 |
1141 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | 1141 SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
1151 "movq %%mm0,%%mm6\n" | 1151 "movq %%mm0,%%mm6\n" |
1152 "psrlq $16, %%mm0\n" | 1152 "psrlq $16, %%mm0\n" |
1153 "paddw %%mm6,%%mm0\n" | 1153 "paddw %%mm6,%%mm0\n" |
1154 "movd %%mm0,%1\n" | 1154 "movd %%mm0,%1\n" |
1155 : "+r" (pix), "=r"(tmp) | 1155 : "+r" (pix), "=r"(tmp) |
1156 : "r" (line_size) , "m" (h) | 1156 : "r" ((long)line_size) , "m" (h) |
1157 : "%ecx"); | 1157 : "%ecx"); |
1158 return tmp & 0xFFFF; | 1158 return tmp & 0xFFFF; |
1159 } | 1159 } |
1160 #undef SUM | 1160 #undef SUM |
1161 | 1161 |
1166 assert((line_size &7) ==0); | 1166 assert((line_size &7) ==0); |
1167 | 1167 |
1168 #define SUM(in0, in1, out0, out1) \ | 1168 #define SUM(in0, in1, out0, out1) \ |
1169 "movq (%0), " #out0 "\n"\ | 1169 "movq (%0), " #out0 "\n"\ |
1170 "movq 8(%0), " #out1 "\n"\ | 1170 "movq 8(%0), " #out1 "\n"\ |
1171 "addl %2,%0\n"\ | 1171 "add %2,%0\n"\ |
1172 "psadbw " #out0 ", " #in0 "\n"\ | 1172 "psadbw " #out0 ", " #in0 "\n"\ |
1173 "psadbw " #out1 ", " #in1 "\n"\ | 1173 "psadbw " #out1 ", " #in1 "\n"\ |
1174 "paddw " #in1 ", " #in0 "\n"\ | 1174 "paddw " #in1 ", " #in0 "\n"\ |
1175 "paddw " #in0 ", %%mm6\n" | 1175 "paddw " #in0 ", %%mm6\n" |
1176 | 1176 |
1178 "movl %3,%%ecx\n" | 1178 "movl %3,%%ecx\n" |
1179 "pxor %%mm6,%%mm6\n" | 1179 "pxor %%mm6,%%mm6\n" |
1180 "pxor %%mm7,%%mm7\n" | 1180 "pxor %%mm7,%%mm7\n" |
1181 "movq (%0),%%mm0\n" | 1181 "movq (%0),%%mm0\n" |
1182 "movq 8(%0),%%mm1\n" | 1182 "movq 8(%0),%%mm1\n" |
1183 "addl %2,%0\n" | 1183 "add %2,%0\n" |
1184 "subl $2, %%ecx\n" | 1184 "subl $2, %%ecx\n" |
1185 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | 1185 SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
1186 "1:\n" | 1186 "1:\n" |
1187 | 1187 |
1188 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | 1188 SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
1192 "subl $2, %%ecx\n" | 1192 "subl $2, %%ecx\n" |
1193 "jnz 1b\n" | 1193 "jnz 1b\n" |
1194 | 1194 |
1195 "movd %%mm6,%1\n" | 1195 "movd %%mm6,%1\n" |
1196 : "+r" (pix), "=r"(tmp) | 1196 : "+r" (pix), "=r"(tmp) |
1197 : "r" (line_size) , "m" (h) | 1197 : "r" ((long)line_size) , "m" (h) |
1198 : "%ecx"); | 1198 : "%ecx"); |
1199 return tmp; | 1199 return tmp; |
1200 } | 1200 } |
1201 #undef SUM | 1201 #undef SUM |
1202 | 1202 |
1210 #define SUM(in0, in1, out0, out1) \ | 1210 #define SUM(in0, in1, out0, out1) \ |
1211 "movq (%0),%%mm2\n"\ | 1211 "movq (%0),%%mm2\n"\ |
1212 "movq (%1)," #out0 "\n"\ | 1212 "movq (%1)," #out0 "\n"\ |
1213 "movq 8(%0),%%mm3\n"\ | 1213 "movq 8(%0),%%mm3\n"\ |
1214 "movq 8(%1)," #out1 "\n"\ | 1214 "movq 8(%1)," #out1 "\n"\ |
1215 "addl %3,%0\n"\ | 1215 "add %3,%0\n"\ |
1216 "addl %3,%1\n"\ | 1216 "add %3,%1\n"\ |
1217 "psubb " #out0 ", %%mm2\n"\ | 1217 "psubb " #out0 ", %%mm2\n"\ |
1218 "psubb " #out1 ", %%mm3\n"\ | 1218 "psubb " #out1 ", %%mm3\n"\ |
1219 "pxor %%mm7, %%mm2\n"\ | 1219 "pxor %%mm7, %%mm2\n"\ |
1220 "pxor %%mm7, %%mm3\n"\ | 1220 "pxor %%mm7, %%mm3\n"\ |
1221 "movq %%mm2, " #out0 "\n"\ | 1221 "movq %%mm2, " #out0 "\n"\ |
1246 "packsswb %%mm7, %%mm7\n" | 1246 "packsswb %%mm7, %%mm7\n" |
1247 "movq (%0),%%mm0\n" | 1247 "movq (%0),%%mm0\n" |
1248 "movq (%1),%%mm2\n" | 1248 "movq (%1),%%mm2\n" |
1249 "movq 8(%0),%%mm1\n" | 1249 "movq 8(%0),%%mm1\n" |
1250 "movq 8(%1),%%mm3\n" | 1250 "movq 8(%1),%%mm3\n" |
1251 "addl %3,%0\n" | 1251 "add %3,%0\n" |
1252 "addl %3,%1\n" | 1252 "add %3,%1\n" |
1253 "subl $2, %%ecx\n" | 1253 "subl $2, %%ecx\n" |
1254 "psubb %%mm2, %%mm0\n" | 1254 "psubb %%mm2, %%mm0\n" |
1255 "psubb %%mm3, %%mm1\n" | 1255 "psubb %%mm3, %%mm1\n" |
1256 "pxor %%mm7, %%mm0\n" | 1256 "pxor %%mm7, %%mm0\n" |
1257 "pxor %%mm7, %%mm1\n" | 1257 "pxor %%mm7, %%mm1\n" |
1271 "movq %%mm0,%%mm6\n" | 1271 "movq %%mm0,%%mm6\n" |
1272 "psrlq $16, %%mm0\n" | 1272 "psrlq $16, %%mm0\n" |
1273 "paddw %%mm6,%%mm0\n" | 1273 "paddw %%mm6,%%mm0\n" |
1274 "movd %%mm0,%2\n" | 1274 "movd %%mm0,%2\n" |
1275 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | 1275 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
1276 : "r" (line_size) , "m" (h) | 1276 : "r" ((long)line_size) , "m" (h) |
1277 : "%ecx"); | 1277 : "%ecx"); |
1278 return tmp & 0x7FFF; | 1278 return tmp & 0x7FFF; |
1279 } | 1279 } |
1280 #undef SUM | 1280 #undef SUM |
1281 | 1281 |
1289 #define SUM(in0, in1, out0, out1) \ | 1289 #define SUM(in0, in1, out0, out1) \ |
1290 "movq (%0)," #out0 "\n"\ | 1290 "movq (%0)," #out0 "\n"\ |
1291 "movq (%1),%%mm2\n"\ | 1291 "movq (%1),%%mm2\n"\ |
1292 "movq 8(%0)," #out1 "\n"\ | 1292 "movq 8(%0)," #out1 "\n"\ |
1293 "movq 8(%1),%%mm3\n"\ | 1293 "movq 8(%1),%%mm3\n"\ |
1294 "addl %3,%0\n"\ | 1294 "add %3,%0\n"\ |
1295 "addl %3,%1\n"\ | 1295 "add %3,%1\n"\ |
1296 "psubb %%mm2, " #out0 "\n"\ | 1296 "psubb %%mm2, " #out0 "\n"\ |
1297 "psubb %%mm3, " #out1 "\n"\ | 1297 "psubb %%mm3, " #out1 "\n"\ |
1298 "pxor %%mm7, " #out0 "\n"\ | 1298 "pxor %%mm7, " #out0 "\n"\ |
1299 "pxor %%mm7, " #out1 "\n"\ | 1299 "pxor %%mm7, " #out1 "\n"\ |
1300 "psadbw " #out0 ", " #in0 "\n"\ | 1300 "psadbw " #out0 ", " #in0 "\n"\ |
1310 "packsswb %%mm7, %%mm7\n" | 1310 "packsswb %%mm7, %%mm7\n" |
1311 "movq (%0),%%mm0\n" | 1311 "movq (%0),%%mm0\n" |
1312 "movq (%1),%%mm2\n" | 1312 "movq (%1),%%mm2\n" |
1313 "movq 8(%0),%%mm1\n" | 1313 "movq 8(%0),%%mm1\n" |
1314 "movq 8(%1),%%mm3\n" | 1314 "movq 8(%1),%%mm3\n" |
1315 "addl %3,%0\n" | 1315 "add %3,%0\n" |
1316 "addl %3,%1\n" | 1316 "add %3,%1\n" |
1317 "subl $2, %%ecx\n" | 1317 "subl $2, %%ecx\n" |
1318 "psubb %%mm2, %%mm0\n" | 1318 "psubb %%mm2, %%mm0\n" |
1319 "psubb %%mm3, %%mm1\n" | 1319 "psubb %%mm3, %%mm1\n" |
1320 "pxor %%mm7, %%mm0\n" | 1320 "pxor %%mm7, %%mm0\n" |
1321 "pxor %%mm7, %%mm1\n" | 1321 "pxor %%mm7, %%mm1\n" |
1329 "subl $2, %%ecx\n" | 1329 "subl $2, %%ecx\n" |
1330 "jnz 1b\n" | 1330 "jnz 1b\n" |
1331 | 1331 |
1332 "movd %%mm6,%2\n" | 1332 "movd %%mm6,%2\n" |
1333 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | 1333 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
1334 : "r" (line_size) , "m" (h) | 1334 : "r" ((long)line_size) , "m" (h) |
1335 : "%ecx"); | 1335 : "%ecx"); |
1336 return tmp; | 1336 return tmp; |
1337 } | 1337 } |
1338 #undef SUM | 1338 #undef SUM |
1339 | 1339 |
1340 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ | 1340 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
1341 int i=0; | 1341 long i=0; |
1342 asm volatile( | 1342 asm volatile( |
1343 "1: \n\t" | 1343 "1: \n\t" |
1344 "movq (%2, %0), %%mm0 \n\t" | 1344 "movq (%2, %0), %%mm0 \n\t" |
1345 "movq (%1, %0), %%mm1 \n\t" | 1345 "movq (%1, %0), %%mm1 \n\t" |
1346 "psubb %%mm0, %%mm1 \n\t" | 1346 "psubb %%mm0, %%mm1 \n\t" |
1347 "movq %%mm1, (%3, %0) \n\t" | 1347 "movq %%mm1, (%3, %0) \n\t" |
1348 "movq 8(%2, %0), %%mm0 \n\t" | 1348 "movq 8(%2, %0), %%mm0 \n\t" |
1349 "movq 8(%1, %0), %%mm1 \n\t" | 1349 "movq 8(%1, %0), %%mm1 \n\t" |
1350 "psubb %%mm0, %%mm1 \n\t" | 1350 "psubb %%mm0, %%mm1 \n\t" |
1351 "movq %%mm1, 8(%3, %0) \n\t" | 1351 "movq %%mm1, 8(%3, %0) \n\t" |
1352 "addl $16, %0 \n\t" | 1352 "add $16, %0 \n\t" |
1353 "cmpl %4, %0 \n\t" | 1353 "cmp %4, %0 \n\t" |
1354 " jb 1b \n\t" | 1354 " jb 1b \n\t" |
1355 : "+r" (i) | 1355 : "+r" (i) |
1356 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15) | 1356 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15) |
1357 ); | 1357 ); |
1358 for(; i<w; i++) | 1358 for(; i<w; i++) |
1359 dst[i+0] = src1[i+0]-src2[i+0]; | 1359 dst[i+0] = src1[i+0]-src2[i+0]; |
1360 } | 1360 } |
1361 | 1361 |
1362 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ | 1362 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ |
1363 int i=0; | 1363 long i=0; |
1364 uint8_t l, lt; | 1364 uint8_t l, lt; |
1365 | 1365 |
1366 asm volatile( | 1366 asm volatile( |
1367 "1: \n\t" | 1367 "1: \n\t" |
1368 "movq -1(%1, %0), %%mm0 \n\t" // LT | 1368 "movq -1(%1, %0), %%mm0 \n\t" // LT |
1377 "pminub %%mm5, %%mm1 \n\t" // min(T, L) | 1377 "pminub %%mm5, %%mm1 \n\t" // min(T, L) |
1378 "pminub %%mm2, %%mm4 \n\t" | 1378 "pminub %%mm2, %%mm4 \n\t" |
1379 "pmaxub %%mm1, %%mm4 \n\t" | 1379 "pmaxub %%mm1, %%mm4 \n\t" |
1380 "psubb %%mm4, %%mm3 \n\t" // dst - pred | 1380 "psubb %%mm4, %%mm3 \n\t" // dst - pred |
1381 "movq %%mm3, (%3, %0) \n\t" | 1381 "movq %%mm3, (%3, %0) \n\t" |
1382 "addl $8, %0 \n\t" | 1382 "add $8, %0 \n\t" |
1383 "cmpl %4, %0 \n\t" | 1383 "cmp %4, %0 \n\t" |
1384 " jb 1b \n\t" | 1384 " jb 1b \n\t" |
1385 : "+r" (i) | 1385 : "+r" (i) |
1386 : "r"(src1), "r"(src2), "r"(dst), "r"(w) | 1386 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w) |
1387 ); | 1387 ); |
1388 | 1388 |
1389 l= *left; | 1389 l= *left; |
1390 lt= *left_top; | 1390 lt= *left_top; |
1391 | 1391 |
1770 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ | 1770 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ |
1771 "psraw $5, %%mm4 \n\t"\ | 1771 "psraw $5, %%mm4 \n\t"\ |
1772 "packuswb %%mm4, %%mm0 \n\t"\ | 1772 "packuswb %%mm4, %%mm0 \n\t"\ |
1773 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ | 1773 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ |
1774 \ | 1774 \ |
1775 "addl %3, %0 \n\t"\ | 1775 "add %3, %0 \n\t"\ |
1776 "addl %4, %1 \n\t"\ | 1776 "add %4, %1 \n\t"\ |
1777 "decl %2 \n\t"\ | 1777 "decl %2 \n\t"\ |
1778 " jnz 1b \n\t"\ | 1778 " jnz 1b \n\t"\ |
1779 : "+a"(src), "+c"(dst), "+m"(h)\ | 1779 : "+a"(src), "+c"(dst), "+m"(h)\ |
1780 : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ | 1780 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
1781 : "memory"\ | 1781 : "memory"\ |
1782 );\ | 1782 );\ |
1783 }\ | 1783 }\ |
1784 \ | 1784 \ |
1785 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | 1785 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
1883 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ | 1883 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ |
1884 "psraw $5, %%mm3 \n\t"\ | 1884 "psraw $5, %%mm3 \n\t"\ |
1885 "packuswb %%mm3, %%mm0 \n\t"\ | 1885 "packuswb %%mm3, %%mm0 \n\t"\ |
1886 OP_MMX2(%%mm0, (%1), %%mm4, q)\ | 1886 OP_MMX2(%%mm0, (%1), %%mm4, q)\ |
1887 \ | 1887 \ |
1888 "addl %3, %0 \n\t"\ | 1888 "add %3, %0 \n\t"\ |
1889 "addl %4, %1 \n\t"\ | 1889 "add %4, %1 \n\t"\ |
1890 "decl %2 \n\t"\ | 1890 "decl %2 \n\t"\ |
1891 " jnz 1b \n\t"\ | 1891 " jnz 1b \n\t"\ |
1892 : "+a"(src), "+c"(dst), "+m"(h)\ | 1892 : "+a"(src), "+c"(dst), "+m"(h)\ |
1893 : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ | 1893 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
1894 : "memory"\ | 1894 : "memory"\ |
1895 );\ | 1895 );\ |
1896 }\ | 1896 }\ |
1897 \ | 1897 \ |
1898 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | 1898 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
1947 "punpckhbw %%mm7, %%mm3 \n\t"\ | 1947 "punpckhbw %%mm7, %%mm3 \n\t"\ |
1948 "movq %%mm0, (%1) \n\t"\ | 1948 "movq %%mm0, (%1) \n\t"\ |
1949 "movq %%mm1, 17*8(%1) \n\t"\ | 1949 "movq %%mm1, 17*8(%1) \n\t"\ |
1950 "movq %%mm2, 2*17*8(%1) \n\t"\ | 1950 "movq %%mm2, 2*17*8(%1) \n\t"\ |
1951 "movq %%mm3, 3*17*8(%1) \n\t"\ | 1951 "movq %%mm3, 3*17*8(%1) \n\t"\ |
1952 "addl $8, %1 \n\t"\ | 1952 "add $8, %1 \n\t"\ |
1953 "addl %3, %0 \n\t"\ | 1953 "add %3, %0 \n\t"\ |
1954 "decl %2 \n\t"\ | 1954 "decl %2 \n\t"\ |
1955 " jnz 1b \n\t"\ | 1955 " jnz 1b \n\t"\ |
1956 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ | 1956 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ |
1957 : "r" (srcStride)\ | 1957 : "r" ((long)srcStride)\ |
1958 : "memory"\ | 1958 : "memory"\ |
1959 );\ | 1959 );\ |
1960 \ | 1960 \ |
1961 temp_ptr= temp;\ | 1961 temp_ptr= temp;\ |
1962 count=4;\ | 1962 count=4;\ |
1969 "movq 8(%0), %%mm1 \n\t"\ | 1969 "movq 8(%0), %%mm1 \n\t"\ |
1970 "movq 16(%0), %%mm2 \n\t"\ | 1970 "movq 16(%0), %%mm2 \n\t"\ |
1971 "movq 24(%0), %%mm3 \n\t"\ | 1971 "movq 24(%0), %%mm3 \n\t"\ |
1972 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ | 1972 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
1973 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | 1973 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
1974 "addl %4, %1 \n\t"\ | 1974 "add %4, %1 \n\t"\ |
1975 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ | 1975 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
1976 \ | 1976 \ |
1977 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ | 1977 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
1978 "addl %4, %1 \n\t"\ | 1978 "add %4, %1 \n\t"\ |
1979 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ | 1979 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
1980 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ | 1980 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ |
1981 "addl %4, %1 \n\t"\ | 1981 "add %4, %1 \n\t"\ |
1982 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ | 1982 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ |
1983 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ | 1983 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ |
1984 "addl %4, %1 \n\t"\ | 1984 "add %4, %1 \n\t"\ |
1985 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ | 1985 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ |
1986 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ | 1986 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ |
1987 "addl %4, %1 \n\t"\ | 1987 "add %4, %1 \n\t"\ |
1988 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ | 1988 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ |
1989 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ | 1989 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ |
1990 "addl %4, %1 \n\t"\ | 1990 "add %4, %1 \n\t"\ |
1991 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ | 1991 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ |
1992 \ | 1992 \ |
1993 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ | 1993 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ |
1994 "addl %4, %1 \n\t" \ | 1994 "add %4, %1 \n\t" \ |
1995 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ | 1995 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ |
1996 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ | 1996 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ |
1997 \ | 1997 \ |
1998 "addl $136, %0 \n\t"\ | 1998 "add $136, %0 \n\t"\ |
1999 "addl %6, %1 \n\t"\ | 1999 "add %6, %1 \n\t"\ |
2000 "decl %2 \n\t"\ | 2000 "decl %2 \n\t"\ |
2001 " jnz 1b \n\t"\ | 2001 " jnz 1b \n\t"\ |
2002 \ | 2002 \ |
2003 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ | 2003 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2004 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\ | 2004 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\ |
2005 :"memory"\ | 2005 :"memory"\ |
2006 );\ | 2006 );\ |
2007 }\ | 2007 }\ |
2008 \ | 2008 \ |
2009 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 2009 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2019 "movq (%0), %%mm1 \n\t"\ | 2019 "movq (%0), %%mm1 \n\t"\ |
2020 "punpcklbw %%mm7, %%mm0 \n\t"\ | 2020 "punpcklbw %%mm7, %%mm0 \n\t"\ |
2021 "punpckhbw %%mm7, %%mm1 \n\t"\ | 2021 "punpckhbw %%mm7, %%mm1 \n\t"\ |
2022 "movq %%mm0, (%1) \n\t"\ | 2022 "movq %%mm0, (%1) \n\t"\ |
2023 "movq %%mm1, 9*8(%1) \n\t"\ | 2023 "movq %%mm1, 9*8(%1) \n\t"\ |
2024 "addl $8, %1 \n\t"\ | 2024 "add $8, %1 \n\t"\ |
2025 "addl %3, %0 \n\t"\ | 2025 "add %3, %0 \n\t"\ |
2026 "decl %2 \n\t"\ | 2026 "decl %2 \n\t"\ |
2027 " jnz 1b \n\t"\ | 2027 " jnz 1b \n\t"\ |
2028 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ | 2028 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ |
2029 : "r" (srcStride)\ | 2029 : "r" ((long)srcStride)\ |
2030 : "memory"\ | 2030 : "memory"\ |
2031 );\ | 2031 );\ |
2032 \ | 2032 \ |
2033 temp_ptr= temp;\ | 2033 temp_ptr= temp;\ |
2034 count=2;\ | 2034 count=2;\ |
2041 "movq 8(%0), %%mm1 \n\t"\ | 2041 "movq 8(%0), %%mm1 \n\t"\ |
2042 "movq 16(%0), %%mm2 \n\t"\ | 2042 "movq 16(%0), %%mm2 \n\t"\ |
2043 "movq 24(%0), %%mm3 \n\t"\ | 2043 "movq 24(%0), %%mm3 \n\t"\ |
2044 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ | 2044 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
2045 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | 2045 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
2046 "addl %4, %1 \n\t"\ | 2046 "add %4, %1 \n\t"\ |
2047 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ | 2047 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
2048 \ | 2048 \ |
2049 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ | 2049 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
2050 "addl %4, %1 \n\t"\ | 2050 "add %4, %1 \n\t"\ |
2051 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ | 2051 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
2052 \ | 2052 \ |
2053 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ | 2053 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ |
2054 "addl %4, %1 \n\t"\ | 2054 "add %4, %1 \n\t"\ |
2055 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ | 2055 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ |
2056 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ | 2056 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ |
2057 \ | 2057 \ |
2058 "addl $72, %0 \n\t"\ | 2058 "add $72, %0 \n\t"\ |
2059 "addl %6, %1 \n\t"\ | 2059 "add %6, %1 \n\t"\ |
2060 "decl %2 \n\t"\ | 2060 "decl %2 \n\t"\ |
2061 " jnz 1b \n\t"\ | 2061 " jnz 1b \n\t"\ |
2062 \ | 2062 \ |
2063 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ | 2063 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2064 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\ | 2064 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\ |
2065 : "memory"\ | 2065 : "memory"\ |
2066 );\ | 2066 );\ |
2067 }\ | 2067 }\ |
2068 \ | 2068 \ |
2069 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ | 2069 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ |
2295 "paddw "#D", %%mm6 \n\t"\ | 2295 "paddw "#D", %%mm6 \n\t"\ |
2296 "psllw $2, %%mm6 \n\t"\ | 2296 "psllw $2, %%mm6 \n\t"\ |
2297 "psubw "#B", %%mm6 \n\t"\ | 2297 "psubw "#B", %%mm6 \n\t"\ |
2298 "psubw "#E", %%mm6 \n\t"\ | 2298 "psubw "#E", %%mm6 \n\t"\ |
2299 "pmullw %4, %%mm6 \n\t"\ | 2299 "pmullw %4, %%mm6 \n\t"\ |
2300 "addl %2, %0 \n\t"\ | 2300 "add %2, %0 \n\t"\ |
2301 "punpcklbw %%mm7, "#F" \n\t"\ | 2301 "punpcklbw %%mm7, "#F" \n\t"\ |
2302 "paddw %5, "#A" \n\t"\ | 2302 "paddw %5, "#A" \n\t"\ |
2303 "paddw "#F", "#A" \n\t"\ | 2303 "paddw "#F", "#A" \n\t"\ |
2304 "paddw "#A", %%mm6 \n\t"\ | 2304 "paddw "#A", %%mm6 \n\t"\ |
2305 "psraw $5, %%mm6 \n\t"\ | 2305 "psraw $5, %%mm6 \n\t"\ |
2306 "packuswb %%mm6, %%mm6 \n\t"\ | 2306 "packuswb %%mm6, %%mm6 \n\t"\ |
2307 OP(%%mm6, (%1), A, d)\ | 2307 OP(%%mm6, (%1), A, d)\ |
2308 "addl %3, %1 \n\t" | 2308 "add %3, %1 \n\t" |
2309 | 2309 |
2310 #define QPEL_H264HV(A,B,C,D,E,F,OF)\ | 2310 #define QPEL_H264HV(A,B,C,D,E,F,OF)\ |
2311 "movd (%0), "#F" \n\t"\ | 2311 "movd (%0), "#F" \n\t"\ |
2312 "movq "#C", %%mm6 \n\t"\ | 2312 "movq "#C", %%mm6 \n\t"\ |
2313 "paddw "#D", %%mm6 \n\t"\ | 2313 "paddw "#D", %%mm6 \n\t"\ |
2314 "psllw $2, %%mm6 \n\t"\ | 2314 "psllw $2, %%mm6 \n\t"\ |
2315 "psubw "#B", %%mm6 \n\t"\ | 2315 "psubw "#B", %%mm6 \n\t"\ |
2316 "psubw "#E", %%mm6 \n\t"\ | 2316 "psubw "#E", %%mm6 \n\t"\ |
2317 "pmullw %3, %%mm6 \n\t"\ | 2317 "pmullw %3, %%mm6 \n\t"\ |
2318 "addl %2, %0 \n\t"\ | 2318 "add %2, %0 \n\t"\ |
2319 "punpcklbw %%mm7, "#F" \n\t"\ | 2319 "punpcklbw %%mm7, "#F" \n\t"\ |
2320 "paddw "#F", "#A" \n\t"\ | 2320 "paddw "#F", "#A" \n\t"\ |
2321 "paddw "#A", %%mm6 \n\t"\ | 2321 "paddw "#A", %%mm6 \n\t"\ |
2322 "movq %%mm6, "#OF"(%1) \n\t" | 2322 "movq %%mm6, "#OF"(%1) \n\t" |
2323 | 2323 |
2351 "paddw %%mm5, %%mm0 \n\t"\ | 2351 "paddw %%mm5, %%mm0 \n\t"\ |
2352 "paddw %%mm2, %%mm0 \n\t"\ | 2352 "paddw %%mm2, %%mm0 \n\t"\ |
2353 "psraw $5, %%mm0 \n\t"\ | 2353 "psraw $5, %%mm0 \n\t"\ |
2354 "packuswb %%mm0, %%mm0 \n\t"\ | 2354 "packuswb %%mm0, %%mm0 \n\t"\ |
2355 OP(%%mm0, (%1),%%mm6, d)\ | 2355 OP(%%mm0, (%1),%%mm6, d)\ |
2356 "addl %3, %0 \n\t"\ | 2356 "add %3, %0 \n\t"\ |
2357 "addl %4, %1 \n\t"\ | 2357 "add %4, %1 \n\t"\ |
2358 "decl %2 \n\t"\ | 2358 "decl %2 \n\t"\ |
2359 " jnz 1b \n\t"\ | 2359 " jnz 1b \n\t"\ |
2360 : "+a"(src), "+c"(dst), "+m"(h)\ | 2360 : "+a"(src), "+c"(dst), "+m"(h)\ |
2361 : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | 2361 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ |
2362 : "memory"\ | 2362 : "memory"\ |
2363 );\ | 2363 );\ |
2364 }\ | 2364 }\ |
2365 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 2365 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2366 src -= 2*srcStride;\ | 2366 src -= 2*srcStride;\ |
2367 asm volatile(\ | 2367 asm volatile(\ |
2368 "pxor %%mm7, %%mm7 \n\t"\ | 2368 "pxor %%mm7, %%mm7 \n\t"\ |
2369 "movd (%0), %%mm0 \n\t"\ | 2369 "movd (%0), %%mm0 \n\t"\ |
2370 "addl %2, %0 \n\t"\ | 2370 "add %2, %0 \n\t"\ |
2371 "movd (%0), %%mm1 \n\t"\ | 2371 "movd (%0), %%mm1 \n\t"\ |
2372 "addl %2, %0 \n\t"\ | 2372 "add %2, %0 \n\t"\ |
2373 "movd (%0), %%mm2 \n\t"\ | 2373 "movd (%0), %%mm2 \n\t"\ |
2374 "addl %2, %0 \n\t"\ | 2374 "add %2, %0 \n\t"\ |
2375 "movd (%0), %%mm3 \n\t"\ | 2375 "movd (%0), %%mm3 \n\t"\ |
2376 "addl %2, %0 \n\t"\ | 2376 "add %2, %0 \n\t"\ |
2377 "movd (%0), %%mm4 \n\t"\ | 2377 "movd (%0), %%mm4 \n\t"\ |
2378 "addl %2, %0 \n\t"\ | 2378 "add %2, %0 \n\t"\ |
2379 "punpcklbw %%mm7, %%mm0 \n\t"\ | 2379 "punpcklbw %%mm7, %%mm0 \n\t"\ |
2380 "punpcklbw %%mm7, %%mm1 \n\t"\ | 2380 "punpcklbw %%mm7, %%mm1 \n\t"\ |
2381 "punpcklbw %%mm7, %%mm2 \n\t"\ | 2381 "punpcklbw %%mm7, %%mm2 \n\t"\ |
2382 "punpcklbw %%mm7, %%mm3 \n\t"\ | 2382 "punpcklbw %%mm7, %%mm3 \n\t"\ |
2383 "punpcklbw %%mm7, %%mm4 \n\t"\ | 2383 "punpcklbw %%mm7, %%mm4 \n\t"\ |
2385 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | 2385 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ |
2386 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ | 2386 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ |
2387 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ | 2387 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ |
2388 \ | 2388 \ |
2389 : "+a"(src), "+c"(dst)\ | 2389 : "+a"(src), "+c"(dst)\ |
2390 : "S"(srcStride), "D"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | 2390 : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ |
2391 : "memory"\ | 2391 : "memory"\ |
2392 );\ | 2392 );\ |
2393 }\ | 2393 }\ |
2394 static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | 2394 static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ |
2395 int h=4;\ | 2395 int h=4;\ |
2397 src -= 2*srcStride+2;\ | 2397 src -= 2*srcStride+2;\ |
2398 while(w--){\ | 2398 while(w--){\ |
2399 asm volatile(\ | 2399 asm volatile(\ |
2400 "pxor %%mm7, %%mm7 \n\t"\ | 2400 "pxor %%mm7, %%mm7 \n\t"\ |
2401 "movd (%0), %%mm0 \n\t"\ | 2401 "movd (%0), %%mm0 \n\t"\ |
2402 "addl %2, %0 \n\t"\ | 2402 "add %2, %0 \n\t"\ |
2403 "movd (%0), %%mm1 \n\t"\ | 2403 "movd (%0), %%mm1 \n\t"\ |
2404 "addl %2, %0 \n\t"\ | 2404 "add %2, %0 \n\t"\ |
2405 "movd (%0), %%mm2 \n\t"\ | 2405 "movd (%0), %%mm2 \n\t"\ |
2406 "addl %2, %0 \n\t"\ | 2406 "add %2, %0 \n\t"\ |
2407 "movd (%0), %%mm3 \n\t"\ | 2407 "movd (%0), %%mm3 \n\t"\ |
2408 "addl %2, %0 \n\t"\ | 2408 "add %2, %0 \n\t"\ |
2409 "movd (%0), %%mm4 \n\t"\ | 2409 "movd (%0), %%mm4 \n\t"\ |
2410 "addl %2, %0 \n\t"\ | 2410 "add %2, %0 \n\t"\ |
2411 "punpcklbw %%mm7, %%mm0 \n\t"\ | 2411 "punpcklbw %%mm7, %%mm0 \n\t"\ |
2412 "punpcklbw %%mm7, %%mm1 \n\t"\ | 2412 "punpcklbw %%mm7, %%mm1 \n\t"\ |
2413 "punpcklbw %%mm7, %%mm2 \n\t"\ | 2413 "punpcklbw %%mm7, %%mm2 \n\t"\ |
2414 "punpcklbw %%mm7, %%mm3 \n\t"\ | 2414 "punpcklbw %%mm7, %%mm3 \n\t"\ |
2415 "punpcklbw %%mm7, %%mm4 \n\t"\ | 2415 "punpcklbw %%mm7, %%mm4 \n\t"\ |
2417 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ | 2417 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ |
2418 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ | 2418 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ |
2419 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ | 2419 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ |
2420 \ | 2420 \ |
2421 : "+a"(src)\ | 2421 : "+a"(src)\ |
2422 : "c"(tmp), "S"(srcStride), "m"(ff_pw_5)\ | 2422 : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\ |
2423 : "memory"\ | 2423 : "memory"\ |
2424 );\ | 2424 );\ |
2425 tmp += 4;\ | 2425 tmp += 4;\ |
2426 src += 4 - 9*srcStride;\ | 2426 src += 4 - 9*srcStride;\ |
2427 }\ | 2427 }\ |
2443 "paddw %%mm6, %%mm2 \n\t"\ | 2443 "paddw %%mm6, %%mm2 \n\t"\ |
2444 "paddw %%mm2, %%mm0 \n\t"\ | 2444 "paddw %%mm2, %%mm0 \n\t"\ |
2445 "psraw $6, %%mm0 \n\t"\ | 2445 "psraw $6, %%mm0 \n\t"\ |
2446 "packuswb %%mm0, %%mm0 \n\t"\ | 2446 "packuswb %%mm0, %%mm0 \n\t"\ |
2447 OP(%%mm0, (%1),%%mm7, d)\ | 2447 OP(%%mm0, (%1),%%mm7, d)\ |
2448 "addl $24, %0 \n\t"\ | 2448 "add $24, %0 \n\t"\ |
2449 "addl %3, %1 \n\t"\ | 2449 "add %3, %1 \n\t"\ |
2450 "decl %2 \n\t"\ | 2450 "decl %2 \n\t"\ |
2451 " jnz 1b \n\t"\ | 2451 " jnz 1b \n\t"\ |
2452 : "+a"(tmp), "+c"(dst), "+m"(h)\ | 2452 : "+a"(tmp), "+c"(dst), "+m"(h)\ |
2453 : "S"(dstStride), "m"(ff_pw_32)\ | 2453 : "S"((long)dstStride), "m"(ff_pw_32)\ |
2454 : "memory"\ | 2454 : "memory"\ |
2455 );\ | 2455 );\ |
2456 }\ | 2456 }\ |
2457 \ | 2457 \ |
2458 static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 2458 static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2500 "paddw %%mm4, %%mm1 \n\t"\ | 2500 "paddw %%mm4, %%mm1 \n\t"\ |
2501 "psraw $5, %%mm0 \n\t"\ | 2501 "psraw $5, %%mm0 \n\t"\ |
2502 "psraw $5, %%mm1 \n\t"\ | 2502 "psraw $5, %%mm1 \n\t"\ |
2503 "packuswb %%mm1, %%mm0 \n\t"\ | 2503 "packuswb %%mm1, %%mm0 \n\t"\ |
2504 OP(%%mm0, (%1),%%mm5, q)\ | 2504 OP(%%mm0, (%1),%%mm5, q)\ |
2505 "addl %3, %0 \n\t"\ | 2505 "add %3, %0 \n\t"\ |
2506 "addl %4, %1 \n\t"\ | 2506 "add %4, %1 \n\t"\ |
2507 "decl %2 \n\t"\ | 2507 "decl %2 \n\t"\ |
2508 " jnz 1b \n\t"\ | 2508 " jnz 1b \n\t"\ |
2509 : "+a"(src), "+c"(dst), "+m"(h)\ | 2509 : "+a"(src), "+c"(dst), "+m"(h)\ |
2510 : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | 2510 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ |
2511 : "memory"\ | 2511 : "memory"\ |
2512 );\ | 2512 );\ |
2513 }\ | 2513 }\ |
2514 \ | 2514 \ |
2515 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 2515 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2518 \ | 2518 \ |
2519 while(h--){\ | 2519 while(h--){\ |
2520 asm volatile(\ | 2520 asm volatile(\ |
2521 "pxor %%mm7, %%mm7 \n\t"\ | 2521 "pxor %%mm7, %%mm7 \n\t"\ |
2522 "movd (%0), %%mm0 \n\t"\ | 2522 "movd (%0), %%mm0 \n\t"\ |
2523 "addl %2, %0 \n\t"\ | 2523 "add %2, %0 \n\t"\ |
2524 "movd (%0), %%mm1 \n\t"\ | 2524 "movd (%0), %%mm1 \n\t"\ |
2525 "addl %2, %0 \n\t"\ | 2525 "add %2, %0 \n\t"\ |
2526 "movd (%0), %%mm2 \n\t"\ | 2526 "movd (%0), %%mm2 \n\t"\ |
2527 "addl %2, %0 \n\t"\ | 2527 "add %2, %0 \n\t"\ |
2528 "movd (%0), %%mm3 \n\t"\ | 2528 "movd (%0), %%mm3 \n\t"\ |
2529 "addl %2, %0 \n\t"\ | 2529 "add %2, %0 \n\t"\ |
2530 "movd (%0), %%mm4 \n\t"\ | 2530 "movd (%0), %%mm4 \n\t"\ |
2531 "addl %2, %0 \n\t"\ | 2531 "add %2, %0 \n\t"\ |
2532 "punpcklbw %%mm7, %%mm0 \n\t"\ | 2532 "punpcklbw %%mm7, %%mm0 \n\t"\ |
2533 "punpcklbw %%mm7, %%mm1 \n\t"\ | 2533 "punpcklbw %%mm7, %%mm1 \n\t"\ |
2534 "punpcklbw %%mm7, %%mm2 \n\t"\ | 2534 "punpcklbw %%mm7, %%mm2 \n\t"\ |
2535 "punpcklbw %%mm7, %%mm3 \n\t"\ | 2535 "punpcklbw %%mm7, %%mm3 \n\t"\ |
2536 "punpcklbw %%mm7, %%mm4 \n\t"\ | 2536 "punpcklbw %%mm7, %%mm4 \n\t"\ |
2542 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ | 2542 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ |
2543 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | 2543 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ |
2544 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | 2544 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ |
2545 \ | 2545 \ |
2546 : "+a"(src), "+c"(dst)\ | 2546 : "+a"(src), "+c"(dst)\ |
2547 : "S"(srcStride), "D"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | 2547 : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ |
2548 : "memory"\ | 2548 : "memory"\ |
2549 );\ | 2549 );\ |
2550 src += 4-13*srcStride;\ | 2550 src += 4-13*srcStride;\ |
2551 dst += 4-8*dstStride;\ | 2551 dst += 4-8*dstStride;\ |
2552 }\ | 2552 }\ |
2557 src -= 2*srcStride+2;\ | 2557 src -= 2*srcStride+2;\ |
2558 while(w--){\ | 2558 while(w--){\ |
2559 asm volatile(\ | 2559 asm volatile(\ |
2560 "pxor %%mm7, %%mm7 \n\t"\ | 2560 "pxor %%mm7, %%mm7 \n\t"\ |
2561 "movd (%0), %%mm0 \n\t"\ | 2561 "movd (%0), %%mm0 \n\t"\ |
2562 "addl %2, %0 \n\t"\ | 2562 "add %2, %0 \n\t"\ |
2563 "movd (%0), %%mm1 \n\t"\ | 2563 "movd (%0), %%mm1 \n\t"\ |
2564 "addl %2, %0 \n\t"\ | 2564 "add %2, %0 \n\t"\ |
2565 "movd (%0), %%mm2 \n\t"\ | 2565 "movd (%0), %%mm2 \n\t"\ |
2566 "addl %2, %0 \n\t"\ | 2566 "add %2, %0 \n\t"\ |
2567 "movd (%0), %%mm3 \n\t"\ | 2567 "movd (%0), %%mm3 \n\t"\ |
2568 "addl %2, %0 \n\t"\ | 2568 "add %2, %0 \n\t"\ |
2569 "movd (%0), %%mm4 \n\t"\ | 2569 "movd (%0), %%mm4 \n\t"\ |
2570 "addl %2, %0 \n\t"\ | 2570 "add %2, %0 \n\t"\ |
2571 "punpcklbw %%mm7, %%mm0 \n\t"\ | 2571 "punpcklbw %%mm7, %%mm0 \n\t"\ |
2572 "punpcklbw %%mm7, %%mm1 \n\t"\ | 2572 "punpcklbw %%mm7, %%mm1 \n\t"\ |
2573 "punpcklbw %%mm7, %%mm2 \n\t"\ | 2573 "punpcklbw %%mm7, %%mm2 \n\t"\ |
2574 "punpcklbw %%mm7, %%mm3 \n\t"\ | 2574 "punpcklbw %%mm7, %%mm3 \n\t"\ |
2575 "punpcklbw %%mm7, %%mm4 \n\t"\ | 2575 "punpcklbw %%mm7, %%mm4 \n\t"\ |
2581 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\ | 2581 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\ |
2582 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\ | 2582 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\ |
2583 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\ | 2583 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\ |
2584 \ | 2584 \ |
2585 : "+a"(src)\ | 2585 : "+a"(src)\ |
2586 : "c"(tmp), "S"(srcStride), "m"(ff_pw_5)\ | 2586 : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\ |
2587 : "memory"\ | 2587 : "memory"\ |
2588 );\ | 2588 );\ |
2589 tmp += 4;\ | 2589 tmp += 4;\ |
2590 src += 4 - 13*srcStride;\ | 2590 src += 4 - 13*srcStride;\ |
2591 }\ | 2591 }\ |
2621 "paddw %%mm5, %%mm3 \n\t"\ | 2621 "paddw %%mm5, %%mm3 \n\t"\ |
2622 "psraw $6, %%mm0 \n\t"\ | 2622 "psraw $6, %%mm0 \n\t"\ |
2623 "psraw $6, %%mm3 \n\t"\ | 2623 "psraw $6, %%mm3 \n\t"\ |
2624 "packuswb %%mm3, %%mm0 \n\t"\ | 2624 "packuswb %%mm3, %%mm0 \n\t"\ |
2625 OP(%%mm0, (%1),%%mm7, q)\ | 2625 OP(%%mm0, (%1),%%mm7, q)\ |
2626 "addl $32, %0 \n\t"\ | 2626 "add $32, %0 \n\t"\ |
2627 "addl %3, %1 \n\t"\ | 2627 "add %3, %1 \n\t"\ |
2628 "decl %2 \n\t"\ | 2628 "decl %2 \n\t"\ |
2629 " jnz 1b \n\t"\ | 2629 " jnz 1b \n\t"\ |
2630 : "+a"(tmp), "+c"(dst), "+m"(h)\ | 2630 : "+a"(tmp), "+c"(dst), "+m"(h)\ |
2631 : "S"(dstStride), "m"(ff_pw_32)\ | 2631 : "S"((long)dstStride), "m"(ff_pw_32)\ |
2632 : "memory"\ | 2632 : "memory"\ |
2633 );\ | 2633 );\ |
2634 }\ | 2634 }\ |
2635 static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 2635 static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2636 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ | 2636 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ |
2829 c->put_ ## postfix1 = put_ ## postfix2;\ | 2829 c->put_ ## postfix1 = put_ ## postfix2;\ |
2830 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ | 2830 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ |
2831 c->avg_ ## postfix1 = avg_ ## postfix2; | 2831 c->avg_ ## postfix1 = avg_ ## postfix2; |
2832 | 2832 |
2833 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ | 2833 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ |
2834 int i=0; | 2834 long i=0; |
2835 | 2835 |
2836 assert(ABS(scale) < 256); | 2836 assert(ABS(scale) < 256); |
2837 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; | 2837 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; |
2838 | 2838 |
2839 asm volatile( | 2839 asm volatile( |
2861 "pmaddwd %%mm0, %%mm0 \n\t" | 2861 "pmaddwd %%mm0, %%mm0 \n\t" |
2862 "pmaddwd %%mm1, %%mm1 \n\t" | 2862 "pmaddwd %%mm1, %%mm1 \n\t" |
2863 "paddd %%mm1, %%mm0 \n\t" | 2863 "paddd %%mm1, %%mm0 \n\t" |
2864 "psrld $4, %%mm0 \n\t" | 2864 "psrld $4, %%mm0 \n\t" |
2865 "paddd %%mm0, %%mm7 \n\t" | 2865 "paddd %%mm0, %%mm7 \n\t" |
2866 "addl $16, %0 \n\t" | 2866 "add $16, %0 \n\t" |
2867 "cmpl $128, %0 \n\t" //FIXME optimize & bench | 2867 "cmp $128, %0 \n\t" //FIXME optimize & bench |
2868 " jb 1b \n\t" | 2868 " jb 1b \n\t" |
2869 "movq %%mm7, %%mm6 \n\t" | 2869 "movq %%mm7, %%mm6 \n\t" |
2870 "psrlq $32, %%mm7 \n\t" | 2870 "psrlq $32, %%mm7 \n\t" |
2871 "paddd %%mm6, %%mm7 \n\t" | 2871 "paddd %%mm6, %%mm7 \n\t" |
2872 "psrld $2, %%mm7 \n\t" | 2872 "psrld $2, %%mm7 \n\t" |
2877 ); | 2877 ); |
2878 return i; | 2878 return i; |
2879 } | 2879 } |
2880 | 2880 |
2881 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ | 2881 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ |
2882 int i=0; | 2882 long i=0; |
2883 | 2883 |
2884 if(ABS(scale) < 256){ | 2884 if(ABS(scale) < 256){ |
2885 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; | 2885 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; |
2886 asm volatile( | 2886 asm volatile( |
2887 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w | 2887 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w |
2900 "psraw $1, %%mm1 \n\t" | 2900 "psraw $1, %%mm1 \n\t" |
2901 "paddw (%2, %0), %%mm0 \n\t" | 2901 "paddw (%2, %0), %%mm0 \n\t" |
2902 "paddw 8(%2, %0), %%mm1 \n\t" | 2902 "paddw 8(%2, %0), %%mm1 \n\t" |
2903 "movq %%mm0, (%2, %0) \n\t" | 2903 "movq %%mm0, (%2, %0) \n\t" |
2904 "movq %%mm1, 8(%2, %0) \n\t" | 2904 "movq %%mm1, 8(%2, %0) \n\t" |
2905 "addl $16, %0 \n\t" | 2905 "add $16, %0 \n\t" |
2906 "cmpl $128, %0 \n\t" //FIXME optimize & bench | 2906 "cmp $128, %0 \n\t" //FIXME optimize & bench |
2907 " jb 1b \n\t" | 2907 " jb 1b \n\t" |
2908 | 2908 |
2909 : "+r" (i) | 2909 : "+r" (i) |
2910 : "r"(basis), "r"(rem), "g"(scale) | 2910 : "r"(basis), "r"(rem), "g"(scale) |
2911 ); | 2911 ); |