comparison i386/h264dsp_mmx.c @ 2979:bfabfdf9ce55 libavcodec

COSMETICS: tabs --> spaces, some prettyprinting
author diego
date Thu, 22 Dec 2005 01:10:11 +0000
parents ef2149182f1c
children b52d8ee430f6
comparison
equal deleted inserted replaced
2978:403183bbb505 2979:bfabfdf9ce55
45 SUMSUB_BA ( s02, d02 )\ 45 SUMSUB_BA ( s02, d02 )\
46 SUMSUBD2_AB( s13, d13, t )\ 46 SUMSUBD2_AB( s13, d13, t )\
47 SUMSUB_BADC( d13, s02, s13, d02 ) 47 SUMSUB_BADC( d13, s02, s13, d02 )
48 48
49 #define SBUTTERFLY(a,b,t,n)\ 49 #define SBUTTERFLY(a,b,t,n)\
50 "movq " #a ", " #t " \n\t" /* abcd */\ 50 "movq " #a ", " #t " \n\t" /* abcd */\
51 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ 51 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
52 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ 52 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
53 53
54 #define TRANSPOSE4(a,b,c,d,t)\ 54 #define TRANSPOSE4(a,b,c,d,t)\
55 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\ 55 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
56 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\ 56 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
57 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\ 57 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
367 367
368 /***********************************/ 368 /***********************************/
369 /* motion compensation */ 369 /* motion compensation */
370 370
371 #define QPEL_H264V(A,B,C,D,E,F,OP)\ 371 #define QPEL_H264V(A,B,C,D,E,F,OP)\
372 "movd (%0), "#F" \n\t"\ 372 "movd (%0), "#F" \n\t"\
373 "movq "#C", %%mm6 \n\t"\ 373 "movq "#C", %%mm6 \n\t"\
374 "paddw "#D", %%mm6 \n\t"\ 374 "paddw "#D", %%mm6 \n\t"\
375 "psllw $2, %%mm6 \n\t"\ 375 "psllw $2, %%mm6 \n\t"\
376 "psubw "#B", %%mm6 \n\t"\ 376 "psubw "#B", %%mm6 \n\t"\
377 "psubw "#E", %%mm6 \n\t"\ 377 "psubw "#E", %%mm6 \n\t"\
378 "pmullw %4, %%mm6 \n\t"\ 378 "pmullw %4, %%mm6 \n\t"\
379 "add %2, %0 \n\t"\ 379 "add %2, %0 \n\t"\
380 "punpcklbw %%mm7, "#F" \n\t"\ 380 "punpcklbw %%mm7, "#F" \n\t"\
381 "paddw %5, "#A" \n\t"\ 381 "paddw %5, "#A" \n\t"\
382 "paddw "#F", "#A" \n\t"\ 382 "paddw "#F", "#A" \n\t"\
383 "paddw "#A", %%mm6 \n\t"\ 383 "paddw "#A", %%mm6 \n\t"\
384 "psraw $5, %%mm6 \n\t"\ 384 "psraw $5, %%mm6 \n\t"\
385 "packuswb %%mm6, %%mm6 \n\t"\ 385 "packuswb %%mm6, %%mm6 \n\t"\
386 OP(%%mm6, (%1), A, d)\ 386 OP(%%mm6, (%1), A, d)\
387 "add %3, %1 \n\t" 387 "add %3, %1 \n\t"
388 388
389 #define QPEL_H264HV(A,B,C,D,E,F,OF)\ 389 #define QPEL_H264HV(A,B,C,D,E,F,OF)\
390 "movd (%0), "#F" \n\t"\ 390 "movd (%0), "#F" \n\t"\
391 "movq "#C", %%mm6 \n\t"\ 391 "movq "#C", %%mm6 \n\t"\
392 "paddw "#D", %%mm6 \n\t"\ 392 "paddw "#D", %%mm6 \n\t"\
393 "psllw $2, %%mm6 \n\t"\ 393 "psllw $2, %%mm6 \n\t"\
394 "psubw "#B", %%mm6 \n\t"\ 394 "psubw "#B", %%mm6 \n\t"\
395 "psubw "#E", %%mm6 \n\t"\ 395 "psubw "#E", %%mm6 \n\t"\
396 "pmullw %3, %%mm6 \n\t"\ 396 "pmullw %3, %%mm6 \n\t"\
397 "add %2, %0 \n\t"\ 397 "add %2, %0 \n\t"\
398 "punpcklbw %%mm7, "#F" \n\t"\ 398 "punpcklbw %%mm7, "#F" \n\t"\
399 "paddw "#F", "#A" \n\t"\ 399 "paddw "#F", "#A" \n\t"\
400 "paddw "#A", %%mm6 \n\t"\ 400 "paddw "#A", %%mm6 \n\t"\
401 "movq %%mm6, "#OF"(%1) \n\t" 401 "movq %%mm6, "#OF"(%1) \n\t"
402 402
403 #define QPEL_H264(OPNAME, OP, MMX)\ 403 #define QPEL_H264(OPNAME, OP, MMX)\
404 static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 404 static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
405 int h=4;\ 405 int h=4;\
406 \ 406 \
407 asm volatile(\ 407 asm volatile(\
408 "pxor %%mm7, %%mm7 \n\t"\ 408 "pxor %%mm7, %%mm7 \n\t"\
409 "movq %5, %%mm4 \n\t"\ 409 "movq %5, %%mm4 \n\t"\
410 "movq %6, %%mm5 \n\t"\ 410 "movq %6, %%mm5 \n\t"\
411 "1: \n\t"\ 411 "1: \n\t"\
412 "movd -1(%0), %%mm1 \n\t"\ 412 "movd -1(%0), %%mm1 \n\t"\
413 "movd (%0), %%mm2 \n\t"\ 413 "movd (%0), %%mm2 \n\t"\
414 "movd 1(%0), %%mm3 \n\t"\ 414 "movd 1(%0), %%mm3 \n\t"\
415 "movd 2(%0), %%mm0 \n\t"\ 415 "movd 2(%0), %%mm0 \n\t"\
416 "punpcklbw %%mm7, %%mm1 \n\t"\ 416 "punpcklbw %%mm7, %%mm1 \n\t"\
417 "punpcklbw %%mm7, %%mm2 \n\t"\ 417 "punpcklbw %%mm7, %%mm2 \n\t"\
418 "punpcklbw %%mm7, %%mm3 \n\t"\ 418 "punpcklbw %%mm7, %%mm3 \n\t"\
419 "punpcklbw %%mm7, %%mm0 \n\t"\ 419 "punpcklbw %%mm7, %%mm0 \n\t"\
420 "paddw %%mm0, %%mm1 \n\t"\ 420 "paddw %%mm0, %%mm1 \n\t"\
421 "paddw %%mm3, %%mm2 \n\t"\ 421 "paddw %%mm3, %%mm2 \n\t"\
422 "movd -2(%0), %%mm0 \n\t"\ 422 "movd -2(%0), %%mm0 \n\t"\
423 "movd 3(%0), %%mm3 \n\t"\ 423 "movd 3(%0), %%mm3 \n\t"\
424 "punpcklbw %%mm7, %%mm0 \n\t"\ 424 "punpcklbw %%mm7, %%mm0 \n\t"\
425 "punpcklbw %%mm7, %%mm3 \n\t"\ 425 "punpcklbw %%mm7, %%mm3 \n\t"\
426 "paddw %%mm3, %%mm0 \n\t"\ 426 "paddw %%mm3, %%mm0 \n\t"\
427 "psllw $2, %%mm2 \n\t"\ 427 "psllw $2, %%mm2 \n\t"\
428 "psubw %%mm1, %%mm2 \n\t"\ 428 "psubw %%mm1, %%mm2 \n\t"\
429 "pmullw %%mm4, %%mm2 \n\t"\ 429 "pmullw %%mm4, %%mm2 \n\t"\
430 "paddw %%mm5, %%mm0 \n\t"\ 430 "paddw %%mm5, %%mm0 \n\t"\
431 "paddw %%mm2, %%mm0 \n\t"\ 431 "paddw %%mm2, %%mm0 \n\t"\
432 "psraw $5, %%mm0 \n\t"\ 432 "psraw $5, %%mm0 \n\t"\
433 "packuswb %%mm0, %%mm0 \n\t"\ 433 "packuswb %%mm0, %%mm0 \n\t"\
434 OP(%%mm0, (%1),%%mm6, d)\ 434 OP(%%mm0, (%1),%%mm6, d)\
435 "add %3, %0 \n\t"\ 435 "add %3, %0 \n\t"\
436 "add %4, %1 \n\t"\ 436 "add %4, %1 \n\t"\
437 "decl %2 \n\t"\ 437 "decl %2 \n\t"\
438 " jnz 1b \n\t"\ 438 " jnz 1b \n\t"\
439 : "+a"(src), "+c"(dst), "+m"(h)\ 439 : "+a"(src), "+c"(dst), "+m"(h)\
440 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 440 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
441 : "memory"\ 441 : "memory"\
442 );\ 442 );\
443 }\ 443 }\
444 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 444 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
445 src -= 2*srcStride;\ 445 src -= 2*srcStride;\
446 asm volatile(\ 446 asm volatile(\
447 "pxor %%mm7, %%mm7 \n\t"\ 447 "pxor %%mm7, %%mm7 \n\t"\
448 "movd (%0), %%mm0 \n\t"\ 448 "movd (%0), %%mm0 \n\t"\
449 "add %2, %0 \n\t"\ 449 "add %2, %0 \n\t"\
450 "movd (%0), %%mm1 \n\t"\ 450 "movd (%0), %%mm1 \n\t"\
451 "add %2, %0 \n\t"\ 451 "add %2, %0 \n\t"\
452 "movd (%0), %%mm2 \n\t"\ 452 "movd (%0), %%mm2 \n\t"\
453 "add %2, %0 \n\t"\ 453 "add %2, %0 \n\t"\
454 "movd (%0), %%mm3 \n\t"\ 454 "movd (%0), %%mm3 \n\t"\
455 "add %2, %0 \n\t"\ 455 "add %2, %0 \n\t"\
456 "movd (%0), %%mm4 \n\t"\ 456 "movd (%0), %%mm4 \n\t"\
457 "add %2, %0 \n\t"\ 457 "add %2, %0 \n\t"\
458 "punpcklbw %%mm7, %%mm0 \n\t"\ 458 "punpcklbw %%mm7, %%mm0 \n\t"\
459 "punpcklbw %%mm7, %%mm1 \n\t"\ 459 "punpcklbw %%mm7, %%mm1 \n\t"\
460 "punpcklbw %%mm7, %%mm2 \n\t"\ 460 "punpcklbw %%mm7, %%mm2 \n\t"\
461 "punpcklbw %%mm7, %%mm3 \n\t"\ 461 "punpcklbw %%mm7, %%mm3 \n\t"\
462 "punpcklbw %%mm7, %%mm4 \n\t"\ 462 "punpcklbw %%mm7, %%mm4 \n\t"\
463 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 463 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
464 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 464 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
465 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 465 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
466 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 466 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
467 \ 467 \
474 int h=4;\ 474 int h=4;\
475 int w=3;\ 475 int w=3;\
476 src -= 2*srcStride+2;\ 476 src -= 2*srcStride+2;\
477 while(w--){\ 477 while(w--){\
478 asm volatile(\ 478 asm volatile(\
479 "pxor %%mm7, %%mm7 \n\t"\ 479 "pxor %%mm7, %%mm7 \n\t"\
480 "movd (%0), %%mm0 \n\t"\ 480 "movd (%0), %%mm0 \n\t"\
481 "add %2, %0 \n\t"\ 481 "add %2, %0 \n\t"\
482 "movd (%0), %%mm1 \n\t"\ 482 "movd (%0), %%mm1 \n\t"\
483 "add %2, %0 \n\t"\ 483 "add %2, %0 \n\t"\
484 "movd (%0), %%mm2 \n\t"\ 484 "movd (%0), %%mm2 \n\t"\
485 "add %2, %0 \n\t"\ 485 "add %2, %0 \n\t"\
486 "movd (%0), %%mm3 \n\t"\ 486 "movd (%0), %%mm3 \n\t"\
487 "add %2, %0 \n\t"\ 487 "add %2, %0 \n\t"\
488 "movd (%0), %%mm4 \n\t"\ 488 "movd (%0), %%mm4 \n\t"\
489 "add %2, %0 \n\t"\ 489 "add %2, %0 \n\t"\
490 "punpcklbw %%mm7, %%mm0 \n\t"\ 490 "punpcklbw %%mm7, %%mm0 \n\t"\
491 "punpcklbw %%mm7, %%mm1 \n\t"\ 491 "punpcklbw %%mm7, %%mm1 \n\t"\
492 "punpcklbw %%mm7, %%mm2 \n\t"\ 492 "punpcklbw %%mm7, %%mm2 \n\t"\
493 "punpcklbw %%mm7, %%mm3 \n\t"\ 493 "punpcklbw %%mm7, %%mm3 \n\t"\
494 "punpcklbw %%mm7, %%mm4 \n\t"\ 494 "punpcklbw %%mm7, %%mm4 \n\t"\
495 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ 495 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
496 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ 496 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
497 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ 497 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
498 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ 498 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
499 \ 499 \
504 tmp += 4;\ 504 tmp += 4;\
505 src += 4 - 9*srcStride;\ 505 src += 4 - 9*srcStride;\
506 }\ 506 }\
507 tmp -= 3*4;\ 507 tmp -= 3*4;\
508 asm volatile(\ 508 asm volatile(\
509 "movq %4, %%mm6 \n\t"\ 509 "movq %4, %%mm6 \n\t"\
510 "1: \n\t"\ 510 "1: \n\t"\
511 "movq (%0), %%mm0 \n\t"\ 511 "movq (%0), %%mm0 \n\t"\
512 "paddw 10(%0), %%mm0 \n\t"\ 512 "paddw 10(%0), %%mm0 \n\t"\
513 "movq 2(%0), %%mm1 \n\t"\ 513 "movq 2(%0), %%mm1 \n\t"\
514 "paddw 8(%0), %%mm1 \n\t"\ 514 "paddw 8(%0), %%mm1 \n\t"\
515 "movq 4(%0), %%mm2 \n\t"\ 515 "movq 4(%0), %%mm2 \n\t"\
516 "paddw 6(%0), %%mm2 \n\t"\ 516 "paddw 6(%0), %%mm2 \n\t"\
517 "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ 517 "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
518 "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ 518 "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
519 "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ 519 "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
520 "paddsw %%mm2, %%mm0 \n\t"\ 520 "paddsw %%mm2, %%mm0 \n\t"\
521 "psraw $2, %%mm0 \n\t"/*((a-b)/4-b)/4 */\ 521 "psraw $2, %%mm0 \n\t"/*((a-b)/4-b)/4 */\
522 "paddw %%mm6, %%mm2 \n\t"\ 522 "paddw %%mm6, %%mm2 \n\t"\
523 "paddw %%mm2, %%mm0 \n\t"\ 523 "paddw %%mm2, %%mm0 \n\t"\
524 "psraw $6, %%mm0 \n\t"\ 524 "psraw $6, %%mm0 \n\t"\
525 "packuswb %%mm0, %%mm0 \n\t"\ 525 "packuswb %%mm0, %%mm0 \n\t"\
526 OP(%%mm0, (%1),%%mm7, d)\ 526 OP(%%mm0, (%1),%%mm7, d)\
527 "add $24, %0 \n\t"\ 527 "add $24, %0 \n\t"\
528 "add %3, %1 \n\t"\ 528 "add %3, %1 \n\t"\
529 "decl %2 \n\t"\ 529 "decl %2 \n\t"\
530 " jnz 1b \n\t"\ 530 " jnz 1b \n\t"\
531 : "+a"(tmp), "+c"(dst), "+m"(h)\ 531 : "+a"(tmp), "+c"(dst), "+m"(h)\
532 : "S"((long)dstStride), "m"(ff_pw_32)\ 532 : "S"((long)dstStride), "m"(ff_pw_32)\
533 : "memory"\ 533 : "memory"\
534 );\ 534 );\
535 }\ 535 }\
536 \ 536 \
537 static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 537 static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
538 int h=8;\ 538 int h=8;\
539 asm volatile(\ 539 asm volatile(\
540 "pxor %%mm7, %%mm7 \n\t"\ 540 "pxor %%mm7, %%mm7 \n\t"\
541 "movq %5, %%mm6 \n\t"\ 541 "movq %5, %%mm6 \n\t"\
542 "1: \n\t"\ 542 "1: \n\t"\
543 "movq (%0), %%mm0 \n\t"\ 543 "movq (%0), %%mm0 \n\t"\
544 "movq 1(%0), %%mm2 \n\t"\ 544 "movq 1(%0), %%mm2 \n\t"\
545 "movq %%mm0, %%mm1 \n\t"\ 545 "movq %%mm0, %%mm1 \n\t"\
546 "movq %%mm2, %%mm3 \n\t"\ 546 "movq %%mm2, %%mm3 \n\t"\
547 "punpcklbw %%mm7, %%mm0 \n\t"\ 547 "punpcklbw %%mm7, %%mm0 \n\t"\
548 "punpckhbw %%mm7, %%mm1 \n\t"\ 548 "punpckhbw %%mm7, %%mm1 \n\t"\
549 "punpcklbw %%mm7, %%mm2 \n\t"\ 549 "punpcklbw %%mm7, %%mm2 \n\t"\
550 "punpckhbw %%mm7, %%mm3 \n\t"\ 550 "punpckhbw %%mm7, %%mm3 \n\t"\
551 "paddw %%mm2, %%mm0 \n\t"\ 551 "paddw %%mm2, %%mm0 \n\t"\
552 "paddw %%mm3, %%mm1 \n\t"\ 552 "paddw %%mm3, %%mm1 \n\t"\
553 "psllw $2, %%mm0 \n\t"\ 553 "psllw $2, %%mm0 \n\t"\
554 "psllw $2, %%mm1 \n\t"\ 554 "psllw $2, %%mm1 \n\t"\
555 "movq -1(%0), %%mm2 \n\t"\ 555 "movq -1(%0), %%mm2 \n\t"\
556 "movq 2(%0), %%mm4 \n\t"\ 556 "movq 2(%0), %%mm4 \n\t"\
557 "movq %%mm2, %%mm3 \n\t"\ 557 "movq %%mm2, %%mm3 \n\t"\
558 "movq %%mm4, %%mm5 \n\t"\ 558 "movq %%mm4, %%mm5 \n\t"\
559 "punpcklbw %%mm7, %%mm2 \n\t"\ 559 "punpcklbw %%mm7, %%mm2 \n\t"\
560 "punpckhbw %%mm7, %%mm3 \n\t"\ 560 "punpckhbw %%mm7, %%mm3 \n\t"\
561 "punpcklbw %%mm7, %%mm4 \n\t"\ 561 "punpcklbw %%mm7, %%mm4 \n\t"\
562 "punpckhbw %%mm7, %%mm5 \n\t"\ 562 "punpckhbw %%mm7, %%mm5 \n\t"\
563 "paddw %%mm4, %%mm2 \n\t"\ 563 "paddw %%mm4, %%mm2 \n\t"\
564 "paddw %%mm3, %%mm5 \n\t"\ 564 "paddw %%mm3, %%mm5 \n\t"\
565 "psubw %%mm2, %%mm0 \n\t"\ 565 "psubw %%mm2, %%mm0 \n\t"\
566 "psubw %%mm5, %%mm1 \n\t"\ 566 "psubw %%mm5, %%mm1 \n\t"\
567 "pmullw %%mm6, %%mm0 \n\t"\ 567 "pmullw %%mm6, %%mm0 \n\t"\
568 "pmullw %%mm6, %%mm1 \n\t"\ 568 "pmullw %%mm6, %%mm1 \n\t"\
569 "movd -2(%0), %%mm2 \n\t"\ 569 "movd -2(%0), %%mm2 \n\t"\
570 "movd 7(%0), %%mm5 \n\t"\ 570 "movd 7(%0), %%mm5 \n\t"\
571 "punpcklbw %%mm7, %%mm2 \n\t"\ 571 "punpcklbw %%mm7, %%mm2 \n\t"\
572 "punpcklbw %%mm7, %%mm5 \n\t"\ 572 "punpcklbw %%mm7, %%mm5 \n\t"\
573 "paddw %%mm3, %%mm2 \n\t"\ 573 "paddw %%mm3, %%mm2 \n\t"\
574 "paddw %%mm5, %%mm4 \n\t"\ 574 "paddw %%mm5, %%mm4 \n\t"\
575 "movq %6, %%mm5 \n\t"\ 575 "movq %6, %%mm5 \n\t"\
576 "paddw %%mm5, %%mm2 \n\t"\ 576 "paddw %%mm5, %%mm2 \n\t"\
577 "paddw %%mm5, %%mm4 \n\t"\ 577 "paddw %%mm5, %%mm4 \n\t"\
578 "paddw %%mm2, %%mm0 \n\t"\ 578 "paddw %%mm2, %%mm0 \n\t"\
579 "paddw %%mm4, %%mm1 \n\t"\ 579 "paddw %%mm4, %%mm1 \n\t"\
580 "psraw $5, %%mm0 \n\t"\ 580 "psraw $5, %%mm0 \n\t"\
581 "psraw $5, %%mm1 \n\t"\ 581 "psraw $5, %%mm1 \n\t"\
582 "packuswb %%mm1, %%mm0 \n\t"\ 582 "packuswb %%mm1, %%mm0 \n\t"\
583 OP(%%mm0, (%1),%%mm5, q)\ 583 OP(%%mm0, (%1),%%mm5, q)\
584 "add %3, %0 \n\t"\ 584 "add %3, %0 \n\t"\
585 "add %4, %1 \n\t"\ 585 "add %4, %1 \n\t"\
586 "decl %2 \n\t"\ 586 "decl %2 \n\t"\
587 " jnz 1b \n\t"\ 587 " jnz 1b \n\t"\
588 : "+a"(src), "+c"(dst), "+m"(h)\ 588 : "+a"(src), "+c"(dst), "+m"(h)\
589 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 589 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
590 : "memory"\ 590 : "memory"\
591 );\ 591 );\
592 }\ 592 }\
595 int h= 2;\ 595 int h= 2;\
596 src -= 2*srcStride;\ 596 src -= 2*srcStride;\
597 \ 597 \
598 while(h--){\ 598 while(h--){\
599 asm volatile(\ 599 asm volatile(\
600 "pxor %%mm7, %%mm7 \n\t"\ 600 "pxor %%mm7, %%mm7 \n\t"\
601 "movd (%0), %%mm0 \n\t"\ 601 "movd (%0), %%mm0 \n\t"\
602 "add %2, %0 \n\t"\ 602 "add %2, %0 \n\t"\
603 "movd (%0), %%mm1 \n\t"\ 603 "movd (%0), %%mm1 \n\t"\
604 "add %2, %0 \n\t"\ 604 "add %2, %0 \n\t"\
605 "movd (%0), %%mm2 \n\t"\ 605 "movd (%0), %%mm2 \n\t"\
606 "add %2, %0 \n\t"\ 606 "add %2, %0 \n\t"\
607 "movd (%0), %%mm3 \n\t"\ 607 "movd (%0), %%mm3 \n\t"\
608 "add %2, %0 \n\t"\ 608 "add %2, %0 \n\t"\
609 "movd (%0), %%mm4 \n\t"\ 609 "movd (%0), %%mm4 \n\t"\
610 "add %2, %0 \n\t"\ 610 "add %2, %0 \n\t"\
611 "punpcklbw %%mm7, %%mm0 \n\t"\ 611 "punpcklbw %%mm7, %%mm0 \n\t"\
612 "punpcklbw %%mm7, %%mm1 \n\t"\ 612 "punpcklbw %%mm7, %%mm1 \n\t"\
613 "punpcklbw %%mm7, %%mm2 \n\t"\ 613 "punpcklbw %%mm7, %%mm2 \n\t"\
614 "punpcklbw %%mm7, %%mm3 \n\t"\ 614 "punpcklbw %%mm7, %%mm3 \n\t"\
615 "punpcklbw %%mm7, %%mm4 \n\t"\ 615 "punpcklbw %%mm7, %%mm4 \n\t"\
616 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 616 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
617 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 617 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
618 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 618 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
619 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 619 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
620 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ 620 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
634 int h=8;\ 634 int h=8;\
635 int w=4;\ 635 int w=4;\
636 src -= 2*srcStride+2;\ 636 src -= 2*srcStride+2;\
637 while(w--){\ 637 while(w--){\
638 asm volatile(\ 638 asm volatile(\
639 "pxor %%mm7, %%mm7 \n\t"\ 639 "pxor %%mm7, %%mm7 \n\t"\
640 "movd (%0), %%mm0 \n\t"\ 640 "movd (%0), %%mm0 \n\t"\
641 "add %2, %0 \n\t"\ 641 "add %2, %0 \n\t"\
642 "movd (%0), %%mm1 \n\t"\ 642 "movd (%0), %%mm1 \n\t"\
643 "add %2, %0 \n\t"\ 643 "add %2, %0 \n\t"\
644 "movd (%0), %%mm2 \n\t"\ 644 "movd (%0), %%mm2 \n\t"\
645 "add %2, %0 \n\t"\ 645 "add %2, %0 \n\t"\
646 "movd (%0), %%mm3 \n\t"\ 646 "movd (%0), %%mm3 \n\t"\
647 "add %2, %0 \n\t"\ 647 "add %2, %0 \n\t"\
648 "movd (%0), %%mm4 \n\t"\ 648 "movd (%0), %%mm4 \n\t"\
649 "add %2, %0 \n\t"\ 649 "add %2, %0 \n\t"\
650 "punpcklbw %%mm7, %%mm0 \n\t"\ 650 "punpcklbw %%mm7, %%mm0 \n\t"\
651 "punpcklbw %%mm7, %%mm1 \n\t"\ 651 "punpcklbw %%mm7, %%mm1 \n\t"\
652 "punpcklbw %%mm7, %%mm2 \n\t"\ 652 "punpcklbw %%mm7, %%mm2 \n\t"\
653 "punpcklbw %%mm7, %%mm3 \n\t"\ 653 "punpcklbw %%mm7, %%mm3 \n\t"\
654 "punpcklbw %%mm7, %%mm4 \n\t"\ 654 "punpcklbw %%mm7, %%mm4 \n\t"\
655 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\ 655 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\
656 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\ 656 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\
657 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\ 657 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\
658 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\ 658 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\
659 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\ 659 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\
668 tmp += 4;\ 668 tmp += 4;\
669 src += 4 - 13*srcStride;\ 669 src += 4 - 13*srcStride;\
670 }\ 670 }\
671 tmp -= 4*4;\ 671 tmp -= 4*4;\
672 asm volatile(\ 672 asm volatile(\
673 "movq %4, %%mm6 \n\t"\ 673 "movq %4, %%mm6 \n\t"\
674 "1: \n\t"\ 674 "1: \n\t"\
675 "movq (%0), %%mm0 \n\t"\ 675 "movq (%0), %%mm0 \n\t"\
676 "movq 8(%0), %%mm3 \n\t"\ 676 "movq 8(%0), %%mm3 \n\t"\
677 "movq 2(%0), %%mm1 \n\t"\ 677 "movq 2(%0), %%mm1 \n\t"\
678 "movq 10(%0), %%mm4 \n\t"\ 678 "movq 10(%0), %%mm4 \n\t"\
679 "paddw %%mm4, %%mm0 \n\t"\ 679 "paddw %%mm4, %%mm0 \n\t"\
680 "paddw %%mm3, %%mm1 \n\t"\ 680 "paddw %%mm3, %%mm1 \n\t"\
681 "paddw 18(%0), %%mm3 \n\t"\ 681 "paddw 18(%0), %%mm3 \n\t"\
682 "paddw 16(%0), %%mm4 \n\t"\ 682 "paddw 16(%0), %%mm4 \n\t"\
683 "movq 4(%0), %%mm2 \n\t"\ 683 "movq 4(%0), %%mm2 \n\t"\
684 "movq 12(%0), %%mm5 \n\t"\ 684 "movq 12(%0), %%mm5 \n\t"\
685 "paddw 6(%0), %%mm2 \n\t"\ 685 "paddw 6(%0), %%mm2 \n\t"\
686 "paddw 14(%0), %%mm5 \n\t"\ 686 "paddw 14(%0), %%mm5 \n\t"\
687 "psubw %%mm1, %%mm0 \n\t"\ 687 "psubw %%mm1, %%mm0 \n\t"\
688 "psubw %%mm4, %%mm3 \n\t"\ 688 "psubw %%mm4, %%mm3 \n\t"\
689 "psraw $2, %%mm0 \n\t"\ 689 "psraw $2, %%mm0 \n\t"\
690 "psraw $2, %%mm3 \n\t"\ 690 "psraw $2, %%mm3 \n\t"\
691 "psubw %%mm1, %%mm0 \n\t"\ 691 "psubw %%mm1, %%mm0 \n\t"\
692 "psubw %%mm4, %%mm3 \n\t"\ 692 "psubw %%mm4, %%mm3 \n\t"\
693 "paddsw %%mm2, %%mm0 \n\t"\ 693 "paddsw %%mm2, %%mm0 \n\t"\
694 "paddsw %%mm5, %%mm3 \n\t"\ 694 "paddsw %%mm5, %%mm3 \n\t"\
695 "psraw $2, %%mm0 \n\t"\ 695 "psraw $2, %%mm0 \n\t"\
696 "psraw $2, %%mm3 \n\t"\ 696 "psraw $2, %%mm3 \n\t"\
697 "paddw %%mm6, %%mm2 \n\t"\ 697 "paddw %%mm6, %%mm2 \n\t"\
698 "paddw %%mm6, %%mm5 \n\t"\ 698 "paddw %%mm6, %%mm5 \n\t"\
699 "paddw %%mm2, %%mm0 \n\t"\ 699 "paddw %%mm2, %%mm0 \n\t"\
700 "paddw %%mm5, %%mm3 \n\t"\ 700 "paddw %%mm5, %%mm3 \n\t"\
701 "psraw $6, %%mm0 \n\t"\ 701 "psraw $6, %%mm0 \n\t"\
702 "psraw $6, %%mm3 \n\t"\ 702 "psraw $6, %%mm3 \n\t"\
703 "packuswb %%mm3, %%mm0 \n\t"\ 703 "packuswb %%mm3, %%mm0 \n\t"\
704 OP(%%mm0, (%1),%%mm7, q)\ 704 OP(%%mm0, (%1),%%mm7, q)\
705 "add $32, %0 \n\t"\ 705 "add $32, %0 \n\t"\
706 "add %3, %1 \n\t"\ 706 "add %3, %1 \n\t"\
707 "decl %2 \n\t"\ 707 "decl %2 \n\t"\
708 " jnz 1b \n\t"\ 708 " jnz 1b \n\t"\
709 : "+a"(tmp), "+c"(dst), "+m"(h)\ 709 : "+a"(tmp), "+c"(dst), "+m"(h)\
710 : "S"((long)dstStride), "m"(ff_pw_32)\ 710 : "S"((long)dstStride), "m"(ff_pw_32)\
711 : "memory"\ 711 : "memory"\
712 );\ 712 );\
713 }\ 713 }\
860 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ 860 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
861 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\ 861 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\
862 }\ 862 }\
863 863
864 864
865 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" 865 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
866 #define AVG_3DNOW_OP(a,b,temp, size) \ 866 #define AVG_3DNOW_OP(a,b,temp, size) \
867 "mov" #size " " #b ", " #temp " \n\t"\ 867 "mov" #size " " #b ", " #temp " \n\t"\
868 "pavgusb " #temp ", " #a " \n\t"\ 868 "pavgusb " #temp ", " #a " \n\t"\
869 "mov" #size " " #a ", " #b " \n\t" 869 "mov" #size " " #a ", " #b " \n\t"
870 #define AVG_MMX2_OP(a,b,temp, size) \ 870 #define AVG_MMX2_OP(a,b,temp, size) \
871 "mov" #size " " #b ", " #temp " \n\t"\ 871 "mov" #size " " #b ", " #temp " \n\t"\
872 "pavgb " #temp ", " #a " \n\t"\ 872 "pavgb " #temp ", " #a " \n\t"\
873 "mov" #size " " #a ", " #b " \n\t" 873 "mov" #size " " #a ", " #b " \n\t"
874 874
875 QPEL_H264(put_, PUT_OP, 3dnow) 875 QPEL_H264(put_, PUT_OP, 3dnow)
876 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) 876 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
877 QPEL_H264(put_, PUT_OP, mmx2) 877 QPEL_H264(put_, PUT_OP, mmx2)
878 QPEL_H264(avg_, AVG_MMX2_OP, mmx2) 878 QPEL_H264(avg_, AVG_MMX2_OP, mmx2)