Mercurial > libavcodec.hg
comparison i386/h264dsp_mmx.c @ 2979:bfabfdf9ce55 libavcodec
COSMETICS: tabs --> spaces, some prettyprinting
author | diego |
---|---|
date | Thu, 22 Dec 2005 01:10:11 +0000 |
parents | ef2149182f1c |
children | b52d8ee430f6 |
comparison
equal
deleted
inserted
replaced
2978:403183bbb505 | 2979:bfabfdf9ce55 |
---|---|
45 SUMSUB_BA ( s02, d02 )\ | 45 SUMSUB_BA ( s02, d02 )\ |
46 SUMSUBD2_AB( s13, d13, t )\ | 46 SUMSUBD2_AB( s13, d13, t )\ |
47 SUMSUB_BADC( d13, s02, s13, d02 ) | 47 SUMSUB_BADC( d13, s02, s13, d02 ) |
48 | 48 |
49 #define SBUTTERFLY(a,b,t,n)\ | 49 #define SBUTTERFLY(a,b,t,n)\ |
50 "movq " #a ", " #t " \n\t" /* abcd */\ | 50 "movq " #a ", " #t " \n\t" /* abcd */\ |
51 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ | 51 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ |
52 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ | 52 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ |
53 | 53 |
54 #define TRANSPOSE4(a,b,c,d,t)\ | 54 #define TRANSPOSE4(a,b,c,d,t)\ |
55 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\ | 55 SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\ |
56 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\ | 56 SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\ |
57 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\ | 57 SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\ |
367 | 367 |
368 /***********************************/ | 368 /***********************************/ |
369 /* motion compensation */ | 369 /* motion compensation */ |
370 | 370 |
371 #define QPEL_H264V(A,B,C,D,E,F,OP)\ | 371 #define QPEL_H264V(A,B,C,D,E,F,OP)\ |
372 "movd (%0), "#F" \n\t"\ | 372 "movd (%0), "#F" \n\t"\ |
373 "movq "#C", %%mm6 \n\t"\ | 373 "movq "#C", %%mm6 \n\t"\ |
374 "paddw "#D", %%mm6 \n\t"\ | 374 "paddw "#D", %%mm6 \n\t"\ |
375 "psllw $2, %%mm6 \n\t"\ | 375 "psllw $2, %%mm6 \n\t"\ |
376 "psubw "#B", %%mm6 \n\t"\ | 376 "psubw "#B", %%mm6 \n\t"\ |
377 "psubw "#E", %%mm6 \n\t"\ | 377 "psubw "#E", %%mm6 \n\t"\ |
378 "pmullw %4, %%mm6 \n\t"\ | 378 "pmullw %4, %%mm6 \n\t"\ |
379 "add %2, %0 \n\t"\ | 379 "add %2, %0 \n\t"\ |
380 "punpcklbw %%mm7, "#F" \n\t"\ | 380 "punpcklbw %%mm7, "#F" \n\t"\ |
381 "paddw %5, "#A" \n\t"\ | 381 "paddw %5, "#A" \n\t"\ |
382 "paddw "#F", "#A" \n\t"\ | 382 "paddw "#F", "#A" \n\t"\ |
383 "paddw "#A", %%mm6 \n\t"\ | 383 "paddw "#A", %%mm6 \n\t"\ |
384 "psraw $5, %%mm6 \n\t"\ | 384 "psraw $5, %%mm6 \n\t"\ |
385 "packuswb %%mm6, %%mm6 \n\t"\ | 385 "packuswb %%mm6, %%mm6 \n\t"\ |
386 OP(%%mm6, (%1), A, d)\ | 386 OP(%%mm6, (%1), A, d)\ |
387 "add %3, %1 \n\t" | 387 "add %3, %1 \n\t" |
388 | 388 |
389 #define QPEL_H264HV(A,B,C,D,E,F,OF)\ | 389 #define QPEL_H264HV(A,B,C,D,E,F,OF)\ |
390 "movd (%0), "#F" \n\t"\ | 390 "movd (%0), "#F" \n\t"\ |
391 "movq "#C", %%mm6 \n\t"\ | 391 "movq "#C", %%mm6 \n\t"\ |
392 "paddw "#D", %%mm6 \n\t"\ | 392 "paddw "#D", %%mm6 \n\t"\ |
393 "psllw $2, %%mm6 \n\t"\ | 393 "psllw $2, %%mm6 \n\t"\ |
394 "psubw "#B", %%mm6 \n\t"\ | 394 "psubw "#B", %%mm6 \n\t"\ |
395 "psubw "#E", %%mm6 \n\t"\ | 395 "psubw "#E", %%mm6 \n\t"\ |
396 "pmullw %3, %%mm6 \n\t"\ | 396 "pmullw %3, %%mm6 \n\t"\ |
397 "add %2, %0 \n\t"\ | 397 "add %2, %0 \n\t"\ |
398 "punpcklbw %%mm7, "#F" \n\t"\ | 398 "punpcklbw %%mm7, "#F" \n\t"\ |
399 "paddw "#F", "#A" \n\t"\ | 399 "paddw "#F", "#A" \n\t"\ |
400 "paddw "#A", %%mm6 \n\t"\ | 400 "paddw "#A", %%mm6 \n\t"\ |
401 "movq %%mm6, "#OF"(%1) \n\t" | 401 "movq %%mm6, "#OF"(%1) \n\t" |
402 | 402 |
403 #define QPEL_H264(OPNAME, OP, MMX)\ | 403 #define QPEL_H264(OPNAME, OP, MMX)\ |
404 static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 404 static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
405 int h=4;\ | 405 int h=4;\ |
406 \ | 406 \ |
407 asm volatile(\ | 407 asm volatile(\ |
408 "pxor %%mm7, %%mm7 \n\t"\ | 408 "pxor %%mm7, %%mm7 \n\t"\ |
409 "movq %5, %%mm4 \n\t"\ | 409 "movq %5, %%mm4 \n\t"\ |
410 "movq %6, %%mm5 \n\t"\ | 410 "movq %6, %%mm5 \n\t"\ |
411 "1: \n\t"\ | 411 "1: \n\t"\ |
412 "movd -1(%0), %%mm1 \n\t"\ | 412 "movd -1(%0), %%mm1 \n\t"\ |
413 "movd (%0), %%mm2 \n\t"\ | 413 "movd (%0), %%mm2 \n\t"\ |
414 "movd 1(%0), %%mm3 \n\t"\ | 414 "movd 1(%0), %%mm3 \n\t"\ |
415 "movd 2(%0), %%mm0 \n\t"\ | 415 "movd 2(%0), %%mm0 \n\t"\ |
416 "punpcklbw %%mm7, %%mm1 \n\t"\ | 416 "punpcklbw %%mm7, %%mm1 \n\t"\ |
417 "punpcklbw %%mm7, %%mm2 \n\t"\ | 417 "punpcklbw %%mm7, %%mm2 \n\t"\ |
418 "punpcklbw %%mm7, %%mm3 \n\t"\ | 418 "punpcklbw %%mm7, %%mm3 \n\t"\ |
419 "punpcklbw %%mm7, %%mm0 \n\t"\ | 419 "punpcklbw %%mm7, %%mm0 \n\t"\ |
420 "paddw %%mm0, %%mm1 \n\t"\ | 420 "paddw %%mm0, %%mm1 \n\t"\ |
421 "paddw %%mm3, %%mm2 \n\t"\ | 421 "paddw %%mm3, %%mm2 \n\t"\ |
422 "movd -2(%0), %%mm0 \n\t"\ | 422 "movd -2(%0), %%mm0 \n\t"\ |
423 "movd 3(%0), %%mm3 \n\t"\ | 423 "movd 3(%0), %%mm3 \n\t"\ |
424 "punpcklbw %%mm7, %%mm0 \n\t"\ | 424 "punpcklbw %%mm7, %%mm0 \n\t"\ |
425 "punpcklbw %%mm7, %%mm3 \n\t"\ | 425 "punpcklbw %%mm7, %%mm3 \n\t"\ |
426 "paddw %%mm3, %%mm0 \n\t"\ | 426 "paddw %%mm3, %%mm0 \n\t"\ |
427 "psllw $2, %%mm2 \n\t"\ | 427 "psllw $2, %%mm2 \n\t"\ |
428 "psubw %%mm1, %%mm2 \n\t"\ | 428 "psubw %%mm1, %%mm2 \n\t"\ |
429 "pmullw %%mm4, %%mm2 \n\t"\ | 429 "pmullw %%mm4, %%mm2 \n\t"\ |
430 "paddw %%mm5, %%mm0 \n\t"\ | 430 "paddw %%mm5, %%mm0 \n\t"\ |
431 "paddw %%mm2, %%mm0 \n\t"\ | 431 "paddw %%mm2, %%mm0 \n\t"\ |
432 "psraw $5, %%mm0 \n\t"\ | 432 "psraw $5, %%mm0 \n\t"\ |
433 "packuswb %%mm0, %%mm0 \n\t"\ | 433 "packuswb %%mm0, %%mm0 \n\t"\ |
434 OP(%%mm0, (%1),%%mm6, d)\ | 434 OP(%%mm0, (%1),%%mm6, d)\ |
435 "add %3, %0 \n\t"\ | 435 "add %3, %0 \n\t"\ |
436 "add %4, %1 \n\t"\ | 436 "add %4, %1 \n\t"\ |
437 "decl %2 \n\t"\ | 437 "decl %2 \n\t"\ |
438 " jnz 1b \n\t"\ | 438 " jnz 1b \n\t"\ |
439 : "+a"(src), "+c"(dst), "+m"(h)\ | 439 : "+a"(src), "+c"(dst), "+m"(h)\ |
440 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | 440 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ |
441 : "memory"\ | 441 : "memory"\ |
442 );\ | 442 );\ |
443 }\ | 443 }\ |
444 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 444 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
445 src -= 2*srcStride;\ | 445 src -= 2*srcStride;\ |
446 asm volatile(\ | 446 asm volatile(\ |
447 "pxor %%mm7, %%mm7 \n\t"\ | 447 "pxor %%mm7, %%mm7 \n\t"\ |
448 "movd (%0), %%mm0 \n\t"\ | 448 "movd (%0), %%mm0 \n\t"\ |
449 "add %2, %0 \n\t"\ | 449 "add %2, %0 \n\t"\ |
450 "movd (%0), %%mm1 \n\t"\ | 450 "movd (%0), %%mm1 \n\t"\ |
451 "add %2, %0 \n\t"\ | 451 "add %2, %0 \n\t"\ |
452 "movd (%0), %%mm2 \n\t"\ | 452 "movd (%0), %%mm2 \n\t"\ |
453 "add %2, %0 \n\t"\ | 453 "add %2, %0 \n\t"\ |
454 "movd (%0), %%mm3 \n\t"\ | 454 "movd (%0), %%mm3 \n\t"\ |
455 "add %2, %0 \n\t"\ | 455 "add %2, %0 \n\t"\ |
456 "movd (%0), %%mm4 \n\t"\ | 456 "movd (%0), %%mm4 \n\t"\ |
457 "add %2, %0 \n\t"\ | 457 "add %2, %0 \n\t"\ |
458 "punpcklbw %%mm7, %%mm0 \n\t"\ | 458 "punpcklbw %%mm7, %%mm0 \n\t"\ |
459 "punpcklbw %%mm7, %%mm1 \n\t"\ | 459 "punpcklbw %%mm7, %%mm1 \n\t"\ |
460 "punpcklbw %%mm7, %%mm2 \n\t"\ | 460 "punpcklbw %%mm7, %%mm2 \n\t"\ |
461 "punpcklbw %%mm7, %%mm3 \n\t"\ | 461 "punpcklbw %%mm7, %%mm3 \n\t"\ |
462 "punpcklbw %%mm7, %%mm4 \n\t"\ | 462 "punpcklbw %%mm7, %%mm4 \n\t"\ |
463 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | 463 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ |
464 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | 464 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ |
465 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ | 465 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ |
466 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ | 466 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ |
467 \ | 467 \ |
474 int h=4;\ | 474 int h=4;\ |
475 int w=3;\ | 475 int w=3;\ |
476 src -= 2*srcStride+2;\ | 476 src -= 2*srcStride+2;\ |
477 while(w--){\ | 477 while(w--){\ |
478 asm volatile(\ | 478 asm volatile(\ |
479 "pxor %%mm7, %%mm7 \n\t"\ | 479 "pxor %%mm7, %%mm7 \n\t"\ |
480 "movd (%0), %%mm0 \n\t"\ | 480 "movd (%0), %%mm0 \n\t"\ |
481 "add %2, %0 \n\t"\ | 481 "add %2, %0 \n\t"\ |
482 "movd (%0), %%mm1 \n\t"\ | 482 "movd (%0), %%mm1 \n\t"\ |
483 "add %2, %0 \n\t"\ | 483 "add %2, %0 \n\t"\ |
484 "movd (%0), %%mm2 \n\t"\ | 484 "movd (%0), %%mm2 \n\t"\ |
485 "add %2, %0 \n\t"\ | 485 "add %2, %0 \n\t"\ |
486 "movd (%0), %%mm3 \n\t"\ | 486 "movd (%0), %%mm3 \n\t"\ |
487 "add %2, %0 \n\t"\ | 487 "add %2, %0 \n\t"\ |
488 "movd (%0), %%mm4 \n\t"\ | 488 "movd (%0), %%mm4 \n\t"\ |
489 "add %2, %0 \n\t"\ | 489 "add %2, %0 \n\t"\ |
490 "punpcklbw %%mm7, %%mm0 \n\t"\ | 490 "punpcklbw %%mm7, %%mm0 \n\t"\ |
491 "punpcklbw %%mm7, %%mm1 \n\t"\ | 491 "punpcklbw %%mm7, %%mm1 \n\t"\ |
492 "punpcklbw %%mm7, %%mm2 \n\t"\ | 492 "punpcklbw %%mm7, %%mm2 \n\t"\ |
493 "punpcklbw %%mm7, %%mm3 \n\t"\ | 493 "punpcklbw %%mm7, %%mm3 \n\t"\ |
494 "punpcklbw %%mm7, %%mm4 \n\t"\ | 494 "punpcklbw %%mm7, %%mm4 \n\t"\ |
495 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ | 495 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ |
496 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ | 496 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ |
497 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ | 497 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ |
498 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ | 498 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ |
499 \ | 499 \ |
504 tmp += 4;\ | 504 tmp += 4;\ |
505 src += 4 - 9*srcStride;\ | 505 src += 4 - 9*srcStride;\ |
506 }\ | 506 }\ |
507 tmp -= 3*4;\ | 507 tmp -= 3*4;\ |
508 asm volatile(\ | 508 asm volatile(\ |
509 "movq %4, %%mm6 \n\t"\ | 509 "movq %4, %%mm6 \n\t"\ |
510 "1: \n\t"\ | 510 "1: \n\t"\ |
511 "movq (%0), %%mm0 \n\t"\ | 511 "movq (%0), %%mm0 \n\t"\ |
512 "paddw 10(%0), %%mm0 \n\t"\ | 512 "paddw 10(%0), %%mm0 \n\t"\ |
513 "movq 2(%0), %%mm1 \n\t"\ | 513 "movq 2(%0), %%mm1 \n\t"\ |
514 "paddw 8(%0), %%mm1 \n\t"\ | 514 "paddw 8(%0), %%mm1 \n\t"\ |
515 "movq 4(%0), %%mm2 \n\t"\ | 515 "movq 4(%0), %%mm2 \n\t"\ |
516 "paddw 6(%0), %%mm2 \n\t"\ | 516 "paddw 6(%0), %%mm2 \n\t"\ |
517 "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ | 517 "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ |
518 "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ | 518 "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ |
519 "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ | 519 "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ |
520 "paddsw %%mm2, %%mm0 \n\t"\ | 520 "paddsw %%mm2, %%mm0 \n\t"\ |
521 "psraw $2, %%mm0 \n\t"/*((a-b)/4-b)/4 */\ | 521 "psraw $2, %%mm0 \n\t"/*((a-b)/4-b)/4 */\ |
522 "paddw %%mm6, %%mm2 \n\t"\ | 522 "paddw %%mm6, %%mm2 \n\t"\ |
523 "paddw %%mm2, %%mm0 \n\t"\ | 523 "paddw %%mm2, %%mm0 \n\t"\ |
524 "psraw $6, %%mm0 \n\t"\ | 524 "psraw $6, %%mm0 \n\t"\ |
525 "packuswb %%mm0, %%mm0 \n\t"\ | 525 "packuswb %%mm0, %%mm0 \n\t"\ |
526 OP(%%mm0, (%1),%%mm7, d)\ | 526 OP(%%mm0, (%1),%%mm7, d)\ |
527 "add $24, %0 \n\t"\ | 527 "add $24, %0 \n\t"\ |
528 "add %3, %1 \n\t"\ | 528 "add %3, %1 \n\t"\ |
529 "decl %2 \n\t"\ | 529 "decl %2 \n\t"\ |
530 " jnz 1b \n\t"\ | 530 " jnz 1b \n\t"\ |
531 : "+a"(tmp), "+c"(dst), "+m"(h)\ | 531 : "+a"(tmp), "+c"(dst), "+m"(h)\ |
532 : "S"((long)dstStride), "m"(ff_pw_32)\ | 532 : "S"((long)dstStride), "m"(ff_pw_32)\ |
533 : "memory"\ | 533 : "memory"\ |
534 );\ | 534 );\ |
535 }\ | 535 }\ |
536 \ | 536 \ |
537 static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 537 static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
538 int h=8;\ | 538 int h=8;\ |
539 asm volatile(\ | 539 asm volatile(\ |
540 "pxor %%mm7, %%mm7 \n\t"\ | 540 "pxor %%mm7, %%mm7 \n\t"\ |
541 "movq %5, %%mm6 \n\t"\ | 541 "movq %5, %%mm6 \n\t"\ |
542 "1: \n\t"\ | 542 "1: \n\t"\ |
543 "movq (%0), %%mm0 \n\t"\ | 543 "movq (%0), %%mm0 \n\t"\ |
544 "movq 1(%0), %%mm2 \n\t"\ | 544 "movq 1(%0), %%mm2 \n\t"\ |
545 "movq %%mm0, %%mm1 \n\t"\ | 545 "movq %%mm0, %%mm1 \n\t"\ |
546 "movq %%mm2, %%mm3 \n\t"\ | 546 "movq %%mm2, %%mm3 \n\t"\ |
547 "punpcklbw %%mm7, %%mm0 \n\t"\ | 547 "punpcklbw %%mm7, %%mm0 \n\t"\ |
548 "punpckhbw %%mm7, %%mm1 \n\t"\ | 548 "punpckhbw %%mm7, %%mm1 \n\t"\ |
549 "punpcklbw %%mm7, %%mm2 \n\t"\ | 549 "punpcklbw %%mm7, %%mm2 \n\t"\ |
550 "punpckhbw %%mm7, %%mm3 \n\t"\ | 550 "punpckhbw %%mm7, %%mm3 \n\t"\ |
551 "paddw %%mm2, %%mm0 \n\t"\ | 551 "paddw %%mm2, %%mm0 \n\t"\ |
552 "paddw %%mm3, %%mm1 \n\t"\ | 552 "paddw %%mm3, %%mm1 \n\t"\ |
553 "psllw $2, %%mm0 \n\t"\ | 553 "psllw $2, %%mm0 \n\t"\ |
554 "psllw $2, %%mm1 \n\t"\ | 554 "psllw $2, %%mm1 \n\t"\ |
555 "movq -1(%0), %%mm2 \n\t"\ | 555 "movq -1(%0), %%mm2 \n\t"\ |
556 "movq 2(%0), %%mm4 \n\t"\ | 556 "movq 2(%0), %%mm4 \n\t"\ |
557 "movq %%mm2, %%mm3 \n\t"\ | 557 "movq %%mm2, %%mm3 \n\t"\ |
558 "movq %%mm4, %%mm5 \n\t"\ | 558 "movq %%mm4, %%mm5 \n\t"\ |
559 "punpcklbw %%mm7, %%mm2 \n\t"\ | 559 "punpcklbw %%mm7, %%mm2 \n\t"\ |
560 "punpckhbw %%mm7, %%mm3 \n\t"\ | 560 "punpckhbw %%mm7, %%mm3 \n\t"\ |
561 "punpcklbw %%mm7, %%mm4 \n\t"\ | 561 "punpcklbw %%mm7, %%mm4 \n\t"\ |
562 "punpckhbw %%mm7, %%mm5 \n\t"\ | 562 "punpckhbw %%mm7, %%mm5 \n\t"\ |
563 "paddw %%mm4, %%mm2 \n\t"\ | 563 "paddw %%mm4, %%mm2 \n\t"\ |
564 "paddw %%mm3, %%mm5 \n\t"\ | 564 "paddw %%mm3, %%mm5 \n\t"\ |
565 "psubw %%mm2, %%mm0 \n\t"\ | 565 "psubw %%mm2, %%mm0 \n\t"\ |
566 "psubw %%mm5, %%mm1 \n\t"\ | 566 "psubw %%mm5, %%mm1 \n\t"\ |
567 "pmullw %%mm6, %%mm0 \n\t"\ | 567 "pmullw %%mm6, %%mm0 \n\t"\ |
568 "pmullw %%mm6, %%mm1 \n\t"\ | 568 "pmullw %%mm6, %%mm1 \n\t"\ |
569 "movd -2(%0), %%mm2 \n\t"\ | 569 "movd -2(%0), %%mm2 \n\t"\ |
570 "movd 7(%0), %%mm5 \n\t"\ | 570 "movd 7(%0), %%mm5 \n\t"\ |
571 "punpcklbw %%mm7, %%mm2 \n\t"\ | 571 "punpcklbw %%mm7, %%mm2 \n\t"\ |
572 "punpcklbw %%mm7, %%mm5 \n\t"\ | 572 "punpcklbw %%mm7, %%mm5 \n\t"\ |
573 "paddw %%mm3, %%mm2 \n\t"\ | 573 "paddw %%mm3, %%mm2 \n\t"\ |
574 "paddw %%mm5, %%mm4 \n\t"\ | 574 "paddw %%mm5, %%mm4 \n\t"\ |
575 "movq %6, %%mm5 \n\t"\ | 575 "movq %6, %%mm5 \n\t"\ |
576 "paddw %%mm5, %%mm2 \n\t"\ | 576 "paddw %%mm5, %%mm2 \n\t"\ |
577 "paddw %%mm5, %%mm4 \n\t"\ | 577 "paddw %%mm5, %%mm4 \n\t"\ |
578 "paddw %%mm2, %%mm0 \n\t"\ | 578 "paddw %%mm2, %%mm0 \n\t"\ |
579 "paddw %%mm4, %%mm1 \n\t"\ | 579 "paddw %%mm4, %%mm1 \n\t"\ |
580 "psraw $5, %%mm0 \n\t"\ | 580 "psraw $5, %%mm0 \n\t"\ |
581 "psraw $5, %%mm1 \n\t"\ | 581 "psraw $5, %%mm1 \n\t"\ |
582 "packuswb %%mm1, %%mm0 \n\t"\ | 582 "packuswb %%mm1, %%mm0 \n\t"\ |
583 OP(%%mm0, (%1),%%mm5, q)\ | 583 OP(%%mm0, (%1),%%mm5, q)\ |
584 "add %3, %0 \n\t"\ | 584 "add %3, %0 \n\t"\ |
585 "add %4, %1 \n\t"\ | 585 "add %4, %1 \n\t"\ |
586 "decl %2 \n\t"\ | 586 "decl %2 \n\t"\ |
587 " jnz 1b \n\t"\ | 587 " jnz 1b \n\t"\ |
588 : "+a"(src), "+c"(dst), "+m"(h)\ | 588 : "+a"(src), "+c"(dst), "+m"(h)\ |
589 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | 589 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ |
590 : "memory"\ | 590 : "memory"\ |
591 );\ | 591 );\ |
592 }\ | 592 }\ |
595 int h= 2;\ | 595 int h= 2;\ |
596 src -= 2*srcStride;\ | 596 src -= 2*srcStride;\ |
597 \ | 597 \ |
598 while(h--){\ | 598 while(h--){\ |
599 asm volatile(\ | 599 asm volatile(\ |
600 "pxor %%mm7, %%mm7 \n\t"\ | 600 "pxor %%mm7, %%mm7 \n\t"\ |
601 "movd (%0), %%mm0 \n\t"\ | 601 "movd (%0), %%mm0 \n\t"\ |
602 "add %2, %0 \n\t"\ | 602 "add %2, %0 \n\t"\ |
603 "movd (%0), %%mm1 \n\t"\ | 603 "movd (%0), %%mm1 \n\t"\ |
604 "add %2, %0 \n\t"\ | 604 "add %2, %0 \n\t"\ |
605 "movd (%0), %%mm2 \n\t"\ | 605 "movd (%0), %%mm2 \n\t"\ |
606 "add %2, %0 \n\t"\ | 606 "add %2, %0 \n\t"\ |
607 "movd (%0), %%mm3 \n\t"\ | 607 "movd (%0), %%mm3 \n\t"\ |
608 "add %2, %0 \n\t"\ | 608 "add %2, %0 \n\t"\ |
609 "movd (%0), %%mm4 \n\t"\ | 609 "movd (%0), %%mm4 \n\t"\ |
610 "add %2, %0 \n\t"\ | 610 "add %2, %0 \n\t"\ |
611 "punpcklbw %%mm7, %%mm0 \n\t"\ | 611 "punpcklbw %%mm7, %%mm0 \n\t"\ |
612 "punpcklbw %%mm7, %%mm1 \n\t"\ | 612 "punpcklbw %%mm7, %%mm1 \n\t"\ |
613 "punpcklbw %%mm7, %%mm2 \n\t"\ | 613 "punpcklbw %%mm7, %%mm2 \n\t"\ |
614 "punpcklbw %%mm7, %%mm3 \n\t"\ | 614 "punpcklbw %%mm7, %%mm3 \n\t"\ |
615 "punpcklbw %%mm7, %%mm4 \n\t"\ | 615 "punpcklbw %%mm7, %%mm4 \n\t"\ |
616 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | 616 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ |
617 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | 617 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ |
618 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ | 618 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ |
619 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ | 619 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ |
620 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ | 620 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ |
634 int h=8;\ | 634 int h=8;\ |
635 int w=4;\ | 635 int w=4;\ |
636 src -= 2*srcStride+2;\ | 636 src -= 2*srcStride+2;\ |
637 while(w--){\ | 637 while(w--){\ |
638 asm volatile(\ | 638 asm volatile(\ |
639 "pxor %%mm7, %%mm7 \n\t"\ | 639 "pxor %%mm7, %%mm7 \n\t"\ |
640 "movd (%0), %%mm0 \n\t"\ | 640 "movd (%0), %%mm0 \n\t"\ |
641 "add %2, %0 \n\t"\ | 641 "add %2, %0 \n\t"\ |
642 "movd (%0), %%mm1 \n\t"\ | 642 "movd (%0), %%mm1 \n\t"\ |
643 "add %2, %0 \n\t"\ | 643 "add %2, %0 \n\t"\ |
644 "movd (%0), %%mm2 \n\t"\ | 644 "movd (%0), %%mm2 \n\t"\ |
645 "add %2, %0 \n\t"\ | 645 "add %2, %0 \n\t"\ |
646 "movd (%0), %%mm3 \n\t"\ | 646 "movd (%0), %%mm3 \n\t"\ |
647 "add %2, %0 \n\t"\ | 647 "add %2, %0 \n\t"\ |
648 "movd (%0), %%mm4 \n\t"\ | 648 "movd (%0), %%mm4 \n\t"\ |
649 "add %2, %0 \n\t"\ | 649 "add %2, %0 \n\t"\ |
650 "punpcklbw %%mm7, %%mm0 \n\t"\ | 650 "punpcklbw %%mm7, %%mm0 \n\t"\ |
651 "punpcklbw %%mm7, %%mm1 \n\t"\ | 651 "punpcklbw %%mm7, %%mm1 \n\t"\ |
652 "punpcklbw %%mm7, %%mm2 \n\t"\ | 652 "punpcklbw %%mm7, %%mm2 \n\t"\ |
653 "punpcklbw %%mm7, %%mm3 \n\t"\ | 653 "punpcklbw %%mm7, %%mm3 \n\t"\ |
654 "punpcklbw %%mm7, %%mm4 \n\t"\ | 654 "punpcklbw %%mm7, %%mm4 \n\t"\ |
655 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\ | 655 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\ |
656 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\ | 656 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\ |
657 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\ | 657 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\ |
658 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\ | 658 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\ |
659 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\ | 659 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\ |
668 tmp += 4;\ | 668 tmp += 4;\ |
669 src += 4 - 13*srcStride;\ | 669 src += 4 - 13*srcStride;\ |
670 }\ | 670 }\ |
671 tmp -= 4*4;\ | 671 tmp -= 4*4;\ |
672 asm volatile(\ | 672 asm volatile(\ |
673 "movq %4, %%mm6 \n\t"\ | 673 "movq %4, %%mm6 \n\t"\ |
674 "1: \n\t"\ | 674 "1: \n\t"\ |
675 "movq (%0), %%mm0 \n\t"\ | 675 "movq (%0), %%mm0 \n\t"\ |
676 "movq 8(%0), %%mm3 \n\t"\ | 676 "movq 8(%0), %%mm3 \n\t"\ |
677 "movq 2(%0), %%mm1 \n\t"\ | 677 "movq 2(%0), %%mm1 \n\t"\ |
678 "movq 10(%0), %%mm4 \n\t"\ | 678 "movq 10(%0), %%mm4 \n\t"\ |
679 "paddw %%mm4, %%mm0 \n\t"\ | 679 "paddw %%mm4, %%mm0 \n\t"\ |
680 "paddw %%mm3, %%mm1 \n\t"\ | 680 "paddw %%mm3, %%mm1 \n\t"\ |
681 "paddw 18(%0), %%mm3 \n\t"\ | 681 "paddw 18(%0), %%mm3 \n\t"\ |
682 "paddw 16(%0), %%mm4 \n\t"\ | 682 "paddw 16(%0), %%mm4 \n\t"\ |
683 "movq 4(%0), %%mm2 \n\t"\ | 683 "movq 4(%0), %%mm2 \n\t"\ |
684 "movq 12(%0), %%mm5 \n\t"\ | 684 "movq 12(%0), %%mm5 \n\t"\ |
685 "paddw 6(%0), %%mm2 \n\t"\ | 685 "paddw 6(%0), %%mm2 \n\t"\ |
686 "paddw 14(%0), %%mm5 \n\t"\ | 686 "paddw 14(%0), %%mm5 \n\t"\ |
687 "psubw %%mm1, %%mm0 \n\t"\ | 687 "psubw %%mm1, %%mm0 \n\t"\ |
688 "psubw %%mm4, %%mm3 \n\t"\ | 688 "psubw %%mm4, %%mm3 \n\t"\ |
689 "psraw $2, %%mm0 \n\t"\ | 689 "psraw $2, %%mm0 \n\t"\ |
690 "psraw $2, %%mm3 \n\t"\ | 690 "psraw $2, %%mm3 \n\t"\ |
691 "psubw %%mm1, %%mm0 \n\t"\ | 691 "psubw %%mm1, %%mm0 \n\t"\ |
692 "psubw %%mm4, %%mm3 \n\t"\ | 692 "psubw %%mm4, %%mm3 \n\t"\ |
693 "paddsw %%mm2, %%mm0 \n\t"\ | 693 "paddsw %%mm2, %%mm0 \n\t"\ |
694 "paddsw %%mm5, %%mm3 \n\t"\ | 694 "paddsw %%mm5, %%mm3 \n\t"\ |
695 "psraw $2, %%mm0 \n\t"\ | 695 "psraw $2, %%mm0 \n\t"\ |
696 "psraw $2, %%mm3 \n\t"\ | 696 "psraw $2, %%mm3 \n\t"\ |
697 "paddw %%mm6, %%mm2 \n\t"\ | 697 "paddw %%mm6, %%mm2 \n\t"\ |
698 "paddw %%mm6, %%mm5 \n\t"\ | 698 "paddw %%mm6, %%mm5 \n\t"\ |
699 "paddw %%mm2, %%mm0 \n\t"\ | 699 "paddw %%mm2, %%mm0 \n\t"\ |
700 "paddw %%mm5, %%mm3 \n\t"\ | 700 "paddw %%mm5, %%mm3 \n\t"\ |
701 "psraw $6, %%mm0 \n\t"\ | 701 "psraw $6, %%mm0 \n\t"\ |
702 "psraw $6, %%mm3 \n\t"\ | 702 "psraw $6, %%mm3 \n\t"\ |
703 "packuswb %%mm3, %%mm0 \n\t"\ | 703 "packuswb %%mm3, %%mm0 \n\t"\ |
704 OP(%%mm0, (%1),%%mm7, q)\ | 704 OP(%%mm0, (%1),%%mm7, q)\ |
705 "add $32, %0 \n\t"\ | 705 "add $32, %0 \n\t"\ |
706 "add %3, %1 \n\t"\ | 706 "add %3, %1 \n\t"\ |
707 "decl %2 \n\t"\ | 707 "decl %2 \n\t"\ |
708 " jnz 1b \n\t"\ | 708 " jnz 1b \n\t"\ |
709 : "+a"(tmp), "+c"(dst), "+m"(h)\ | 709 : "+a"(tmp), "+c"(dst), "+m"(h)\ |
710 : "S"((long)dstStride), "m"(ff_pw_32)\ | 710 : "S"((long)dstStride), "m"(ff_pw_32)\ |
711 : "memory"\ | 711 : "memory"\ |
712 );\ | 712 );\ |
713 }\ | 713 }\ |
860 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ | 860 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ |
861 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\ | 861 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\ |
862 }\ | 862 }\ |
863 | 863 |
864 | 864 |
865 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" | 865 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" |
866 #define AVG_3DNOW_OP(a,b,temp, size) \ | 866 #define AVG_3DNOW_OP(a,b,temp, size) \ |
867 "mov" #size " " #b ", " #temp " \n\t"\ | 867 "mov" #size " " #b ", " #temp " \n\t"\ |
868 "pavgusb " #temp ", " #a " \n\t"\ | 868 "pavgusb " #temp ", " #a " \n\t"\ |
869 "mov" #size " " #a ", " #b " \n\t" | 869 "mov" #size " " #a ", " #b " \n\t" |
870 #define AVG_MMX2_OP(a,b,temp, size) \ | 870 #define AVG_MMX2_OP(a,b,temp, size) \ |
871 "mov" #size " " #b ", " #temp " \n\t"\ | 871 "mov" #size " " #b ", " #temp " \n\t"\ |
872 "pavgb " #temp ", " #a " \n\t"\ | 872 "pavgb " #temp ", " #a " \n\t"\ |
873 "mov" #size " " #a ", " #b " \n\t" | 873 "mov" #size " " #a ", " #b " \n\t" |
874 | 874 |
875 QPEL_H264(put_, PUT_OP, 3dnow) | 875 QPEL_H264(put_, PUT_OP, 3dnow) |
876 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) | 876 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) |
877 QPEL_H264(put_, PUT_OP, mmx2) | 877 QPEL_H264(put_, PUT_OP, mmx2) |
878 QPEL_H264(avg_, AVG_MMX2_OP, mmx2) | 878 QPEL_H264(avg_, AVG_MMX2_OP, mmx2) |