Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 2210:e1c60876a0ae libavcodec
optimization
author | michael |
---|---|
date | Tue, 07 Sep 2004 17:22:37 +0000 |
parents | c4a476971abc |
children | ee8e91ec869a |
comparison
equal
deleted
inserted
replaced
2209:c4a476971abc | 2210:e1c60876a0ae |
---|---|
2487 }\ | 2487 }\ |
2488 \ | 2488 \ |
2489 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 2489 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2490 uint64_t temp[(8+5)*2];\ | 2490 uint64_t temp[(8+5)*2];\ |
2491 uint64_t *temp_ptr= temp;\ | 2491 uint64_t *temp_ptr= temp;\ |
2492 int h= 8+5;\ | 2492 int h= 4;\ |
2493 \ | 2493 \ |
2494 src -= 2*srcStride;\ | 2494 src -= 2*srcStride;\ |
2495 /*FIXME unroll */\ | 2495 \ |
2496 asm volatile(\ | 2496 asm volatile(\ |
2497 "pxor %%mm7, %%mm7 \n\t"\ | 2497 "pxor %%mm7, %%mm7 \n\t"\ |
2498 "1: \n\t"\ | 2498 "1: \n\t"\ |
2499 "movq (%0), %%mm0 \n\t"\ | 2499 "movq (%0), %%mm0 \n\t"\ |
2500 "movq (%0), %%mm1 \n\t"\ | 2500 "movq %%mm0, %%mm1 \n\t"\ |
2501 "addl %3, %0 \n\t"\ | |
2501 "punpcklbw %%mm7, %%mm0 \n\t"\ | 2502 "punpcklbw %%mm7, %%mm0 \n\t"\ |
2503 "movq %%mm0, (%1) \n\t"\ | |
2502 "punpckhbw %%mm7, %%mm1 \n\t"\ | 2504 "punpckhbw %%mm7, %%mm1 \n\t"\ |
2503 "movq %%mm0, (%1) \n\t"\ | 2505 "movq (%0), %%mm0 \n\t"\ |
2504 "movq %%mm1, 8(%1) \n\t"\ | 2506 "movq %%mm1, 8(%1) \n\t"\ |
2505 "addl $16, %1 \n\t"\ | 2507 "movq %%mm0, %%mm1 \n\t"\ |
2506 "addl %3, %0 \n\t"\ | 2508 "addl %3, %0 \n\t"\ |
2509 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2510 "movq %%mm0, 16(%1) \n\t"\ | |
2511 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2512 "movq (%0), %%mm0 \n\t"\ | |
2513 "movq %%mm1, 24(%1) \n\t"\ | |
2514 "movq %%mm0, %%mm1 \n\t"\ | |
2515 "addl %3, %0 \n\t"\ | |
2516 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2517 "movq %%mm0, 32(%1) \n\t"\ | |
2518 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2519 "movq %%mm1, 40(%1) \n\t"\ | |
2520 "addl $48, %1 \n\t"\ | |
2507 "decl %2 \n\t"\ | 2521 "decl %2 \n\t"\ |
2508 " jnz 1b \n\t"\ | 2522 " jnz 1b \n\t"\ |
2523 "movq (%0), %%mm0 \n\t"\ | |
2524 "movq %%mm0, %%mm1 \n\t"\ | |
2525 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2526 "movq %%mm0, (%1) \n\t"\ | |
2527 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2528 "movq %%mm1, 8(%1) \n\t"\ | |
2509 : "+a" (src), "+c" (temp_ptr), "+d"(h)\ | 2529 : "+a" (src), "+c" (temp_ptr), "+d"(h)\ |
2510 : "S" (srcStride)\ | 2530 : "S" (srcStride)\ |
2511 : "memory"\ | 2531 : "memory"\ |
2512 );\ | 2532 );\ |
2513 \ | 2533 \ |
2518 "movq %4, %%mm6 \n\t"\ | 2538 "movq %4, %%mm6 \n\t"\ |
2519 "movq %5, %%mm7 \n\t"\ | 2539 "movq %5, %%mm7 \n\t"\ |
2520 "1: \n\t"\ | 2540 "1: \n\t"\ |
2521 "movq 2*16+0(%0), %%mm0 \n\t"\ | 2541 "movq 2*16+0(%0), %%mm0 \n\t"\ |
2522 "movq 2*16+8(%0), %%mm1 \n\t"\ | 2542 "movq 2*16+8(%0), %%mm1 \n\t"\ |
2523 "movq 3*16+0(%0), %%mm2 \n\t"\ | 2543 "paddw 3*16+0(%0), %%mm0 \n\t"\ |
2524 "movq 3*16+8(%0), %%mm3 \n\t"\ | 2544 "paddw 3*16+8(%0), %%mm1 \n\t"\ |
2525 "paddw %%mm2, %%mm0 \n\t"\ | |
2526 "paddw %%mm3, %%mm1 \n\t"\ | |
2527 "psllw $2, %%mm0 \n\t"\ | 2545 "psllw $2, %%mm0 \n\t"\ |
2528 "psllw $2, %%mm1 \n\t"\ | 2546 "psllw $2, %%mm1 \n\t"\ |
2529 "movq 1*16+0(%0), %%mm2 \n\t"\ | 2547 "movq 1*16+0(%0), %%mm2 \n\t"\ |
2530 "movq 1*16+8(%0), %%mm3 \n\t"\ | 2548 "movq 1*16+8(%0), %%mm3 \n\t"\ |
2531 "movq 4*16+0(%0), %%mm4 \n\t"\ | 2549 "paddw 4*16+0(%0), %%mm2 \n\t"\ |
2532 "movq 4*16+8(%0), %%mm5 \n\t"\ | 2550 "paddw 4*16+8(%0), %%mm3 \n\t"\ |
2533 "paddw %%mm4, %%mm2 \n\t"\ | |
2534 "paddw %%mm5, %%mm3 \n\t"\ | |
2535 "psubw %%mm2, %%mm0 \n\t"\ | 2551 "psubw %%mm2, %%mm0 \n\t"\ |
2536 "psubw %%mm3, %%mm1 \n\t"\ | 2552 "psubw %%mm3, %%mm1 \n\t"\ |
2537 "pmullw %%mm6, %%mm0 \n\t"\ | 2553 "pmullw %%mm6, %%mm0 \n\t"\ |
2538 "pmullw %%mm6, %%mm1 \n\t"\ | 2554 "pmullw %%mm6, %%mm1 \n\t"\ |
2539 "movq 0*16+0(%0), %%mm2 \n\t"\ | 2555 "movq 0*16+0(%0), %%mm2 \n\t"\ |
2540 "movq 0*16+8(%0), %%mm3 \n\t"\ | 2556 "movq 0*16+8(%0), %%mm3 \n\t"\ |
2541 "movq 5*16+0(%0), %%mm4 \n\t"\ | 2557 "paddw 5*16+0(%0), %%mm2 \n\t"\ |
2542 "movq 5*16+8(%0), %%mm5 \n\t"\ | 2558 "paddw 5*16+8(%0), %%mm3 \n\t"\ |
2543 "paddw %%mm4, %%mm2 \n\t"\ | |
2544 "paddw %%mm5, %%mm3 \n\t"\ | |
2545 "paddw %%mm2, %%mm0 \n\t"\ | 2559 "paddw %%mm2, %%mm0 \n\t"\ |
2546 "paddw %%mm3, %%mm1 \n\t"\ | 2560 "paddw %%mm3, %%mm1 \n\t"\ |
2547 "paddw %%mm7, %%mm0 \n\t"\ | 2561 "paddw %%mm7, %%mm0 \n\t"\ |
2548 "paddw %%mm7, %%mm1 \n\t"\ | 2562 "paddw %%mm7, %%mm1 \n\t"\ |
2549 "psraw $5, %%mm0 \n\t"\ | 2563 "psraw $5, %%mm0 \n\t"\ |