Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 2211:ee8e91ec869a libavcodec
optimization
author | michael |
---|---|
date | Tue, 07 Sep 2004 22:56:50 +0000 |
parents | e1c60876a0ae |
children | 93091141f75e |
comparison
equal
deleted
inserted
replaced
2210:e1c60876a0ae | 2211:ee8e91ec869a |
---|---|
2287 uint8_t * const halfH= ((uint8_t*)half);\ | 2287 uint8_t * const halfH= ((uint8_t*)half);\ |
2288 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | 2288 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2289 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ | 2289 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
2290 } | 2290 } |
2291 | 2291 |
2292 #define QPEL_H264V(A,B,C,D,E,F,OP)\ | |
2293 "movd (%0), "#F" \n\t"\ | |
2294 "movq "#C", %%mm6 \n\t"\ | |
2295 "paddw "#D", %%mm6 \n\t"\ | |
2296 "psllw $2, %%mm6 \n\t"\ | |
2297 "psubw "#B", %%mm6 \n\t"\ | |
2298 "psubw "#E", %%mm6 \n\t"\ | |
2299 "pmullw %4, %%mm6 \n\t"\ | |
2300 "addl %2, %0 \n\t"\ | |
2301 "punpcklbw %%mm7, "#F" \n\t"\ | |
2302 "paddw %5, "#A" \n\t"\ | |
2303 "paddw "#F", "#A" \n\t"\ | |
2304 "paddw "#A", %%mm6 \n\t"\ | |
2305 "psraw $5, %%mm6 \n\t"\ | |
2306 "packuswb %%mm6, %%mm6 \n\t"\ | |
2307 OP(%%mm6, (%1), A, d)\ | |
2308 "addl %3, %1 \n\t" | |
2309 | |
2292 #define QPEL_H264(OPNAME, OP, MMX)\ | 2310 #define QPEL_H264(OPNAME, OP, MMX)\ |
2293 static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 2311 static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2294 int h=4;\ | 2312 int h=4;\ |
2295 \ | 2313 \ |
2296 asm volatile(\ | 2314 asm volatile(\ |
2485 : "memory"\ | 2503 : "memory"\ |
2486 );\ | 2504 );\ |
2487 }\ | 2505 }\ |
2488 \ | 2506 \ |
2489 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 2507 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2490 uint64_t temp[(8+5)*2];\ | 2508 int h= 2;\ |
2491 uint64_t *temp_ptr= temp;\ | |
2492 int h= 4;\ | |
2493 \ | |
2494 src -= 2*srcStride;\ | 2509 src -= 2*srcStride;\ |
2495 \ | 2510 \ |
2496 asm volatile(\ | 2511 while(h--){\ |
2512 asm volatile(\ | |
2497 "pxor %%mm7, %%mm7 \n\t"\ | 2513 "pxor %%mm7, %%mm7 \n\t"\ |
2498 "1: \n\t"\ | 2514 "movd (%0), %%mm0 \n\t"\ |
2499 "movq (%0), %%mm0 \n\t"\ | 2515 "addl %2, %0 \n\t"\ |
2500 "movq %%mm0, %%mm1 \n\t"\ | 2516 "movd (%0), %%mm1 \n\t"\ |
2501 "addl %3, %0 \n\t"\ | 2517 "addl %2, %0 \n\t"\ |
2518 "movd (%0), %%mm2 \n\t"\ | |
2519 "addl %2, %0 \n\t"\ | |
2520 "movd (%0), %%mm3 \n\t"\ | |
2521 "addl %2, %0 \n\t"\ | |
2522 "movd (%0), %%mm4 \n\t"\ | |
2523 "addl %2, %0 \n\t"\ | |
2502 "punpcklbw %%mm7, %%mm0 \n\t"\ | 2524 "punpcklbw %%mm7, %%mm0 \n\t"\ |
2503 "movq %%mm0, (%1) \n\t"\ | 2525 "punpcklbw %%mm7, %%mm1 \n\t"\ |
2504 "punpckhbw %%mm7, %%mm1 \n\t"\ | 2526 "punpcklbw %%mm7, %%mm2 \n\t"\ |
2505 "movq (%0), %%mm0 \n\t"\ | 2527 "punpcklbw %%mm7, %%mm3 \n\t"\ |
2506 "movq %%mm1, 8(%1) \n\t"\ | 2528 "punpcklbw %%mm7, %%mm4 \n\t"\ |
2507 "movq %%mm0, %%mm1 \n\t"\ | 2529 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ |
2508 "addl %3, %0 \n\t"\ | 2530 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ |
2509 "punpcklbw %%mm7, %%mm0 \n\t"\ | 2531 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ |
2510 "movq %%mm0, 16(%1) \n\t"\ | 2532 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ |
2511 "punpckhbw %%mm7, %%mm1 \n\t"\ | 2533 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ |
2512 "movq (%0), %%mm0 \n\t"\ | 2534 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ |
2513 "movq %%mm1, 24(%1) \n\t"\ | 2535 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ |
2514 "movq %%mm0, %%mm1 \n\t"\ | 2536 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ |
2515 "addl %3, %0 \n\t"\ | 2537 \ |
2516 "punpcklbw %%mm7, %%mm0 \n\t"\ | 2538 : "+a"(src), "+c"(dst)\ |
2517 "movq %%mm0, 32(%1) \n\t"\ | 2539 : "S"(srcStride), "D"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ |
2518 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2519 "movq %%mm1, 40(%1) \n\t"\ | |
2520 "addl $48, %1 \n\t"\ | |
2521 "decl %2 \n\t"\ | |
2522 " jnz 1b \n\t"\ | |
2523 "movq (%0), %%mm0 \n\t"\ | |
2524 "movq %%mm0, %%mm1 \n\t"\ | |
2525 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2526 "movq %%mm0, (%1) \n\t"\ | |
2527 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2528 "movq %%mm1, 8(%1) \n\t"\ | |
2529 : "+a" (src), "+c" (temp_ptr), "+d"(h)\ | |
2530 : "S" (srcStride)\ | |
2531 : "memory"\ | 2540 : "memory"\ |
2532 );\ | 2541 );\ |
2533 \ | 2542 src += 4-13*srcStride;\ |
2534 temp_ptr= temp;\ | 2543 dst += 4-8*dstStride;\ |
2535 h= 8;\ | 2544 }\ |
2536 \ | |
2537 asm volatile(\ | |
2538 "movq %4, %%mm6 \n\t"\ | |
2539 "movq %5, %%mm7 \n\t"\ | |
2540 "1: \n\t"\ | |
2541 "movq 2*16+0(%0), %%mm0 \n\t"\ | |
2542 "movq 2*16+8(%0), %%mm1 \n\t"\ | |
2543 "paddw 3*16+0(%0), %%mm0 \n\t"\ | |
2544 "paddw 3*16+8(%0), %%mm1 \n\t"\ | |
2545 "psllw $2, %%mm0 \n\t"\ | |
2546 "psllw $2, %%mm1 \n\t"\ | |
2547 "movq 1*16+0(%0), %%mm2 \n\t"\ | |
2548 "movq 1*16+8(%0), %%mm3 \n\t"\ | |
2549 "paddw 4*16+0(%0), %%mm2 \n\t"\ | |
2550 "paddw 4*16+8(%0), %%mm3 \n\t"\ | |
2551 "psubw %%mm2, %%mm0 \n\t"\ | |
2552 "psubw %%mm3, %%mm1 \n\t"\ | |
2553 "pmullw %%mm6, %%mm0 \n\t"\ | |
2554 "pmullw %%mm6, %%mm1 \n\t"\ | |
2555 "movq 0*16+0(%0), %%mm2 \n\t"\ | |
2556 "movq 0*16+8(%0), %%mm3 \n\t"\ | |
2557 "paddw 5*16+0(%0), %%mm2 \n\t"\ | |
2558 "paddw 5*16+8(%0), %%mm3 \n\t"\ | |
2559 "paddw %%mm2, %%mm0 \n\t"\ | |
2560 "paddw %%mm3, %%mm1 \n\t"\ | |
2561 "paddw %%mm7, %%mm0 \n\t"\ | |
2562 "paddw %%mm7, %%mm1 \n\t"\ | |
2563 "psraw $5, %%mm0 \n\t"\ | |
2564 "psraw $5, %%mm1 \n\t"\ | |
2565 "packuswb %%mm1, %%mm0 \n\t"\ | |
2566 OP(%%mm0, (%1),%%mm5, q)\ | |
2567 "addl %3, %1 \n\t"\ | |
2568 "addl $16, %0 \n\t"\ | |
2569 "decl %2 \n\t"\ | |
2570 " jnz 1b \n\t"\ | |
2571 \ | |
2572 : "+a"(temp_ptr), "+c"(dst), "+d"(h)\ | |
2573 : "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
2574 : "memory"\ | |
2575 );\ | |
2576 }\ | 2545 }\ |
2577 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | 2546 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ |
2578 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ | 2547 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ |
2579 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst+4, tmp+4, src+4, dstStride, tmpStride, srcStride);\ | 2548 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst+4, tmp+4, src+4, dstStride, tmpStride, srcStride);\ |
2580 src += 4*srcStride;\ | 2549 src += 4*srcStride;\ |