comparison i386/dsputil_mmx.c @ 2211:ee8e91ec869a libavcodec

optimization
author michael
date Tue, 07 Sep 2004 22:56:50 +0000
parents e1c60876a0ae
children 93091141f75e
comparison
equal deleted inserted replaced
2210:e1c60876a0ae 2211:ee8e91ec869a
2287 uint8_t * const halfH= ((uint8_t*)half);\ 2287 uint8_t * const halfH= ((uint8_t*)half);\
2288 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 2288 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2289 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ 2289 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2290 } 2290 }
2291 2291
2292 #define QPEL_H264V(A,B,C,D,E,F,OP)\
2293 "movd (%0), "#F" \n\t"\
2294 "movq "#C", %%mm6 \n\t"\
2295 "paddw "#D", %%mm6 \n\t"\
2296 "psllw $2, %%mm6 \n\t"\
2297 "psubw "#B", %%mm6 \n\t"\
2298 "psubw "#E", %%mm6 \n\t"\
2299 "pmullw %4, %%mm6 \n\t"\
2300 "addl %2, %0 \n\t"\
2301 "punpcklbw %%mm7, "#F" \n\t"\
2302 "paddw %5, "#A" \n\t"\
2303 "paddw "#F", "#A" \n\t"\
2304 "paddw "#A", %%mm6 \n\t"\
2305 "psraw $5, %%mm6 \n\t"\
2306 "packuswb %%mm6, %%mm6 \n\t"\
2307 OP(%%mm6, (%1), A, d)\
2308 "addl %3, %1 \n\t"
2309
2292 #define QPEL_H264(OPNAME, OP, MMX)\ 2310 #define QPEL_H264(OPNAME, OP, MMX)\
2293 static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 2311 static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2294 int h=4;\ 2312 int h=4;\
2295 \ 2313 \
2296 asm volatile(\ 2314 asm volatile(\
2485 : "memory"\ 2503 : "memory"\
2486 );\ 2504 );\
2487 }\ 2505 }\
2488 \ 2506 \
2489 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 2507 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2490 uint64_t temp[(8+5)*2];\ 2508 int h= 2;\
2491 uint64_t *temp_ptr= temp;\
2492 int h= 4;\
2493 \
2494 src -= 2*srcStride;\ 2509 src -= 2*srcStride;\
2495 \ 2510 \
2496 asm volatile(\ 2511 while(h--){\
2512 asm volatile(\
2497 "pxor %%mm7, %%mm7 \n\t"\ 2513 "pxor %%mm7, %%mm7 \n\t"\
2498 "1: \n\t"\ 2514 "movd (%0), %%mm0 \n\t"\
2499 "movq (%0), %%mm0 \n\t"\ 2515 "addl %2, %0 \n\t"\
2500 "movq %%mm0, %%mm1 \n\t"\ 2516 "movd (%0), %%mm1 \n\t"\
2501 "addl %3, %0 \n\t"\ 2517 "addl %2, %0 \n\t"\
2518 "movd (%0), %%mm2 \n\t"\
2519 "addl %2, %0 \n\t"\
2520 "movd (%0), %%mm3 \n\t"\
2521 "addl %2, %0 \n\t"\
2522 "movd (%0), %%mm4 \n\t"\
2523 "addl %2, %0 \n\t"\
2502 "punpcklbw %%mm7, %%mm0 \n\t"\ 2524 "punpcklbw %%mm7, %%mm0 \n\t"\
2503 "movq %%mm0, (%1) \n\t"\ 2525 "punpcklbw %%mm7, %%mm1 \n\t"\
2504 "punpckhbw %%mm7, %%mm1 \n\t"\ 2526 "punpcklbw %%mm7, %%mm2 \n\t"\
2505 "movq (%0), %%mm0 \n\t"\ 2527 "punpcklbw %%mm7, %%mm3 \n\t"\
2506 "movq %%mm1, 8(%1) \n\t"\ 2528 "punpcklbw %%mm7, %%mm4 \n\t"\
2507 "movq %%mm0, %%mm1 \n\t"\ 2529 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
2508 "addl %3, %0 \n\t"\ 2530 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
2509 "punpcklbw %%mm7, %%mm0 \n\t"\ 2531 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
2510 "movq %%mm0, 16(%1) \n\t"\ 2532 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
2511 "punpckhbw %%mm7, %%mm1 \n\t"\ 2533 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
2512 "movq (%0), %%mm0 \n\t"\ 2534 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
2513 "movq %%mm1, 24(%1) \n\t"\ 2535 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
2514 "movq %%mm0, %%mm1 \n\t"\ 2536 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
2515 "addl %3, %0 \n\t"\ 2537 \
2516 "punpcklbw %%mm7, %%mm0 \n\t"\ 2538 : "+a"(src), "+c"(dst)\
2517 "movq %%mm0, 32(%1) \n\t"\ 2539 : "S"(srcStride), "D"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2518 "punpckhbw %%mm7, %%mm1 \n\t"\
2519 "movq %%mm1, 40(%1) \n\t"\
2520 "addl $48, %1 \n\t"\
2521 "decl %2 \n\t"\
2522 " jnz 1b \n\t"\
2523 "movq (%0), %%mm0 \n\t"\
2524 "movq %%mm0, %%mm1 \n\t"\
2525 "punpcklbw %%mm7, %%mm0 \n\t"\
2526 "movq %%mm0, (%1) \n\t"\
2527 "punpckhbw %%mm7, %%mm1 \n\t"\
2528 "movq %%mm1, 8(%1) \n\t"\
2529 : "+a" (src), "+c" (temp_ptr), "+d"(h)\
2530 : "S" (srcStride)\
2531 : "memory"\ 2540 : "memory"\
2532 );\ 2541 );\
2533 \ 2542 src += 4-13*srcStride;\
2534 temp_ptr= temp;\ 2543 dst += 4-8*dstStride;\
2535 h= 8;\ 2544 }\
2536 \
2537 asm volatile(\
2538 "movq %4, %%mm6 \n\t"\
2539 "movq %5, %%mm7 \n\t"\
2540 "1: \n\t"\
2541 "movq 2*16+0(%0), %%mm0 \n\t"\
2542 "movq 2*16+8(%0), %%mm1 \n\t"\
2543 "paddw 3*16+0(%0), %%mm0 \n\t"\
2544 "paddw 3*16+8(%0), %%mm1 \n\t"\
2545 "psllw $2, %%mm0 \n\t"\
2546 "psllw $2, %%mm1 \n\t"\
2547 "movq 1*16+0(%0), %%mm2 \n\t"\
2548 "movq 1*16+8(%0), %%mm3 \n\t"\
2549 "paddw 4*16+0(%0), %%mm2 \n\t"\
2550 "paddw 4*16+8(%0), %%mm3 \n\t"\
2551 "psubw %%mm2, %%mm0 \n\t"\
2552 "psubw %%mm3, %%mm1 \n\t"\
2553 "pmullw %%mm6, %%mm0 \n\t"\
2554 "pmullw %%mm6, %%mm1 \n\t"\
2555 "movq 0*16+0(%0), %%mm2 \n\t"\
2556 "movq 0*16+8(%0), %%mm3 \n\t"\
2557 "paddw 5*16+0(%0), %%mm2 \n\t"\
2558 "paddw 5*16+8(%0), %%mm3 \n\t"\
2559 "paddw %%mm2, %%mm0 \n\t"\
2560 "paddw %%mm3, %%mm1 \n\t"\
2561 "paddw %%mm7, %%mm0 \n\t"\
2562 "paddw %%mm7, %%mm1 \n\t"\
2563 "psraw $5, %%mm0 \n\t"\
2564 "psraw $5, %%mm1 \n\t"\
2565 "packuswb %%mm1, %%mm0 \n\t"\
2566 OP(%%mm0, (%1),%%mm5, q)\
2567 "addl %3, %1 \n\t"\
2568 "addl $16, %0 \n\t"\
2569 "decl %2 \n\t"\
2570 " jnz 1b \n\t"\
2571 \
2572 : "+a"(temp_ptr), "+c"(dst), "+d"(h)\
2573 : "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2574 : "memory"\
2575 );\
2576 }\ 2545 }\
2577 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 2546 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2578 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ 2547 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
2579 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst+4, tmp+4, src+4, dstStride, tmpStride, srcStride);\ 2548 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst+4, tmp+4, src+4, dstStride, tmpStride, srcStride);\
2580 src += 4*srcStride;\ 2549 src += 4*srcStride;\