comparison i386/dsputil_mmx.c @ 2212:93091141f75e libavcodec

optimization
author michael
date Wed, 08 Sep 2004 00:52:49 +0000
parents ee8e91ec869a
children 1d6835102c51
comparison
equal deleted inserted replaced
2211:ee8e91ec869a 2212:93091141f75e
2347 : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 2347 : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2348 : "memory"\ 2348 : "memory"\
2349 );\ 2349 );\
2350 }\ 2350 }\
2351 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 2351 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2352 uint64_t temp[4+5];\
2353 uint64_t *temp_ptr= temp;\
2354 int h= 3;\
2355 src -= 2*srcStride;\ 2352 src -= 2*srcStride;\
2356 /*FIXME unroll */\
2357 asm volatile(\ 2353 asm volatile(\
2358 "pxor %%mm7, %%mm7 \n\t"\ 2354 "pxor %%mm7, %%mm7 \n\t"\
2359 "1: \n\t"\
2360 "movd (%0), %%mm0 \n\t"\ 2355 "movd (%0), %%mm0 \n\t"\
2356 "addl %2, %0 \n\t"\
2357 "movd (%0), %%mm1 \n\t"\
2358 "addl %2, %0 \n\t"\
2359 "movd (%0), %%mm2 \n\t"\
2360 "addl %2, %0 \n\t"\
2361 "movd (%0), %%mm3 \n\t"\
2362 "addl %2, %0 \n\t"\
2363 "movd (%0), %%mm4 \n\t"\
2364 "addl %2, %0 \n\t"\
2361 "punpcklbw %%mm7, %%mm0 \n\t"\ 2365 "punpcklbw %%mm7, %%mm0 \n\t"\
2362 "movq %%mm0, (%1) \n\t"\ 2366 "punpcklbw %%mm7, %%mm1 \n\t"\
2363 "addl %3, %0 \n\t"\ 2367 "punpcklbw %%mm7, %%mm2 \n\t"\
2364 "movd (%0), %%mm0 \n\t"\ 2368 "punpcklbw %%mm7, %%mm3 \n\t"\
2365 "punpcklbw %%mm7, %%mm0 \n\t"\ 2369 "punpcklbw %%mm7, %%mm4 \n\t"\
2366 "movq %%mm0, 8(%1) \n\t"\ 2370 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
2367 "addl %3, %0 \n\t"\ 2371 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
2368 "movd (%0), %%mm0 \n\t"\ 2372 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
2369 "punpcklbw %%mm7, %%mm0 \n\t"\ 2373 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
2370 "movq %%mm0, 16(%1) \n\t"\ 2374 \
2371 "addl %3, %0 \n\t"\ 2375 : "+a"(src), "+c"(dst)\
2372 "addl $24, %1 \n\t"\ 2376 : "S"(srcStride), "D"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2373 "decl %2 \n\t"\
2374 " jnz 1b \n\t"\
2375 : "+a" (src), "+c" (temp_ptr), "+d"(h)\
2376 : "S" (srcStride)\
2377 : "memory"\ 2377 : "memory"\
2378 );\ 2378 );\
2379 \
2380 temp_ptr= temp;\
2381 h= 4;\
2382 \
2383 asm volatile(\
2384 "movq %4, %%mm6 \n\t"\
2385 "movq %5, %%mm7 \n\t"\
2386 "1: \n\t"\
2387 "movq 2*8(%0), %%mm0 \n\t"\
2388 "movq 3*8(%0), %%mm1 \n\t"\
2389 "paddw %%mm1, %%mm0 \n\t"\
2390 "psllw $2, %%mm0 \n\t"\
2391 "movq 1*8(%0), %%mm2 \n\t"\
2392 "movq 4*8(%0), %%mm3 \n\t"\
2393 "paddw %%mm3, %%mm2 \n\t"\
2394 "psubw %%mm2, %%mm0 \n\t"\
2395 "pmullw %%mm6, %%mm0 \n\t"\
2396 "movq 0*8(%0), %%mm4 \n\t"\
2397 "movq 5*8(%0), %%mm5 \n\t"\
2398 "paddw %%mm5, %%mm4 \n\t"\
2399 "paddw %%mm7, %%mm4 \n\t"\
2400 "paddw %%mm4, %%mm0 \n\t"\
2401 "psraw $5, %%mm0 \n\t"\
2402 "packuswb %%mm0, %%mm0 \n\t"\
2403 OP(%%mm0, (%1),%%mm5, d)\
2404 "addl %3, %1 \n\t"\
2405 "addl $8, %0 \n\t"\
2406 "decl %2 \n\t"\
2407 " jnz 1b \n\t"\
2408 \
2409 : "+a"(temp_ptr), "+c"(dst), "+d"(h)\
2410 : "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2411 : "memory"\
2412 );\
2413 }\ 2379 }\
2414 static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 2380 static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2415 const int h=4;\ 2381 const int h=4;\
2416 const int w=4;\ 2382 const int w=4;\
2417 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ 2383 uint8_t *cm = cropTbl + MAX_NEG_CROP;\