Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 2212:93091141f75e libavcodec
optimization
author | michael |
---|---|
date | Wed, 08 Sep 2004 00:52:49 +0000 |
parents | ee8e91ec869a |
children | 1d6835102c51 |
comparison
equal
deleted
inserted
replaced
2211:ee8e91ec869a | 2212:93091141f75e |
---|---|
2347 : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | 2347 : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ |
2348 : "memory"\ | 2348 : "memory"\ |
2349 );\ | 2349 );\ |
2350 }\ | 2350 }\ |
2351 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 2351 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2352 uint64_t temp[4+5];\ | |
2353 uint64_t *temp_ptr= temp;\ | |
2354 int h= 3;\ | |
2355 src -= 2*srcStride;\ | 2352 src -= 2*srcStride;\ |
2356 /*FIXME unroll */\ | |
2357 asm volatile(\ | 2353 asm volatile(\ |
2358 "pxor %%mm7, %%mm7 \n\t"\ | 2354 "pxor %%mm7, %%mm7 \n\t"\ |
2359 "1: \n\t"\ | |
2360 "movd (%0), %%mm0 \n\t"\ | 2355 "movd (%0), %%mm0 \n\t"\ |
2356 "addl %2, %0 \n\t"\ | |
2357 "movd (%0), %%mm1 \n\t"\ | |
2358 "addl %2, %0 \n\t"\ | |
2359 "movd (%0), %%mm2 \n\t"\ | |
2360 "addl %2, %0 \n\t"\ | |
2361 "movd (%0), %%mm3 \n\t"\ | |
2362 "addl %2, %0 \n\t"\ | |
2363 "movd (%0), %%mm4 \n\t"\ | |
2364 "addl %2, %0 \n\t"\ | |
2361 "punpcklbw %%mm7, %%mm0 \n\t"\ | 2365 "punpcklbw %%mm7, %%mm0 \n\t"\ |
2362 "movq %%mm0, (%1) \n\t"\ | 2366 "punpcklbw %%mm7, %%mm1 \n\t"\ |
2363 "addl %3, %0 \n\t"\ | 2367 "punpcklbw %%mm7, %%mm2 \n\t"\ |
2364 "movd (%0), %%mm0 \n\t"\ | 2368 "punpcklbw %%mm7, %%mm3 \n\t"\ |
2365 "punpcklbw %%mm7, %%mm0 \n\t"\ | 2369 "punpcklbw %%mm7, %%mm4 \n\t"\ |
2366 "movq %%mm0, 8(%1) \n\t"\ | 2370 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ |
2367 "addl %3, %0 \n\t"\ | 2371 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ |
2368 "movd (%0), %%mm0 \n\t"\ | 2372 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ |
2369 "punpcklbw %%mm7, %%mm0 \n\t"\ | 2373 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ |
2370 "movq %%mm0, 16(%1) \n\t"\ | 2374 \ |
2371 "addl %3, %0 \n\t"\ | 2375 : "+a"(src), "+c"(dst)\ |
2372 "addl $24, %1 \n\t"\ | 2376 : "S"(srcStride), "D"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ |
2373 "decl %2 \n\t"\ | |
2374 " jnz 1b \n\t"\ | |
2375 : "+a" (src), "+c" (temp_ptr), "+d"(h)\ | |
2376 : "S" (srcStride)\ | |
2377 : "memory"\ | 2377 : "memory"\ |
2378 );\ | 2378 );\ |
2379 \ | |
2380 temp_ptr= temp;\ | |
2381 h= 4;\ | |
2382 \ | |
2383 asm volatile(\ | |
2384 "movq %4, %%mm6 \n\t"\ | |
2385 "movq %5, %%mm7 \n\t"\ | |
2386 "1: \n\t"\ | |
2387 "movq 2*8(%0), %%mm0 \n\t"\ | |
2388 "movq 3*8(%0), %%mm1 \n\t"\ | |
2389 "paddw %%mm1, %%mm0 \n\t"\ | |
2390 "psllw $2, %%mm0 \n\t"\ | |
2391 "movq 1*8(%0), %%mm2 \n\t"\ | |
2392 "movq 4*8(%0), %%mm3 \n\t"\ | |
2393 "paddw %%mm3, %%mm2 \n\t"\ | |
2394 "psubw %%mm2, %%mm0 \n\t"\ | |
2395 "pmullw %%mm6, %%mm0 \n\t"\ | |
2396 "movq 0*8(%0), %%mm4 \n\t"\ | |
2397 "movq 5*8(%0), %%mm5 \n\t"\ | |
2398 "paddw %%mm5, %%mm4 \n\t"\ | |
2399 "paddw %%mm7, %%mm4 \n\t"\ | |
2400 "paddw %%mm4, %%mm0 \n\t"\ | |
2401 "psraw $5, %%mm0 \n\t"\ | |
2402 "packuswb %%mm0, %%mm0 \n\t"\ | |
2403 OP(%%mm0, (%1),%%mm5, d)\ | |
2404 "addl %3, %1 \n\t"\ | |
2405 "addl $8, %0 \n\t"\ | |
2406 "decl %2 \n\t"\ | |
2407 " jnz 1b \n\t"\ | |
2408 \ | |
2409 : "+a"(temp_ptr), "+c"(dst), "+d"(h)\ | |
2410 : "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
2411 : "memory"\ | |
2412 );\ | |
2413 }\ | 2379 }\ |
2414 static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | 2380 static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ |
2415 const int h=4;\ | 2381 const int h=4;\ |
2416 const int w=4;\ | 2382 const int w=4;\ |
2417 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ | 2383 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ |