comparison i386/dsputil_mmx.c @ 2209:c4a476971abc libavcodec

h264 luma motion compensation in mmx2/3dnow
author michael
date Tue, 07 Sep 2004 01:48:45 +0000
parents 22b768f1261a
children e1c60876a0ae
comparison
equal deleted inserted replaced
2208:9ca8a88a8a70 2209:c4a476971abc
37 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL; 37 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
38 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL; 38 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
39 39
40 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL; 40 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
41 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL; 41 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
42 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
42 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL; 43 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
44 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
43 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; 45 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
44 46
45 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL; 47 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
46 48
47 #define JUMPALIGN() __asm __volatile (".balign 8"::) 49 #define JUMPALIGN() __asm __volatile (".balign 8"::)
350 :"r"(p) 352 :"r"(p)
351 :"memory"); 353 :"memory");
352 pix += line_size*2; 354 pix += line_size*2;
353 p += 16; 355 p += 16;
354 } while (--i); 356 } while (--i);
357 }
358
359 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
360 {
361 __asm __volatile(
362 "lea (%3, %3), %%eax \n\t"
363 ".balign 8 \n\t"
364 "1: \n\t"
365 "movd (%1), %%mm0 \n\t"
366 "movd (%1, %3), %%mm1 \n\t"
367 "movd %%mm0, (%2) \n\t"
368 "movd %%mm1, (%2, %3) \n\t"
369 "addl %%eax, %1 \n\t"
370 "addl %%eax, %2 \n\t"
371 "movd (%1), %%mm0 \n\t"
372 "movd (%1, %3), %%mm1 \n\t"
373 "movd %%mm0, (%2) \n\t"
374 "movd %%mm1, (%2, %3) \n\t"
375 "addl %%eax, %1 \n\t"
376 "addl %%eax, %2 \n\t"
377 "subl $4, %0 \n\t"
378 "jnz 1b \n\t"
379 : "+g"(h), "+r" (pixels), "+r" (block)
380 : "r"(line_size)
381 : "%eax", "memory"
382 );
355 } 383 }
356 384
357 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 385 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
358 { 386 {
359 __asm __volatile( 387 __asm __volatile(
1977 :"memory"\ 2005 :"memory"\
1978 );\ 2006 );\
1979 }\ 2007 }\
1980 \ 2008 \
1981 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 2009 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1982 uint64_t temp[9*4];\ 2010 uint64_t temp[9*2];\
1983 uint64_t *temp_ptr= temp;\ 2011 uint64_t *temp_ptr= temp;\
1984 int count= 9;\ 2012 int count= 9;\
1985 \ 2013 \
1986 /*FIXME unroll */\ 2014 /*FIXME unroll */\
1987 asm volatile(\ 2015 asm volatile(\
2259 uint8_t * const halfH= ((uint8_t*)half);\ 2287 uint8_t * const halfH= ((uint8_t*)half);\
2260 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 2288 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2261 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ 2289 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
2262 } 2290 }
2263 2291
2292 #define QPEL_H264(OPNAME, OP, MMX)\
2293 static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2294 int h=4;\
2295 \
2296 asm volatile(\
2297 "pxor %%mm7, %%mm7 \n\t"\
2298 "movq %5, %%mm4 \n\t"\
2299 "movq %6, %%mm5 \n\t"\
2300 "1: \n\t"\
2301 "movd -1(%0), %%mm1 \n\t"\
2302 "movd (%0), %%mm2 \n\t"\
2303 "movd 1(%0), %%mm3 \n\t"\
2304 "movd 2(%0), %%mm0 \n\t"\
2305 "punpcklbw %%mm7, %%mm1 \n\t"\
2306 "punpcklbw %%mm7, %%mm2 \n\t"\
2307 "punpcklbw %%mm7, %%mm3 \n\t"\
2308 "punpcklbw %%mm7, %%mm0 \n\t"\
2309 "paddw %%mm0, %%mm1 \n\t"\
2310 "paddw %%mm3, %%mm2 \n\t"\
2311 "movd -2(%0), %%mm0 \n\t"\
2312 "movd 3(%0), %%mm3 \n\t"\
2313 "punpcklbw %%mm7, %%mm0 \n\t"\
2314 "punpcklbw %%mm7, %%mm3 \n\t"\
2315 "paddw %%mm3, %%mm0 \n\t"\
2316 "psllw $2, %%mm2 \n\t"\
2317 "psubw %%mm1, %%mm2 \n\t"\
2318 "pmullw %%mm4, %%mm2 \n\t"\
2319 "paddw %%mm5, %%mm0 \n\t"\
2320 "paddw %%mm2, %%mm0 \n\t"\
2321 "psraw $5, %%mm0 \n\t"\
2322 "packuswb %%mm0, %%mm0 \n\t"\
2323 OP(%%mm0, (%1),%%mm6, d)\
2324 "addl %3, %0 \n\t"\
2325 "addl %4, %1 \n\t"\
2326 "decl %2 \n\t"\
2327 " jnz 1b \n\t"\
2328 : "+a"(src), "+c"(dst), "+m"(h)\
2329 : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2330 : "memory"\
2331 );\
2332 }\
2333 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2334 uint64_t temp[4+5];\
2335 uint64_t *temp_ptr= temp;\
2336 int h= 3;\
2337 src -= 2*srcStride;\
2338 /*FIXME unroll */\
2339 asm volatile(\
2340 "pxor %%mm7, %%mm7 \n\t"\
2341 "1: \n\t"\
2342 "movd (%0), %%mm0 \n\t"\
2343 "punpcklbw %%mm7, %%mm0 \n\t"\
2344 "movq %%mm0, (%1) \n\t"\
2345 "addl %3, %0 \n\t"\
2346 "movd (%0), %%mm0 \n\t"\
2347 "punpcklbw %%mm7, %%mm0 \n\t"\
2348 "movq %%mm0, 8(%1) \n\t"\
2349 "addl %3, %0 \n\t"\
2350 "movd (%0), %%mm0 \n\t"\
2351 "punpcklbw %%mm7, %%mm0 \n\t"\
2352 "movq %%mm0, 16(%1) \n\t"\
2353 "addl %3, %0 \n\t"\
2354 "addl $24, %1 \n\t"\
2355 "decl %2 \n\t"\
2356 " jnz 1b \n\t"\
2357 : "+a" (src), "+c" (temp_ptr), "+d"(h)\
2358 : "S" (srcStride)\
2359 : "memory"\
2360 );\
2361 \
2362 temp_ptr= temp;\
2363 h= 4;\
2364 \
2365 asm volatile(\
2366 "movq %4, %%mm6 \n\t"\
2367 "movq %5, %%mm7 \n\t"\
2368 "1: \n\t"\
2369 "movq 2*8(%0), %%mm0 \n\t"\
2370 "movq 3*8(%0), %%mm1 \n\t"\
2371 "paddw %%mm1, %%mm0 \n\t"\
2372 "psllw $2, %%mm0 \n\t"\
2373 "movq 1*8(%0), %%mm2 \n\t"\
2374 "movq 4*8(%0), %%mm3 \n\t"\
2375 "paddw %%mm3, %%mm2 \n\t"\
2376 "psubw %%mm2, %%mm0 \n\t"\
2377 "pmullw %%mm6, %%mm0 \n\t"\
2378 "movq 0*8(%0), %%mm4 \n\t"\
2379 "movq 5*8(%0), %%mm5 \n\t"\
2380 "paddw %%mm5, %%mm4 \n\t"\
2381 "paddw %%mm7, %%mm4 \n\t"\
2382 "paddw %%mm4, %%mm0 \n\t"\
2383 "psraw $5, %%mm0 \n\t"\
2384 "packuswb %%mm0, %%mm0 \n\t"\
2385 OP(%%mm0, (%1),%%mm5, d)\
2386 "addl %3, %1 \n\t"\
2387 "addl $8, %0 \n\t"\
2388 "decl %2 \n\t"\
2389 " jnz 1b \n\t"\
2390 \
2391 : "+a"(temp_ptr), "+c"(dst), "+d"(h)\
2392 : "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2393 : "memory"\
2394 );\
2395 }\
2396 static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2397 const int h=4;\
2398 const int w=4;\
2399 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2400 int i;\
2401 src -= 2*srcStride;\
2402 for(i=0; i<h+5; i++)\
2403 {\
2404 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2405 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2406 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2407 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2408 tmp+=tmpStride;\
2409 src+=srcStride;\
2410 }\
2411 tmp -= tmpStride*(h+5-2);\
2412 for(i=0; i<w; i++)\
2413 {\
2414 const int tmpB= tmp[-2*tmpStride];\
2415 const int tmpA= tmp[-1*tmpStride];\
2416 const int tmp0= tmp[0 *tmpStride];\
2417 const int tmp1= tmp[1 *tmpStride];\
2418 const int tmp2= tmp[2 *tmpStride];\
2419 const int tmp3= tmp[3 *tmpStride];\
2420 const int tmp4= tmp[4 *tmpStride];\
2421 const int tmp5= tmp[5 *tmpStride];\
2422 const int tmp6= tmp[6 *tmpStride];\
2423 dst[0*dstStride]= cm[( (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3) + 512)>>10];\
2424 dst[1*dstStride]= cm[( (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4) + 512)>>10];\
2425 dst[2*dstStride]= cm[( (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5) + 512)>>10];\
2426 dst[3*dstStride]= cm[( (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6) + 512)>>10];\
2427 dst++;\
2428 tmp++;\
2429 }\
2430 }\
2431 \
2432 static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2433 int h=8;\
2434 asm volatile(\
2435 "pxor %%mm7, %%mm7 \n\t"\
2436 "movq %5, %%mm6 \n\t"\
2437 "1: \n\t"\
2438 "movq (%0), %%mm0 \n\t"\
2439 "movq 1(%0), %%mm2 \n\t"\
2440 "movq %%mm0, %%mm1 \n\t"\
2441 "movq %%mm2, %%mm3 \n\t"\
2442 "punpcklbw %%mm7, %%mm0 \n\t"\
2443 "punpckhbw %%mm7, %%mm1 \n\t"\
2444 "punpcklbw %%mm7, %%mm2 \n\t"\
2445 "punpckhbw %%mm7, %%mm3 \n\t"\
2446 "paddw %%mm2, %%mm0 \n\t"\
2447 "paddw %%mm3, %%mm1 \n\t"\
2448 "psllw $2, %%mm0 \n\t"\
2449 "psllw $2, %%mm1 \n\t"\
2450 "movq -1(%0), %%mm2 \n\t"\
2451 "movq 2(%0), %%mm4 \n\t"\
2452 "movq %%mm2, %%mm3 \n\t"\
2453 "movq %%mm4, %%mm5 \n\t"\
2454 "punpcklbw %%mm7, %%mm2 \n\t"\
2455 "punpckhbw %%mm7, %%mm3 \n\t"\
2456 "punpcklbw %%mm7, %%mm4 \n\t"\
2457 "punpckhbw %%mm7, %%mm5 \n\t"\
2458 "paddw %%mm4, %%mm2 \n\t"\
2459 "paddw %%mm3, %%mm5 \n\t"\
2460 "psubw %%mm2, %%mm0 \n\t"\
2461 "psubw %%mm5, %%mm1 \n\t"\
2462 "pmullw %%mm6, %%mm0 \n\t"\
2463 "pmullw %%mm6, %%mm1 \n\t"\
2464 "movd -2(%0), %%mm2 \n\t"\
2465 "movd 7(%0), %%mm5 \n\t"\
2466 "punpcklbw %%mm7, %%mm2 \n\t"\
2467 "punpcklbw %%mm7, %%mm5 \n\t"\
2468 "paddw %%mm3, %%mm2 \n\t"\
2469 "paddw %%mm5, %%mm4 \n\t"\
2470 "movq %6, %%mm5 \n\t"\
2471 "paddw %%mm5, %%mm2 \n\t"\
2472 "paddw %%mm5, %%mm4 \n\t"\
2473 "paddw %%mm2, %%mm0 \n\t"\
2474 "paddw %%mm4, %%mm1 \n\t"\
2475 "psraw $5, %%mm0 \n\t"\
2476 "psraw $5, %%mm1 \n\t"\
2477 "packuswb %%mm1, %%mm0 \n\t"\
2478 OP(%%mm0, (%1),%%mm5, q)\
2479 "addl %3, %0 \n\t"\
2480 "addl %4, %1 \n\t"\
2481 "decl %2 \n\t"\
2482 " jnz 1b \n\t"\
2483 : "+a"(src), "+c"(dst), "+m"(h)\
2484 : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2485 : "memory"\
2486 );\
2487 }\
2488 \
2489 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2490 uint64_t temp[(8+5)*2];\
2491 uint64_t *temp_ptr= temp;\
2492 int h= 8+5;\
2493 \
2494 src -= 2*srcStride;\
2495 /*FIXME unroll */\
2496 asm volatile(\
2497 "pxor %%mm7, %%mm7 \n\t"\
2498 "1: \n\t"\
2499 "movq (%0), %%mm0 \n\t"\
2500 "movq (%0), %%mm1 \n\t"\
2501 "punpcklbw %%mm7, %%mm0 \n\t"\
2502 "punpckhbw %%mm7, %%mm1 \n\t"\
2503 "movq %%mm0, (%1) \n\t"\
2504 "movq %%mm1, 8(%1) \n\t"\
2505 "addl $16, %1 \n\t"\
2506 "addl %3, %0 \n\t"\
2507 "decl %2 \n\t"\
2508 " jnz 1b \n\t"\
2509 : "+a" (src), "+c" (temp_ptr), "+d"(h)\
2510 : "S" (srcStride)\
2511 : "memory"\
2512 );\
2513 \
2514 temp_ptr= temp;\
2515 h= 8;\
2516 \
2517 asm volatile(\
2518 "movq %4, %%mm6 \n\t"\
2519 "movq %5, %%mm7 \n\t"\
2520 "1: \n\t"\
2521 "movq 2*16+0(%0), %%mm0 \n\t"\
2522 "movq 2*16+8(%0), %%mm1 \n\t"\
2523 "movq 3*16+0(%0), %%mm2 \n\t"\
2524 "movq 3*16+8(%0), %%mm3 \n\t"\
2525 "paddw %%mm2, %%mm0 \n\t"\
2526 "paddw %%mm3, %%mm1 \n\t"\
2527 "psllw $2, %%mm0 \n\t"\
2528 "psllw $2, %%mm1 \n\t"\
2529 "movq 1*16+0(%0), %%mm2 \n\t"\
2530 "movq 1*16+8(%0), %%mm3 \n\t"\
2531 "movq 4*16+0(%0), %%mm4 \n\t"\
2532 "movq 4*16+8(%0), %%mm5 \n\t"\
2533 "paddw %%mm4, %%mm2 \n\t"\
2534 "paddw %%mm5, %%mm3 \n\t"\
2535 "psubw %%mm2, %%mm0 \n\t"\
2536 "psubw %%mm3, %%mm1 \n\t"\
2537 "pmullw %%mm6, %%mm0 \n\t"\
2538 "pmullw %%mm6, %%mm1 \n\t"\
2539 "movq 0*16+0(%0), %%mm2 \n\t"\
2540 "movq 0*16+8(%0), %%mm3 \n\t"\
2541 "movq 5*16+0(%0), %%mm4 \n\t"\
2542 "movq 5*16+8(%0), %%mm5 \n\t"\
2543 "paddw %%mm4, %%mm2 \n\t"\
2544 "paddw %%mm5, %%mm3 \n\t"\
2545 "paddw %%mm2, %%mm0 \n\t"\
2546 "paddw %%mm3, %%mm1 \n\t"\
2547 "paddw %%mm7, %%mm0 \n\t"\
2548 "paddw %%mm7, %%mm1 \n\t"\
2549 "psraw $5, %%mm0 \n\t"\
2550 "psraw $5, %%mm1 \n\t"\
2551 "packuswb %%mm1, %%mm0 \n\t"\
2552 OP(%%mm0, (%1),%%mm5, q)\
2553 "addl %3, %1 \n\t"\
2554 "addl $16, %0 \n\t"\
2555 "decl %2 \n\t"\
2556 " jnz 1b \n\t"\
2557 \
2558 : "+a"(temp_ptr), "+c"(dst), "+d"(h)\
2559 : "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2560 : "memory"\
2561 );\
2562 }\
2563 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2564 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
2565 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst+4, tmp+4, src+4, dstStride, tmpStride, srcStride);\
2566 src += 4*srcStride;\
2567 dst += 4*dstStride;\
2568 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
2569 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst+4, tmp+4, src+4, dstStride, tmpStride, srcStride);\
2570 }\
2571 static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2572 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
2573 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
2574 src += 8*srcStride;\
2575 dst += 8*dstStride;\
2576 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
2577 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
2578 }\
2579 \
2580 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2581 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
2582 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
2583 src += 8*srcStride;\
2584 dst += 8*dstStride;\
2585 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
2586 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
2587 }\
2588 \
2589 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2590 OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
2591 OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2592 src += 8*srcStride;\
2593 dst += 8*dstStride;\
2594 OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
2595 OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2596 }\
2597
2598 #define H264_MC(OPNAME, SIZE, MMX) \
2599 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2600 OPNAME ## pixels ## SIZE ## _mmx(dst, src, stride, SIZE);\
2601 }\
2602 \
2603 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2604 uint64_t temp[SIZE*SIZE/8];\
2605 uint8_t * const half= (uint8_t*)temp;\
2606 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\
2607 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
2608 }\
2609 \
2610 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2611 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
2612 }\
2613 \
2614 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2615 uint64_t temp[SIZE*SIZE/8];\
2616 uint8_t * const half= (uint8_t*)temp;\
2617 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\
2618 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+1, half, stride, stride, SIZE);\
2619 }\
2620 \
2621 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2622 uint64_t temp[SIZE*SIZE/8];\
2623 uint8_t * const half= (uint8_t*)temp;\
2624 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
2625 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
2626 }\
2627 \
2628 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2629 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
2630 }\
2631 \
2632 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2633 uint64_t temp[SIZE*SIZE/8];\
2634 uint8_t * const half= (uint8_t*)temp;\
2635 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
2636 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, half, stride, stride, SIZE);\
2637 }\
2638 \
2639 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2640 uint64_t temp[SIZE*SIZE/4];\
2641 uint8_t * const halfH= (uint8_t*)temp;\
2642 uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
2643 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
2644 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
2645 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
2646 }\
2647 \
2648 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2649 uint64_t temp[SIZE*SIZE/4];\
2650 uint8_t * const halfH= (uint8_t*)temp;\
2651 uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
2652 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
2653 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
2654 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
2655 }\
2656 \
2657 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2658 uint64_t temp[SIZE*SIZE/4];\
2659 uint8_t * const halfH= (uint8_t*)temp;\
2660 uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
2661 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
2662 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
2663 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
2664 }\
2665 \
2666 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2667 uint64_t temp[SIZE*SIZE/4];\
2668 uint8_t * const halfH= (uint8_t*)temp;\
2669 uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
2670 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
2671 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
2672 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
2673 }\
2674 \
2675 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2676 int16_t tmp[SIZE*(SIZE+5)];\
2677 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\
2678 }\
2679 \
2680 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2681 int16_t tmp[SIZE*(SIZE+5)];\
2682 uint8_t halfH[SIZE*SIZE];\
2683 uint8_t halfHV[SIZE*SIZE];\
2684 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
2685 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
2686 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\
2687 }\
2688 \
2689 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2690 int16_t tmp[SIZE*(SIZE+5)];\
2691 uint8_t halfH[SIZE*SIZE];\
2692 uint8_t halfHV[SIZE*SIZE];\
2693 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
2694 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
2695 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\
2696 }\
2697 \
2698 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2699 int16_t tmp[SIZE*(SIZE+5)];\
2700 uint8_t halfV[SIZE*SIZE];\
2701 uint8_t halfHV[SIZE*SIZE];\
2702 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
2703 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
2704 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\
2705 }\
2706 \
2707 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
2708 int16_t tmp[SIZE*(SIZE+5)];\
2709 uint8_t halfV[SIZE*SIZE];\
2710 uint8_t halfHV[SIZE*SIZE];\
2711 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
2712 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
2713 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\
2714 }\
2715
2264 2716
2265 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" 2717 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
2266 #define AVG_3DNOW_OP(a,b,temp, size) \ 2718 #define AVG_3DNOW_OP(a,b,temp, size) \
2267 "mov" #size " " #b ", " #temp " \n\t"\ 2719 "mov" #size " " #b ", " #temp " \n\t"\
2268 "pavgusb " #temp ", " #a " \n\t"\ 2720 "pavgusb " #temp ", " #a " \n\t"\
2279 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) 2731 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
2280 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) 2732 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
2281 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) 2733 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
2282 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) 2734 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
2283 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) 2735 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
2736
2737 QPEL_H264(put_ , PUT_OP, 3dnow)
2738 QPEL_H264(avg_ , AVG_3DNOW_OP, 3dnow)
2739 QPEL_H264(put_ , PUT_OP, mmx2)
2740 QPEL_H264(avg_ , AVG_MMX2_OP, mmx2)
2741
2742 H264_MC(put_, 4, 3dnow)
2743 H264_MC(put_, 8, 3dnow)
2744 H264_MC(put_, 16,3dnow)
2745 H264_MC(avg_, 4, 3dnow)
2746 H264_MC(avg_, 8, 3dnow)
2747 H264_MC(avg_, 16,3dnow)
2748 H264_MC(put_, 4, mmx2)
2749 H264_MC(put_, 8, mmx2)
2750 H264_MC(put_, 16,mmx2)
2751 H264_MC(avg_, 4, mmx2)
2752 H264_MC(avg_, 8, mmx2)
2753 H264_MC(avg_, 16,mmx2)
2284 2754
2285 #if 0 2755 #if 0
2286 static void just_return() { return; } 2756 static void just_return() { return; }
2287 #endif 2757 #endif
2288 2758
2619 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2) 3089 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
2620 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2) 3090 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
2621 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2) 3091 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
2622 #endif 3092 #endif
2623 3093
3094 //FIXME 3dnow too
3095 #define dspfunc(PFX, IDX, NUM) \
3096 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \
3097 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \
3098 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \
3099 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \
3100 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \
3101 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \
3102 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \
3103 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \
3104 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \
3105 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \
3106 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \
3107 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \
3108 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \
3109 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \
3110 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \
3111 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2
3112
3113 dspfunc(put_h264_qpel, 0, 16);
3114 dspfunc(put_h264_qpel, 1, 8);
3115 dspfunc(put_h264_qpel, 2, 4);
3116 dspfunc(avg_h264_qpel, 0, 16);
3117 dspfunc(avg_h264_qpel, 1, 8);
3118 dspfunc(avg_h264_qpel, 2, 4);
3119 #undef dspfunc
3120
2624 #ifdef CONFIG_ENCODERS 3121 #ifdef CONFIG_ENCODERS
2625 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; 3122 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
2626 #endif //CONFIG_ENCODERS 3123 #endif //CONFIG_ENCODERS
2627 } else if (mm_flags & MM_3DNOW) { 3124 } else if (mm_flags & MM_3DNOW) {
2628 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; 3125 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2678 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow) 3175 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
2679 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow) 3176 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
2680 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow) 3177 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
2681 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow) 3178 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
2682 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow) 3179 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
3180
3181 #define dspfunc(PFX, IDX, NUM) \
3182 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \
3183 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \
3184 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \
3185 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \
3186 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \
3187 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \
3188 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \
3189 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \
3190 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \
3191 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \
3192 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \
3193 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \
3194 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \
3195 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \
3196 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \
3197 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow
3198
3199 dspfunc(put_h264_qpel, 0, 16);
3200 dspfunc(put_h264_qpel, 1, 8);
3201 dspfunc(put_h264_qpel, 2, 4);
3202 dspfunc(avg_h264_qpel, 0, 16);
3203 dspfunc(avg_h264_qpel, 1, 8);
3204 dspfunc(avg_h264_qpel, 2, 4);
2683 } 3205 }
2684 } 3206 }
2685 3207
2686 #ifdef CONFIG_ENCODERS 3208 #ifdef CONFIG_ENCODERS
2687 dsputil_init_pix_mmx(c, avctx); 3209 dsputil_init_pix_mmx(c, avctx);