Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 2209:c4a476971abc libavcodec
h264 luma motion compensation in mmx2/3dnow
author | michael |
---|---|
date | Tue, 07 Sep 2004 01:48:45 +0000 |
parents | 22b768f1261a |
children | e1c60876a0ae |
comparison
equal
deleted
inserted
replaced
2208:9ca8a88a8a70 | 2209:c4a476971abc |
---|---|
37 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | 37 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL; |
38 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL; | 38 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL; |
39 | 39 |
40 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL; | 40 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL; |
41 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL; | 41 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL; |
42 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL; | |
42 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL; | 43 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL; |
44 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL; | |
43 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; | 45 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; |
44 | 46 |
45 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL; | 47 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL; |
46 | 48 |
47 #define JUMPALIGN() __asm __volatile (".balign 8"::) | 49 #define JUMPALIGN() __asm __volatile (".balign 8"::) |
350 :"r"(p) | 352 :"r"(p) |
351 :"memory"); | 353 :"memory"); |
352 pix += line_size*2; | 354 pix += line_size*2; |
353 p += 16; | 355 p += 16; |
354 } while (--i); | 356 } while (--i); |
357 } | |
358 | |
359 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
360 { | |
361 __asm __volatile( | |
362 "lea (%3, %3), %%eax \n\t" | |
363 ".balign 8 \n\t" | |
364 "1: \n\t" | |
365 "movd (%1), %%mm0 \n\t" | |
366 "movd (%1, %3), %%mm1 \n\t" | |
367 "movd %%mm0, (%2) \n\t" | |
368 "movd %%mm1, (%2, %3) \n\t" | |
369 "addl %%eax, %1 \n\t" | |
370 "addl %%eax, %2 \n\t" | |
371 "movd (%1), %%mm0 \n\t" | |
372 "movd (%1, %3), %%mm1 \n\t" | |
373 "movd %%mm0, (%2) \n\t" | |
374 "movd %%mm1, (%2, %3) \n\t" | |
375 "addl %%eax, %1 \n\t" | |
376 "addl %%eax, %2 \n\t" | |
377 "subl $4, %0 \n\t" | |
378 "jnz 1b \n\t" | |
379 : "+g"(h), "+r" (pixels), "+r" (block) | |
380 : "r"(line_size) | |
381 : "%eax", "memory" | |
382 ); | |
355 } | 383 } |
356 | 384 |
357 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 385 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
358 { | 386 { |
359 __asm __volatile( | 387 __asm __volatile( |
1977 :"memory"\ | 2005 :"memory"\ |
1978 );\ | 2006 );\ |
1979 }\ | 2007 }\ |
1980 \ | 2008 \ |
1981 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 2009 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
1982 uint64_t temp[9*4];\ | 2010 uint64_t temp[9*2];\ |
1983 uint64_t *temp_ptr= temp;\ | 2011 uint64_t *temp_ptr= temp;\ |
1984 int count= 9;\ | 2012 int count= 9;\ |
1985 \ | 2013 \ |
1986 /*FIXME unroll */\ | 2014 /*FIXME unroll */\ |
1987 asm volatile(\ | 2015 asm volatile(\ |
2259 uint8_t * const halfH= ((uint8_t*)half);\ | 2287 uint8_t * const halfH= ((uint8_t*)half);\ |
2260 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | 2288 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2261 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ | 2289 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
2262 } | 2290 } |
2263 | 2291 |
2292 #define QPEL_H264(OPNAME, OP, MMX)\ | |
2293 static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2294 int h=4;\ | |
2295 \ | |
2296 asm volatile(\ | |
2297 "pxor %%mm7, %%mm7 \n\t"\ | |
2298 "movq %5, %%mm4 \n\t"\ | |
2299 "movq %6, %%mm5 \n\t"\ | |
2300 "1: \n\t"\ | |
2301 "movd -1(%0), %%mm1 \n\t"\ | |
2302 "movd (%0), %%mm2 \n\t"\ | |
2303 "movd 1(%0), %%mm3 \n\t"\ | |
2304 "movd 2(%0), %%mm0 \n\t"\ | |
2305 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
2306 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
2307 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
2308 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2309 "paddw %%mm0, %%mm1 \n\t"\ | |
2310 "paddw %%mm3, %%mm2 \n\t"\ | |
2311 "movd -2(%0), %%mm0 \n\t"\ | |
2312 "movd 3(%0), %%mm3 \n\t"\ | |
2313 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2314 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
2315 "paddw %%mm3, %%mm0 \n\t"\ | |
2316 "psllw $2, %%mm2 \n\t"\ | |
2317 "psubw %%mm1, %%mm2 \n\t"\ | |
2318 "pmullw %%mm4, %%mm2 \n\t"\ | |
2319 "paddw %%mm5, %%mm0 \n\t"\ | |
2320 "paddw %%mm2, %%mm0 \n\t"\ | |
2321 "psraw $5, %%mm0 \n\t"\ | |
2322 "packuswb %%mm0, %%mm0 \n\t"\ | |
2323 OP(%%mm0, (%1),%%mm6, d)\ | |
2324 "addl %3, %0 \n\t"\ | |
2325 "addl %4, %1 \n\t"\ | |
2326 "decl %2 \n\t"\ | |
2327 " jnz 1b \n\t"\ | |
2328 : "+a"(src), "+c"(dst), "+m"(h)\ | |
2329 : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
2330 : "memory"\ | |
2331 );\ | |
2332 }\ | |
2333 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2334 uint64_t temp[4+5];\ | |
2335 uint64_t *temp_ptr= temp;\ | |
2336 int h= 3;\ | |
2337 src -= 2*srcStride;\ | |
2338 /*FIXME unroll */\ | |
2339 asm volatile(\ | |
2340 "pxor %%mm7, %%mm7 \n\t"\ | |
2341 "1: \n\t"\ | |
2342 "movd (%0), %%mm0 \n\t"\ | |
2343 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2344 "movq %%mm0, (%1) \n\t"\ | |
2345 "addl %3, %0 \n\t"\ | |
2346 "movd (%0), %%mm0 \n\t"\ | |
2347 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2348 "movq %%mm0, 8(%1) \n\t"\ | |
2349 "addl %3, %0 \n\t"\ | |
2350 "movd (%0), %%mm0 \n\t"\ | |
2351 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2352 "movq %%mm0, 16(%1) \n\t"\ | |
2353 "addl %3, %0 \n\t"\ | |
2354 "addl $24, %1 \n\t"\ | |
2355 "decl %2 \n\t"\ | |
2356 " jnz 1b \n\t"\ | |
2357 : "+a" (src), "+c" (temp_ptr), "+d"(h)\ | |
2358 : "S" (srcStride)\ | |
2359 : "memory"\ | |
2360 );\ | |
2361 \ | |
2362 temp_ptr= temp;\ | |
2363 h= 4;\ | |
2364 \ | |
2365 asm volatile(\ | |
2366 "movq %4, %%mm6 \n\t"\ | |
2367 "movq %5, %%mm7 \n\t"\ | |
2368 "1: \n\t"\ | |
2369 "movq 2*8(%0), %%mm0 \n\t"\ | |
2370 "movq 3*8(%0), %%mm1 \n\t"\ | |
2371 "paddw %%mm1, %%mm0 \n\t"\ | |
2372 "psllw $2, %%mm0 \n\t"\ | |
2373 "movq 1*8(%0), %%mm2 \n\t"\ | |
2374 "movq 4*8(%0), %%mm3 \n\t"\ | |
2375 "paddw %%mm3, %%mm2 \n\t"\ | |
2376 "psubw %%mm2, %%mm0 \n\t"\ | |
2377 "pmullw %%mm6, %%mm0 \n\t"\ | |
2378 "movq 0*8(%0), %%mm4 \n\t"\ | |
2379 "movq 5*8(%0), %%mm5 \n\t"\ | |
2380 "paddw %%mm5, %%mm4 \n\t"\ | |
2381 "paddw %%mm7, %%mm4 \n\t"\ | |
2382 "paddw %%mm4, %%mm0 \n\t"\ | |
2383 "psraw $5, %%mm0 \n\t"\ | |
2384 "packuswb %%mm0, %%mm0 \n\t"\ | |
2385 OP(%%mm0, (%1),%%mm5, d)\ | |
2386 "addl %3, %1 \n\t"\ | |
2387 "addl $8, %0 \n\t"\ | |
2388 "decl %2 \n\t"\ | |
2389 " jnz 1b \n\t"\ | |
2390 \ | |
2391 : "+a"(temp_ptr), "+c"(dst), "+d"(h)\ | |
2392 : "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
2393 : "memory"\ | |
2394 );\ | |
2395 }\ | |
2396 static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2397 const int h=4;\ | |
2398 const int w=4;\ | |
2399 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ | |
2400 int i;\ | |
2401 src -= 2*srcStride;\ | |
2402 for(i=0; i<h+5; i++)\ | |
2403 {\ | |
2404 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\ | |
2405 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\ | |
2406 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\ | |
2407 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\ | |
2408 tmp+=tmpStride;\ | |
2409 src+=srcStride;\ | |
2410 }\ | |
2411 tmp -= tmpStride*(h+5-2);\ | |
2412 for(i=0; i<w; i++)\ | |
2413 {\ | |
2414 const int tmpB= tmp[-2*tmpStride];\ | |
2415 const int tmpA= tmp[-1*tmpStride];\ | |
2416 const int tmp0= tmp[0 *tmpStride];\ | |
2417 const int tmp1= tmp[1 *tmpStride];\ | |
2418 const int tmp2= tmp[2 *tmpStride];\ | |
2419 const int tmp3= tmp[3 *tmpStride];\ | |
2420 const int tmp4= tmp[4 *tmpStride];\ | |
2421 const int tmp5= tmp[5 *tmpStride];\ | |
2422 const int tmp6= tmp[6 *tmpStride];\ | |
2423 dst[0*dstStride]= cm[( (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3) + 512)>>10];\ | |
2424 dst[1*dstStride]= cm[( (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4) + 512)>>10];\ | |
2425 dst[2*dstStride]= cm[( (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5) + 512)>>10];\ | |
2426 dst[3*dstStride]= cm[( (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6) + 512)>>10];\ | |
2427 dst++;\ | |
2428 tmp++;\ | |
2429 }\ | |
2430 }\ | |
2431 \ | |
2432 static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2433 int h=8;\ | |
2434 asm volatile(\ | |
2435 "pxor %%mm7, %%mm7 \n\t"\ | |
2436 "movq %5, %%mm6 \n\t"\ | |
2437 "1: \n\t"\ | |
2438 "movq (%0), %%mm0 \n\t"\ | |
2439 "movq 1(%0), %%mm2 \n\t"\ | |
2440 "movq %%mm0, %%mm1 \n\t"\ | |
2441 "movq %%mm2, %%mm3 \n\t"\ | |
2442 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2443 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2444 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
2445 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
2446 "paddw %%mm2, %%mm0 \n\t"\ | |
2447 "paddw %%mm3, %%mm1 \n\t"\ | |
2448 "psllw $2, %%mm0 \n\t"\ | |
2449 "psllw $2, %%mm1 \n\t"\ | |
2450 "movq -1(%0), %%mm2 \n\t"\ | |
2451 "movq 2(%0), %%mm4 \n\t"\ | |
2452 "movq %%mm2, %%mm3 \n\t"\ | |
2453 "movq %%mm4, %%mm5 \n\t"\ | |
2454 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
2455 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
2456 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
2457 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
2458 "paddw %%mm4, %%mm2 \n\t"\ | |
2459 "paddw %%mm3, %%mm5 \n\t"\ | |
2460 "psubw %%mm2, %%mm0 \n\t"\ | |
2461 "psubw %%mm5, %%mm1 \n\t"\ | |
2462 "pmullw %%mm6, %%mm0 \n\t"\ | |
2463 "pmullw %%mm6, %%mm1 \n\t"\ | |
2464 "movd -2(%0), %%mm2 \n\t"\ | |
2465 "movd 7(%0), %%mm5 \n\t"\ | |
2466 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
2467 "punpcklbw %%mm7, %%mm5 \n\t"\ | |
2468 "paddw %%mm3, %%mm2 \n\t"\ | |
2469 "paddw %%mm5, %%mm4 \n\t"\ | |
2470 "movq %6, %%mm5 \n\t"\ | |
2471 "paddw %%mm5, %%mm2 \n\t"\ | |
2472 "paddw %%mm5, %%mm4 \n\t"\ | |
2473 "paddw %%mm2, %%mm0 \n\t"\ | |
2474 "paddw %%mm4, %%mm1 \n\t"\ | |
2475 "psraw $5, %%mm0 \n\t"\ | |
2476 "psraw $5, %%mm1 \n\t"\ | |
2477 "packuswb %%mm1, %%mm0 \n\t"\ | |
2478 OP(%%mm0, (%1),%%mm5, q)\ | |
2479 "addl %3, %0 \n\t"\ | |
2480 "addl %4, %1 \n\t"\ | |
2481 "decl %2 \n\t"\ | |
2482 " jnz 1b \n\t"\ | |
2483 : "+a"(src), "+c"(dst), "+m"(h)\ | |
2484 : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
2485 : "memory"\ | |
2486 );\ | |
2487 }\ | |
2488 \ | |
2489 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2490 uint64_t temp[(8+5)*2];\ | |
2491 uint64_t *temp_ptr= temp;\ | |
2492 int h= 8+5;\ | |
2493 \ | |
2494 src -= 2*srcStride;\ | |
2495 /*FIXME unroll */\ | |
2496 asm volatile(\ | |
2497 "pxor %%mm7, %%mm7 \n\t"\ | |
2498 "1: \n\t"\ | |
2499 "movq (%0), %%mm0 \n\t"\ | |
2500 "movq (%0), %%mm1 \n\t"\ | |
2501 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2502 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2503 "movq %%mm0, (%1) \n\t"\ | |
2504 "movq %%mm1, 8(%1) \n\t"\ | |
2505 "addl $16, %1 \n\t"\ | |
2506 "addl %3, %0 \n\t"\ | |
2507 "decl %2 \n\t"\ | |
2508 " jnz 1b \n\t"\ | |
2509 : "+a" (src), "+c" (temp_ptr), "+d"(h)\ | |
2510 : "S" (srcStride)\ | |
2511 : "memory"\ | |
2512 );\ | |
2513 \ | |
2514 temp_ptr= temp;\ | |
2515 h= 8;\ | |
2516 \ | |
2517 asm volatile(\ | |
2518 "movq %4, %%mm6 \n\t"\ | |
2519 "movq %5, %%mm7 \n\t"\ | |
2520 "1: \n\t"\ | |
2521 "movq 2*16+0(%0), %%mm0 \n\t"\ | |
2522 "movq 2*16+8(%0), %%mm1 \n\t"\ | |
2523 "movq 3*16+0(%0), %%mm2 \n\t"\ | |
2524 "movq 3*16+8(%0), %%mm3 \n\t"\ | |
2525 "paddw %%mm2, %%mm0 \n\t"\ | |
2526 "paddw %%mm3, %%mm1 \n\t"\ | |
2527 "psllw $2, %%mm0 \n\t"\ | |
2528 "psllw $2, %%mm1 \n\t"\ | |
2529 "movq 1*16+0(%0), %%mm2 \n\t"\ | |
2530 "movq 1*16+8(%0), %%mm3 \n\t"\ | |
2531 "movq 4*16+0(%0), %%mm4 \n\t"\ | |
2532 "movq 4*16+8(%0), %%mm5 \n\t"\ | |
2533 "paddw %%mm4, %%mm2 \n\t"\ | |
2534 "paddw %%mm5, %%mm3 \n\t"\ | |
2535 "psubw %%mm2, %%mm0 \n\t"\ | |
2536 "psubw %%mm3, %%mm1 \n\t"\ | |
2537 "pmullw %%mm6, %%mm0 \n\t"\ | |
2538 "pmullw %%mm6, %%mm1 \n\t"\ | |
2539 "movq 0*16+0(%0), %%mm2 \n\t"\ | |
2540 "movq 0*16+8(%0), %%mm3 \n\t"\ | |
2541 "movq 5*16+0(%0), %%mm4 \n\t"\ | |
2542 "movq 5*16+8(%0), %%mm5 \n\t"\ | |
2543 "paddw %%mm4, %%mm2 \n\t"\ | |
2544 "paddw %%mm5, %%mm3 \n\t"\ | |
2545 "paddw %%mm2, %%mm0 \n\t"\ | |
2546 "paddw %%mm3, %%mm1 \n\t"\ | |
2547 "paddw %%mm7, %%mm0 \n\t"\ | |
2548 "paddw %%mm7, %%mm1 \n\t"\ | |
2549 "psraw $5, %%mm0 \n\t"\ | |
2550 "psraw $5, %%mm1 \n\t"\ | |
2551 "packuswb %%mm1, %%mm0 \n\t"\ | |
2552 OP(%%mm0, (%1),%%mm5, q)\ | |
2553 "addl %3, %1 \n\t"\ | |
2554 "addl $16, %0 \n\t"\ | |
2555 "decl %2 \n\t"\ | |
2556 " jnz 1b \n\t"\ | |
2557 \ | |
2558 : "+a"(temp_ptr), "+c"(dst), "+d"(h)\ | |
2559 : "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | |
2560 : "memory"\ | |
2561 );\ | |
2562 }\ | |
2563 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2564 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
2565 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst+4, tmp+4, src+4, dstStride, tmpStride, srcStride);\ | |
2566 src += 4*srcStride;\ | |
2567 dst += 4*dstStride;\ | |
2568 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
2569 OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst+4, tmp+4, src+4, dstStride, tmpStride, srcStride);\ | |
2570 }\ | |
2571 static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2572 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ | |
2573 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
2574 src += 8*srcStride;\ | |
2575 dst += 8*dstStride;\ | |
2576 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ | |
2577 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
2578 }\ | |
2579 \ | |
2580 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2581 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ | |
2582 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
2583 src += 8*srcStride;\ | |
2584 dst += 8*dstStride;\ | |
2585 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ | |
2586 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
2587 }\ | |
2588 \ | |
2589 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2590 OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
2591 OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ | |
2592 src += 8*srcStride;\ | |
2593 dst += 8*dstStride;\ | |
2594 OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
2595 OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ | |
2596 }\ | |
2597 | |
2598 #define H264_MC(OPNAME, SIZE, MMX) \ | |
2599 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ | |
2600 OPNAME ## pixels ## SIZE ## _mmx(dst, src, stride, SIZE);\ | |
2601 }\ | |
2602 \ | |
2603 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2604 uint64_t temp[SIZE*SIZE/8];\ | |
2605 uint8_t * const half= (uint8_t*)temp;\ | |
2606 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\ | |
2607 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\ | |
2608 }\ | |
2609 \ | |
2610 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2611 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ | |
2612 }\ | |
2613 \ | |
2614 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2615 uint64_t temp[SIZE*SIZE/8];\ | |
2616 uint8_t * const half= (uint8_t*)temp;\ | |
2617 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\ | |
2618 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+1, half, stride, stride, SIZE);\ | |
2619 }\ | |
2620 \ | |
2621 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2622 uint64_t temp[SIZE*SIZE/8];\ | |
2623 uint8_t * const half= (uint8_t*)temp;\ | |
2624 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\ | |
2625 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\ | |
2626 }\ | |
2627 \ | |
2628 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2629 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ | |
2630 }\ | |
2631 \ | |
2632 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2633 uint64_t temp[SIZE*SIZE/8];\ | |
2634 uint8_t * const half= (uint8_t*)temp;\ | |
2635 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\ | |
2636 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, half, stride, stride, SIZE);\ | |
2637 }\ | |
2638 \ | |
2639 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2640 uint64_t temp[SIZE*SIZE/4];\ | |
2641 uint8_t * const halfH= (uint8_t*)temp;\ | |
2642 uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\ | |
2643 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\ | |
2644 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\ | |
2645 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\ | |
2646 }\ | |
2647 \ | |
2648 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2649 uint64_t temp[SIZE*SIZE/4];\ | |
2650 uint8_t * const halfH= (uint8_t*)temp;\ | |
2651 uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\ | |
2652 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\ | |
2653 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\ | |
2654 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\ | |
2655 }\ | |
2656 \ | |
2657 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2658 uint64_t temp[SIZE*SIZE/4];\ | |
2659 uint8_t * const halfH= (uint8_t*)temp;\ | |
2660 uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\ | |
2661 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\ | |
2662 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\ | |
2663 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\ | |
2664 }\ | |
2665 \ | |
2666 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2667 uint64_t temp[SIZE*SIZE/4];\ | |
2668 uint8_t * const halfH= (uint8_t*)temp;\ | |
2669 uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\ | |
2670 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\ | |
2671 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\ | |
2672 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\ | |
2673 }\ | |
2674 \ | |
2675 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2676 int16_t tmp[SIZE*(SIZE+5)];\ | |
2677 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\ | |
2678 }\ | |
2679 \ | |
2680 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2681 int16_t tmp[SIZE*(SIZE+5)];\ | |
2682 uint8_t halfH[SIZE*SIZE];\ | |
2683 uint8_t halfHV[SIZE*SIZE];\ | |
2684 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\ | |
2685 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2686 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\ | |
2687 }\ | |
2688 \ | |
2689 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2690 int16_t tmp[SIZE*(SIZE+5)];\ | |
2691 uint8_t halfH[SIZE*SIZE];\ | |
2692 uint8_t halfHV[SIZE*SIZE];\ | |
2693 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\ | |
2694 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2695 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\ | |
2696 }\ | |
2697 \ | |
2698 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2699 int16_t tmp[SIZE*(SIZE+5)];\ | |
2700 uint8_t halfV[SIZE*SIZE];\ | |
2701 uint8_t halfHV[SIZE*SIZE];\ | |
2702 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\ | |
2703 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2704 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\ | |
2705 }\ | |
2706 \ | |
2707 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
2708 int16_t tmp[SIZE*(SIZE+5)];\ | |
2709 uint8_t halfV[SIZE*SIZE];\ | |
2710 uint8_t halfHV[SIZE*SIZE];\ | |
2711 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\ | |
2712 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2713 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\ | |
2714 }\ | |
2715 | |
2264 | 2716 |
2265 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" | 2717 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" |
2266 #define AVG_3DNOW_OP(a,b,temp, size) \ | 2718 #define AVG_3DNOW_OP(a,b,temp, size) \ |
2267 "mov" #size " " #b ", " #temp " \n\t"\ | 2719 "mov" #size " " #b ", " #temp " \n\t"\ |
2268 "pavgusb " #temp ", " #a " \n\t"\ | 2720 "pavgusb " #temp ", " #a " \n\t"\ |
2279 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) | 2731 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) |
2280 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) | 2732 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) |
2281 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) | 2733 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) |
2282 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) | 2734 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) |
2283 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) | 2735 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) |
2736 | |
2737 QPEL_H264(put_ , PUT_OP, 3dnow) | |
2738 QPEL_H264(avg_ , AVG_3DNOW_OP, 3dnow) | |
2739 QPEL_H264(put_ , PUT_OP, mmx2) | |
2740 QPEL_H264(avg_ , AVG_MMX2_OP, mmx2) | |
2741 | |
2742 H264_MC(put_, 4, 3dnow) | |
2743 H264_MC(put_, 8, 3dnow) | |
2744 H264_MC(put_, 16,3dnow) | |
2745 H264_MC(avg_, 4, 3dnow) | |
2746 H264_MC(avg_, 8, 3dnow) | |
2747 H264_MC(avg_, 16,3dnow) | |
2748 H264_MC(put_, 4, mmx2) | |
2749 H264_MC(put_, 8, mmx2) | |
2750 H264_MC(put_, 16,mmx2) | |
2751 H264_MC(avg_, 4, mmx2) | |
2752 H264_MC(avg_, 8, mmx2) | |
2753 H264_MC(avg_, 16,mmx2) | |
2284 | 2754 |
2285 #if 0 | 2755 #if 0 |
2286 static void just_return() { return; } | 2756 static void just_return() { return; } |
2287 #endif | 2757 #endif |
2288 | 2758 |
2619 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2) | 3089 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2) |
2620 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2) | 3090 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2) |
2621 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2) | 3091 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2) |
2622 #endif | 3092 #endif |
2623 | 3093 |
3094 //FIXME 3dnow too | |
3095 #define dspfunc(PFX, IDX, NUM) \ | |
3096 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \ | |
3097 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \ | |
3098 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \ | |
3099 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \ | |
3100 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \ | |
3101 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \ | |
3102 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \ | |
3103 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \ | |
3104 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \ | |
3105 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \ | |
3106 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \ | |
3107 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \ | |
3108 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \ | |
3109 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \ | |
3110 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \ | |
3111 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2 | |
3112 | |
3113 dspfunc(put_h264_qpel, 0, 16); | |
3114 dspfunc(put_h264_qpel, 1, 8); | |
3115 dspfunc(put_h264_qpel, 2, 4); | |
3116 dspfunc(avg_h264_qpel, 0, 16); | |
3117 dspfunc(avg_h264_qpel, 1, 8); | |
3118 dspfunc(avg_h264_qpel, 2, 4); | |
3119 #undef dspfunc | |
3120 | |
2624 #ifdef CONFIG_ENCODERS | 3121 #ifdef CONFIG_ENCODERS |
2625 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; | 3122 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; |
2626 #endif //CONFIG_ENCODERS | 3123 #endif //CONFIG_ENCODERS |
2627 } else if (mm_flags & MM_3DNOW) { | 3124 } else if (mm_flags & MM_3DNOW) { |
2628 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; | 3125 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; |
2678 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow) | 3175 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow) |
2679 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow) | 3176 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow) |
2680 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow) | 3177 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow) |
2681 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow) | 3178 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow) |
2682 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow) | 3179 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow) |
3180 | |
3181 #define dspfunc(PFX, IDX, NUM) \ | |
3182 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \ | |
3183 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \ | |
3184 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \ | |
3185 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \ | |
3186 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \ | |
3187 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \ | |
3188 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \ | |
3189 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \ | |
3190 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \ | |
3191 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \ | |
3192 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \ | |
3193 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \ | |
3194 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \ | |
3195 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \ | |
3196 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \ | |
3197 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow | |
3198 | |
3199 dspfunc(put_h264_qpel, 0, 16); | |
3200 dspfunc(put_h264_qpel, 1, 8); | |
3201 dspfunc(put_h264_qpel, 2, 4); | |
3202 dspfunc(avg_h264_qpel, 0, 16); | |
3203 dspfunc(avg_h264_qpel, 1, 8); | |
3204 dspfunc(avg_h264_qpel, 2, 4); | |
2683 } | 3205 } |
2684 } | 3206 } |
2685 | 3207 |
2686 #ifdef CONFIG_ENCODERS | 3208 #ifdef CONFIG_ENCODERS |
2687 dsputil_init_pix_mmx(c, avctx); | 3209 dsputil_init_pix_mmx(c, avctx); |