Mercurial > mplayer.hg
comparison postproc/swscale_template.c @ 5452:eb87391a5292
overread in the mmx2 horizontal scaler fixed
2% faster horizontal mmx2 scaler
author | michael |
---|---|
date | Mon, 01 Apr 2002 14:01:22 +0000 |
parents | 3cc0f4938be1 |
children | 4b18bf35f153 |
comparison
equal
deleted
inserted
replaced
5451:b716977c47d9 | 5452:eb87391a5292 |
---|---|
2236 } | 2236 } |
2237 // *** horizontal scale Y line to temp buffer | 2237 // *** horizontal scale Y line to temp buffer |
2238 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc, | 2238 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc, |
2239 int flags, int canMMX2BeUsed, int16_t *hLumFilter, | 2239 int flags, int canMMX2BeUsed, int16_t *hLumFilter, |
2240 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, | 2240 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, |
2241 int srcFormat, uint8_t *formatConvBuffer) | 2241 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, |
2242 int32_t *mmx2FilterPos) | |
2242 { | 2243 { |
2243 if(srcFormat==IMGFMT_YUY2) | 2244 if(srcFormat==IMGFMT_YUY2) |
2244 { | 2245 { |
2245 RENAME(yuy2ToY)(formatConvBuffer, src, srcW); | 2246 RENAME(yuy2ToY)(formatConvBuffer, src, srcW); |
2246 src= formatConvBuffer; | 2247 src= formatConvBuffer; |
2292 int i; | 2293 int i; |
2293 if(canMMX2BeUsed) | 2294 if(canMMX2BeUsed) |
2294 { | 2295 { |
2295 asm volatile( | 2296 asm volatile( |
2296 "pxor %%mm7, %%mm7 \n\t" | 2297 "pxor %%mm7, %%mm7 \n\t" |
2297 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | 2298 "movl %0, %%ecx \n\t" |
2298 "movd %5, %%mm6 \n\t" // xInc&0xFFFF | 2299 "movl %1, %%edi \n\t" |
2299 "punpcklwd %%mm6, %%mm6 \n\t" | 2300 "movl %2, %%edx \n\t" |
2300 "punpcklwd %%mm6, %%mm6 \n\t" | 2301 "movl %3, %%ebx \n\t" |
2301 "movq %%mm6, %%mm2 \n\t" | |
2302 "psllq $16, %%mm2 \n\t" | |
2303 "paddw %%mm6, %%mm2 \n\t" | |
2304 "psllq $16, %%mm2 \n\t" | |
2305 "paddw %%mm6, %%mm2 \n\t" | |
2306 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF | |
2307 "movq %%mm2, %%mm4 \n\t" | |
2308 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF | |
2309 "punpcklwd %%mm6, %%mm6 \n\t" | |
2310 "punpcklwd %%mm6, %%mm6 \n\t" | |
2311 "xorl %%eax, %%eax \n\t" // i | 2302 "xorl %%eax, %%eax \n\t" // i |
2312 "movl %0, %%esi \n\t" // src | 2303 PREFETCH" (%%ecx) \n\t" |
2313 "movl %1, %%edi \n\t" // buf1 | 2304 PREFETCH" 32(%%ecx) \n\t" |
2314 "movl %3, %%edx \n\t" // (xInc*4)>>16 | 2305 PREFETCH" 64(%%ecx) \n\t" |
2315 "xorl %%ecx, %%ecx \n\t" | |
2316 "xorl %%ebx, %%ebx \n\t" | |
2317 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF | |
2318 | 2306 |
2319 #define FUNNY_Y_CODE \ | 2307 #define FUNNY_Y_CODE \ |
2320 PREFETCH" 1024(%%esi) \n\t"\ | 2308 "movl (%%ebx), %%esi \n\t"\ |
2321 PREFETCH" 1056(%%esi) \n\t"\ | 2309 "call *%4 \n\t"\ |
2322 PREFETCH" 1088(%%esi) \n\t"\ | 2310 "addl (%%ebx, %%eax), %%ecx \n\t"\ |
2323 "call *%6 \n\t"\ | 2311 "addl %%eax, %%edi \n\t"\ |
2324 "movq %%mm4, %%mm2 \n\t"\ | 2312 "xorl %%eax, %%eax \n\t"\ |
2325 "xorl %%ecx, %%ecx \n\t" | |
2326 | 2313 |
2327 FUNNY_Y_CODE | 2314 FUNNY_Y_CODE |
2328 FUNNY_Y_CODE | 2315 FUNNY_Y_CODE |
2329 FUNNY_Y_CODE | 2316 FUNNY_Y_CODE |
2330 FUNNY_Y_CODE | 2317 FUNNY_Y_CODE |
2331 FUNNY_Y_CODE | 2318 FUNNY_Y_CODE |
2332 FUNNY_Y_CODE | 2319 FUNNY_Y_CODE |
2333 FUNNY_Y_CODE | 2320 FUNNY_Y_CODE |
2334 FUNNY_Y_CODE | 2321 FUNNY_Y_CODE |
2335 | 2322 |
2336 :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16), | 2323 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2337 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode) | 2324 "m" (funnyYCode) |
2338 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | 2325 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" |
2339 ); | 2326 ); |
2340 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; | 2327 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; |
2341 } | 2328 } |
2342 else | 2329 else |
2400 } | 2387 } |
2401 | 2388 |
2402 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2, | 2389 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2, |
2403 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, | 2390 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, |
2404 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, | 2391 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, |
2405 int srcFormat, uint8_t *formatConvBuffer) | 2392 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, |
2393 int32_t *mmx2FilterPos) | |
2406 { | 2394 { |
2407 if(srcFormat==IMGFMT_YUY2) | 2395 if(srcFormat==IMGFMT_YUY2) |
2408 { | 2396 { |
2409 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); | 2397 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); |
2410 src1= formatConvBuffer; | 2398 src1= formatConvBuffer; |
2467 #ifdef HAVE_MMX2 | 2455 #ifdef HAVE_MMX2 |
2468 int i; | 2456 int i; |
2469 if(canMMX2BeUsed) | 2457 if(canMMX2BeUsed) |
2470 { | 2458 { |
2471 asm volatile( | 2459 asm volatile( |
2472 "pxor %%mm7, %%mm7 \n\t" | 2460 "pxor %%mm7, %%mm7 \n\t" |
2473 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | 2461 "movl %0, %%ecx \n\t" |
2474 "movd %5, %%mm6 \n\t" // xInc&0xFFFF | 2462 "movl %1, %%edi \n\t" |
2475 "punpcklwd %%mm6, %%mm6 \n\t" | 2463 "movl %2, %%edx \n\t" |
2476 "punpcklwd %%mm6, %%mm6 \n\t" | 2464 "movl %3, %%ebx \n\t" |
2477 "movq %%mm6, %%mm2 \n\t" | 2465 "xorl %%eax, %%eax \n\t" // i |
2478 "psllq $16, %%mm2 \n\t" | 2466 PREFETCH" (%%ecx) \n\t" |
2479 "paddw %%mm6, %%mm2 \n\t" | 2467 PREFETCH" 32(%%ecx) \n\t" |
2480 "psllq $16, %%mm2 \n\t" | 2468 PREFETCH" 64(%%ecx) \n\t" |
2481 "paddw %%mm6, %%mm2 \n\t" | 2469 |
2482 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF | 2470 #define FUNNY_UV_CODE \ |
2483 "movq %%mm2, %%mm4 \n\t" | 2471 "movl (%%ebx), %%esi \n\t"\ |
2484 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF | 2472 "call *%4 \n\t"\ |
2485 "punpcklwd %%mm6, %%mm6 \n\t" | 2473 "addl (%%ebx, %%eax), %%ecx \n\t"\ |
2486 "punpcklwd %%mm6, %%mm6 \n\t" | 2474 "addl %%eax, %%edi \n\t"\ |
2487 "xorl %%eax, %%eax \n\t" // i | 2475 "xorl %%eax, %%eax \n\t"\ |
2488 "movl %0, %%esi \n\t" // src | 2476 |
2489 "movl %1, %%edi \n\t" // buf1 | 2477 FUNNY_UV_CODE |
2490 "movl %3, %%edx \n\t" // (xInc*4)>>16 | 2478 FUNNY_UV_CODE |
2491 "xorl %%ecx, %%ecx \n\t" | 2479 FUNNY_UV_CODE |
2492 "xorl %%ebx, %%ebx \n\t" | 2480 FUNNY_UV_CODE |
2493 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF | 2481 "xorl %%eax, %%eax \n\t" // i |
2494 | 2482 "movl %5, %%ecx \n\t" // src |
2495 #define FUNNYUVCODE \ | 2483 "movl %1, %%edi \n\t" // buf1 |
2496 PREFETCH" 1024(%%esi) \n\t"\ | 2484 "addl $4096, %%edi \n\t" |
2497 PREFETCH" 1056(%%esi) \n\t"\ | 2485 PREFETCH" (%%ecx) \n\t" |
2498 PREFETCH" 1088(%%esi) \n\t"\ | 2486 PREFETCH" 32(%%ecx) \n\t" |
2499 "call *%7 \n\t"\ | 2487 PREFETCH" 64(%%ecx) \n\t" |
2500 "movq %%mm4, %%mm2 \n\t"\ | 2488 |
2501 "xorl %%ecx, %%ecx \n\t" | 2489 FUNNY_UV_CODE |
2502 | 2490 FUNNY_UV_CODE |
2503 FUNNYUVCODE | 2491 FUNNY_UV_CODE |
2504 FUNNYUVCODE | 2492 FUNNY_UV_CODE |
2505 FUNNYUVCODE | 2493 |
2506 FUNNYUVCODE | 2494 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), |
2507 | 2495 "m" (funnyUVCode), "m" (src2) |
2508 FUNNYUVCODE | 2496 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" |
2509 FUNNYUVCODE | 2497 ); |
2510 FUNNYUVCODE | |
2511 FUNNYUVCODE | |
2512 "xorl %%eax, %%eax \n\t" // i | |
2513 "movl %6, %%esi \n\t" // src | |
2514 "movl %1, %%edi \n\t" // buf1 | |
2515 "addl $4096, %%edi \n\t" | |
2516 | |
2517 FUNNYUVCODE | |
2518 FUNNYUVCODE | |
2519 FUNNYUVCODE | |
2520 FUNNYUVCODE | |
2521 | |
2522 FUNNYUVCODE | |
2523 FUNNYUVCODE | |
2524 FUNNYUVCODE | |
2525 FUNNYUVCODE | |
2526 | |
2527 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16), | |
2528 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode) | |
2529 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |
2530 ); | |
2531 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) | 2498 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) |
2532 { | 2499 { |
2533 // printf("%d %d %d\n", dstWidth, i, srcW); | 2500 // printf("%d %d %d\n", dstWidth, i, srcW); |
2534 dst[i] = src1[srcW-1]*128; | 2501 dst[i] = src1[srcW-1]*128; |
2535 dst[i+2048] = src2[srcW-1]*128; | 2502 dst[i+2048] = src2[srcW-1]*128; |
2747 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | 2714 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) |
2748 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | 2715 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) |
2749 // printf("%d %d\n", lumBufIndex, vLumBufSize); | 2716 // printf("%d %d\n", lumBufIndex, vLumBufSize); |
2750 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, | 2717 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
2751 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, | 2718 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
2752 funnyYCode, c->srcFormat, formatConvBuffer); | 2719 funnyYCode, c->srcFormat, formatConvBuffer, |
2720 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
2753 lastInLumBuf++; | 2721 lastInLumBuf++; |
2754 } | 2722 } |
2755 while(lastInChrBuf < lastChrSrcY) | 2723 while(lastInChrBuf < lastChrSrcY) |
2756 { | 2724 { |
2757 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1]; | 2725 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1]; |
2761 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1)) | 2729 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1)) |
2762 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) | 2730 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) |
2763 //FIXME replace parameters through context struct (some at least) | 2731 //FIXME replace parameters through context struct (some at least) |
2764 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc, | 2732 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc, |
2765 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, | 2733 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, |
2766 funnyUVCode, c->srcFormat, formatConvBuffer); | 2734 funnyUVCode, c->srcFormat, formatConvBuffer, |
2735 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
2767 lastInChrBuf++; | 2736 lastInChrBuf++; |
2768 } | 2737 } |
2769 //wrap buf index around to stay inside the ring buffer | 2738 //wrap buf index around to stay inside the ring buffer |
2770 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | 2739 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; |
2771 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | 2740 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; |
2785 ASSERT(lumBufIndex < 2*vLumBufSize) | 2754 ASSERT(lumBufIndex < 2*vLumBufSize) |
2786 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) | 2755 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) |
2787 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) | 2756 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) |
2788 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, | 2757 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, |
2789 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, | 2758 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, |
2790 funnyYCode, c->srcFormat, formatConvBuffer); | 2759 funnyYCode, c->srcFormat, formatConvBuffer, |
2760 c->lumMmx2Filter, c->lumMmx2FilterPos); | |
2791 lastInLumBuf++; | 2761 lastInLumBuf++; |
2792 } | 2762 } |
2793 while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1)) | 2763 while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1)) |
2794 { | 2764 { |
2795 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1]; | 2765 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1]; |
2798 ASSERT(chrBufIndex < 2*vChrBufSize) | 2768 ASSERT(chrBufIndex < 2*vChrBufSize) |
2799 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1)) | 2769 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1)) |
2800 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) | 2770 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) |
2801 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc, | 2771 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc, |
2802 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, | 2772 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, |
2803 funnyUVCode, c->srcFormat, formatConvBuffer); | 2773 funnyUVCode, c->srcFormat, formatConvBuffer, |
2774 c->chrMmx2Filter, c->chrMmx2FilterPos); | |
2804 lastInChrBuf++; | 2775 lastInChrBuf++; |
2805 } | 2776 } |
2806 //wrap buf index around to stay inside the ring buffer | 2777 //wrap buf index around to stay inside the ring buffer |
2807 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; | 2778 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; |
2808 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; | 2779 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; |