comparison postproc/swscale_template.c @ 5452:eb87391a5292

overread in the mmx2 horizontal scaler fixed 2% faster horizontal mmx2 scaler
author michael
date Mon, 01 Apr 2002 14:01:22 +0000
parents 3cc0f4938be1
children 4b18bf35f153
comparison
equal deleted inserted replaced
5451:b716977c47d9 5452:eb87391a5292
2236 } 2236 }
2237 // *** horizontal scale Y line to temp buffer 2237 // *** horizontal scale Y line to temp buffer
2238 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc, 2238 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2239 int flags, int canMMX2BeUsed, int16_t *hLumFilter, 2239 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2240 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 2240 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2241 int srcFormat, uint8_t *formatConvBuffer) 2241 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2242 int32_t *mmx2FilterPos)
2242 { 2243 {
2243 if(srcFormat==IMGFMT_YUY2) 2244 if(srcFormat==IMGFMT_YUY2)
2244 { 2245 {
2245 RENAME(yuy2ToY)(formatConvBuffer, src, srcW); 2246 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2246 src= formatConvBuffer; 2247 src= formatConvBuffer;
2292 int i; 2293 int i;
2293 if(canMMX2BeUsed) 2294 if(canMMX2BeUsed)
2294 { 2295 {
2295 asm volatile( 2296 asm volatile(
2296 "pxor %%mm7, %%mm7 \n\t" 2297 "pxor %%mm7, %%mm7 \n\t"
2297 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha 2298 "movl %0, %%ecx \n\t"
2298 "movd %5, %%mm6 \n\t" // xInc&0xFFFF 2299 "movl %1, %%edi \n\t"
2299 "punpcklwd %%mm6, %%mm6 \n\t" 2300 "movl %2, %%edx \n\t"
2300 "punpcklwd %%mm6, %%mm6 \n\t" 2301 "movl %3, %%ebx \n\t"
2301 "movq %%mm6, %%mm2 \n\t"
2302 "psllq $16, %%mm2 \n\t"
2303 "paddw %%mm6, %%mm2 \n\t"
2304 "psllq $16, %%mm2 \n\t"
2305 "paddw %%mm6, %%mm2 \n\t"
2306 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
2307 "movq %%mm2, %%mm4 \n\t"
2308 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
2309 "punpcklwd %%mm6, %%mm6 \n\t"
2310 "punpcklwd %%mm6, %%mm6 \n\t"
2311 "xorl %%eax, %%eax \n\t" // i 2302 "xorl %%eax, %%eax \n\t" // i
2312 "movl %0, %%esi \n\t" // src 2303 PREFETCH" (%%ecx) \n\t"
2313 "movl %1, %%edi \n\t" // buf1 2304 PREFETCH" 32(%%ecx) \n\t"
2314 "movl %3, %%edx \n\t" // (xInc*4)>>16 2305 PREFETCH" 64(%%ecx) \n\t"
2315 "xorl %%ecx, %%ecx \n\t"
2316 "xorl %%ebx, %%ebx \n\t"
2317 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
2318 2306
2319 #define FUNNY_Y_CODE \ 2307 #define FUNNY_Y_CODE \
2320 PREFETCH" 1024(%%esi) \n\t"\ 2308 "movl (%%ebx), %%esi \n\t"\
2321 PREFETCH" 1056(%%esi) \n\t"\ 2309 "call *%4 \n\t"\
2322 PREFETCH" 1088(%%esi) \n\t"\ 2310 "addl (%%ebx, %%eax), %%ecx \n\t"\
2323 "call *%6 \n\t"\ 2311 "addl %%eax, %%edi \n\t"\
2324 "movq %%mm4, %%mm2 \n\t"\ 2312 "xorl %%eax, %%eax \n\t"\
2325 "xorl %%ecx, %%ecx \n\t"
2326 2313
2327 FUNNY_Y_CODE 2314 FUNNY_Y_CODE
2328 FUNNY_Y_CODE 2315 FUNNY_Y_CODE
2329 FUNNY_Y_CODE 2316 FUNNY_Y_CODE
2330 FUNNY_Y_CODE 2317 FUNNY_Y_CODE
2331 FUNNY_Y_CODE 2318 FUNNY_Y_CODE
2332 FUNNY_Y_CODE 2319 FUNNY_Y_CODE
2333 FUNNY_Y_CODE 2320 FUNNY_Y_CODE
2334 FUNNY_Y_CODE 2321 FUNNY_Y_CODE
2335 2322
2336 :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16), 2323 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2337 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode) 2324 "m" (funnyYCode)
2338 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" 2325 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2339 ); 2326 );
2340 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; 2327 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2341 } 2328 }
2342 else 2329 else
2400 } 2387 }
2401 2388
2402 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2, 2389 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2403 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, 2390 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2404 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, 2391 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2405 int srcFormat, uint8_t *formatConvBuffer) 2392 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2393 int32_t *mmx2FilterPos)
2406 { 2394 {
2407 if(srcFormat==IMGFMT_YUY2) 2395 if(srcFormat==IMGFMT_YUY2)
2408 { 2396 {
2409 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW); 2397 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2410 src1= formatConvBuffer; 2398 src1= formatConvBuffer;
2467 #ifdef HAVE_MMX2 2455 #ifdef HAVE_MMX2
2468 int i; 2456 int i;
2469 if(canMMX2BeUsed) 2457 if(canMMX2BeUsed)
2470 { 2458 {
2471 asm volatile( 2459 asm volatile(
2472 "pxor %%mm7, %%mm7 \n\t" 2460 "pxor %%mm7, %%mm7 \n\t"
2473 "pxor %%mm2, %%mm2 \n\t" // 2*xalpha 2461 "movl %0, %%ecx \n\t"
2474 "movd %5, %%mm6 \n\t" // xInc&0xFFFF 2462 "movl %1, %%edi \n\t"
2475 "punpcklwd %%mm6, %%mm6 \n\t" 2463 "movl %2, %%edx \n\t"
2476 "punpcklwd %%mm6, %%mm6 \n\t" 2464 "movl %3, %%ebx \n\t"
2477 "movq %%mm6, %%mm2 \n\t" 2465 "xorl %%eax, %%eax \n\t" // i
2478 "psllq $16, %%mm2 \n\t" 2466 PREFETCH" (%%ecx) \n\t"
2479 "paddw %%mm6, %%mm2 \n\t" 2467 PREFETCH" 32(%%ecx) \n\t"
2480 "psllq $16, %%mm2 \n\t" 2468 PREFETCH" 64(%%ecx) \n\t"
2481 "paddw %%mm6, %%mm2 \n\t" 2469
2482 "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF 2470 #define FUNNY_UV_CODE \
2483 "movq %%mm2, %%mm4 \n\t" 2471 "movl (%%ebx), %%esi \n\t"\
2484 "movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF 2472 "call *%4 \n\t"\
2485 "punpcklwd %%mm6, %%mm6 \n\t" 2473 "addl (%%ebx, %%eax), %%ecx \n\t"\
2486 "punpcklwd %%mm6, %%mm6 \n\t" 2474 "addl %%eax, %%edi \n\t"\
2487 "xorl %%eax, %%eax \n\t" // i 2475 "xorl %%eax, %%eax \n\t"\
2488 "movl %0, %%esi \n\t" // src 2476
2489 "movl %1, %%edi \n\t" // buf1 2477 FUNNY_UV_CODE
2490 "movl %3, %%edx \n\t" // (xInc*4)>>16 2478 FUNNY_UV_CODE
2491 "xorl %%ecx, %%ecx \n\t" 2479 FUNNY_UV_CODE
2492 "xorl %%ebx, %%ebx \n\t" 2480 FUNNY_UV_CODE
2493 "movw %4, %%bx \n\t" // (xInc*4)&0xFFFF 2481 "xorl %%eax, %%eax \n\t" // i
2494 2482 "movl %5, %%ecx \n\t" // src
2495 #define FUNNYUVCODE \ 2483 "movl %1, %%edi \n\t" // buf1
2496 PREFETCH" 1024(%%esi) \n\t"\ 2484 "addl $4096, %%edi \n\t"
2497 PREFETCH" 1056(%%esi) \n\t"\ 2485 PREFETCH" (%%ecx) \n\t"
2498 PREFETCH" 1088(%%esi) \n\t"\ 2486 PREFETCH" 32(%%ecx) \n\t"
2499 "call *%7 \n\t"\ 2487 PREFETCH" 64(%%ecx) \n\t"
2500 "movq %%mm4, %%mm2 \n\t"\ 2488
2501 "xorl %%ecx, %%ecx \n\t" 2489 FUNNY_UV_CODE
2502 2490 FUNNY_UV_CODE
2503 FUNNYUVCODE 2491 FUNNY_UV_CODE
2504 FUNNYUVCODE 2492 FUNNY_UV_CODE
2505 FUNNYUVCODE 2493
2506 FUNNYUVCODE 2494 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2507 2495 "m" (funnyUVCode), "m" (src2)
2508 FUNNYUVCODE 2496 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2509 FUNNYUVCODE 2497 );
2510 FUNNYUVCODE
2511 FUNNYUVCODE
2512 "xorl %%eax, %%eax \n\t" // i
2513 "movl %6, %%esi \n\t" // src
2514 "movl %1, %%edi \n\t" // buf1
2515 "addl $4096, %%edi \n\t"
2516
2517 FUNNYUVCODE
2518 FUNNYUVCODE
2519 FUNNYUVCODE
2520 FUNNYUVCODE
2521
2522 FUNNYUVCODE
2523 FUNNYUVCODE
2524 FUNNYUVCODE
2525 FUNNYUVCODE
2526
2527 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
2528 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode)
2529 : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2530 );
2531 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) 2498 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2532 { 2499 {
2533 // printf("%d %d %d\n", dstWidth, i, srcW); 2500 // printf("%d %d %d\n", dstWidth, i, srcW);
2534 dst[i] = src1[srcW-1]*128; 2501 dst[i] = src1[srcW-1]*128;
2535 dst[i+2048] = src2[srcW-1]*128; 2502 dst[i+2048] = src2[srcW-1]*128;
2747 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) 2714 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2748 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) 2715 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2749 // printf("%d %d\n", lumBufIndex, vLumBufSize); 2716 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2750 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, 2717 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2751 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, 2718 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2752 funnyYCode, c->srcFormat, formatConvBuffer); 2719 funnyYCode, c->srcFormat, formatConvBuffer,
2720 c->lumMmx2Filter, c->lumMmx2FilterPos);
2753 lastInLumBuf++; 2721 lastInLumBuf++;
2754 } 2722 }
2755 while(lastInChrBuf < lastChrSrcY) 2723 while(lastInChrBuf < lastChrSrcY)
2756 { 2724 {
2757 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1]; 2725 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2761 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1)) 2729 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1))
2762 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) 2730 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2763 //FIXME replace parameters through context struct (some at least) 2731 //FIXME replace parameters through context struct (some at least)
2764 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc, 2732 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2765 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, 2733 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2766 funnyUVCode, c->srcFormat, formatConvBuffer); 2734 funnyUVCode, c->srcFormat, formatConvBuffer,
2735 c->chrMmx2Filter, c->chrMmx2FilterPos);
2767 lastInChrBuf++; 2736 lastInChrBuf++;
2768 } 2737 }
2769 //wrap buf index around to stay inside the ring buffer 2738 //wrap buf index around to stay inside the ring buffer
2770 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; 2739 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2771 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; 2740 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2785 ASSERT(lumBufIndex < 2*vLumBufSize) 2754 ASSERT(lumBufIndex < 2*vLumBufSize)
2786 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH) 2755 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2787 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0) 2756 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2788 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, 2757 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2789 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, 2758 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2790 funnyYCode, c->srcFormat, formatConvBuffer); 2759 funnyYCode, c->srcFormat, formatConvBuffer,
2760 c->lumMmx2Filter, c->lumMmx2FilterPos);
2791 lastInLumBuf++; 2761 lastInLumBuf++;
2792 } 2762 }
2793 while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1)) 2763 while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
2794 { 2764 {
2795 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1]; 2765 uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2798 ASSERT(chrBufIndex < 2*vChrBufSize) 2768 ASSERT(chrBufIndex < 2*vChrBufSize)
2799 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1)) 2769 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1))
2800 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0) 2770 ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2801 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc, 2771 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2802 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, 2772 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2803 funnyUVCode, c->srcFormat, formatConvBuffer); 2773 funnyUVCode, c->srcFormat, formatConvBuffer,
2774 c->chrMmx2Filter, c->chrMmx2FilterPos);
2804 lastInChrBuf++; 2775 lastInChrBuf++;
2805 } 2776 }
2806 //wrap buf index around to stay inside the ring buffer 2777 //wrap buf index around to stay inside the ring buffer
2807 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize; 2778 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2808 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize; 2779 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;