Mercurial > mplayer.hg
changeset 27158:65b8334df960
spelling/grammar/wording overhaul
author | diego |
---|---|
date | Fri, 04 Jul 2008 13:49:45 +0000 |
parents | e2797c291ba9 |
children | dc5144ad6560 |
files | libswscale/internal_bfin.S libswscale/rgb2rgb.c libswscale/rgb2rgb.h libswscale/rgb2rgb_template.c libswscale/swscale_altivec_template.c libswscale/swscale_bfin.c libswscale/swscale_internal.h libswscale/swscale_template.c libswscale/yuv2rgb.c libswscale/yuv2rgb_altivec.c libswscale/yuv2rgb_bfin.c libswscale/yuv2rgb_mlib.c libswscale/yuv2rgb_template.c |
diffstat | 13 files changed, 181 insertions(+), 179 deletions(-) [+] |
line wrap: on
line diff
--- a/libswscale/internal_bfin.S Fri Jul 04 13:42:19 2008 +0000 +++ b/libswscale/internal_bfin.S Fri Jul 04 13:49:45 2008 +0000 @@ -2,8 +2,8 @@ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> * April 20, 2007 * - * Blackfin Video Color Space Converters Operations - * convert I420 YV12 to RGB in various formats, + * Blackfin video color space converter operations + * convert I420 YV12 to RGB in various formats * * This file is part of FFmpeg. * @@ -24,8 +24,8 @@ /* -YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock -and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts +YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock +and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts. The following calculation is used for the conversion: @@ -34,36 +34,36 @@ g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) b = clipz((y-oy)*cy + cbu*(u-128)) -y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision. +y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision. New factorization to eliminate the truncation error which was -occuring due to the byteop3p. +occurring due to the byteop3p. -1) use the bytop16m to subtract quad bytes we use this in U8 this +1) Use the bytop16m to subtract quad bytes we use this in U8 this then so the offsets need to be renormalized to 8bits. -2) scale operands up by a factor of 4 not 8 because Blackfin +2) Scale operands up by a factor of 4 not 8 because Blackfin multiplies include a shift. -3) compute into the accumulators cy*yx0, cy*yx1 +3) Compute into the accumulators cy*yx0, cy*yx1. -4) compute each of the linear equations +4) Compute each of the linear equations: r = clipz((y - oy) * cy + crv * (v - 128)) g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128)) b = clipz((y - oy) * cy + cbu * (u - 128)) - reuse of the accumulators requires that we actually multiply - twice once with addition and the second time with a subtaction. + Reuse of the accumulators requires that we actually multiply + twice once with addition and the second time with a subtraction. - because of this we need to compute the equations in the order R B + Because of this we need to compute the equations in the order R B then G saving the writes for B in the case of 24/32 bit color formats. - api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, + API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs); A B @@ -77,13 +77,13 @@ coeffs is a pointer to oy. -the {rgb} masks are only utilized by the 565 packing algorithm. Note the data -replication is used to simplify the internal algorithms for the dual mac architecture -of BlackFin. +The {rgb} masks are only utilized by the 565 packing algorithm. Note the data +replication is used to simplify the internal algorithms for the dual Mac +architecture of BlackFin. -All routines are exported with _ff_bfin_ as a symbol prefix +All routines are exported with _ff_bfin_ as a symbol prefix. -rough performance gain compared against -O3: +Rough performance gain compared against -O3: 2779809/1484290 187.28%
--- a/libswscale/rgb2rgb.c Fri Jul 04 13:42:19 2008 +0000 +++ b/libswscale/rgb2rgb.c Fri Jul 04 13:49:45 2008 +0000 @@ -1,10 +1,10 @@ /* - * rgb2rgb.c, Software RGB to RGB convertor - * pluralize by Software PAL8 to RGB convertor - * Software YUV to YUV convertor - * Software YUV to RGB convertor - * Written by Nick Kurshev. - * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) + * software RGB to RGB converter + * pluralize by software PAL8 to RGB converter + * software YUV to YUV converter + * software YUV to RGB converter + * Written by Nick Kurshev. + * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * * This file is part of FFmpeg. * @@ -22,8 +22,8 @@ * along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * - * the C code (not assembly, mmx, ...) of this file can be used - * under the LGPL license too + * The C code (not assembly, MMX, ...) of this file can be used + * under the LGPL license. */ #include <inttypes.h> #include "config.h" @@ -33,7 +33,7 @@ #include "swscale.h" #include "swscale_internal.h" -#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit +#define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients void (*rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size); void (*rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size); @@ -149,8 +149,8 @@ #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5)) -//Note: we have C, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one -//Plain C versions +//Note: We have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW + MMX2 one. +//plain C versions #undef HAVE_MMX #undef HAVE_MMX2 #undef HAVE_3DNOW @@ -190,10 +190,10 @@ #endif //ARCH_X86 || ARCH_X86_64 /* - rgb15->rgb16 Original by Strepto/Astral + RGB15->RGB16 original by Strepto/Astral ported to gcc & bugfixed : A'rpi MMX2, 3DNOW optimization by Nick Kurshev - 32bit c version, and and&add trick by Michael Niedermayer + 32-bit C version, and and&add trick by Michael Niedermayer */ void sws_rgb2rgb_init(int flags){ @@ -266,7 +266,7 @@ { long i; /* - writes 1 byte o much and might cause alignment issues on some architectures? + Writes 1 byte too much and might cause alignment issues on some architectures? for (i=0; i<num_pixels; i++) ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]]; */ @@ -284,7 +284,7 @@ { long i; /* - writes 1 byte o much and might cause alignment issues on some architectures? + Writes 1 byte too much and might cause alignment issues on some architectures? for (i=0; i<num_pixels; i++) ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]]; */ @@ -299,7 +299,7 @@ } /** - * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette + * Palette is assumed to contain BGR16, see rgb32to16 to convert the palette. */ void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) {
--- a/libswscale/rgb2rgb.h Fri Jul 04 13:42:19 2008 +0000 +++ b/libswscale/rgb2rgb.h Fri Jul 04 13:49:45 2008 +0000 @@ -1,8 +1,8 @@ /* - * rgb2rgb.h, Software RGB to RGB convertor - * pluralize by Software PAL8 to RGB convertor - * Software YUV to YUV convertor - * Software YUV to RGB convertor + * software RGB to RGB converter + * pluralize by Software PAL8 to RGB converter + * Software YUV to YUV converter + * Software YUV to RGB converter * Written by Nick Kurshev. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * @@ -28,7 +28,7 @@ #include <inttypes.h> -/* A full collection of rgb to rgb(bgr) convertors */ +/* A full collection of RGB to RGB(BGR) converters */ extern void (*rgb24to32) (const uint8_t *src, uint8_t *dst, long src_size); extern void (*rgb24to16) (const uint8_t *src, uint8_t *dst, long src_size); extern void (*rgb24to15) (const uint8_t *src, uint8_t *dst, long src_size); @@ -71,53 +71,49 @@ extern void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette); /** - * - * height should be a multiple of 2 and width should be a multiple of 16 (if this is a - * problem for anyone then tell me, and ill fix it) - * chrominance data is only taken from every secound line others are ignored FIXME write HQ version + * Height should be a multiple of 2 and width should be a multiple of 16. + * (If this is a problem for anyone then tell me, and I will fix it.) + * Chrominance data is only taken from every second line, others are ignored. + * FIXME: Write HQ version. */ //void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, /** - * - * height should be a multiple of 2 and width should be a multiple of 16 (if this is a - * problem for anyone then tell me, and ill fix it) + * Height should be a multiple of 2 and width should be a multiple of 16. + * (If this is a problem for anyone then tell me, and I will fix it.) */ extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, long width, long height, long lumStride, long chromStride, long dstStride); /** - * - * width should be a multiple of 16 + * Width should be a multiple of 16. */ extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, long width, long height, long lumStride, long chromStride, long dstStride); /** - * - * height should be a multiple of 2 and width should be a multiple of 16 (if this is a - * problem for anyone then tell me, and ill fix it) + * Height should be a multiple of 2 and width should be a multiple of 16. + * (If this is a problem for anyone then tell me, and I will fix it.) */ extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, long width, long height, long lumStride, long chromStride, long srcStride); /** - * - * height should be a multiple of 2 and width should be a multiple of 16 (if this is a - * problem for anyone then tell me, and ill fix it) + * Height should be a multiple of 2 and width should be a multiple of 16. + * (If this is a problem for anyone then tell me, and I will fix it.) */ extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, long width, long height, long lumStride, long chromStride, long dstStride); /** - * - * height should be a multiple of 2 and width should be a multiple of 2 (if this is a - * problem for anyone then tell me, and ill fix it) - * chrominance data is only taken from every secound line others are ignored FIXME write HQ version + * Height should be a multiple of 2 and width should be a multiple of 2. + * (If this is a problem for anyone then tell me, and I will fix it.) + * Chrominance data is only taken from every second line, others are ignored. + * FIXME: Write HQ version. */ extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, long width, long height,
--- a/libswscale/rgb2rgb_template.c Fri Jul 04 13:42:19 2008 +0000 +++ b/libswscale/rgb2rgb_template.c Fri Jul 04 13:49:45 2008 +0000 @@ -1,11 +1,11 @@ /* - * rgb2rgb.c, Software RGB to RGB convertor - * pluralize by Software PAL8 to RGB convertor - * Software YUV to YUV convertor - * Software YUV to RGB convertor - * Written by Nick Kurshev. - * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) - * lot of big-endian byteorder fixes by Alex Beregszaszi + * software RGB to RGB converter + * pluralize by software PAL8 to RGB converter + * software YUV to YUV converter + * software YUV to RGB converter + * Written by Nick Kurshev. + * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) + * lot of big-endian byte order fixes by Alex Beregszaszi * * This file is part of FFmpeg. * @@ -23,7 +23,7 @@ * along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * - * The C code (not assembly, mmx, ...) of this file can be used + * The C code (not assembly, MMX, ...) of this file can be used * under the LGPL license. */ @@ -229,10 +229,10 @@ } /* - Original by Strepto/Astral - ported to gcc & bugfixed : A'rpi + original by Strepto/Astral + ported to gcc & bugfixed: A'rpi MMX2, 3DNOW optimization by Nick Kurshev - 32 bit C version, and and&add trick by Michael Niedermayer + 32-bit C version, and and&add trick by Michael Niedermayer */ static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size) { @@ -926,9 +926,9 @@ ---------------- 1 1 0 1 1 1 1 0 |=======| |===| - | Leftmost Bits Repeated to Fill Open Bits + | leftmost bits repeated to fill open bits | - Original Bits + original bits */ static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size) { @@ -1006,7 +1006,7 @@ :"=m"(*d) :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) :"memory"); - /* Borrowed 32 to 24 */ + /* borrowed 32 to 24 */ asm volatile( "movq %%mm0, %%mm4 \n\t" "movq %%mm3, %%mm5 \n\t" @@ -1147,7 +1147,7 @@ :"=m"(*d) :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) :"memory"); - /* Borrowed 32 to 24 */ + /* borrowed 32 to 24 */ asm volatile( "movq %%mm0, %%mm4 \n\t" "movq %%mm3, %%mm5 \n\t" @@ -1479,7 +1479,7 @@ asm volatile(SFENCE:::"memory"); asm volatile(EMMS:::"memory"); - if (mmx_size==23) return; //finihsed, was multiple of 8 + if (mmx_size==23) return; //finished, was multiple of 8 src+= src_size; dst+= src_size; @@ -1638,8 +1638,8 @@ } /** - * Height should be a multiple of 2 and width should be a multiple of 16 (if - * this is a problem for anyone then tell me, and I will fix it). + * Height should be a multiple of 2 and width should be a multiple of 16. + * (If this is a problem for anyone then tell me, and I will fix it.) */ static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, long width, long height, @@ -1720,7 +1720,7 @@ (vc[0] << 8) + (yc[1] << 0); #else *idst++ = uc[0] + (yc[0] << 8) + - (vc[0] << 16) + (yc[1] << 24); + (vc[0] << 16) + (yc[1] << 24); #endif yc += 2; uc++; @@ -1744,8 +1744,8 @@ } /** - * Height should be a multiple of 2 and width should be a multiple of 16 (if - * this is a problem for anyone then tell me, and I will fix it). + * Height should be a multiple of 2 and width should be a multiple of 16 + * (If this is a problem for anyone then tell me, and I will fix it.) */ static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, long width, long height, @@ -1766,8 +1766,8 @@ } /** - * Height should be a multiple of 2 and width should be a multiple of 16 (if - * this is a problem for anyone then tell me, and I will fix it). + * Height should be a multiple of 2 and width should be a multiple of 16. + * (If this is a problem for anyone then tell me, and I will fix it.) */ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, long width, long height, @@ -2002,9 +2002,9 @@ } /** - * Height should be a multiple of 2 and width should be a multiple of 16 (if - * this is a problem for anyone then tell me, and I will fix it). - * Chrominance data is only taken from every secound line, others are ignored. + * Height should be a multiple of 2 and width should be a multiple of 16. + * (If this is a problem for anyone then tell me, and I will fix it.) + * Chrominance data is only taken from every second line, others are ignored. * FIXME: Write HQ version. */ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, @@ -2128,9 +2128,9 @@ } /** - * Height should be a multiple of 2 and width should be a multiple of 2 (if - * this is a problem for anyone then tell me, and I will fix it). - * Chrominance data is only taken from every secound line, + * Height should be a multiple of 2 and width should be a multiple of 2. + * (If this is a problem for anyone then tell me, and I will fix it.) + * Chrominance data is only taken from every second line, * others are ignored in the C version. * FIXME: Write HQ version. */
--- a/libswscale/swscale_altivec_template.c Fri Jul 04 13:42:19 2008 +0000 +++ b/libswscale/swscale_altivec_template.c Fri Jul 04 13:49:45 2008 +0000 @@ -245,12 +245,12 @@ src_v = vec_mergeh(src_v, (vector signed short)vzero); filter_v = vec_ld(i << 3, filter); - // the 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2) + // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2). - // the neat trick : we only care for half the elements, + // The neat trick: We only care for half the elements, // high or low depending on (i<<3)%16 (it's 0 or 8 here), - // and we're going to use vec_mule, so we chose - // carefully how to "unpack" the elements into the even slots + // and we're going to use vec_mule, so we choose + // carefully how to "unpack" the elements into the even slots. if ((i << 3) % 16) filter_v = vec_mergel(filter_v, (vector signed short)vzero); else @@ -405,12 +405,12 @@ return srcSliceH; } - /* this code assume: + /* This code assumes: 1) dst is 16 bytes-aligned 2) dstStride is a multiple of 16 3) width is a multiple of 16 - 4) lum&chrom stride are multiple of 8 + 4) lum & chrom stride are multiples of 8 */ for (y=0; y<height; y++) { @@ -482,12 +482,12 @@ return srcSliceH; } - /* this code assume: + /* This code assumes: 1) dst is 16 bytes-aligned 2) dstStride is a multiple of 16 3) width is a multiple of 16 - 4) lum&chrom stride are multiple of 8 + 4) lum & chrom stride are multiples of 8 */ for (y=0; y<height; y++) {
--- a/libswscale/swscale_bfin.c Fri Jul 04 13:42:19 2008 +0000 +++ b/libswscale/swscale_bfin.c Fri Jul 04 13:49:45 2008 +0000 @@ -1,7 +1,7 @@ /* * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> * - * Blackfin Software Video SCALER Operations + * Blackfin software video scaler operations * * This file is part of FFmpeg. *
--- a/libswscale/swscale_internal.h Fri Jul 04 13:42:19 2008 +0000 +++ b/libswscale/swscale_internal.h Fri Jul 04 13:49:45 2008 +0000 @@ -37,7 +37,7 @@ typedef int (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t* dst[], int dstStride[]); -/* this struct should be aligned on at least 32-byte boundary */ +/* This struct should be aligned on at least a 32-byte boundary. */ typedef struct SwsContext{ /** * info on struct for av_log @@ -73,7 +73,7 @@ int16_t *vChrFilter; int16_t *vChrFilterPos; - uint8_t formatConvBuffer[VOF]; //FIXME dynamic alloc, but we have to change a lot of code for this to be useful + uint8_t formatConvBuffer[VOF]; //FIXME dynamic allocation, but we have to change a lot of code for this to be useful int hLumFilterSize; int hChrFilterSize; @@ -122,7 +122,7 @@ #define V_OFFSET "10*8" #define LUM_MMX_FILTER_OFFSET "11*8" #define CHR_MMX_FILTER_OFFSET "11*8+4*4*256" -#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the asm +#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM #define ESP_OFFSET "11*8+4*4*256*2+8" #define VROUNDER_OFFSET "11*8+4*4*256*2+16" #define U_TEMP "11*8+4*4*256*2+24"
--- a/libswscale/swscale_template.c Fri Jul 04 13:42:19 2008 +0000 +++ b/libswscale/swscale_template.c Fri Jul 04 13:49:45 2008 +0000 @@ -17,8 +17,8 @@ * along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * - * the C code (not assembly, mmx, ...) of this file can be used - * under the LGPL license too + * The C code (not assembly, MMX, ...) of this file can be used + * under the LGPL license. */ #undef REAL_MOVNTQ @@ -30,7 +30,7 @@ #undef SFENCE #ifdef HAVE_3DNOW -/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ +/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */ #define EMMS "femms" #else #define EMMS "emms" @@ -1503,7 +1503,7 @@ const int yalpha1=0; int i; - uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1 + uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 const int yalpha= 4096; //FIXME ... if (flags&SWS_FULL_CHR_H_INT) @@ -1700,7 +1700,7 @@ } } -//FIXME yuy2* can read upto 7 samples to much +//FIXME yuy2* can read up to 7 samples too much static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width) { @@ -2297,7 +2297,7 @@ } } -// Bilinear / Bicubic scaling +// bilinear / bicubic scaling static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, int16_t *filter, int16_t *filterPos, long filterSize) { @@ -2544,7 +2544,7 @@ } #ifdef HAVE_MMX - // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one) + // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) #else if (!(flags&SWS_FAST_BILINEAR)) @@ -2552,7 +2552,7 @@ { RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); } - else // Fast Bilinear upscale / crap downscale + else // fast bilinear upscale / crap downscale { #if defined(ARCH_X86) #ifdef HAVE_MMX2 @@ -2761,7 +2761,7 @@ } #ifdef HAVE_MMX - // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one) + // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) #else if (!(flags&SWS_FAST_BILINEAR)) @@ -2770,7 +2770,7 @@ RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); } - else // Fast Bilinear upscale / crap downscale + else // fast bilinear upscale / crap downscale { #if defined(ARCH_X86) #ifdef HAVE_MMX2 @@ -2890,8 +2890,8 @@ "cmp %2, %%"REG_a" \n\t" " jb 1b \n\t" -/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, - which is needed to support GCC-4.0 */ +/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, + which is needed to support GCC 4.0. */ #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), #else @@ -2963,7 +2963,7 @@ int lastDstY; uint8_t *pal=NULL; - /* vars whch will change and which we need to storw back in the context */ + /* vars which will change and which we need to store back in the context */ int dstY= c->dstY; int lumBufIndex= c->lumBufIndex; int chrBufIndex= c->chrBufIndex; @@ -3004,13 +3004,14 @@ if (flags & SWS_PRINT_INFO && firstTime) { av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n" - " ->cannot do aligned memory acesses anymore\n"); + " ->cannot do aligned memory accesses anymore\n"); firstTime=0; } } - /* Note the user might start scaling the picture in the middle so this will not get executed - this is not really intended but works currently, so ppl might do it */ + /* Note the user might start scaling the picture in the middle so this + will not get executed. This is not really intended but works + currently, so people might do it. */ if (srcSliceY ==0){ lumBufIndex=0; chrBufIndex=0; @@ -3182,7 +3183,7 @@ { const int chrSkipMask= (1<<c->chrDstVSubSample)-1; if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi - if (vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 + if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12 { int16_t *lumBuf = lumPixBuf[0]; int16_t *chrBuf= chrPixBuf[0]; @@ -3200,13 +3201,13 @@ { ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); - if (vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB + if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB { int chrAlpha= vChrFilter[2*dstY+1]; RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), dest, dstW, chrAlpha, dstFormat, flags, dstY); } - else if (vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB + else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB { int lumAlpha= vLumFilter[2*dstY+1]; int chrAlpha= vChrFilter[2*dstY+1]; @@ -3217,7 +3218,7 @@ RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), dest, dstW, lumAlpha, chrAlpha, dstY); } - else //General RGB + else //general RGB { RENAME(yuv2packedX)(c, vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
--- a/libswscale/yuv2rgb.c Fri Jul 04 13:42:19 2008 +0000 +++ b/libswscale/yuv2rgb.c Fri Jul 04 13:49:45 2008 +0000 @@ -39,7 +39,7 @@ #include "swscale.h" #include "swscale_internal.h" -#define DITHER1XBPP // only for mmx +#define DITHER1XBPP // only for MMX const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={ { 1, 3, 1, 3, 1, 3, 1, 3, }, @@ -155,8 +155,8 @@ DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL; DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL; -// the volatile is required because gcc otherwise optimizes some writes away not knowing that these -// are read in the asm block +// The volatile is required because gcc otherwise optimizes some writes away +// not knowing that these are read in the ASM block. static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither; static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither; static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither; @@ -641,7 +641,7 @@ } #endif - av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found\n"); + av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found.\n"); switch(c->dstFormat){ case PIX_FMT_BGR32:
--- a/libswscale/yuv2rgb_altivec.c Fri Jul 04 13:42:19 2008 +0000 +++ b/libswscale/yuv2rgb_altivec.c Fri Jul 04 13:49:45 2008 +0000 @@ -21,63 +21,68 @@ */ /* -convert I420 YV12 to RGB in various formats, - it rejects images that are not in 420 formats - it rejects images that don't have widths of multiples of 16 - it rejects images that don't have heights of multiples of 2 -reject defers to C simulation codes. +Convert I420 YV12 to RGB in various formats, + it rejects images that are not in 420 formats, + it rejects images that don't have widths of multiples of 16, + it rejects images that don't have heights of multiples of 2. +Reject defers to C simulation code. + +Lots of optimizations to be done here. -lots of optimizations to be done here +1. Need to fix saturation code. I just couldn't get it to fly with packs + and adds, so we currently use max/min to clip. -1. need to fix saturation code, I just couldn't get it to fly with packs and adds. - so we currently use max min to clip +2. The inefficient use of chroma loading needs a bit of brushing up. -2. the inefficient use of chroma loading needs a bit of brushing up - -3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls +3. Analysis of pipeline stalls needs to be done. Use shark to identify + pipeline stalls. MODIFIED to calculate coeffs from currently selected color space. -MODIFIED core to be a macro which you spec the output format. -ADDED UYVY conversion which is never called due to some thing in SWSCALE. +MODIFIED core to be a macro where you specify the output format. +ADDED UYVY conversion which is never called due to some thing in swscale. CORRECTED algorithim selection to be strict on input formats. -ADDED runtime detection of altivec. +ADDED runtime detection of AltiVec. ADDED altivec_yuv2packedX vertical scl + RGB converter March 27,2004 PERFORMANCE ANALYSIS -The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test -The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence +The C version uses 25% of the processor or ~250Mips for D1 video rawvideo +used as test. +The AltiVec version uses 10% of the processor or ~100Mips for D1 video +same sequence. -720*480*30 ~10MPS +720 * 480 * 30 ~10MPS -so we have roughly 10clocks per pixel this is too high something has to be wrong. - -OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min. +so we have roughly 10 clocks per pixel. This is too high, something has +to be wrong. -OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much -guaranteed to have the input video frame it was just decompressed so -it probably resides in L1 caches. However we are creating the -output video stream this needs to use the DSTST instruction to -optimize for the cache. We couple this with the fact that we are -not going to be visiting the input buffer again so we mark it Least -Recently Used. This shaves 25% of the processor cycles off. +OPTIMIZED clip codes to utilize vec_max and vec_packs removing the +need for vec_min. -Now MEMCPY is the largest mips consumer in the system, probably due +OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have +the input video frame, it was just decompressed so it probably resides in L1 +caches. However, we are creating the output video stream. This needs to use the +DSTST instruction to optimize for the cache. We couple this with the fact that +we are not going to be visiting the input buffer again so we mark it Least +Recently Used. This shaves 25% of the processor cycles off. + +Now memcpy is the largest mips consumer in the system, probably due to the inefficient X11 stuff. GL libraries seem to be very slow on this machine 1.33Ghz PB running Jaguar, this is not the case for my 1Ghz PB. I thought it might be -a versioning issues, however I have libGL.1.2.dylib for both -machines. ((We need to figure this out now)) +a versioning issue, however I have libGL.1.2.dylib for both +machines. (We need to figure this out now.) + +GL2 libraries work now with patch for RGB32. -GL2 libraries work now with patch for RGB32 +NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor. -NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor - -Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. +Integrated luma prescaling adjustment for saturation/contrast/brightness +adjustment. */ #include <stdio.h>
--- a/libswscale/yuv2rgb_bfin.c Fri Jul 04 13:42:19 2008 +0000 +++ b/libswscale/yuv2rgb_bfin.c Fri Jul 04 13:49:45 2008 +0000 @@ -1,9 +1,8 @@ /* * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> - * April 20, 2007 * - * Blackfin Video Color Space Converters Operations - * convert I420 YV12 to RGB in various formats, + * Blackfin video color space converter operations + * convert I420 YV12 to RGB in various formats * * This file is part of FFmpeg. * @@ -200,7 +199,7 @@ return 0; } - av_log(c, AV_LOG_INFO, "BlackFin Accelerated Color Space Converter %s\n", + av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n", sws_format_name (c->dstFormat)); return f;
--- a/libswscale/yuv2rgb_mlib.c Fri Jul 04 13:42:19 2008 +0000 +++ b/libswscale/yuv2rgb_mlib.c Fri Jul 04 13:49:45 2008 +0000 @@ -1,5 +1,6 @@ /* - * yuv2rgb_mlib.c, Software YUV to RGB converter using mediaLib + * software YUV to RGB converter using mediaLib + * * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at> * * This file is part of FFmpeg.
--- a/libswscale/yuv2rgb_template.c Fri Jul 04 13:42:19 2008 +0000 +++ b/libswscale/yuv2rgb_template.c Fri Jul 04 13:49:45 2008 +0000 @@ -1,5 +1,5 @@ /* - * yuv2rgb_mmx.c, Software YUV to RGB converter with Intel MMX "technology" + * yuv2rgb_mmx.c, software YUV to RGB converter with Intel MMX "technology" * * Copyright (C) 2000, Silicon Integrated System Corp. * @@ -31,7 +31,7 @@ #undef SFENCE #ifdef HAVE_3DNOW -/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ +/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */ #define EMMS "femms" #else #define EMMS "emms" @@ -147,8 +147,8 @@ g6Dither= ff_dither4[y&1]; g5Dither= ff_dither8[y&1]; r5Dither= ff_dither8[(y+1)&1]; - /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 - pixels in each iteration */ + /* This MMX assembly code deals with a SINGLE scan line at a time, + * it converts 8 pixels in each iteration. */ asm volatile ( /* load data for start of next scan line */ "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ @@ -156,8 +156,8 @@ "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ //".balign 16 \n\t" "1: \n\t" - /* no speed diference on my p3@500 with prefetch, - * if it is faster for anyone with -benchmark then tell me + /* No speed difference on my p3@500 with prefetch, + * if it is faster for anyone with -benchmark then tell me. PREFETCH" 64(%0) \n\t" PREFETCH" 64(%1) \n\t" PREFETCH" 64(%2) \n\t" @@ -180,7 +180,7 @@ "movq %%mm0, %%mm5;" /* Copy B7-B0 */ "movq %%mm2, %%mm7;" /* Copy G7-G0 */ - /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ + /* convert RGB24 plane to RGB16 pack for pixel 0-3 */ "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ @@ -190,7 +190,7 @@ "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ - /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ + /* convert RGB24 plane to RGB16 pack for pixel 0-3 */ "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ @@ -242,8 +242,8 @@ g6Dither= ff_dither4[y&1]; g5Dither= ff_dither8[y&1]; r5Dither= ff_dither8[(y+1)&1]; - /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 - pixels in each iteration */ + /* This MMX assembly code deals with a SINGLE scan line at a time, + * it converts 8 pixels in each iteration. */ asm volatile ( /* load data for start of next scan line */ "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ @@ -271,7 +271,7 @@ "movq %%mm0, %%mm5;" /* Copy B7-B0 */ "movq %%mm2, %%mm7;" /* Copy G7-G0 */ - /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ + /* convert RGB24 plane to RGB16 pack for pixel 0-3 */ "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */ "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ @@ -281,7 +281,7 @@ "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ - /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ + /* convert RGB24 plane to RGB16 pack for pixel 0-3 */ "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */ "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ @@ -326,8 +326,8 @@ uint8_t *pv = src[2] + (y>>1)*srcStride[2]; long index= -h_size/2; - /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 - pixels in each iteration */ + /* This MMX assembly code deals with a SINGLE scan line at a time, + * it converts 8 pixels in each iteration. */ asm volatile ( /* load data for start of next scan line */ "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ @@ -472,8 +472,8 @@ uint8_t *pv = src[2] + (y>>1)*srcStride[2]; long index= -h_size/2; - /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 - pixels in each iteration */ + /* This MMX assembly code deals with a SINGLE scan line at a time, + * it converts 8 pixels in each iteration. */ asm volatile ( /* load data for start of next scan line */ "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */