libavcodec.hg: i386/dsputil

annotate i386/dsputil_mmx.c @ 6408:7af599600f2d libavcodec

One variable less. No real change to the actual code.

author	michael
date	Tue, 26 Feb 2008 15:19:37 +0000
parents	9a736918fd90
children	5154ab444372

rev	line source
0 986e461dc072 Initial revision glantau parents: diff changeset	1 /*
986e461dc072 Initial revision glantau parents: diff changeset	2 * MMX optimized DSP utils
429 718a22dc121f license/copyright change glantau parents: 422 diff changeset	3 * Copyright (c) 2000, 2001 Fabrice Bellard.
1739 07a484280a82 copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise michael parents: 1729 diff changeset	4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
0 986e461dc072 Initial revision glantau parents: diff changeset	5 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	6 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	7 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	8 * FFmpeg is free software; you can redistribute it and/or
429 718a22dc121f license/copyright change glantau parents: 422 diff changeset	9 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change glantau parents: 422 diff changeset	10 * License as published by the Free Software Foundation; either
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	11 * version 2.1 of the License, or (at your option) any later version.
0 986e461dc072 Initial revision glantau parents: diff changeset	12 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	13 * FFmpeg is distributed in the hope that it will be useful,
0 986e461dc072 Initial revision glantau parents: diff changeset	14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429 718a22dc121f license/copyright change glantau parents: 422 diff changeset	15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change glantau parents: 422 diff changeset	16 * Lesser General Public License for more details.
0 986e461dc072 Initial revision glantau parents: diff changeset	17 *
429 718a22dc121f license/copyright change glantau parents: 422 diff changeset	18 * You should have received a copy of the GNU Lesser General Public
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	19 * License along with FFmpeg; if not, write to the Free Software
3036 0b546eab515d Update licensing information: The FSF changed postal address. diego parents: 2979 diff changeset	20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0 986e461dc072 Initial revision glantau parents: diff changeset	21 *
986e461dc072 Initial revision glantau parents: diff changeset	22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
986e461dc072 Initial revision glantau parents: diff changeset	23 */
986e461dc072 Initial revision glantau parents: diff changeset	24
5010 d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header diego parents: 5007 diff changeset	25 #include "dsputil.h"
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	26 #include "dsputil_mmx.h"
5010 d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header diego parents: 5007 diff changeset	27 #include "simple_idct.h"
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header diego parents: 5007 diff changeset	28 #include "mpegvideo.h"
3398 e0927bc44a10 Move REG_* macros from libavcodec/i386/mmx.h to libavutil/x86_cpu.h lucabe parents: 3250 diff changeset	29 #include "x86_cpu.h"
1984 ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	30 #include "mmx.h"
5014 42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c aurel parents: 5010 diff changeset	31 #include "vp3dsp_mmx.h"
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c aurel parents: 5010 diff changeset	32 #include "vp3dsp_sse2.h"
5277 7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs aurel parents: 5255 diff changeset	33 #include "h263.h"
0 986e461dc072 Initial revision glantau parents: diff changeset	34
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	35 //#undef NDEBUG
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	36 //#include <assert.h>
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	37
2868 666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	38 extern void ff_idct_xvid_mmx(short *block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	39 extern void ff_idct_xvid_mmx2(short *block);
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	40
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	41 int mm_flags; /* multimedia extension flags */
936 caa77cd960c0 qpel encoding michaelni parents: 866 diff changeset	42
0 986e461dc072 Initial revision glantau parents: diff changeset	43 /* pixel operations */
5947 37a03989871b use ff_ prefix for extern vars aurel parents: 5946 diff changeset	44 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
37a03989871b use ff_ prefix for extern vars aurel parents: 5946 diff changeset	45 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	46
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	47 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	48 {0x8000000080000000ULL, 0x8000000080000000ULL};
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	49
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	50 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	51 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	52 DECLARE_ALIGNED_16(const xmm_t, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	53 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	54 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	55 DECLARE_ALIGNED_16(const xmm_t, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	56 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
6329 5969caa9190d clean up an ugliness introduced in r11826. this syntax will require fewer changes when adding future sse2 code. lorenm parents: 6327 diff changeset	57 DECLARE_ALIGNED_16(const xmm_t, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	58 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	59 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	60 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
6333 beb52d4a5efe constant was excessively aligned lorenm parents: 6331 diff changeset	61 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	62 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	63
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	64 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	65 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	66 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	67 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	68 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	69 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	70
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	71 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	72 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
5737 efa3c1f9259a sse2 version of compute_autocorr(). lorenm parents: 5602 diff changeset	73
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	74 #define JUMPALIGN() asm volatile (ASMALIGN(3)::)
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	75 #define MOVQ_ZERO(regd) asm volatile ("pxor %%" #regd ", %%" #regd ::)
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	76
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	77 #define MOVQ_BFE(regd) \
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	78 asm volatile ( \
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	79 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	80 "paddb %%" #regd ", %%" #regd " \n\t" ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	81
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	82 #ifndef PIC
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	83 #define MOVQ_BONE(regd) asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	84 #define MOVQ_WTWO(regd) asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	85 #else
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	86 // for shared library it's better to use this way for accessing constants
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	87 // pcmpeqd -> -1
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	88 #define MOVQ_BONE(regd) \
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	89 asm volatile ( \
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	90 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	91 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	92 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	93
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	94 #define MOVQ_WTWO(regd) \
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	95 asm volatile ( \
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	96 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	97 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	98 "psllw $1, %%" #regd " \n\t"::)
387 b8f3affeb8e1 shared lib support (req by kabi) ... michaelni parents: 386 diff changeset	99
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	100 #endif
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	101
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	102 // using regr as temporary and for the output result
444 a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC kabi parents: 438 diff changeset	103 // first argument is unmodifed and second is trashed
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	104 // regfe is supposed to contain 0xfefefefefefefefe
d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	105 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	106 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	107 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	108 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	109 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	110 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	111 "paddb " #regb ", " #regr " \n\t"
444 a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC kabi parents: 438 diff changeset	112
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	113 #define PAVGB_MMX(rega, regb, regr, regfe) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	114 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	115 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	116 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	117 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	118 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	119 "psubb " #regb ", " #regr " \n\t"
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	120
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	121 // mm6 is supposed to contain 0xfefefefefefefefe
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	122 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	123 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	124 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	125 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	126 "pand " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	127 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	128 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	129 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	130 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	131 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	132 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	133 "paddb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	134 "paddb " #regd ", " #regp " \n\t"
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	135
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	136 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	137 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	138 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	139 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	140 "por " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	141 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	142 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	143 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	144 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	145 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	146 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	147 "psubb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	148 "psubb " #regd ", " #regp " \n\t"
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	149
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	150 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	151 /* MMX no rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	152 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	153 #define SET_RND MOVQ_WONE
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	154 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	155 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	156
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	157 #include "dsputil_mmx_rnd.h"
444 a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC kabi parents: 438 diff changeset	158
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	159 #undef DEF
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	160 #undef SET_RND
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	161 #undef PAVGBP
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	162 #undef PAVGB
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	163 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	164 /* MMX rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	165
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	166 #define DEF(x, y) x ## _ ## y ##_mmx
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	167 #define SET_RND MOVQ_WTWO
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	168 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	169 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	170
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	171 #include "dsputil_mmx_rnd.h"
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	172
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	173 #undef DEF
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	174 #undef SET_RND
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	175 #undef PAVGBP
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	176 #undef PAVGB
387 b8f3affeb8e1 shared lib support (req by kabi) ... michaelni parents: 386 diff changeset	177
0 986e461dc072 Initial revision glantau parents: diff changeset	178 /***********************************/
986e461dc072 Initial revision glantau parents: diff changeset	179 /* 3Dnow specific */
986e461dc072 Initial revision glantau parents: diff changeset	180
986e461dc072 Initial revision glantau parents: diff changeset	181 #define DEF(x) x ## _3dnow
986e461dc072 Initial revision glantau parents: diff changeset	182 #define PAVGB "pavgusb"
986e461dc072 Initial revision glantau parents: diff changeset	183
986e461dc072 Initial revision glantau parents: diff changeset	184 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision glantau parents: diff changeset	185
986e461dc072 Initial revision glantau parents: diff changeset	186 #undef DEF
986e461dc072 Initial revision glantau parents: diff changeset	187 #undef PAVGB
986e461dc072 Initial revision glantau parents: diff changeset	188
986e461dc072 Initial revision glantau parents: diff changeset	189 /***********************************/
986e461dc072 Initial revision glantau parents: diff changeset	190 /* MMX2 specific */
986e461dc072 Initial revision glantau parents: diff changeset	191
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 342 diff changeset	192 #define DEF(x) x ## _mmx2
0 986e461dc072 Initial revision glantau parents: diff changeset	193
986e461dc072 Initial revision glantau parents: diff changeset	194 /* Introduced only in MMX2 set */
986e461dc072 Initial revision glantau parents: diff changeset	195 #define PAVGB "pavgb"
986e461dc072 Initial revision glantau parents: diff changeset	196
986e461dc072 Initial revision glantau parents: diff changeset	197 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision glantau parents: diff changeset	198
986e461dc072 Initial revision glantau parents: diff changeset	199 #undef DEF
986e461dc072 Initial revision glantau parents: diff changeset	200 #undef PAVGB
986e461dc072 Initial revision glantau parents: diff changeset	201
6327 5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	202 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	203 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
6321 57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	204 #define put_pixels16_mmx2 put_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	205 #define put_pixels8_mmx2 put_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	206 #define put_pixels4_mmx2 put_pixels4_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	207 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	208 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	209 #define put_pixels16_3dnow put_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	210 #define put_pixels8_3dnow put_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	211 #define put_pixels4_3dnow put_pixels4_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	212 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	213 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	214
0 986e461dc072 Initial revision glantau parents: diff changeset	215 /***********************************/
986e461dc072 Initial revision glantau parents: diff changeset	216 /* standard MMX */
986e461dc072 Initial revision glantau parents: diff changeset	217
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	218 void put_pixels_clamped_mmx(const DCTELEM block, uint8_t pixels, int line_size)
0 986e461dc072 Initial revision glantau parents: diff changeset	219 {
986e461dc072 Initial revision glantau parents: diff changeset	220 const DCTELEM *p;
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	221 uint8_t *pix;
0 986e461dc072 Initial revision glantau parents: diff changeset	222
986e461dc072 Initial revision glantau parents: diff changeset	223 /* read the pixels */
986e461dc072 Initial revision glantau parents: diff changeset	224 p = block;
986e461dc072 Initial revision glantau parents: diff changeset	225 pix = pixels;
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	226 /* unrolled loop */
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	227 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	228 "movq %3, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	229 "movq 8%3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	230 "movq 16%3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	231 "movq 24%3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	232 "movq 32%3, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	233 "movq 40%3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	234 "movq 48%3, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	235 "movq 56%3, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	236 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	237 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	238 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	239 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	240 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	241 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	242 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	243 "movq %%mm6, (%0, %2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	244 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size3), "m"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	245 :"memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	246 pix += line_size*4;
986e461dc072 Initial revision glantau parents: diff changeset	247 p += 32;
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	248
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	249 // if here would be an exact copy of the code above
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	250 // compiler would generate some very strange code
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	251 // thus using "r"
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	252 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	253 "movq (%3), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	254 "movq 8(%3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	255 "movq 16(%3), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	256 "movq 24(%3), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	257 "movq 32(%3), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	258 "movq 40(%3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	259 "movq 48(%3), %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	260 "movq 56(%3), %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	261 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	262 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	263 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	264 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	265 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	266 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	267 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	268 "movq %%mm6, (%0, %2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	269 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	270 :"memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	271 }
986e461dc072 Initial revision glantau parents: diff changeset	272
3089 072dbc669253 MSVC-compatible __align8/__align16 declaration diego parents: 3036 diff changeset	273 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
1985 b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler melanson parents: 1984 diff changeset	274 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler melanson parents: 1984 diff changeset	275
1984 ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	276 void put_signed_pixels_clamped_mmx(const DCTELEM block, uint8_t pixels, int line_size)
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	277 {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	278 int i;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	279
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	280 movq_m2r(*vector128, mm1);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	281 for (i = 0; i < 8; i++) {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	282 movq_m2r(*(block), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	283 packsswb_m2r(*(block + 4), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	284 block += 8;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	285 paddb_r2r(mm1, mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	286 movq_r2m(mm0, *pixels);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	287 pixels += line_size;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	288 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	289 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	290
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	291 void add_pixels_clamped_mmx(const DCTELEM block, uint8_t pixels, int line_size)
0 986e461dc072 Initial revision glantau parents: diff changeset	292 {
986e461dc072 Initial revision glantau parents: diff changeset	293 const DCTELEM *p;
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	294 uint8_t *pix;
0 986e461dc072 Initial revision glantau parents: diff changeset	295 int i;
986e461dc072 Initial revision glantau parents: diff changeset	296
986e461dc072 Initial revision glantau parents: diff changeset	297 /* read the pixels */
986e461dc072 Initial revision glantau parents: diff changeset	298 p = block;
986e461dc072 Initial revision glantau parents: diff changeset	299 pix = pixels;
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	300 MOVQ_ZERO(mm7);
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	301 i = 4;
342 8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler kabi parents: 324 diff changeset	302 do {
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	303 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	304 "movq (%2), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	305 "movq 8(%2), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	306 "movq 16(%2), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	307 "movq 24(%2), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	308 "movq %0, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	309 "movq %1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	310 "movq %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	311 "punpcklbw %%mm7, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	312 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	313 "paddsw %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	314 "paddsw %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	315 "movq %%mm6, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	316 "punpcklbw %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	317 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	318 "paddsw %%mm6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	319 "paddsw %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	320 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	321 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	322 "movq %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	323 "movq %%mm2, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	324 :"+m"(pix), "+m"((pix+line_size))
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	325 :"r"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	326 :"memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	327 pix += line_size*2;
986e461dc072 Initial revision glantau parents: diff changeset	328 p += 16;
342 8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler kabi parents: 324 diff changeset	329 } while (--i);
0 986e461dc072 Initial revision glantau parents: diff changeset	330 }
986e461dc072 Initial revision glantau parents: diff changeset	331
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	332 static void put_pixels4_mmx(uint8_t block, const uint8_t pixels, int line_size, int h)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	333 {
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	334 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	335 "lea (%3, %3), %%"REG_a" \n\t"
3576 f7125bf10892 Support for MacIntel, last part: balign directives gpoirier parents: 3574 diff changeset	336 ASMALIGN(3)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	337 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	338 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	339 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	340 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	341 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	342 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	343 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	344 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	345 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	346 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	347 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	348 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	349 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	350 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	351 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	352 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	353 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	354 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	355 );
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	356 }
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	357
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	358 static void put_pixels8_mmx(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	359 {
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	360 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	361 "lea (%3, %3), %%"REG_a" \n\t"
3576 f7125bf10892 Support for MacIntel, last part: balign directives gpoirier parents: 3574 diff changeset	362 ASMALIGN(3)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	363 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	364 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	365 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	366 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	367 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	368 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	369 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	370 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	371 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	372 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	373 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	374 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	375 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	376 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	377 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	378 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	379 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	380 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	381 );
0 986e461dc072 Initial revision glantau parents: diff changeset	382 }
986e461dc072 Initial revision glantau parents: diff changeset	383
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	384 static void put_pixels16_mmx(uint8_t block, const uint8_t pixels, int line_size, int h)
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	385 {
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	386 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	387 "lea (%3, %3), %%"REG_a" \n\t"
3576 f7125bf10892 Support for MacIntel, last part: balign directives gpoirier parents: 3574 diff changeset	388 ASMALIGN(3)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	389 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	390 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	391 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	392 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	393 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	394 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	395 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	396 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	397 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	398 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	399 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	400 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	401 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	402 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	403 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	404 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	405 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	406 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	407 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	408 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	409 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	410 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	411 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	412 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	413 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	414 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	415 );
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	416 }
45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	417
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	418 static void put_pixels16_sse2(uint8_t block, const uint8_t pixels, int line_size, int h)
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	419 {
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	420 asm volatile(
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	421 "1: \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	422 "movdqu (%1), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	423 "movdqu (%1,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	424 "movdqu (%1,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	425 "movdqu (%1,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	426 "movdqa %%xmm0, (%2) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	427 "movdqa %%xmm1, (%2,%3) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	428 "movdqa %%xmm2, (%2,%3,2) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	429 "movdqa %%xmm3, (%2,%4) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	430 "subl $4, %0 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	431 "lea (%1,%3,4), %1 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	432 "lea (%2,%3,4), %2 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	433 "jnz 1b \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	434 : "+g"(h), "+r" (pixels), "+r" (block)
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	435 : "r"((long)line_size), "r"(3L*line_size)
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	436 : "memory"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	437 );
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	438 }
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	439
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	440 static void avg_pixels16_sse2(uint8_t block, const uint8_t pixels, int line_size, int h)
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	441 {
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	442 asm volatile(
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	443 "1: \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	444 "movdqu (%1), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	445 "movdqu (%1,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	446 "movdqu (%1,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	447 "movdqu (%1,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	448 "pavgb (%2), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	449 "pavgb (%2,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	450 "pavgb (%2,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	451 "pavgb (%2,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	452 "movdqa %%xmm0, (%2) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	453 "movdqa %%xmm1, (%2,%3) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	454 "movdqa %%xmm2, (%2,%3,2) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	455 "movdqa %%xmm3, (%2,%4) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	456 "subl $4, %0 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	457 "lea (%1,%3,4), %1 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	458 "lea (%2,%3,4), %2 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	459 "jnz 1b \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	460 : "+g"(h), "+r" (pixels), "+r" (block)
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	461 : "r"((long)line_size), "r"(3L*line_size)
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	462 : "memory"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	463 );
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	464 }
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	465
296 c1a8a1b4a24b sizeof(s->block) isnt 6462 anymore bugfix michaelni parents: 294 diff changeset	466 static void clear_blocks_mmx(DCTELEM *blocks)
c1a8a1b4a24b sizeof(s->block) isnt 6462 anymore bugfix michaelni parents: 294 diff changeset	467 {
6391 3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works reimar parents: 6384 diff changeset	468 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	469 "pxor %%mm7, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	470 "mov $-128*6, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	471 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	472 "movq %%mm7, (%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	473 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	474 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	475 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	476 "add $32, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	477 " js 1b \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	478 : : "r" (((uint8_t )blocks)+1286)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	479 : "%"REG_a
296 c1a8a1b4a24b sizeof(s->block) isnt 6462 anymore bugfix michaelni parents: 294 diff changeset	480 );
c1a8a1b4a24b sizeof(s->block) isnt 6462 anymore bugfix michaelni parents: 294 diff changeset	481 }
c1a8a1b4a24b sizeof(s->block) isnt 6462 anymore bugfix michaelni parents: 294 diff changeset	482
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	483 static void add_bytes_mmx(uint8_t dst, uint8_t src, int w){
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	484 long i=0;
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	485 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	486 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	487 "movq (%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	488 "movq (%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	489 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	490 "movq %%mm1, (%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	491 "movq 8(%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	492 "movq 8(%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	493 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	494 "movq %%mm1, 8(%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	495 "add $16, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	496 "cmp %3, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	497 " jb 1b \n\t"
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	498 : "+r" (i)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	499 : "r"(src), "r"(dst), "r"((long)w-15)
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	500 );
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	501 for(; i<w; i++)
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	502 dst[i+0] += src[i+0];
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	503 }
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	504
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	505 static void add_bytes_l2_mmx(uint8_t dst, uint8_t src1, uint8_t *src2, int w){
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	506 long i=0;
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	507 asm volatile(
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	508 "1: \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	509 "movq (%2, %0), %%mm0 \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	510 "movq 8(%2, %0), %%mm1 \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	511 "paddb (%3, %0), %%mm0 \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	512 "paddb 8(%3, %0), %%mm1 \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	513 "movq %%mm0, (%1, %0) \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	514 "movq %%mm1, 8(%1, %0) \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	515 "add $16, %0 \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	516 "cmp %4, %0 \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	517 " jb 1b \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	518 : "+r" (i)
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	519 : "r"(dst), "r"(src1), "r"(src2), "r"((long)w-15)
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	520 );
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	521 for(; i<w; i++)
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	522 dst[i] = src1[i] + src2[i];
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	523 }
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	524
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	525 #define H263_LOOP_FILTER \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	526 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	527 "movq %0, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	528 "movq %0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	529 "movq %3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	530 "movq %3, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	531 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	532 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	533 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	534 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	535 "psubw %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	536 "psubw %%mm3, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	537 "movq %1, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	538 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	539 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	540 "movq %2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	541 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	542 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	543 "punpcklbw %%mm7, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	544 "punpckhbw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	545 "psubw %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	546 "psubw %%mm3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	547 "psllw $2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	548 "psllw $2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	549 "paddw %%mm0, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	550 "paddw %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	551 "pxor %%mm6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	552 "pcmpgtw %%mm4, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	553 "pcmpgtw %%mm5, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	554 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	555 "pxor %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	556 "psubw %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	557 "psubw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	558 "psrlw $3, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	559 "psrlw $3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	560 "packuswb %%mm5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	561 "packsswb %%mm7, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	562 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	563 "movd %4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	564 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	565 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	566 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	567 "psubusb %%mm4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	568 "movq %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	569 "psubusb %%mm4, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	570 "psubb %%mm3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	571 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	572 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	573 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	574 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	575 "paddusb %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	576 "psubusb %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	577 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	578 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	579 "paddusb %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	580 "packsswb %%mm1, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	581 "pcmpgtb %%mm0, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	582 "pxor %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	583 "psubb %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	584 "movq %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	585 "psubusb %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	586 "psubb %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	587 "pand %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	588 "psrlw $2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	589 "pxor %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	590 "psubb %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	591 "movq %0, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	592 "movq %3, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	593 "psubb %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	594 "paddb %%mm1, %%mm6 \n\t"
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	595
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	596 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394 e9a6215f4e3a help some gcc version to optimize out those functions aurel parents: 5278 diff changeset	597 if(ENABLE_ANY_H263) {
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	598 const int strength= ff_h263_loop_filter_strength[qscale];
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	599
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	600 asm volatile(
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	601
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	602 H263_LOOP_FILTER
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	603
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	604 "movq %%mm3, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	605 "movq %%mm4, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	606 "movq %%mm5, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	607 "movq %%mm6, %3 \n\t"
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	608 : "+m" ((uint64_t)(src - 2*stride)),
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	609 "+m" ((uint64_t)(src - 1*stride)),
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	610 "+m" ((uint64_t)(src + 0*stride)),
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	611 "+m" ((uint64_t)(src + 1*stride))
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	612 : "g" (2*strength), "m"(ff_pb_FC)
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	613 );
5394 e9a6215f4e3a help some gcc version to optimize out those functions aurel parents: 5278 diff changeset	614 }
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	615 }
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	616
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	617 static inline void transpose4x4(uint8_t dst, uint8_t src, int dst_stride, int src_stride){
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	618 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	619 "movd %4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	620 "movd %5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	621 "movd %6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	622 "movd %7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	623 "punpcklbw %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	624 "punpcklbw %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	625 "movq %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	626 "punpcklwd %%mm2, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	627 "punpckhwd %%mm2, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	628 "movd %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	629 "punpckhdq %%mm0, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	630 "movd %%mm0, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	631 "movd %%mm1, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	632 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	633 "movd %%mm1, %3 \n\t"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	634
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	635 : "=m" ((uint32_t)(dst + 0*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	636 "=m" ((uint32_t)(dst + 1*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	637 "=m" ((uint32_t)(dst + 2*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	638 "=m" ((uint32_t)(dst + 3*dst_stride))
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	639 : "m" ((uint32_t)(src + 0*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	640 "m" ((uint32_t)(src + 1*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	641 "m" ((uint32_t)(src + 2*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	642 "m" ((uint32_t)(src + 3*src_stride))
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	643 );
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	644 }
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	645
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	646 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394 e9a6215f4e3a help some gcc version to optimize out those functions aurel parents: 5278 diff changeset	647 if(ENABLE_ANY_H263) {
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	648 const int strength= ff_h263_loop_filter_strength[qscale];
6181 f3da7b2592aa Use DECLARE_ALIGNED reimar parents: 6135 diff changeset	649 DECLARE_ALIGNED(8, uint64_t, temp[4]);
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	650 uint8_t btemp= (uint8_t)temp;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	651
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	652 src -= 2;
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	653
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	654 transpose4x4(btemp , src , 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	655 transpose4x4(btemp+4, src + 4*stride, 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	656 asm volatile(
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	657 H263_LOOP_FILTER // 5 3 4 6
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	658
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	659 : "+m" (temp[0]),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	660 "+m" (temp[1]),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	661 "+m" (temp[2]),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	662 "+m" (temp[3])
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	663 : "g" (2*strength), "m"(ff_pb_FC)
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	664 );
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	665
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	666 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	667 "movq %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	668 "movq %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	669 "punpcklbw %%mm3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	670 "punpcklbw %%mm6, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	671 "punpckhbw %%mm3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	672 "punpckhbw %%mm6, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	673 "movq %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	674 "movq %%mm1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	675 "punpcklwd %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	676 "punpcklwd %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	677 "punpckhwd %%mm4, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	678 "punpckhwd %%mm0, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	679 "movd %%mm5, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	680 "punpckhdq %%mm5, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	681 "movd %%mm5, (%0,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	682 "movd %%mm3, (%0,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	683 "punpckhdq %%mm3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	684 "movd %%mm3, (%0,%3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	685 "movd %%mm1, (%1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	686 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	687 "movd %%mm1, (%1,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	688 "movd %%mm6, (%1,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	689 "punpckhdq %%mm6, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	690 "movd %%mm6, (%1,%3) \n\t"
2505 86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<) michael parents: 2293 diff changeset	691 :: "r" (src),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<) michael parents: 2293 diff changeset	692 "r" (src + 4*stride),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<) michael parents: 2293 diff changeset	693 "r" ((long) stride ),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<) michael parents: 2293 diff changeset	694 "r" ((long)(3*stride))
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	695 );
5394 e9a6215f4e3a help some gcc version to optimize out those functions aurel parents: 5278 diff changeset	696 }
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	697 }
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	698
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	699 #define PAETH(cpu, abs3)\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	700 void add_png_paeth_prediction_##cpu(uint8_t dst, uint8_t src, uint8_t *top, int w, int bpp)\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	701 {\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	702 long i = -bpp;\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	703 long end = w-3;\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	704 asm volatile(\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	705 "pxor %%mm7, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	706 "movd (%1,%0), %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	707 "movd (%2,%0), %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	708 "punpcklbw %%mm7, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	709 "punpcklbw %%mm7, %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	710 "add %4, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	711 "1: \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	712 "movq %%mm1, %%mm2 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	713 "movd (%2,%0), %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	714 "movq %%mm2, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	715 "punpcklbw %%mm7, %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	716 "movq %%mm2, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	717 "psubw %%mm1, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	718 "psubw %%mm0, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	719 "movq %%mm3, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	720 "paddw %%mm4, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	721 abs3\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	722 "movq %%mm4, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	723 "pminsw %%mm5, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	724 "pcmpgtw %%mm6, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	725 "pcmpgtw %%mm5, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	726 "movq %%mm4, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	727 "pand %%mm3, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	728 "pandn %%mm3, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	729 "pandn %%mm0, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	730 "movd (%3,%0), %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	731 "pand %%mm1, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	732 "pand %%mm4, %%mm2 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	733 "punpcklbw %%mm7, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	734 "movq %6, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	735 "paddw %%mm6, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	736 "paddw %%mm2, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	737 "paddw %%mm3, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	738 "pand %%mm5, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	739 "movq %%mm0, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	740 "packuswb %%mm3, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	741 "movd %%mm3, (%1,%0) \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	742 "add %4, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	743 "cmp %5, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	744 "jle 1b \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	745 :"+r"(i)\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	746 :"r"(dst), "r"(top), "r"(src), "r"((long)bpp), "g"(end),\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	747 "m"(ff_pw_255)\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	748 :"memory"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	749 );\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	750 }
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	751
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	752 #define ABS3_MMX2\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	753 "psubw %%mm5, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	754 "pmaxsw %%mm7, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	755 "pxor %%mm6, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	756 "pxor %%mm7, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	757 "psubw %%mm3, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	758 "psubw %%mm4, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	759 "pmaxsw %%mm6, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	760 "pmaxsw %%mm7, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	761 "pxor %%mm7, %%mm7 \n"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	762
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	763 #define ABS3_SSSE3\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	764 "pabsw %%mm3, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	765 "pabsw %%mm4, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	766 "pabsw %%mm5, %%mm5 \n"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	767
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	768 PAETH(mmx2, ABS3_MMX2)
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	769 #ifdef HAVE_SSSE3
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	770 PAETH(ssse3, ABS3_SSSE3)
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	771 #endif
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	772
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	773 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	774 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	775 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	776 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	777 "movq "#in7", " #m3 " \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	778 "movq "#in0", %%mm5 \n\t" /* D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	779 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	780 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	781 "movq "#in1", %%mm5 \n\t" /* C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	782 "movq "#in2", %%mm6 \n\t" /* B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	783 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	784 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	785 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	786 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	787 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	788 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	789 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	790 "psraw $5, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	791 "packuswb %%mm5, %%mm5 \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	792 OP(%%mm5, out, %%mm7, d)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	793
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	794 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1057 bb5de8a59da8 * static,const,compiler warning cleanup kabi parents: 997 diff changeset	795 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	796 uint64_t temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	797 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	798 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	799 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	800 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	801 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	802 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	803 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	804 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	805 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	806 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	807 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	808 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	809 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	810 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	811 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	812 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	813 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	814 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	815 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	816 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	817 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	818 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	819 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	820 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	821 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	822 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	823 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	824 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	825 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	826 "paddw %6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	827 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	828 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	829 "movq %%mm0, %5 \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	830 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	831 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	832 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	833 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	834 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	835 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	836 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	837 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	838 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	839 "paddw %%mm0, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	840 "paddw %%mm5, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	841 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	842 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	843 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	844 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	845 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	846 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	847 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	848 "paddw %%mm2, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	849 "paddw %%mm6, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	850 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	851 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	852 "paddw %6, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	853 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	854 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	855 "movq %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	856 "packuswb %%mm3, %%mm1 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	857 OP_MMX2(%%mm1, (%1),%%mm4, q)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	858 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	859 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	860 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	861 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	862 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	863 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	864 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	865 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	866 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	867 "paddw %%mm1, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	868 "paddw %%mm4, %%mm0 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	869 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	870 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	871 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	872 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	873 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	874 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	875 "paddw %%mm3, %%mm2 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	876 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	877 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	878 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	879 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	880 "paddw %%mm2, %%mm6 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	881 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	882 "paddw %6, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	883 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	884 "psraw $5, %%mm0 \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	885 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	886 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	887 "paddw %%mm5, %%mm3 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	888 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	889 "paddw %%mm4, %%mm6 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	890 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	891 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	892 "paddw %%mm1, %%mm4 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	893 "paddw %%mm2, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	894 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	895 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	896 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	897 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	898 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	899 "paddw %6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	900 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	901 "psraw $5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	902 "packuswb %%mm4, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	903 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	904 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	905 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	906 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	907 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	908 " jnz 1b \n\t"\
6335 950811a14eb3 put loop counter in a register if possible. makes some of the qpel functions 3% faster. lorenm parents: 6333 diff changeset	909 : "+a"(src), "+c"(dst), "+g"(h)\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	910 : "d"((long)srcStride), "S"((long)dstStride), /"m"(ff_pw_20), "m"(ff_pw_3),/ "m"(temp), "m"(ROUNDER)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	911 : "memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	912 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	913 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	914 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	915 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	916 int i;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	917 int16_t temp[16];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	918 /* quick HACK, XXX FIXME MUST be optimized */\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	919 for(i=0; i<h; i++)\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	920 {\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	921 temp[ 0]= (src[ 0]+src[ 1])20 - (src[ 0]+src[ 2])6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	922 temp[ 1]= (src[ 1]+src[ 2])20 - (src[ 0]+src[ 3])6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	923 temp[ 2]= (src[ 2]+src[ 3])20 - (src[ 1]+src[ 4])6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	924 temp[ 3]= (src[ 3]+src[ 4])20 - (src[ 2]+src[ 5])6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	925 temp[ 4]= (src[ 4]+src[ 5])20 - (src[ 3]+src[ 6])6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	926 temp[ 5]= (src[ 5]+src[ 6])20 - (src[ 4]+src[ 7])6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	927 temp[ 6]= (src[ 6]+src[ 7])20 - (src[ 5]+src[ 8])6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	928 temp[ 7]= (src[ 7]+src[ 8])20 - (src[ 6]+src[ 9])6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	929 temp[ 8]= (src[ 8]+src[ 9])20 - (src[ 7]+src[10])6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	930 temp[ 9]= (src[ 9]+src[10])20 - (src[ 8]+src[11])6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	931 temp[10]= (src[10]+src[11])20 - (src[ 9]+src[12])6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	932 temp[11]= (src[11]+src[12])20 - (src[10]+src[13])6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	933 temp[12]= (src[12]+src[13])20 - (src[11]+src[14])6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	934 temp[13]= (src[13]+src[14])20 - (src[12]+src[15])6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	935 temp[14]= (src[14]+src[15])20 - (src[13]+src[16])6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	936 temp[15]= (src[15]+src[16])20 - (src[14]+src[16])6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	937 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	938 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	939 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	940 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	941 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	942 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	943 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	944 "packuswb %%mm1, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	945 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	946 "movq 16(%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	947 "movq 24(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	948 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	949 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	950 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	951 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	952 "packuswb %%mm1, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	953 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	954 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	955 : "memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	956 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	957 dst+=dstStride;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	958 src+=srcStride;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	959 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	960 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	961 \
1057 bb5de8a59da8 * static,const,compiler warning cleanup kabi parents: 997 diff changeset	962 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	963 uint64_t temp;\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	964 \
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	965 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	966 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	967 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	968 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	969 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	970 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	971 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	972 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	973 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	974 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	975 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	976 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	977 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	978 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	979 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	980 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	981 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	982 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	983 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	984 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	985 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	986 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	987 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	988 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	989 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	990 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	991 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	992 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	993 "paddw %6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	994 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	995 "psraw $5, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	996 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	997 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	998 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	999 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1000 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1001 "paddw %%mm5, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1002 "paddw %%mm6, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1003 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1004 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1005 "paddw %%mm6, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1006 "paddw %%mm5, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1007 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1008 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1009 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1010 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1011 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1012 "paddw %6, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1013 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1014 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1015 "packuswb %%mm3, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1016 OP_MMX2(%%mm0, (%1), %%mm4, q)\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1017 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1018 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1019 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1020 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1021 " jnz 1b \n\t"\
6335 950811a14eb3 put loop counter in a register if possible. makes some of the qpel functions 3% faster. lorenm parents: 6333 diff changeset	1022 : "+a"(src), "+c"(dst), "+g"(h)\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1023 : "S"((long)srcStride), "D"((long)dstStride), /"m"(ff_pw_20), "m"(ff_pw_3),/ "m"(temp), "m"(ROUNDER)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1024 : "memory"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1025 );\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1026 }\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1027 \
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1028 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1029 int i;\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1030 int16_t temp[8];\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1031 /* quick HACK, XXX FIXME MUST be optimized */\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1032 for(i=0; i<h; i++)\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1033 {\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1034 temp[ 0]= (src[ 0]+src[ 1])20 - (src[ 0]+src[ 2])6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1035 temp[ 1]= (src[ 1]+src[ 2])20 - (src[ 0]+src[ 3])6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1036 temp[ 2]= (src[ 2]+src[ 3])20 - (src[ 1]+src[ 4])6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1037 temp[ 3]= (src[ 3]+src[ 4])20 - (src[ 2]+src[ 5])6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1038 temp[ 4]= (src[ 4]+src[ 5])20 - (src[ 3]+src[ 6])6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1039 temp[ 5]= (src[ 5]+src[ 6])20 - (src[ 4]+src[ 7])6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1040 temp[ 6]= (src[ 6]+src[ 7])20 - (src[ 5]+src[ 8])6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1041 temp[ 7]= (src[ 7]+src[ 8])20 - (src[ 6]+src[ 8])6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1042 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1043 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1044 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1045 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1046 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1047 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1048 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1049 "packuswb %%mm1, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1050 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1051 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1052 :"memory"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1053 );\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1054 dst+=dstStride;\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1055 src+=srcStride;\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1056 }\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1057 }
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1058
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1059 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1060 \
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1061 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1062 uint64_t temp[17*4];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1063 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1064 int count= 17;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1065 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1066 /FIXME unroll /\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1067 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1068 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1069 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1070 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1071 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1072 "movq 8(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1073 "movq 8(%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1074 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1075 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1076 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1077 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1078 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1079 "movq %%mm1, 17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1080 "movq %%mm2, 2178(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1081 "movq %%mm3, 3178(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1082 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1083 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1084 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1085 " jnz 1b \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1086 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1087 : "r" ((long)srcStride)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1088 : "memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1089 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1090 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1091 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1092 count=4;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1093 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1094 /FIXME reorder for speed /\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1095 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1096 /"pxor %%mm7, %%mm7 \n\t"/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1097 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1098 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1099 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1100 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1101 "movq 24(%0), %%mm3 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1102 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1103 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1104 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1105 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1106 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1107 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1108 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1109 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1110 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1111 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1112 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1113 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1114 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1115 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1116 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1117 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1118 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1119 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1120 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1121 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1122 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1123 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1124 "add %4, %1 \n\t" \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1125 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1126 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1127 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1128 "add $136, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1129 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1130 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1131 " jnz 1b \n\t"\
958 9bb668034ecf slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped) michaelni parents: 954 diff changeset	1132 \
967 274b518c4ecb PIC / ebx fix michaelni parents: 966 diff changeset	1133 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1134 : "r"((long)dstStride), "r"(2(long)dstStride), /"m"(ff_pw_20), "m"(ff_pw_3),/ "m"(ROUNDER), "g"(4-14(long)dstStride)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1135 :"memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1136 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1137 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1138 \
1057 bb5de8a59da8 * static,const,compiler warning cleanup kabi parents: 997 diff changeset	1139 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	1140 uint64_t temp[9*2];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1141 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1142 int count= 9;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1143 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1144 /FIXME unroll /\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1145 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1146 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1147 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1148 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1149 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1150 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1151 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1152 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1153 "movq %%mm1, 9*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1154 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1155 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1156 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1157 " jnz 1b \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1158 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1159 : "r" ((long)srcStride)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1160 : "memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1161 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1162 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1163 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1164 count=2;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1165 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1166 /FIXME reorder for speed /\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1167 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1168 /"pxor %%mm7, %%mm7 \n\t"/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1169 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1170 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1171 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1172 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1173 "movq 24(%0), %%mm3 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1174 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1175 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1176 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1177 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1178 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1179 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1180 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1181 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1182 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1183 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1184 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1185 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1186 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1187 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1188 "add $72, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1189 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1190 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1191 " jnz 1b \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1192 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1193 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1194 : "r"((long)dstStride), "r"(2(long)dstStride), /"m"(ff_pw_20), "m"(ff_pw_3),/ "m"(ROUNDER), "g"(4-6(long)dstStride)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1195 : "memory"\
7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1196 );\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1197 }\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1198 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1199 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t dst, uint8_t src, int stride){\
6321 57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	1200 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1201 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1202 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1203 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1204 uint64_t temp[8];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1205 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1206 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1207 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1208 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1209 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1210 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1211 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1212 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1213 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1214 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1215 uint64_t temp[8];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1216 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1217 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1218 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1219 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1220 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1221 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1222 uint64_t temp[8];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1223 uint8_t * const half= (uint8_t*)temp;\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1224 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1225 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1226 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1227 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1228 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t dst, uint8_t src, int stride){\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1229 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1230 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1231 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1232 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1233 uint64_t temp[8];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1234 uint8_t * const half= (uint8_t*)temp;\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1235 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1236 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1237 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1238 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1239 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1240 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1241 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1242 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1243 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1244 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1245 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1246 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1247 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1248 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1249 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1250 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1251 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1252 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1253 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1254 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1255 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1256 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1257 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1258 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1259 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1260 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1261 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1262 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1263 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1264 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1265 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1266 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1267 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1268 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1269 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1270 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1271 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1272 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1273 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1274 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1275 uint64_t half[8 + 9];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1276 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1277 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1278 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1279 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1280 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1281 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1282 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1283 uint64_t half[8 + 9];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1284 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1285 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1286 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1287 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1288 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1289 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1290 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1291 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1292 uint8_t * const halfH= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1293 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1294 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1295 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1296 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1297 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1298 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1299 uint8_t * const halfH= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1300 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1301 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1302 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1303 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1304 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1305 uint64_t half[9];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1306 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1307 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1308 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1309 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1310 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t dst, uint8_t src, int stride){\
6321 57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	1311 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1312 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1313 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1314 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1315 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1316 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1317 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1318 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1319 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1320 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1321 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1322 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1323 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1324 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1325 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1326 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1327 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1328 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1329 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1330 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1331 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1332 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1333 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1334 uint8_t * const half= (uint8_t*)temp;\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1335 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1336 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1337 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1338 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1339 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t dst, uint8_t src, int stride){\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1340 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1341 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1342 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1343 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1344 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1345 uint8_t * const half= (uint8_t*)temp;\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1346 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1347 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1348 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1349 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1350 uint64_t half[162 + 172];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1351 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1352 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1353 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1354 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1355 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1356 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1357 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1358 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1359 uint64_t half[162 + 172];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1360 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1361 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1362 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1363 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1364 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1365 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1366 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1367 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1368 uint64_t half[162 + 172];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1369 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1370 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1371 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1372 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1373 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1374 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1375 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1376 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1377 uint64_t half[162 + 172];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1378 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1379 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1380 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1381 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1382 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1383 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1384 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1385 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1386 uint64_t half[162 + 172];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1387 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1388 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1389 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1390 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1391 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1392 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1393 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1394 uint64_t half[162 + 172];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1395 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1396 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1397 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1398 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1399 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1400 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1401 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1402 uint64_t half[17*2];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1403 uint8_t * const halfH= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1404 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1405 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1406 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1407 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1408 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1409 uint64_t half[17*2];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1410 uint8_t * const halfH= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1411 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1412 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1413 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1414 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1415 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1416 uint64_t half[17*2];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1417 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1418 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1419 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1420 }
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1421
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1422 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1423 #define AVG_3DNOW_OP(a,b,temp, size) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1424 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1425 "pavgusb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1426 "mov" #size " " #a ", " #b " \n\t"
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1427 #define AVG_MMX2_OP(a,b,temp, size) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1428 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1429 "pavgb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1430 "mov" #size " " #a ", " #b " \n\t"
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1431
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1432 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1433 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1434 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1435 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1436 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1437 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1438 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1439 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1440 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1441
3807 6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1442 /***********************************/
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1443 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1444
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1445 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1446 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t dst, uint8_t src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1447 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1448 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1449 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1450 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t dst, uint8_t src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1451 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1452 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1453
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1454 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1455 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1456 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1457 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1458 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1459 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1460 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1461 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1462 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1463 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1464 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t dst, uint8_t src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1465 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1466 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1467 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t dst, uint8_t src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1468 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1469 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1470 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1471 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1472 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1473 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1474 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1475 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1476 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1477 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1478
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1479 QPEL_2TAP(put_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1480 QPEL_2TAP(avg_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1481 QPEL_2TAP(put_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1482 QPEL_2TAP(avg_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1483 QPEL_2TAP(put_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1484 QPEL_2TAP(avg_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1485 QPEL_2TAP(put_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1486 QPEL_2TAP(avg_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1487
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1488
393 bf164fce2c14 removed debug function glantau parents: 387 diff changeset	1489 #if 0
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	1490 static void just_return() { return; }
393 bf164fce2c14 removed debug function glantau parents: 387 diff changeset	1491 #endif
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	1492
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1493 static void gmc_mmx(uint8_t dst, uint8_t src, int stride, int h, int ox, int oy,
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1494 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1495 const int w = 8;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1496 const int ix = ox>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1497 const int iy = oy>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1498 const int oxs = ox>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1499 const int oys = oy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1500 const int dxxs = dxx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1501 const int dxys = dxy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1502 const int dyxs = dyx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1503 const int dyys = dyy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1504 const uint16_t r4[4] = {r,r,r,r};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1505 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1506 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1507 const uint64_t shift2 = 2*shift;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1508 uint8_t edge_buf[(h+1)*stride];
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1509 int x, y;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1510
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1511 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1512 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1513 const int dxh = dxy*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1514 const int dyw = dyx*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1515 if( // non-constant fullpel offset (3% of blocks)
6196 166bef5cad01 add parenthesis, fix warning: i386/dsputil_mmx.c:2618: warning: suggest parentheses around arithmetic in operand of \| bcoudurier parents: 6195 diff changeset	1516 ((ox^(ox+dxw)) \| (ox^(ox+dxh)) \| (ox^(ox+dxw+dxh)) \|
166bef5cad01 add parenthesis, fix warning: i386/dsputil_mmx.c:2618: warning: suggest parentheses around arithmetic in operand of \| bcoudurier parents: 6195 diff changeset	1517 (oy^(oy+dyw)) \| (oy^(oy+dyh)) \| (oy^(oy+dyw+dyh))) >> (16+shift)
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1518 // uses more than 16 bits of subpel mv (only at huge resolution)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1519 \|\| (dxx\|dxy\|dyx\|dyy)&15 )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1520 {
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1521 //FIXME could still use mmx for some of the rows
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1522 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1523 return;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1524 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1525
3250 fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1526 src += ix + iy*stride;
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1527 if( (unsigned)ix >= width-w \|\|
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1528 (unsigned)iy >= height-h )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1529 {
3250 fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1530 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1531 src = edge_buf;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1532 }
3250 fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1533
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1534 asm volatile(
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1535 "movd %0, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1536 "pxor %%mm7, %%mm7 \n\t"
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1537 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1538 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1539 :: "r"(1<<shift)
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1540 );
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1541
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1542 for(x=0; x<w; x+=4){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1543 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1544 oxs - dxys + dxxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1545 oxs - dxys + dxxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1546 oxs - dxys + dxxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1547 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1548 oys - dyys + dyxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1549 oys - dyys + dyxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1550 oys - dyys + dyxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1551
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1552 for(y=0; y<h; y++){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1553 asm volatile(
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1554 "movq %0, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1555 "movq %1, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1556 "paddw %2, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1557 "paddw %3, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1558 "movq %%mm4, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1559 "movq %%mm5, %1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1560 "psrlw $12, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1561 "psrlw $12, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1562 : "+m"(dx4), "+m"(dy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1563 : "m"(dxy4), "m"(dyy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1564 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1565
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1566 asm volatile(
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1567 "movq %%mm6, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1568 "movq %%mm6, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1569 "psubw %%mm4, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1570 "psubw %%mm5, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1571 "movq %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1572 "movq %%mm4, %%mm3 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1573 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1574 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1575 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1576 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1577
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1578 "movd %4, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1579 "movd %3, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1580 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1581 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1582 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1583 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1584
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1585 "movd %2, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1586 "movd %1, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1587 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1588 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1589 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1590 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
3250 fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1591 "paddw %5, %%mm1 \n\t"
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1592 "paddw %%mm3, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1593 "paddw %%mm1, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1594 "paddw %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1595
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1596 "psrlw %6, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1597 "packuswb %%mm0, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1598 "movd %%mm0, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1599
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1600 : "=m"(dst[x+y*stride])
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1601 : "m"(src[0]), "m"(src[1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1602 "m"(src[stride]), "m"(src[stride+1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1603 "m"(*r4), "m"(shift2)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1604 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1605 src += stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1606 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1607 src += 4-h*stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1608 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1609 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1610
3215 06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1611 #define PREFETCH(name, op) \
4172 608e2dfcb86e adding more static keywords mru parents: 4127 diff changeset	1612 static void name(void *mem, int stride, int h){\
3215 06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1613 const uint8_t *p= mem;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1614 do{\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1615 asm volatile(#op" %0" :: "m"(*p));\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1616 p+= stride;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1617 }while(--h);\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1618 }
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1619 PREFETCH(prefetch_mmx2, prefetcht0)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1620 PREFETCH(prefetch_3dnow, prefetch)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1621 #undef PREFETCH
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1622
2754 a49f140179e9 sort H.264 mmx dsp functions into their own file lorenm parents: 2753 diff changeset	1623 #include "h264dsp_mmx.c"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1624
6009 ecfdc0bfb233 typo/clarification diego parents: 5963 diff changeset	1625 /* CAVS specific */
3524 419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1626 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1627
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1628 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t dst, uint8_t src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1629 put_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1630 }
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1631 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t dst, uint8_t src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1632 avg_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1633 }
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1634 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t dst, uint8_t src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1635 put_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1636 }
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1637 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t dst, uint8_t src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1638 avg_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1639 }
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1640
5948 db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1641 /* VC1 specific */
db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1642 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1643
db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1644 void ff_put_vc1_mspel_mc00_mmx(uint8_t dst, const uint8_t src, int stride, int rnd) {
db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1645 put_pixels8_mmx(dst, src, stride, 8);
db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1646 }
db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1647
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1648 /* external functions, from idct_mmx.c */
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1649 void ff_mmx_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1650 void ff_mmxext_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1651
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1652 /* XXX: those functions should be suppressed ASAP when all IDCTs are
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1653 converted */
4020 723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure diego parents: 4001 diff changeset	1654 #ifdef CONFIG_GPL
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1655 static void ff_libmpeg2mmx_idct_put(uint8_t dest, int line_size, DCTELEM block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1656 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1657 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1658 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1659 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1660 static void ff_libmpeg2mmx_idct_add(uint8_t dest, int line_size, DCTELEM block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1661 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1662 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1663 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1664 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1665 static void ff_libmpeg2mmx2_idct_put(uint8_t dest, int line_size, DCTELEM block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1666 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1667 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1668 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1669 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1670 static void ff_libmpeg2mmx2_idct_add(uint8_t dest, int line_size, DCTELEM block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1671 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1672 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1673 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1674 }
4020 723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure diego parents: 4001 diff changeset	1675 #endif
2868 666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1676 static void ff_idct_xvid_mmx_put(uint8_t dest, int line_size, DCTELEM block)
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1677 {
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1678 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1679 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1680 }
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1681 static void ff_idct_xvid_mmx_add(uint8_t dest, int line_size, DCTELEM block)
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1682 {
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1683 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1684 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1685 }
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1686 static void ff_idct_xvid_mmx2_put(uint8_t dest, int line_size, DCTELEM block)
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1687 {
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1688 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1689 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1690 }
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1691 static void ff_idct_xvid_mmx2_add(uint8_t dest, int line_size, DCTELEM block)
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1692 {
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1693 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1694 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1695 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1696
3541 3fbddeb13686 10l, vorbis_inverse_coupling_sse() was really 3dnow lorenm parents: 3536 diff changeset	1697 static void vorbis_inverse_coupling_3dnow(float mag, float ang, int blocksize)
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1698 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1699 int i;
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1700 asm volatile("pxor %%mm7, %%mm7":);
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1701 for(i=0; i<blocksize; i+=2) {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1702 asm volatile(
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1703 "movq %0, %%mm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1704 "movq %1, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1705 "movq %%mm0, %%mm2 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1706 "movq %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1707 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1708 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1709 "pslld $31, %%mm2 \n\t" // keep only the sign bit
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1710 "pxor %%mm2, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1711 "movq %%mm3, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1712 "pand %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1713 "pandn %%mm1, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1714 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1715 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1716 "movq %%mm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1717 "movq %%mm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1718 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1719 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1720 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1721 }
3561 97325fecd35a emms -> femms lorenm parents: 3557 diff changeset	1722 asm volatile("femms");
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1723 }
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1724 static void vorbis_inverse_coupling_sse(float mag, float ang, int blocksize)
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1725 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1726 int i;
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1727
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1728 asm volatile(
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1729 "movaps %0, %%xmm5 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1730 ::"m"(ff_pdw_80000000[0])
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1731 );
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1732 for(i=0; i<blocksize; i+=4) {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1733 asm volatile(
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1734 "movaps %0, %%xmm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1735 "movaps %1, %%xmm1 \n\t"
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1736 "xorps %%xmm2, %%xmm2 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1737 "xorps %%xmm3, %%xmm3 \n\t"
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1738 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1739 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1740 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1741 "xorps %%xmm2, %%xmm1 \n\t"
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1742 "movaps %%xmm3, %%xmm4 \n\t"
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1743 "andps %%xmm1, %%xmm3 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1744 "andnps %%xmm1, %%xmm4 \n\t"
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1745 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1746 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1747 "movaps %%xmm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1748 "movaps %%xmm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1749 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1750 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1751 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1752 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1753 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1754
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1755 static void vector_fmul_3dnow(float dst, const float src, int len){
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1756 long i = (len-4)*4;
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1757 asm volatile(
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1758 "1: \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1759 "movq (%1,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1760 "movq 8(%1,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1761 "pfmul (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1762 "pfmul 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1763 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1764 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1765 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1766 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1767 "femms \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1768 :"+r"(i)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1769 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1770 :"memory"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1771 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1772 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1773 static void vector_fmul_sse(float dst, const float src, int len){
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1774 long i = (len-8)*4;
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1775 asm volatile(
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1776 "1: \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1777 "movaps (%1,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1778 "movaps 16(%1,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1779 "mulps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1780 "mulps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1781 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1782 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1783 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1784 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1785 :"+r"(i)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1786 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1787 :"memory"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1788 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1789 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1790
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1791 static void vector_fmul_reverse_3dnow2(float dst, const float src0, const float *src1, int len){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1792 long i = len*4-16;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1793 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1794 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1795 "pswapd 8(%1), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1796 "pswapd (%1), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1797 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1798 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1799 "movq %%mm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1800 "movq %%mm1, 8(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1801 "add $16, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1802 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1803 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1804 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1805 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1806 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1807 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1808 }
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1809 static void vector_fmul_reverse_sse(float dst, const float src0, const float *src1, int len){
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1810 long i = len*4-32;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1811 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1812 "1: \n\t"
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1813 "movaps 16(%1), %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1814 "movaps (%1), %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1815 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1816 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1817 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1818 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1819 "movaps %%xmm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1820 "movaps %%xmm1, 16(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1821 "add $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1822 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1823 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1824 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1825 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1826 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1827 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1828
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1829 static void vector_fmul_add_add_3dnow(float dst, const float src0, const float *src1,
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1830 const float *src2, int src3, int len, int step){
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1831 long i = (len-4)*4;
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1832 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1833 dst += (len-4)*2;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1834 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1835 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1836 "movq (%2,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1837 "movq 8(%2,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1838 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1839 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1840 "pfadd (%4,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1841 "pfadd 8(%4,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1842 "movd %%mm0, (%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1843 "movd %%mm1, 16(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1844 "psrlq $32, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1845 "psrlq $32, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1846 "movd %%mm0, 8(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1847 "movd %%mm1, 24(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1848 "sub $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1849 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1850 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1851 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1852 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1853 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1854 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1855 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1856 else if(step == 1 && src3 == 0){
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1857 asm volatile(
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1858 "1: \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1859 "movq (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1860 "movq 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1861 "pfmul (%3,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1862 "pfmul 8(%3,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1863 "pfadd (%4,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1864 "pfadd 8(%4,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1865 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1866 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1867 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1868 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1869 :"+r"(i)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1870 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1871 :"memory"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1872 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1873 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1874 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1875 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1876 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1877 }
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1878 static void vector_fmul_add_add_sse(float dst, const float src0, const float *src1,
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1879 const float *src2, int src3, int len, int step){
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1880 long i = (len-8)*4;
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1881 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1882 dst += (len-8)*2;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1883 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1884 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1885 "movaps (%2,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1886 "movaps 16(%2,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1887 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1888 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1889 "addps (%4,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1890 "addps 16(%4,%0), %%xmm1 \n\t"
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1891 "movss %%xmm0, (%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1892 "movss %%xmm1, 32(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1893 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1894 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1895 "movss %%xmm2, 16(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1896 "movss %%xmm3, 48(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1897 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1898 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1899 "movss %%xmm0, 8(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1900 "movss %%xmm1, 40(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1901 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1902 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1903 "movss %%xmm2, 24(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	1904 "movss %%xmm3, 56(%1) \n\t"
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1905 "sub $64, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1906 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1907 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1908 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1909 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1910 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1911 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1912 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1913 else if(step == 1 && src3 == 0){
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1914 asm volatile(
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1915 "1: \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1916 "movaps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1917 "movaps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1918 "mulps (%3,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1919 "mulps 16(%3,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1920 "addps (%4,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1921 "addps 16(%4,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1922 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1923 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1924 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1925 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1926 :"+r"(i)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1927 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1928 :"memory"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1929 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1930 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1931 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1932 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1933 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1934
4172 608e2dfcb86e adding more static keywords mru parents: 4127 diff changeset	1935 static void float_to_int16_3dnow(int16_t dst, const float src, int len){
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1936 // not bit-exact: pf2id uses different rounding than C and SSE
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1937 int i;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1938 for(i=0; i<len; i+=4) {
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1939 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1940 "pf2id %1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1941 "pf2id %2, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1942 "packssdw %%mm1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1943 "movq %%mm0, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1944 :"=m"(dst[i])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1945 :"m"(src[i]), "m"(src[i+2])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1946 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1947 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1948 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1949 }
4172 608e2dfcb86e adding more static keywords mru parents: 4127 diff changeset	1950 static void float_to_int16_sse(int16_t dst, const float src, int len){
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1951 int i;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1952 for(i=0; i<len; i+=4) {
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1953 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1954 "cvtps2pi %1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1955 "cvtps2pi %2, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1956 "packssdw %%mm1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1957 "movq %%mm0, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1958 :"=m"(dst[i])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1959 :"m"(src[i]), "m"(src[i+2])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1960 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1961 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1962 asm volatile("emms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1963 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1964
6195 5f704e9cb518 fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type bcoudurier parents: 6181 diff changeset	1965 extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
5f704e9cb518 fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type bcoudurier parents: 6181 diff changeset	1966 extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
5f704e9cb518 fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type bcoudurier parents: 6181 diff changeset	1967 extern void ff_snow_vertical_compose97i_sse2(IDWTELEM b0, IDWTELEM b1, IDWTELEM b2, IDWTELEM b3, IDWTELEM b4, IDWTELEM b5, int width);
5f704e9cb518 fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type bcoudurier parents: 6181 diff changeset	1968 extern void ff_snow_vertical_compose97i_mmx(IDWTELEM b0, IDWTELEM b1, IDWTELEM b2, IDWTELEM b3, IDWTELEM b4, IDWTELEM b5, int width);
4436 d3e389536b0a Add the const specifier as needed to reduce the number of warnings. takis parents: 4197 diff changeset	1969 extern void ff_snow_inner_add_yblock_sse2(const uint8_t obmc, const int obmc_stride, uint8_t * block, int b_w, int b_h,
3211 b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock gpoirier parents: 3210 diff changeset	1970 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
4436 d3e389536b0a Add the const specifier as needed to reduce the number of warnings. takis parents: 4197 diff changeset	1971 extern void ff_snow_inner_add_yblock_mmx(const uint8_t obmc, const int obmc_stride, uint8_t * block, int b_w, int b_h,
3211 b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock gpoirier parents: 3210 diff changeset	1972 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	1973
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1974 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
0 986e461dc072 Initial revision glantau parents: diff changeset	1975 {
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	1976 mm_flags = mm_support();
1115 74a46d77e061 * support FF_MM_FORCE kabi parents: 1092 diff changeset	1977
1122 ddc3b0140b8f * oooooops - sorry for this one - wrong logic kabi parents: 1115 diff changeset	1978 if (avctx->dsp_mask) {
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1979 if (avctx->dsp_mask & FF_MM_FORCE)
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	1980 mm_flags \|= (avctx->dsp_mask & 0xffff);
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1981 else
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	1982 mm_flags &= ~(avctx->dsp_mask & 0xffff);
1122 ddc3b0140b8f * oooooops - sorry for this one - wrong logic kabi parents: 1115 diff changeset	1983 }
1115 74a46d77e061 * support FF_MM_FORCE kabi parents: 1092 diff changeset	1984
631 47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>) michaelni parents: 629 diff changeset	1985 #if 0
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	1986 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	1987 if (mm_flags & MM_MMX)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	1988 av_log(avctx, AV_LOG_INFO, " mmx");
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	1989 if (mm_flags & MM_MMXEXT)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	1990 av_log(avctx, AV_LOG_INFO, " mmxext");
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	1991 if (mm_flags & MM_3DNOW)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	1992 av_log(avctx, AV_LOG_INFO, " 3dnow");
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	1993 if (mm_flags & MM_SSE)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	1994 av_log(avctx, AV_LOG_INFO, " sse");
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	1995 if (mm_flags & MM_SSE2)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	1996 av_log(avctx, AV_LOG_INFO, " sse2");
771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	1997 av_log(avctx, AV_LOG_INFO, "\n");
0 986e461dc072 Initial revision glantau parents: diff changeset	1998 #endif
986e461dc072 Initial revision glantau parents: diff changeset	1999
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	2000 if (mm_flags & MM_MMX) {
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2001 const int idct_algo= avctx->idct_algo;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2002
2256 7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2003 if(avctx->lowres==0){
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2004 if(idct_algo==FF_IDCT_AUTO \|\| idct_algo==FF_IDCT_SIMPLEMMX){
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2005 c->idct_put= ff_simple_idct_put_mmx;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2006 c->idct_add= ff_simple_idct_add_mmx;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2007 c->idct = ff_simple_idct_mmx;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2008 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3717 ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel diego parents: 3712 diff changeset	2009 #ifdef CONFIG_GPL
2256 7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2010 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	2011 if(mm_flags & MM_MMXEXT){
2256 7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2012 c->idct_put= ff_libmpeg2mmx2_idct_put;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2013 c->idct_add= ff_libmpeg2mmx2_idct_add;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2014 c->idct = ff_mmxext_idct;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2015 }else{
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2016 c->idct_put= ff_libmpeg2mmx_idct_put;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2017 c->idct_add= ff_libmpeg2mmx_idct_add;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2018 c->idct = ff_mmx_idct;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2019 }
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2020 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3717 ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel diego parents: 3712 diff changeset	2021 #endif
5007 f7edc4fe94db Make vp3dsp.c compilation optional. takis* parents: 4988 diff changeset	2022 }else if((ENABLE_VP3_DECODER \|\| ENABLE_VP5_DECODER \|\| ENABLE_VP6_DECODER) &&
f7edc4fe94db Make vp3dsp.c compilation optional. takis* parents: 4988 diff changeset	2023 idct_algo==FF_IDCT_VP3 &&
3721 2000e401593d disable vp3 mmx idct for theora files to avoid artifacts aurel parents: 3717 diff changeset	2024 avctx->codec->id!=CODEC_ID_THEORA &&
3712 f7f75f718efb Enables back the mmx/sse optimized version of the vp3 idct. aurel parents: 3666 diff changeset	2025 !(avctx->flags & CODEC_FLAG_BITEXACT)){
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	2026 if(mm_flags & MM_SSE2){
2696 9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2027 c->idct_put= ff_vp3_idct_put_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2028 c->idct_add= ff_vp3_idct_add_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2029 c->idct = ff_vp3_idct_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2030 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2031 }else{
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2032 ff_vp3_dsp_init_mmx();
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2033 c->idct_put= ff_vp3_idct_put_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2034 c->idct_add= ff_vp3_idct_add_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2035 c->idct = ff_vp3_idct_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2036 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2037 }
3524 419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2038 }else if(idct_algo==FF_IDCT_CAVS){
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2039 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2868 666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2040 }else if(idct_algo==FF_IDCT_XVIDMMX){
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	2041 if(mm_flags & MM_MMXEXT){
2868 666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2042 c->idct_put= ff_idct_xvid_mmx2_put;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2043 c->idct_add= ff_idct_xvid_mmx2_add;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2044 c->idct = ff_idct_xvid_mmx2;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2045 }else{
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2046 c->idct_put= ff_idct_xvid_mmx_put;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2047 c->idct_add= ff_idct_xvid_mmx_add;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2048 c->idct = ff_idct_xvid_mmx;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2049 }
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2050 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2051 }
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	2052
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2053 c->put_pixels_clamped = put_pixels_clamped_mmx;
1984 ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	2054 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2055 c->add_pixels_clamped = add_pixels_clamped_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2056 c->clear_blocks = clear_blocks_mmx;
415 1c3f42442fba * added simple test main - see comments about how to kabi parents: 402 diff changeset	2057
6327 5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2058 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2059 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2060 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2061 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2062 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2063
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2064 SET_HPEL_FUNCS(put, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2065 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2066 SET_HPEL_FUNCS(avg, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2067 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2068 SET_HPEL_FUNCS(put, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2069 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2070 SET_HPEL_FUNCS(avg, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2071 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	2072
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2073 c->gmc= gmc_mmx;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2074
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	2075 c->add_bytes= add_bytes_mmx;
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	2076 c->add_bytes_l2= add_bytes_l2_mmx;
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	2077
5277 7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs aurel parents: 5255 diff changeset	2078 if (ENABLE_ANY_H263) {
5278 ef85411bb7e8 cosmetics: indentation aurel parents: 5277 diff changeset	2079 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
ef85411bb7e8 cosmetics: indentation aurel parents: 5277 diff changeset	2080 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
5277 7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs aurel parents: 5255 diff changeset	2081 }
6057 03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding. gpoirier parents: 6056 diff changeset	2082 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
2922 d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx. lorenm parents: 2902 diff changeset	2083 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
6057 03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding. gpoirier parents: 6056 diff changeset	2084 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	2085
3173 9a2cc7b0fbdb h264_idct_add only needs mmx1 lorenm parents: 3105 diff changeset	2086 c->h264_idct_dc_add=
9a2cc7b0fbdb h264_idct_add only needs mmx1 lorenm parents: 3105 diff changeset	2087 c->h264_idct_add= ff_h264_idct_add_mmx;
3174 b65cbae9d940 h264_idct8_add_mmx lorenm parents: 3173 diff changeset	2088 c->h264_idct8_dc_add=
b65cbae9d940 h264_idct8_add_mmx lorenm parents: 3173 diff changeset	2089 c->h264_idct8_add= ff_h264_idct8_add_mmx;
6320 ffb2a7b80d6d ff_h264_idct8_add_sse2. lorenm parents: 6196 diff changeset	2090 if (mm_flags & MM_SSE2)
ffb2a7b80d6d ff_h264_idct8_add_sse2. lorenm parents: 6196 diff changeset	2091 c->h264_idct8_add= ff_h264_idct8_add_sse2;
3173 9a2cc7b0fbdb h264_idct_add only needs mmx1 lorenm parents: 3105 diff changeset	2092
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	2093 if (mm_flags & MM_MMXEXT) {
3215 06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2094 c->prefetch = prefetch_mmx2;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2095
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2096 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2097 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	2098
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2099 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2100 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2101 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
415 1c3f42442fba * added simple test main - see comments about how to kabi parents: 402 diff changeset	2102
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2103 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2104 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	2105
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2106 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2107 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2108 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2109
3105 2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall lorenm parents: 3089 diff changeset	2110 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall lorenm parents: 3089 diff changeset	2111 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
2745 42d3e9068e32 MMX for H.264 iDCT (adapted from x264) lorenm parents: 2732 diff changeset	2112
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2113 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2114 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2115 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2116 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2117 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2118 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2119 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2120 }
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2121
6327 5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2122 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2123 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2124 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2125 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2126 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2127 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2128 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2129 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2130 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2131 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2132 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2133 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2134 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2135 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2136 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2137 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2138 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2139
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2140 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2141 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2142 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2143 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2144 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2145 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2146
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2147 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2148 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2149 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2150 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2151 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2152 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2153
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2154 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2155 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2156 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2157 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	2158
6057 03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding. gpoirier parents: 6056 diff changeset	2159 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
2922 d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx. lorenm parents: 2902 diff changeset	2160 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
3213 57d31bdbebe8 added mmx implementation of h264_chroma_mc2 lorenm parents: 3211 diff changeset	2161 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
57d31bdbebe8 added mmx implementation of h264_chroma_mc2 lorenm parents: 3211 diff changeset	2162 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
2633 72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	2163 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	2164 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	2165 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	2166 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
2707 360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math) lorenm parents: 2696 diff changeset	2167 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math) lorenm parents: 2696 diff changeset	2168 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
3645 47821be55b6c mmx implementation of deblocking strength decision. lorenm parents: 3576 diff changeset	2169 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
2633 72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	2170
2902 3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2171 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2172 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2173 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2174 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2175 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2176 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2177 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2178 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2179
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2180 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2181 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2182 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2183 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2184 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2185 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2186 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2187 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2188
5949 d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_ aurel parents: 5948 diff changeset	2189 if (ENABLE_CAVS_DECODER)
5950 e419e6d4e7eb cosmetics: indentation aurel parents: 5949 diff changeset	2190 ff_cavsdsp_init_mmx2(c, avctx);
5949 d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_ aurel parents: 5948 diff changeset	2191
d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_ aurel parents: 5948 diff changeset	2192 if (ENABLE_VC1_DECODER \|\| ENABLE_WMV3_DECODER)
5950 e419e6d4e7eb cosmetics: indentation aurel parents: 5949 diff changeset	2193 ff_vc1dsp_init_mmx(c, avctx);
5933 6ce8f15fc02b add VC-1 MMX DSP functions, under MIT license. gpoirier parents: 5912 diff changeset	2194
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	2195 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	2196 } else if (mm_flags & MM_3DNOW) {
3215 06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2197 c->prefetch = prefetch_3dnow;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2198
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2199 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2200 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
393 bf164fce2c14 removed debug function glantau parents: 387 diff changeset	2201
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2202 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2203 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2204 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	2205
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2206 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2207 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2208
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2209 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2210 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2211 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2212
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2213 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2214 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2215 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2216 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2217 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2218 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2219 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2220 }
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2221
6327 5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2222 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2223 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2224 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2225 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2226 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2227 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2228
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2229 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2230 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2231 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2232 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2233 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2234 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2235
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2236 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2237 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2238 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2239 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
3807 6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2240
6057 03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding. gpoirier parents: 6056 diff changeset	2241 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
2922 d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx. lorenm parents: 2902 diff changeset	2242 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
0 986e461dc072 Initial revision glantau parents: diff changeset	2243 }
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2244
6336 ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2245
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2246 #define H264_QPEL_FUNCS(x, y, CPU)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2247 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2248 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2249 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2250 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2251 if((mm_flags & MM_SSE2) && !(mm_flags & MM_3DNOW)){
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2252 // these functions are slower than mmx on AMD, but faster on Intel
6336 ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2253 /* FIXME works in most codecs, but crashes svq1 due to unaligned chroma
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2254 c->put_pixels_tab[0][0] = put_pixels16_sse2;
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2255 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
6336 ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2256 */
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2257 H264_QPEL_FUNCS(0, 0, sse2);
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2258 }
6336 ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2259 if(mm_flags & MM_SSE2){
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2260 H264_QPEL_FUNCS(0, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2261 H264_QPEL_FUNCS(0, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2262 H264_QPEL_FUNCS(0, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2263 H264_QPEL_FUNCS(1, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2264 H264_QPEL_FUNCS(1, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2265 H264_QPEL_FUNCS(1, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2266 H264_QPEL_FUNCS(2, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2267 H264_QPEL_FUNCS(2, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2268 H264_QPEL_FUNCS(2, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2269 H264_QPEL_FUNCS(3, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2270 H264_QPEL_FUNCS(3, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2271 H264_QPEL_FUNCS(3, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2272 }
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2273 #ifdef HAVE_SSSE3
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2274 if(mm_flags & MM_SSSE3){
6336 ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2275 H264_QPEL_FUNCS(1, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2276 H264_QPEL_FUNCS(1, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2277 H264_QPEL_FUNCS(1, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2278 H264_QPEL_FUNCS(1, 3, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2279 H264_QPEL_FUNCS(2, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2280 H264_QPEL_FUNCS(2, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2281 H264_QPEL_FUNCS(2, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2282 H264_QPEL_FUNCS(2, 3, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2283 H264_QPEL_FUNCS(3, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2284 H264_QPEL_FUNCS(3, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2285 H264_QPEL_FUNCS(3, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2286 H264_QPEL_FUNCS(3, 3, ssse3);
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	2287 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2288 }
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2289 #endif
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2290
4589 30261f4ed12d Fix wrong conditional, Snow decoding, not encoding, was SIMD-accelerated. diego parents: 4436 diff changeset	2291 #ifdef CONFIG_SNOW_DECODER
5591 642588a60570 update mmx code to latest snow changes michael parents: 5587 diff changeset	2292 if(mm_flags & MM_SSE2 & 0){
3210 81cafbc23b8d snow mmx+sse2 optimizations, part 4 corey parents: 3207 diff changeset	2293 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
5602 3b21f3268707 CONFIG_7REGS has been renamed to HAVE_7REGS ramiro parents: 5601 diff changeset	2294 #ifdef HAVE_7REGS
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2295 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
5601 b26025b9586d workaround gcc bug, untested as my gcc is not complaining michael parents: 5594 diff changeset	2296 #endif
3211 b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock gpoirier parents: 3210 diff changeset	2297 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2298 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2299 else{
5594 384629ebcb93 avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum michael parents: 5591 diff changeset	2300 if(mm_flags & MM_MMXEXT){
3210 81cafbc23b8d snow mmx+sse2 optimizations, part 4 corey parents: 3207 diff changeset	2301 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
5602 3b21f3268707 CONFIG_7REGS has been renamed to HAVE_7REGS ramiro parents: 5601 diff changeset	2302 #ifdef HAVE_7REGS
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2303 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
5601 b26025b9586d workaround gcc bug, untested as my gcc is not complaining michael parents: 5594 diff changeset	2304 #endif
5594 384629ebcb93 avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum michael parents: 5591 diff changeset	2305 }
3211 b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock gpoirier parents: 3210 diff changeset	2306 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2307 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2308 #endif
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2309
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	2310 if(mm_flags & MM_3DNOW){
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2311 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2312 c->vector_fmul = vector_fmul_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2313 if(!(avctx->flags & CODEC_FLAG_BITEXACT))
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2314 c->float_to_int16 = float_to_int16_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2315 }
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	2316 if(mm_flags & MM_3DNOWEXT)
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2317 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	2318 if(mm_flags & MM_SSE){
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2319 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2320 c->vector_fmul = vector_fmul_sse;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2321 c->float_to_int16 = float_to_int16_sse;
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2322 c->vector_fmul_reverse = vector_fmul_reverse_sse;
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2323 c->vector_fmul_add_add = vector_fmul_add_add_sse;
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2324 }
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	2325 if(mm_flags & MM_3DNOW)
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2326 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
0 986e461dc072 Initial revision glantau parents: diff changeset	2327 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	2328
6403 9a736918fd90 split encoding part of dsputil_mmx into its own file aurel parents: 6391 diff changeset	2329 if (ENABLE_ENCODERS)
9a736918fd90 split encoding part of dsputil_mmx into its own file aurel parents: 6391 diff changeset	2330 dsputilenc_init_mmx(c, avctx);
9a736918fd90 split encoding part of dsputil_mmx into its own file aurel parents: 6391 diff changeset	2331
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2332 #if 0
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2333 // for speed testing
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2334 get_pixels = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2335 put_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2336 add_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2337
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2338 pix_abs16x16 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2339 pix_abs16x16_x2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2340 pix_abs16x16_y2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2341 pix_abs16x16_xy2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2342
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2343 put_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2344 put_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2345 put_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2346 put_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2347
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2348 put_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2349 put_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2350 put_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2351 put_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2352
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2353 avg_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2354 avg_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2355 avg_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2356 avg_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2357
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2358 avg_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2359 avg_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2360 avg_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2361 avg_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2362
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2363 //av_fdct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2364 //ff_idct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2365 #endif
0 986e461dc072 Initial revision glantau parents: diff changeset	2366 }

Mercurial > libavcodec.hg

annotate i386/dsputil_mmx.c @ 6408:7af599600f2d libavcodec