libavcodec.hg: i386/dsputil

annotate i386/dsputil_mmx.c @ 8340:834a77844ba3 libavcodec

ARM: NEON optimised h264_idct_dc_add

author	mru
date	Mon, 15 Dec 2008 22:12:54 +0000
parents	08b0f63a91c5
children	de2509cf3c44

rev	line source
0 986e461dc072 Initial revision glantau parents: diff changeset	1 /*
986e461dc072 Initial revision glantau parents: diff changeset	2 * MMX optimized DSP utils
429 718a22dc121f license/copyright change glantau parents: 422 diff changeset	3 * Copyright (c) 2000, 2001 Fabrice Bellard.
1739 07a484280a82 copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise michael parents: 1729 diff changeset	4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
0 986e461dc072 Initial revision glantau parents: diff changeset	5 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	6 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	7 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	8 * FFmpeg is free software; you can redistribute it and/or
429 718a22dc121f license/copyright change glantau parents: 422 diff changeset	9 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change glantau parents: 422 diff changeset	10 * License as published by the Free Software Foundation; either
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	11 * version 2.1 of the License, or (at your option) any later version.
0 986e461dc072 Initial revision glantau parents: diff changeset	12 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	13 * FFmpeg is distributed in the hope that it will be useful,
0 986e461dc072 Initial revision glantau parents: diff changeset	14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429 718a22dc121f license/copyright change glantau parents: 422 diff changeset	15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change glantau parents: 422 diff changeset	16 * Lesser General Public License for more details.
0 986e461dc072 Initial revision glantau parents: diff changeset	17 *
429 718a22dc121f license/copyright change glantau parents: 422 diff changeset	18 * You should have received a copy of the GNU Lesser General Public
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	19 * License along with FFmpeg; if not, write to the Free Software
3036 0b546eab515d Update licensing information: The FSF changed postal address. diego parents: 2979 diff changeset	20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0 986e461dc072 Initial revision glantau parents: diff changeset	21 *
986e461dc072 Initial revision glantau parents: diff changeset	22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
986e461dc072 Initial revision glantau parents: diff changeset	23 */
986e461dc072 Initial revision glantau parents: diff changeset	24
6763 f7cbb7733146 Use full path for #includes from another directory. diego parents: 6755 diff changeset	25 #include "libavutil/x86_cpu.h"
f7cbb7733146 Use full path for #includes from another directory. diego parents: 6755 diff changeset	26 #include "libavcodec/dsputil.h"
f7cbb7733146 Use full path for #includes from another directory. diego parents: 6755 diff changeset	27 #include "libavcodec/h263.h"
f7cbb7733146 Use full path for #includes from another directory. diego parents: 6755 diff changeset	28 #include "libavcodec/mpegvideo.h"
f7cbb7733146 Use full path for #includes from another directory. diego parents: 6755 diff changeset	29 #include "libavcodec/simple_idct.h"
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	30 #include "dsputil_mmx.h"
1984 ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	31 #include "mmx.h"
5014 42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c aurel parents: 5010 diff changeset	32 #include "vp3dsp_mmx.h"
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c aurel parents: 5010 diff changeset	33 #include "vp3dsp_sse2.h"
6585 0ec61ed36c29 Add a header file to declare Xvid IDCT functions. diego parents: 6557 diff changeset	34 #include "idct_xvid.h"
0 986e461dc072 Initial revision glantau parents: diff changeset	35
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	36 //#undef NDEBUG
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	37 //#include <assert.h>
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	38
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	39 int mm_flags; /* multimedia extension flags */
936 caa77cd960c0 qpel encoding michaelni parents: 866 diff changeset	40
0 986e461dc072 Initial revision glantau parents: diff changeset	41 /* pixel operations */
5947 37a03989871b use ff_ prefix for extern vars aurel parents: 5946 diff changeset	42 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
37a03989871b use ff_ prefix for extern vars aurel parents: 5946 diff changeset	43 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	44
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	45 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	46 {0x8000000080000000ULL, 0x8000000080000000ULL};
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	47
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	48 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	49 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
8317 08b0f63a91c5 avoid POSIX reserved _t suffix aurel parents: 8288 diff changeset	50 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
08b0f63a91c5 avoid POSIX reserved _t suffix aurel parents: 8288 diff changeset	51 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	52 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
8317 08b0f63a91c5 avoid POSIX reserved _t suffix aurel parents: 8288 diff changeset	53 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	54 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
8317 08b0f63a91c5 avoid POSIX reserved _t suffix aurel parents: 8288 diff changeset	55 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
08b0f63a91c5 avoid POSIX reserved _t suffix aurel parents: 8288 diff changeset	56 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	57 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	58 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	59 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
6333 beb52d4a5efe constant was excessively aligned lorenm parents: 6331 diff changeset	60 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	61 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	62
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	63 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	64 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	65 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
8032 0839f325edb5 MMX VP3 Loop Filter conrad parents: 8031 diff changeset	66 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	67 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
8032 0839f325edb5 MMX VP3 Loop Filter conrad parents: 8031 diff changeset	68 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
5946 55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	69 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	70 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	71
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	72 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
55251379b5b1 make ff_p* vars extern so that they can be used in various _mmx.c files aurel* parents: 5933 diff changeset	73 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
5737 efa3c1f9259a sse2 version of compute_autocorr(). lorenm parents: 5602 diff changeset	74
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	75 #define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	76 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	77
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	78 #define MOVQ_BFE(regd) \
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	79 __asm__ volatile ( \
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	80 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	81 "paddb %%" #regd ", %%" #regd " \n\t" ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	82
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	83 #ifndef PIC
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	84 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	85 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	86 #else
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	87 // for shared library it's better to use this way for accessing constants
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	88 // pcmpeqd -> -1
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	89 #define MOVQ_BONE(regd) \
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	90 __asm__ volatile ( \
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	91 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	92 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	93 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	94
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	95 #define MOVQ_WTWO(regd) \
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	96 __asm__ volatile ( \
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	97 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	98 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	99 "psllw $1, %%" #regd " \n\t"::)
387 b8f3affeb8e1 shared lib support (req by kabi) ... michaelni parents: 386 diff changeset	100
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	101 #endif
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	102
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	103 // using regr as temporary and for the output result
444 a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC kabi parents: 438 diff changeset	104 // first argument is unmodifed and second is trashed
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	105 // regfe is supposed to contain 0xfefefefefefefefe
d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	106 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	107 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	108 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	109 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	110 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	111 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	112 "paddb " #regb ", " #regr " \n\t"
444 a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC kabi parents: 438 diff changeset	113
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	114 #define PAVGB_MMX(rega, regb, regr, regfe) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	115 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	116 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	117 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	118 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	119 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	120 "psubb " #regb ", " #regr " \n\t"
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	121
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	122 // mm6 is supposed to contain 0xfefefefefefefefe
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	123 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	124 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	125 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	126 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	127 "pand " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	128 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	129 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	130 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	131 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	132 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	133 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	134 "paddb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	135 "paddb " #regd ", " #regp " \n\t"
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	136
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	137 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	138 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	139 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	140 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	141 "por " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	142 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	143 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	144 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	145 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	146 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	147 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	148 "psubb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	149 "psubb " #regd ", " #regp " \n\t"
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	150
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	151 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	152 /* MMX no rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	153 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	154 #define SET_RND MOVQ_WONE
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	155 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	156 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	157
8073 915bc657348f Rename template included sources from .h to _template.c. flameeyes parents: 8041 diff changeset	158 #include "dsputil_mmx_rnd_template.c"
444 a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC kabi parents: 438 diff changeset	159
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	160 #undef DEF
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	161 #undef SET_RND
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	162 #undef PAVGBP
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	163 #undef PAVGB
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	164 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	165 /* MMX rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	166
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	167 #define DEF(x, y) x ## _ ## y ##_mmx
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	168 #define SET_RND MOVQ_WTWO
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	169 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	170 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	171
8073 915bc657348f Rename template included sources from .h to _template.c. flameeyes parents: 8041 diff changeset	172 #include "dsputil_mmx_rnd_template.c"
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	173
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	174 #undef DEF
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	175 #undef SET_RND
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	176 #undef PAVGBP
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	177 #undef PAVGB
387 b8f3affeb8e1 shared lib support (req by kabi) ... michaelni parents: 386 diff changeset	178
0 986e461dc072 Initial revision glantau parents: diff changeset	179 /***********************************/
986e461dc072 Initial revision glantau parents: diff changeset	180 /* 3Dnow specific */
986e461dc072 Initial revision glantau parents: diff changeset	181
986e461dc072 Initial revision glantau parents: diff changeset	182 #define DEF(x) x ## _3dnow
986e461dc072 Initial revision glantau parents: diff changeset	183 #define PAVGB "pavgusb"
986e461dc072 Initial revision glantau parents: diff changeset	184
8073 915bc657348f Rename template included sources from .h to _template.c. flameeyes parents: 8041 diff changeset	185 #include "dsputil_mmx_avg_template.c"
0 986e461dc072 Initial revision glantau parents: diff changeset	186
986e461dc072 Initial revision glantau parents: diff changeset	187 #undef DEF
986e461dc072 Initial revision glantau parents: diff changeset	188 #undef PAVGB
986e461dc072 Initial revision glantau parents: diff changeset	189
986e461dc072 Initial revision glantau parents: diff changeset	190 /***********************************/
986e461dc072 Initial revision glantau parents: diff changeset	191 /* MMX2 specific */
986e461dc072 Initial revision glantau parents: diff changeset	192
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 342 diff changeset	193 #define DEF(x) x ## _mmx2
0 986e461dc072 Initial revision glantau parents: diff changeset	194
986e461dc072 Initial revision glantau parents: diff changeset	195 /* Introduced only in MMX2 set */
986e461dc072 Initial revision glantau parents: diff changeset	196 #define PAVGB "pavgb"
986e461dc072 Initial revision glantau parents: diff changeset	197
8073 915bc657348f Rename template included sources from .h to _template.c. flameeyes parents: 8041 diff changeset	198 #include "dsputil_mmx_avg_template.c"
0 986e461dc072 Initial revision glantau parents: diff changeset	199
986e461dc072 Initial revision glantau parents: diff changeset	200 #undef DEF
986e461dc072 Initial revision glantau parents: diff changeset	201 #undef PAVGB
986e461dc072 Initial revision glantau parents: diff changeset	202
6327 5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	203 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	204 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
6321 57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	205 #define put_pixels16_mmx2 put_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	206 #define put_pixels8_mmx2 put_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	207 #define put_pixels4_mmx2 put_pixels4_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	208 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	209 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	210 #define put_pixels16_3dnow put_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	211 #define put_pixels8_3dnow put_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	212 #define put_pixels4_3dnow put_pixels4_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	213 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	214 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	215
0 986e461dc072 Initial revision glantau parents: diff changeset	216 /***********************************/
986e461dc072 Initial revision glantau parents: diff changeset	217 /* standard MMX */
986e461dc072 Initial revision glantau parents: diff changeset	218
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	219 void put_pixels_clamped_mmx(const DCTELEM block, uint8_t pixels, int line_size)
0 986e461dc072 Initial revision glantau parents: diff changeset	220 {
986e461dc072 Initial revision glantau parents: diff changeset	221 const DCTELEM *p;
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	222 uint8_t *pix;
0 986e461dc072 Initial revision glantau parents: diff changeset	223
986e461dc072 Initial revision glantau parents: diff changeset	224 /* read the pixels */
986e461dc072 Initial revision glantau parents: diff changeset	225 p = block;
986e461dc072 Initial revision glantau parents: diff changeset	226 pix = pixels;
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	227 /* unrolled loop */
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	228 __asm__ volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	229 "movq %3, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	230 "movq 8%3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	231 "movq 16%3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	232 "movq 24%3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	233 "movq 32%3, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	234 "movq 40%3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	235 "movq 48%3, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	236 "movq 56%3, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	237 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	238 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	239 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	240 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	241 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	242 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	243 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	244 "movq %%mm6, (%0, %2) \n\t"
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	245 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size3), "m"(p)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	246 :"memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	247 pix += line_size*4;
986e461dc072 Initial revision glantau parents: diff changeset	248 p += 32;
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	249
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	250 // if here would be an exact copy of the code above
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	251 // compiler would generate some very strange code
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	252 // thus using "r"
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	253 __asm__ volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	254 "movq (%3), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	255 "movq 8(%3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	256 "movq 16(%3), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	257 "movq 24(%3), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	258 "movq 32(%3), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	259 "movq 40(%3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	260 "movq 48(%3), %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	261 "movq 56(%3), %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	262 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	263 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	264 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	265 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	266 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	267 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	268 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	269 "movq %%mm6, (%0, %2) \n\t"
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	270 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	271 :"memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	272 }
986e461dc072 Initial revision glantau parents: diff changeset	273
3089 072dbc669253 MSVC-compatible __align8/__align16 declaration diego parents: 3036 diff changeset	274 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
1985 b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler melanson parents: 1984 diff changeset	275 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler melanson parents: 1984 diff changeset	276
1984 ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	277 void put_signed_pixels_clamped_mmx(const DCTELEM block, uint8_t pixels, int line_size)
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	278 {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	279 int i;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	280
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	281 movq_m2r(*vector128, mm1);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	282 for (i = 0; i < 8; i++) {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	283 movq_m2r(*(block), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	284 packsswb_m2r(*(block + 4), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	285 block += 8;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	286 paddb_r2r(mm1, mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	287 movq_r2m(mm0, *pixels);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	288 pixels += line_size;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	289 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	290 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	291
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	292 void add_pixels_clamped_mmx(const DCTELEM block, uint8_t pixels, int line_size)
0 986e461dc072 Initial revision glantau parents: diff changeset	293 {
986e461dc072 Initial revision glantau parents: diff changeset	294 const DCTELEM *p;
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	295 uint8_t *pix;
0 986e461dc072 Initial revision glantau parents: diff changeset	296 int i;
986e461dc072 Initial revision glantau parents: diff changeset	297
986e461dc072 Initial revision glantau parents: diff changeset	298 /* read the pixels */
986e461dc072 Initial revision glantau parents: diff changeset	299 p = block;
986e461dc072 Initial revision glantau parents: diff changeset	300 pix = pixels;
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	301 MOVQ_ZERO(mm7);
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	302 i = 4;
342 8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler kabi parents: 324 diff changeset	303 do {
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	304 __asm__ volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	305 "movq (%2), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	306 "movq 8(%2), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	307 "movq 16(%2), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	308 "movq 24(%2), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	309 "movq %0, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	310 "movq %1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	311 "movq %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	312 "punpcklbw %%mm7, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	313 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	314 "paddsw %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	315 "paddsw %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	316 "movq %%mm6, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	317 "punpcklbw %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	318 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	319 "paddsw %%mm6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	320 "paddsw %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	321 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	322 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	323 "movq %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	324 "movq %%mm2, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	325 :"+m"(pix), "+m"((pix+line_size))
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	326 :"r"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	327 :"memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	328 pix += line_size*2;
986e461dc072 Initial revision glantau parents: diff changeset	329 p += 16;
342 8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler kabi parents: 324 diff changeset	330 } while (--i);
0 986e461dc072 Initial revision glantau parents: diff changeset	331 }
986e461dc072 Initial revision glantau parents: diff changeset	332
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	333 static void put_pixels4_mmx(uint8_t block, const uint8_t pixels, int line_size, int h)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	334 {
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	335 __asm__ volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	336 "lea (%3, %3), %%"REG_a" \n\t"
3576 f7125bf10892 Support for MacIntel, last part: balign directives gpoirier parents: 3574 diff changeset	337 ASMALIGN(3)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	338 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	339 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	340 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	341 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	342 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	343 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	344 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	345 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	346 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	347 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	348 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	349 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	350 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	351 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	352 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	353 : "+g"(h), "+r" (pixels), "+r" (block)
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	354 : "r"((x86_reg)line_size)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	355 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	356 );
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	357 }
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	358
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	359 static void put_pixels8_mmx(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	360 {
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	361 __asm__ volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	362 "lea (%3, %3), %%"REG_a" \n\t"
3576 f7125bf10892 Support for MacIntel, last part: balign directives gpoirier parents: 3574 diff changeset	363 ASMALIGN(3)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	364 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	365 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	366 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	367 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	368 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	369 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	370 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	371 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	372 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	373 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	374 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	375 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	376 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	377 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	378 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	379 : "+g"(h), "+r" (pixels), "+r" (block)
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	380 : "r"((x86_reg)line_size)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	381 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	382 );
0 986e461dc072 Initial revision glantau parents: diff changeset	383 }
986e461dc072 Initial revision glantau parents: diff changeset	384
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	385 static void put_pixels16_mmx(uint8_t block, const uint8_t pixels, int line_size, int h)
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	386 {
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	387 __asm__ volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	388 "lea (%3, %3), %%"REG_a" \n\t"
3576 f7125bf10892 Support for MacIntel, last part: balign directives gpoirier parents: 3574 diff changeset	389 ASMALIGN(3)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	390 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	391 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	392 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	393 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	394 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	395 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	396 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	397 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	398 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	399 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	400 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	401 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	402 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	403 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	404 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	405 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	406 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	407 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	408 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	409 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	410 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	411 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	412 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	413 : "+g"(h), "+r" (pixels), "+r" (block)
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	414 : "r"((x86_reg)line_size)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	415 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	416 );
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	417 }
45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	418
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	419 static void put_pixels16_sse2(uint8_t block, const uint8_t pixels, int line_size, int h)
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	420 {
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	421 __asm__ volatile(
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	422 "1: \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	423 "movdqu (%1), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	424 "movdqu (%1,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	425 "movdqu (%1,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	426 "movdqu (%1,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	427 "movdqa %%xmm0, (%2) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	428 "movdqa %%xmm1, (%2,%3) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	429 "movdqa %%xmm2, (%2,%3,2) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	430 "movdqa %%xmm3, (%2,%4) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	431 "subl $4, %0 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	432 "lea (%1,%3,4), %1 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	433 "lea (%2,%3,4), %2 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	434 "jnz 1b \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	435 : "+g"(h), "+r" (pixels), "+r" (block)
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	436 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	437 : "memory"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	438 );
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	439 }
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	440
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	441 static void avg_pixels16_sse2(uint8_t block, const uint8_t pixels, int line_size, int h)
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	442 {
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	443 __asm__ volatile(
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	444 "1: \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	445 "movdqu (%1), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	446 "movdqu (%1,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	447 "movdqu (%1,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	448 "movdqu (%1,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	449 "pavgb (%2), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	450 "pavgb (%2,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	451 "pavgb (%2,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	452 "pavgb (%2,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	453 "movdqa %%xmm0, (%2) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	454 "movdqa %%xmm1, (%2,%3) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	455 "movdqa %%xmm2, (%2,%3,2) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	456 "movdqa %%xmm3, (%2,%4) \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	457 "subl $4, %0 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	458 "lea (%1,%3,4), %1 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	459 "lea (%2,%3,4), %2 \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	460 "jnz 1b \n\t"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	461 : "+g"(h), "+r" (pixels), "+r" (block)
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	462 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	463 : "memory"
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	464 );
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	465 }
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	466
8288 800444234375 clear_block mmx lorenm parents: 8250 diff changeset	467 #define CLEAR_BLOCKS(name,n) \
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	468 static void name(DCTELEM *blocks)\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	469 {\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	470 __asm__ volatile(\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	471 "pxor %%mm7, %%mm7 \n\t"\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	472 "mov %1, %%"REG_a" \n\t"\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	473 "1: \n\t"\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	474 "movq %%mm7, (%0, %%"REG_a") \n\t"\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	475 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	476 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	477 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	478 "add $32, %%"REG_a" \n\t"\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	479 " js 1b \n\t"\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	480 : : "r" (((uint8_t )blocks)+128n),\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	481 "i" (-128*n)\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	482 : "%"REG_a\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	483 );\
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	484 }
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	485 CLEAR_BLOCKS(clear_blocks_mmx, 6)
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	486 CLEAR_BLOCKS(clear_block_mmx, 1)
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	487
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	488 static void clear_block_sse(DCTELEM *block)
296 c1a8a1b4a24b sizeof(s->block) isnt 6462 anymore bugfix michaelni parents: 294 diff changeset	489 {
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	490 __asm__ volatile(
8288 800444234375 clear_block mmx lorenm parents: 8250 diff changeset	491 "xorps %%xmm0, %%xmm0 \n"
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	492 "movaps %%xmm0, (%0) \n"
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	493 "movaps %%xmm0, 16(%0) \n"
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	494 "movaps %%xmm0, 32(%0) \n"
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	495 "movaps %%xmm0, 48(%0) \n"
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	496 "movaps %%xmm0, 64(%0) \n"
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	497 "movaps %%xmm0, 80(%0) \n"
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	498 "movaps %%xmm0, 96(%0) \n"
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	499 "movaps %%xmm0, 112(%0) \n"
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	500 :: "r"(block)
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	501 : "memory"
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	502 );
296 c1a8a1b4a24b sizeof(s->block) isnt 6462 anymore bugfix michaelni parents: 294 diff changeset	503 }
c1a8a1b4a24b sizeof(s->block) isnt 6462 anymore bugfix michaelni parents: 294 diff changeset	504
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	505 static void add_bytes_mmx(uint8_t dst, uint8_t src, int w){
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	506 x86_reg i=0;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	507 __asm__ volatile(
7087 9c0f579ebb6b Fix add_bytes_mmx and add_bytes_l2_mmx for w < 16 reimar parents: 6763 diff changeset	508 "jmp 2f \n\t"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	509 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	510 "movq (%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	511 "movq (%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	512 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	513 "movq %%mm1, (%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	514 "movq 8(%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	515 "movq 8(%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	516 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	517 "movq %%mm1, 8(%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	518 "add $16, %0 \n\t"
7087 9c0f579ebb6b Fix add_bytes_mmx and add_bytes_l2_mmx for w < 16 reimar parents: 6763 diff changeset	519 "2: \n\t"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	520 "cmp %3, %0 \n\t"
7087 9c0f579ebb6b Fix add_bytes_mmx and add_bytes_l2_mmx for w < 16 reimar parents: 6763 diff changeset	521 " js 1b \n\t"
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	522 : "+r" (i)
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	523 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	524 );
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	525 for(; i<w; i++)
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	526 dst[i+0] += src[i+0];
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	527 }
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	528
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	529 static void add_bytes_l2_mmx(uint8_t dst, uint8_t src1, uint8_t *src2, int w){
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	530 x86_reg i=0;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	531 __asm__ volatile(
7087 9c0f579ebb6b Fix add_bytes_mmx and add_bytes_l2_mmx for w < 16 reimar parents: 6763 diff changeset	532 "jmp 2f \n\t"
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	533 "1: \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	534 "movq (%2, %0), %%mm0 \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	535 "movq 8(%2, %0), %%mm1 \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	536 "paddb (%3, %0), %%mm0 \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	537 "paddb 8(%3, %0), %%mm1 \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	538 "movq %%mm0, (%1, %0) \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	539 "movq %%mm1, 8(%1, %0) \n\t"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	540 "add $16, %0 \n\t"
7087 9c0f579ebb6b Fix add_bytes_mmx and add_bytes_l2_mmx for w < 16 reimar parents: 6763 diff changeset	541 "2: \n\t"
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	542 "cmp %4, %0 \n\t"
7087 9c0f579ebb6b Fix add_bytes_mmx and add_bytes_l2_mmx for w < 16 reimar parents: 6763 diff changeset	543 " js 1b \n\t"
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	544 : "+r" (i)
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	545 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	546 );
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	547 for(; i<w; i++)
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	548 dst[i] = src1[i] + src2[i];
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	549 }
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	550
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	551 #define H263_LOOP_FILTER \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	552 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	553 "movq %0, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	554 "movq %0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	555 "movq %3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	556 "movq %3, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	557 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	558 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	559 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	560 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	561 "psubw %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	562 "psubw %%mm3, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	563 "movq %1, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	564 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	565 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	566 "movq %2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	567 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	568 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	569 "punpcklbw %%mm7, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	570 "punpckhbw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	571 "psubw %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	572 "psubw %%mm3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	573 "psllw $2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	574 "psllw $2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	575 "paddw %%mm0, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	576 "paddw %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	577 "pxor %%mm6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	578 "pcmpgtw %%mm4, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	579 "pcmpgtw %%mm5, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	580 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	581 "pxor %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	582 "psubw %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	583 "psubw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	584 "psrlw $3, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	585 "psrlw $3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	586 "packuswb %%mm5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	587 "packsswb %%mm7, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	588 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	589 "movd %4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	590 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	591 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	592 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	593 "psubusb %%mm4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	594 "movq %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	595 "psubusb %%mm4, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	596 "psubb %%mm3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	597 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	598 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	599 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	600 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	601 "paddusb %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	602 "psubusb %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	603 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	604 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	605 "paddusb %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	606 "packsswb %%mm1, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	607 "pcmpgtb %%mm0, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	608 "pxor %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	609 "psubb %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	610 "movq %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	611 "psubusb %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	612 "psubb %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	613 "pand %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	614 "psrlw $2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	615 "pxor %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	616 "psubb %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	617 "movq %0, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	618 "movq %3, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	619 "psubb %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	620 "paddb %%mm1, %%mm6 \n\t"
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	621
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	622 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394 e9a6215f4e3a help some gcc version to optimize out those functions aurel parents: 5278 diff changeset	623 if(ENABLE_ANY_H263) {
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	624 const int strength= ff_h263_loop_filter_strength[qscale];
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	625
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	626 __asm__ volatile(
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	627
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	628 H263_LOOP_FILTER
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	629
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	630 "movq %%mm3, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	631 "movq %%mm4, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	632 "movq %%mm5, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	633 "movq %%mm6, %3 \n\t"
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	634 : "+m" ((uint64_t)(src - 2*stride)),
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	635 "+m" ((uint64_t)(src - 1*stride)),
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	636 "+m" ((uint64_t)(src + 0*stride)),
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	637 "+m" ((uint64_t)(src + 1*stride))
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	638 : "g" (2*strength), "m"(ff_pb_FC)
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	639 );
5394 e9a6215f4e3a help some gcc version to optimize out those functions aurel parents: 5278 diff changeset	640 }
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	641 }
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	642
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	643 static inline void transpose4x4(uint8_t dst, uint8_t src, int dst_stride, int src_stride){
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	644 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	645 "movd %4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	646 "movd %5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	647 "movd %6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	648 "movd %7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	649 "punpcklbw %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	650 "punpcklbw %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	651 "movq %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	652 "punpcklwd %%mm2, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	653 "punpckhwd %%mm2, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	654 "movd %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	655 "punpckhdq %%mm0, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	656 "movd %%mm0, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	657 "movd %%mm1, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	658 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	659 "movd %%mm1, %3 \n\t"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	660
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	661 : "=m" ((uint32_t)(dst + 0*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	662 "=m" ((uint32_t)(dst + 1*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	663 "=m" ((uint32_t)(dst + 2*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	664 "=m" ((uint32_t)(dst + 3*dst_stride))
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	665 : "m" ((uint32_t)(src + 0*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	666 "m" ((uint32_t)(src + 1*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	667 "m" ((uint32_t)(src + 2*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	668 "m" ((uint32_t)(src + 3*src_stride))
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	669 );
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	670 }
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	671
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	672 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394 e9a6215f4e3a help some gcc version to optimize out those functions aurel parents: 5278 diff changeset	673 if(ENABLE_ANY_H263) {
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	674 const int strength= ff_h263_loop_filter_strength[qscale];
6181 f3da7b2592aa Use DECLARE_ALIGNED reimar parents: 6135 diff changeset	675 DECLARE_ALIGNED(8, uint64_t, temp[4]);
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	676 uint8_t btemp= (uint8_t)temp;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	677
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	678 src -= 2;
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	679
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	680 transpose4x4(btemp , src , 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	681 transpose4x4(btemp+4, src + 4*stride, 8, stride);
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	682 __asm__ volatile(
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	683 H263_LOOP_FILTER // 5 3 4 6
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	684
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	685 : "+m" (temp[0]),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	686 "+m" (temp[1]),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	687 "+m" (temp[2]),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	688 "+m" (temp[3])
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	689 : "g" (2*strength), "m"(ff_pb_FC)
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	690 );
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	691
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	692 __asm__ volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	693 "movq %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	694 "movq %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	695 "punpcklbw %%mm3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	696 "punpcklbw %%mm6, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	697 "punpckhbw %%mm3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	698 "punpckhbw %%mm6, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	699 "movq %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	700 "movq %%mm1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	701 "punpcklwd %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	702 "punpcklwd %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	703 "punpckhwd %%mm4, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	704 "punpckhwd %%mm0, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	705 "movd %%mm5, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	706 "punpckhdq %%mm5, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	707 "movd %%mm5, (%0,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	708 "movd %%mm3, (%0,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	709 "punpckhdq %%mm3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	710 "movd %%mm3, (%0,%3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	711 "movd %%mm1, (%1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	712 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	713 "movd %%mm1, (%1,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	714 "movd %%mm6, (%1,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	715 "punpckhdq %%mm6, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	716 "movd %%mm6, (%1,%3) \n\t"
2505 86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<) michael parents: 2293 diff changeset	717 :: "r" (src),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<) michael parents: 2293 diff changeset	718 "r" (src + 4*stride),
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	719 "r" ((x86_reg) stride ),
33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	720 "r" ((x86_reg)(3*stride))
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	721 );
5394 e9a6215f4e3a help some gcc version to optimize out those functions aurel parents: 5278 diff changeset	722 }
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	723 }
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	724
6437 5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	725 /* draw the edges of width 'w' of an image of size width, height
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	726 this mmx version can only handle w==8 \|\| w==16 */
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	727 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	728 {
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	729 uint8_t ptr, last_line;
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	730 int i;
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	731
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	732 last_line = buf + (height - 1) * wrap;
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	733 /* left and right */
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	734 ptr = buf;
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	735 if(w==8)
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	736 {
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	737 __asm__ volatile(
6437 5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	738 "1: \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	739 "movd (%0), %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	740 "punpcklbw %%mm0, %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	741 "punpcklwd %%mm0, %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	742 "punpckldq %%mm0, %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	743 "movq %%mm0, -8(%0) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	744 "movq -8(%0, %2), %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	745 "punpckhbw %%mm1, %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	746 "punpckhwd %%mm1, %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	747 "punpckhdq %%mm1, %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	748 "movq %%mm1, (%0, %2) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	749 "add %1, %0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	750 "cmp %3, %0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	751 " jb 1b \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	752 : "+r" (ptr)
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	753 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
6437 5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	754 );
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	755 }
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	756 else
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	757 {
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	758 __asm__ volatile(
6437 5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	759 "1: \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	760 "movd (%0), %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	761 "punpcklbw %%mm0, %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	762 "punpcklwd %%mm0, %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	763 "punpckldq %%mm0, %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	764 "movq %%mm0, -8(%0) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	765 "movq %%mm0, -16(%0) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	766 "movq -8(%0, %2), %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	767 "punpckhbw %%mm1, %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	768 "punpckhwd %%mm1, %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	769 "punpckhdq %%mm1, %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	770 "movq %%mm1, (%0, %2) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	771 "movq %%mm1, 8(%0, %2) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	772 "add %1, %0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	773 "cmp %3, %0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	774 " jb 1b \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	775 : "+r" (ptr)
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	776 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
6437 5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	777 );
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	778 }
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	779
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	780 for(i=0;i<w;i+=4) {
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	781 /* top and bottom (and hopefully also the corners) */
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	782 ptr= buf - (i + 1) * wrap - w;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	783 __asm__ volatile(
6437 5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	784 "1: \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	785 "movq (%1, %0), %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	786 "movq %%mm0, (%0) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	787 "movq %%mm0, (%0, %2) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	788 "movq %%mm0, (%0, %2, 2) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	789 "movq %%mm0, (%0, %3) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	790 "add $8, %0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	791 "cmp %4, %0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	792 " jb 1b \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	793 : "+r" (ptr)
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	794 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap3), "r" (ptr+width+2w)
6437 5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	795 );
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	796 ptr= last_line + (i + 1) * wrap - w;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	797 __asm__ volatile(
6437 5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	798 "1: \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	799 "movq (%1, %0), %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	800 "movq %%mm0, (%0) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	801 "movq %%mm0, (%0, %2) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	802 "movq %%mm0, (%0, %2, 2) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	803 "movq %%mm0, (%0, %3) \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	804 "add $8, %0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	805 "cmp %4, %0 \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	806 " jb 1b \n\t"
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	807 : "+r" (ptr)
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	808 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap3), "r" (ptr+width+2w)
6437 5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	809 );
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	810 }
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	811 }
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	812
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	813 #define PAETH(cpu, abs3)\
7460 2ced44037814 Mark add_png_paeth_prediction_* functions which are only used within this file diego parents: 7286 diff changeset	814 static void add_png_paeth_prediction_##cpu(uint8_t dst, uint8_t src, uint8_t *top, int w, int bpp)\
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	815 {\
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	816 x86_reg i = -bpp;\
33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	817 x86_reg end = w-3;\
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	818 __asm__ volatile(\
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	819 "pxor %%mm7, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	820 "movd (%1,%0), %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	821 "movd (%2,%0), %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	822 "punpcklbw %%mm7, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	823 "punpcklbw %%mm7, %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	824 "add %4, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	825 "1: \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	826 "movq %%mm1, %%mm2 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	827 "movd (%2,%0), %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	828 "movq %%mm2, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	829 "punpcklbw %%mm7, %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	830 "movq %%mm2, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	831 "psubw %%mm1, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	832 "psubw %%mm0, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	833 "movq %%mm3, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	834 "paddw %%mm4, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	835 abs3\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	836 "movq %%mm4, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	837 "pminsw %%mm5, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	838 "pcmpgtw %%mm6, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	839 "pcmpgtw %%mm5, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	840 "movq %%mm4, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	841 "pand %%mm3, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	842 "pandn %%mm3, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	843 "pandn %%mm0, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	844 "movd (%3,%0), %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	845 "pand %%mm1, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	846 "pand %%mm4, %%mm2 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	847 "punpcklbw %%mm7, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	848 "movq %6, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	849 "paddw %%mm6, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	850 "paddw %%mm2, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	851 "paddw %%mm3, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	852 "pand %%mm5, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	853 "movq %%mm0, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	854 "packuswb %%mm3, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	855 "movd %%mm3, (%1,%0) \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	856 "add %4, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	857 "cmp %5, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	858 "jle 1b \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	859 :"+r"(i)\
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	860 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	861 "m"(ff_pw_255)\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	862 :"memory"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	863 );\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	864 }
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	865
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	866 #define ABS3_MMX2\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	867 "psubw %%mm5, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	868 "pmaxsw %%mm7, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	869 "pxor %%mm6, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	870 "pxor %%mm7, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	871 "psubw %%mm3, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	872 "psubw %%mm4, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	873 "pmaxsw %%mm6, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	874 "pmaxsw %%mm7, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	875 "pxor %%mm7, %%mm7 \n"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	876
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	877 #define ABS3_SSSE3\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	878 "pabsw %%mm3, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	879 "pabsw %%mm4, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	880 "pabsw %%mm5, %%mm5 \n"
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	881
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	882 PAETH(mmx2, ABS3_MMX2)
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	883 #ifdef HAVE_SSSE3
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	884 PAETH(ssse3, ABS3_SSSE3)
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	885 #endif
0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	886
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	887 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	888 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	889 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	890 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	891 "movq "#in7", " #m3 " \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	892 "movq "#in0", %%mm5 \n\t" /* D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	893 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	894 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	895 "movq "#in1", %%mm5 \n\t" /* C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	896 "movq "#in2", %%mm6 \n\t" /* B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	897 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	898 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	899 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	900 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	901 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	902 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	903 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	904 "psraw $5, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	905 "packuswb %%mm5, %%mm5 \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	906 OP(%%mm5, out, %%mm7, d)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	907
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	908 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1057 bb5de8a59da8 * static,const,compiler warning cleanup kabi parents: 997 diff changeset	909 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	910 uint64_t temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	911 \
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	912 __asm__ volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	913 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	914 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	915 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	916 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	917 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	918 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	919 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	920 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	921 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	922 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	923 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	924 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	925 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	926 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	927 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	928 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	929 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	930 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	931 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	932 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	933 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	934 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	935 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	936 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	937 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	938 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	939 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	940 "paddw %6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	941 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	942 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	943 "movq %%mm0, %5 \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	944 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	945 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	946 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	947 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	948 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	949 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	950 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	951 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	952 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	953 "paddw %%mm0, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	954 "paddw %%mm5, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	955 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	956 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	957 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	958 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	959 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	960 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	961 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	962 "paddw %%mm2, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	963 "paddw %%mm6, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	964 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	965 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	966 "paddw %6, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	967 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	968 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	969 "movq %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	970 "packuswb %%mm3, %%mm1 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	971 OP_MMX2(%%mm1, (%1),%%mm4, q)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	972 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	973 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	974 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	975 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	976 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	977 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	978 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	979 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	980 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	981 "paddw %%mm1, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	982 "paddw %%mm4, %%mm0 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	983 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	984 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	985 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	986 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	987 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	988 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	989 "paddw %%mm3, %%mm2 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	990 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	991 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	992 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	993 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	994 "paddw %%mm2, %%mm6 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	995 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	996 "paddw %6, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	997 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	998 "psraw $5, %%mm0 \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	999 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1000 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1001 "paddw %%mm5, %%mm3 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1002 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1003 "paddw %%mm4, %%mm6 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1004 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1005 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1006 "paddw %%mm1, %%mm4 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1007 "paddw %%mm2, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1008 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1009 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1010 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1011 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1012 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1013 "paddw %6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1014 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1015 "psraw $5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1016 "packuswb %%mm4, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1017 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1018 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1019 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1020 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1021 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1022 " jnz 1b \n\t"\
6513 713c4fd84e0b Hardcode register to prevent aparent miscompilation. michael parents: 6512 diff changeset	1023 : "+a"(src), "+c"(dst), "+D"(h)\
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	1024 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /"m"(ff_pw_20), "m"(ff_pw_3),/ "m"(temp), "m"(ROUNDER)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1025 : "memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1026 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1027 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1028 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1029 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1030 int i;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1031 int16_t temp[16];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1032 /* quick HACK, XXX FIXME MUST be optimized */\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1033 for(i=0; i<h; i++)\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1034 {\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1035 temp[ 0]= (src[ 0]+src[ 1])20 - (src[ 0]+src[ 2])6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1036 temp[ 1]= (src[ 1]+src[ 2])20 - (src[ 0]+src[ 3])6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1037 temp[ 2]= (src[ 2]+src[ 3])20 - (src[ 1]+src[ 4])6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1038 temp[ 3]= (src[ 3]+src[ 4])20 - (src[ 2]+src[ 5])6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1039 temp[ 4]= (src[ 4]+src[ 5])20 - (src[ 3]+src[ 6])6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1040 temp[ 5]= (src[ 5]+src[ 6])20 - (src[ 4]+src[ 7])6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1041 temp[ 6]= (src[ 6]+src[ 7])20 - (src[ 5]+src[ 8])6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1042 temp[ 7]= (src[ 7]+src[ 8])20 - (src[ 6]+src[ 9])6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1043 temp[ 8]= (src[ 8]+src[ 9])20 - (src[ 7]+src[10])6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1044 temp[ 9]= (src[ 9]+src[10])20 - (src[ 8]+src[11])6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1045 temp[10]= (src[10]+src[11])20 - (src[ 9]+src[12])6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1046 temp[11]= (src[11]+src[12])20 - (src[10]+src[13])6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1047 temp[12]= (src[12]+src[13])20 - (src[11]+src[14])6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1048 temp[13]= (src[13]+src[14])20 - (src[12]+src[15])6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1049 temp[14]= (src[14]+src[15])20 - (src[13]+src[16])6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1050 temp[15]= (src[15]+src[16])20 - (src[14]+src[16])6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1051 __asm__ volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1052 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1053 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1054 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1055 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1056 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1057 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1058 "packuswb %%mm1, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1059 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1060 "movq 16(%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1061 "movq 24(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1062 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1063 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1064 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1065 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1066 "packuswb %%mm1, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1067 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1068 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1069 : "memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1070 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1071 dst+=dstStride;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1072 src+=srcStride;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1073 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1074 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1075 \
1057 bb5de8a59da8 * static,const,compiler warning cleanup kabi parents: 997 diff changeset	1076 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1077 __asm__ volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1078 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1079 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1080 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1081 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1082 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1083 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1084 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1085 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1086 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1087 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1088 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1089 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1090 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1091 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1092 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1093 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1094 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1095 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1096 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1097 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1098 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1099 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1100 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1101 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1102 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1103 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1104 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
6512 33ac9c5524cc remove unused temp michael parents: 6437 diff changeset	1105 "paddw %5, %%mm6 \n\t"\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1106 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1107 "psraw $5, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1108 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1109 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1110 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1111 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1112 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1113 "paddw %%mm5, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1114 "paddw %%mm6, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1115 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1116 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1117 "paddw %%mm6, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1118 "paddw %%mm5, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1119 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1120 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1121 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1122 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1123 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
6512 33ac9c5524cc remove unused temp michael parents: 6437 diff changeset	1124 "paddw %5, %%mm1 \n\t"\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1125 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1126 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1127 "packuswb %%mm3, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1128 OP_MMX2(%%mm0, (%1), %%mm4, q)\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1129 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1130 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1131 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1132 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1133 " jnz 1b \n\t"\
6513 713c4fd84e0b Hardcode register to prevent aparent miscompilation. michael parents: 6512 diff changeset	1134 : "+a"(src), "+c"(dst), "+d"(h)\
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	1135 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /"m"(ff_pw_20), "m"(ff_pw_3),/ "m"(ROUNDER)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1136 : "memory"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1137 );\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1138 }\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1139 \
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1140 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1141 int i;\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1142 int16_t temp[8];\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1143 /* quick HACK, XXX FIXME MUST be optimized */\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1144 for(i=0; i<h; i++)\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1145 {\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1146 temp[ 0]= (src[ 0]+src[ 1])20 - (src[ 0]+src[ 2])6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1147 temp[ 1]= (src[ 1]+src[ 2])20 - (src[ 0]+src[ 3])6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1148 temp[ 2]= (src[ 2]+src[ 3])20 - (src[ 1]+src[ 4])6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1149 temp[ 3]= (src[ 3]+src[ 4])20 - (src[ 2]+src[ 5])6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1150 temp[ 4]= (src[ 4]+src[ 5])20 - (src[ 3]+src[ 6])6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1151 temp[ 5]= (src[ 5]+src[ 6])20 - (src[ 4]+src[ 7])6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1152 temp[ 6]= (src[ 6]+src[ 7])20 - (src[ 5]+src[ 8])6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1153 temp[ 7]= (src[ 7]+src[ 8])20 - (src[ 6]+src[ 8])6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1154 __asm__ volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1155 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1156 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1157 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1158 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1159 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1160 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1161 "packuswb %%mm1, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1162 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1163 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1164 :"memory"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1165 );\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1166 dst+=dstStride;\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1167 src+=srcStride;\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1168 }\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1169 }
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1170
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1171 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1172 \
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1173 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1174 uint64_t temp[17*4];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1175 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1176 int count= 17;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1177 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1178 /FIXME unroll /\
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1179 __asm__ volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1180 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1181 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1182 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1183 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1184 "movq 8(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1185 "movq 8(%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1186 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1187 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1188 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1189 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1190 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1191 "movq %%mm1, 17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1192 "movq %%mm2, 2178(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1193 "movq %%mm3, 3178(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1194 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1195 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1196 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1197 " jnz 1b \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1198 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	1199 : "r" ((x86_reg)srcStride)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1200 : "memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1201 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1202 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1203 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1204 count=4;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1205 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1206 /FIXME reorder for speed /\
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1207 __asm__ volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1208 /"pxor %%mm7, %%mm7 \n\t"/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1209 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1210 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1211 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1212 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1213 "movq 24(%0), %%mm3 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1214 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1215 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1216 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1217 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1218 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1219 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1220 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1221 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1222 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1223 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1224 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1225 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1226 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1227 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1228 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1229 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1230 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1231 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1232 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1233 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1234 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1235 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1236 "add %4, %1 \n\t" \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1237 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1238 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1239 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1240 "add $136, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1241 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1242 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1243 " jnz 1b \n\t"\
958 9bb668034ecf slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped) michaelni parents: 954 diff changeset	1244 \
967 274b518c4ecb PIC / ebx fix michaelni parents: 966 diff changeset	1245 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	1246 : "r"((x86_reg)dstStride), "r"(2(x86_reg)dstStride), /"m"(ff_pw_20), "m"(ff_pw_3),/ "m"(ROUNDER), "g"(4-14(x86_reg)dstStride)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1247 :"memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1248 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1249 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1250 \
1057 bb5de8a59da8 * static,const,compiler warning cleanup kabi parents: 997 diff changeset	1251 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	1252 uint64_t temp[9*2];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1253 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1254 int count= 9;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1255 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1256 /FIXME unroll /\
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1257 __asm__ volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1258 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1259 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1260 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1261 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1262 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1263 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1264 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1265 "movq %%mm1, 9*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1266 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1267 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1268 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1269 " jnz 1b \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1270 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	1271 : "r" ((x86_reg)srcStride)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1272 : "memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1273 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1274 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1275 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1276 count=2;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1277 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1278 /FIXME reorder for speed /\
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1279 __asm__ volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1280 /"pxor %%mm7, %%mm7 \n\t"/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1281 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1282 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1283 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1284 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1285 "movq 24(%0), %%mm3 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1286 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1287 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1288 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1289 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1290 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1291 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1292 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1293 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1294 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1295 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1296 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1297 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1298 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1299 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1300 "add $72, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1301 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1302 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1303 " jnz 1b \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1304 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	1305 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	1306 : "r"((x86_reg)dstStride), "r"(2(x86_reg)dstStride), /"m"(ff_pw_20), "m"(ff_pw_3),/ "m"(ROUNDER), "g"(4-6(x86_reg)dstStride)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1307 : "memory"\
7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	1308 );\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1309 }\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1310 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1311 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t dst, uint8_t src, int stride){\
6321 57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	1312 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1313 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1314 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1315 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1316 uint64_t temp[8];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1317 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1318 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1319 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1320 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1321 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1322 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1323 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1324 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1325 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1326 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1327 uint64_t temp[8];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1328 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1329 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1330 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1331 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1332 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1333 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1334 uint64_t temp[8];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1335 uint8_t * const half= (uint8_t*)temp;\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1336 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1337 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1338 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1339 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1340 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t dst, uint8_t src, int stride){\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1341 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1342 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1343 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1344 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1345 uint64_t temp[8];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1346 uint8_t * const half= (uint8_t*)temp;\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1347 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1348 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1349 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1350 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1351 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1352 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1353 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1354 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1355 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1356 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1357 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1358 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1359 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1360 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1361 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1362 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1363 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1364 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1365 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1366 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1367 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1368 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1369 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1370 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1371 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1372 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1373 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1374 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1375 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1376 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1377 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1378 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1379 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1380 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1381 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1382 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1383 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1384 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1385 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1386 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1387 uint64_t half[8 + 9];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1388 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1389 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1390 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1391 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1392 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1393 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1394 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1395 uint64_t half[8 + 9];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1396 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1397 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1398 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1399 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1400 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1401 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1402 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1403 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1404 uint8_t * const halfH= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1405 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1406 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1407 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1408 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1409 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1410 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1411 uint8_t * const halfH= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1412 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1413 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1414 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1415 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1416 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1417 uint64_t half[9];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1418 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1419 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1420 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1421 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1422 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t dst, uint8_t src, int stride){\
6321 57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel_mc00 lorenm* parents: 6320 diff changeset	1423 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1424 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1425 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1426 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1427 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1428 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1429 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1430 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1431 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1432 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1433 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1434 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1435 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1436 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1437 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1438 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1439 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1440 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1441 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1442 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1443 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1444 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1445 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1446 uint8_t * const half= (uint8_t*)temp;\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1447 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1448 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1449 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1450 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1451 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t dst, uint8_t src, int stride){\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1452 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1453 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1454 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1455 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1456 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1457 uint8_t * const half= (uint8_t*)temp;\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1458 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1459 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1460 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1461 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1462 uint64_t half[162 + 172];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1463 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1464 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1465 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1466 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1467 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1468 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1469 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1470 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1471 uint64_t half[162 + 172];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1472 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1473 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1474 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1475 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1476 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1477 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1478 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1479 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1480 uint64_t half[162 + 172];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1481 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1482 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1483 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1484 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1485 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1486 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1487 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1488 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1489 uint64_t half[162 + 172];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1490 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1491 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1492 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1493 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1494 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1495 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1496 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1497 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1498 uint64_t half[162 + 172];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1499 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1500 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1501 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1502 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1503 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1504 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1505 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1506 uint64_t half[162 + 172];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1507 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1508 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1509 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1510 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1511 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1512 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1513 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1514 uint64_t half[17*2];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1515 uint8_t * const halfH= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1516 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1517 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1518 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1519 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1520 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1521 uint64_t half[17*2];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1522 uint8_t * const halfH= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1523 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	1524 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	1525 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1526 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	1527 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1528 uint64_t half[17*2];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1529 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1530 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1531 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1532 }
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1533
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1534 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1535 #define AVG_3DNOW_OP(a,b,temp, size) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1536 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1537 "pavgusb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1538 "mov" #size " " #a ", " #b " \n\t"
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1539 #define AVG_MMX2_OP(a,b,temp, size) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1540 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1541 "pavgb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1542 "mov" #size " " #a ", " #b " \n\t"
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1543
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1544 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1545 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1546 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1547 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1548 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1549 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1550 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1551 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1552 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1553
3807 6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1554 /***********************************/
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1555 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1556
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1557 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1558 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t dst, uint8_t src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1559 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1560 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1561 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1562 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t dst, uint8_t src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1563 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1564 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1565
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1566 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1567 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1568 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1569 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1570 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1571 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1572 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1573 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1574 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1575 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1576 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t dst, uint8_t src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1577 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1578 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1579 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t dst, uint8_t src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1580 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1581 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1582 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1583 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1584 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1585 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1586 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1587 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1588 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1589 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1590
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1591 QPEL_2TAP(put_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1592 QPEL_2TAP(avg_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1593 QPEL_2TAP(put_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1594 QPEL_2TAP(avg_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1595 QPEL_2TAP(put_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1596 QPEL_2TAP(avg_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1597 QPEL_2TAP(put_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1598 QPEL_2TAP(avg_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1599
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	1600
393 bf164fce2c14 removed debug function glantau parents: 387 diff changeset	1601 #if 0
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	1602 static void just_return() { return; }
393 bf164fce2c14 removed debug function glantau parents: 387 diff changeset	1603 #endif
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	1604
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1605 static void gmc_mmx(uint8_t dst, uint8_t src, int stride, int h, int ox, int oy,
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1606 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1607 const int w = 8;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1608 const int ix = ox>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1609 const int iy = oy>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1610 const int oxs = ox>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1611 const int oys = oy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1612 const int dxxs = dxx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1613 const int dxys = dxy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1614 const int dyxs = dyx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1615 const int dyys = dyy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1616 const uint16_t r4[4] = {r,r,r,r};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1617 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1618 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1619 const uint64_t shift2 = 2*shift;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1620 uint8_t edge_buf[(h+1)*stride];
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1621 int x, y;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1622
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1623 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1624 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1625 const int dxh = dxy*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1626 const int dyw = dyx*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1627 if( // non-constant fullpel offset (3% of blocks)
6196 166bef5cad01 add parenthesis, fix warning: i386/dsputil_mmx.c:2618: warning: suggest parentheses around arithmetic in operand of \| bcoudurier parents: 6195 diff changeset	1628 ((ox^(ox+dxw)) \| (ox^(ox+dxh)) \| (ox^(ox+dxw+dxh)) \|
166bef5cad01 add parenthesis, fix warning: i386/dsputil_mmx.c:2618: warning: suggest parentheses around arithmetic in operand of \| bcoudurier parents: 6195 diff changeset	1629 (oy^(oy+dyw)) \| (oy^(oy+dyh)) \| (oy^(oy+dyw+dyh))) >> (16+shift)
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1630 // uses more than 16 bits of subpel mv (only at huge resolution)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1631 \|\| (dxx\|dxy\|dyx\|dyy)&15 )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1632 {
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1633 //FIXME could still use mmx for some of the rows
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1634 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1635 return;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1636 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1637
3250 fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1638 src += ix + iy*stride;
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1639 if( (unsigned)ix >= width-w \|\|
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1640 (unsigned)iy >= height-h )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1641 {
3250 fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1642 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1643 src = edge_buf;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1644 }
3250 fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1645
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1646 __asm__ volatile(
3250 fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1647 "movd %0, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1648 "pxor %%mm7, %%mm7 \n\t"
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1649 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1650 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1651 :: "r"(1<<shift)
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1652 );
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1653
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1654 for(x=0; x<w; x+=4){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1655 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1656 oxs - dxys + dxxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1657 oxs - dxys + dxxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1658 oxs - dxys + dxxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1659 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1660 oys - dyys + dyxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1661 oys - dyys + dyxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1662 oys - dyys + dyxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1663
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1664 for(y=0; y<h; y++){
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1665 __asm__ volatile(
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1666 "movq %0, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1667 "movq %1, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1668 "paddw %2, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1669 "paddw %3, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1670 "movq %%mm4, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1671 "movq %%mm5, %1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1672 "psrlw $12, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1673 "psrlw $12, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1674 : "+m"(dx4), "+m"(dy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1675 : "m"(dxy4), "m"(dyy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1676 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1677
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1678 __asm__ volatile(
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1679 "movq %%mm6, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1680 "movq %%mm6, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1681 "psubw %%mm4, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1682 "psubw %%mm5, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1683 "movq %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1684 "movq %%mm4, %%mm3 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1685 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1686 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1687 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1688 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1689
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1690 "movd %4, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1691 "movd %3, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1692 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1693 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1694 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1695 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1696
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1697 "movd %2, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1698 "movd %1, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1699 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1700 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1701 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1702 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
3250 fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	1703 "paddw %5, %%mm1 \n\t"
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1704 "paddw %%mm3, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1705 "paddw %%mm1, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1706 "paddw %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1707
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1708 "psrlw %6, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1709 "packuswb %%mm0, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1710 "movd %%mm0, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1711
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1712 : "=m"(dst[x+y*stride])
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1713 : "m"(src[0]), "m"(src[1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1714 "m"(src[stride]), "m"(src[stride+1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1715 "m"(*r4), "m"(shift2)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1716 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1717 src += stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1718 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1719 src += 4-h*stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1720 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1721 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	1722
3215 06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1723 #define PREFETCH(name, op) \
4172 608e2dfcb86e adding more static keywords mru parents: 4127 diff changeset	1724 static void name(void *mem, int stride, int h){\
3215 06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1725 const uint8_t *p= mem;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1726 do{\
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1727 __asm__ volatile(#op" %0" :: "m"(*p));\
3215 06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1728 p+= stride;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1729 }while(--h);\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1730 }
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1731 PREFETCH(prefetch_mmx2, prefetcht0)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1732 PREFETCH(prefetch_3dnow, prefetch)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1733 #undef PREFETCH
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	1734
2754 a49f140179e9 sort H.264 mmx dsp functions into their own file lorenm parents: 2753 diff changeset	1735 #include "h264dsp_mmx.c"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1736
6009 ecfdc0bfb233 typo/clarification diego parents: 5963 diff changeset	1737 /* CAVS specific */
3524 419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1738 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
6522 dfa76e0734e5 Add missed call to ff_cavsdsp_init_3dnow() in dsputil_init_mmx() zuxy parents: 6513 diff changeset	1739 void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx);
3524 419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1740
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1741 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t dst, uint8_t src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1742 put_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1743 }
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1744 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t dst, uint8_t src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1745 avg_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1746 }
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1747 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t dst, uint8_t src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1748 put_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1749 }
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1750 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t dst, uint8_t src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1751 avg_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1752 }
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	1753
5948 db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1754 /* VC1 specific */
db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1755 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1756
db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1757 void ff_put_vc1_mspel_mc00_mmx(uint8_t dst, const uint8_t src, int stride, int rnd) {
db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1758 put_pixels8_mmx(dst, src, stride, 8);
db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1759 }
db875a610973 build vc1dsp_mmx.c in its own compilation unit aurel parents: 5947 diff changeset	1760
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1761 /* external functions, from idct_mmx.c */
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1762 void ff_mmx_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1763 void ff_mmxext_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1764
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1765 /* XXX: those functions should be suppressed ASAP when all IDCTs are
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1766 converted */
4020 723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure diego parents: 4001 diff changeset	1767 #ifdef CONFIG_GPL
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1768 static void ff_libmpeg2mmx_idct_put(uint8_t dest, int line_size, DCTELEM block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1769 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1770 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1771 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1772 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1773 static void ff_libmpeg2mmx_idct_add(uint8_t dest, int line_size, DCTELEM block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1774 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1775 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1776 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1777 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1778 static void ff_libmpeg2mmx2_idct_put(uint8_t dest, int line_size, DCTELEM block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1779 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1780 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1781 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1782 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1783 static void ff_libmpeg2mmx2_idct_add(uint8_t dest, int line_size, DCTELEM block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1784 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1785 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1786 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	1787 }
4020 723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure diego parents: 4001 diff changeset	1788 #endif
2868 666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1789 static void ff_idct_xvid_mmx_put(uint8_t dest, int line_size, DCTELEM block)
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1790 {
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1791 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1792 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1793 }
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1794 static void ff_idct_xvid_mmx_add(uint8_t dest, int line_size, DCTELEM block)
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1795 {
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1796 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1797 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1798 }
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1799 static void ff_idct_xvid_mmx2_put(uint8_t dest, int line_size, DCTELEM block)
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1800 {
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1801 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1802 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1803 }
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1804 static void ff_idct_xvid_mmx2_add(uint8_t dest, int line_size, DCTELEM block)
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1805 {
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1806 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1807 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	1808 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1809
3541 3fbddeb13686 10l, vorbis_inverse_coupling_sse() was really 3dnow lorenm parents: 3536 diff changeset	1810 static void vorbis_inverse_coupling_3dnow(float mag, float ang, int blocksize)
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1811 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1812 int i;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1813 __asm__ volatile("pxor %%mm7, %%mm7":);
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1814 for(i=0; i<blocksize; i+=2) {
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1815 __asm__ volatile(
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1816 "movq %0, %%mm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1817 "movq %1, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1818 "movq %%mm0, %%mm2 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1819 "movq %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1820 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1821 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1822 "pslld $31, %%mm2 \n\t" // keep only the sign bit
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1823 "pxor %%mm2, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1824 "movq %%mm3, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1825 "pand %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1826 "pandn %%mm1, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1827 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1828 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1829 "movq %%mm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1830 "movq %%mm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1831 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1832 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1833 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1834 }
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1835 __asm__ volatile("femms");
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1836 }
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1837 static void vorbis_inverse_coupling_sse(float mag, float ang, int blocksize)
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1838 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1839 int i;
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1840
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1841 __asm__ volatile(
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1842 "movaps %0, %%xmm5 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1843 ::"m"(ff_pdw_80000000[0])
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1844 );
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1845 for(i=0; i<blocksize; i+=4) {
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1846 __asm__ volatile(
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1847 "movaps %0, %%xmm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1848 "movaps %1, %%xmm1 \n\t"
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1849 "xorps %%xmm2, %%xmm2 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1850 "xorps %%xmm3, %%xmm3 \n\t"
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1851 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1852 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1853 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1854 "xorps %%xmm2, %%xmm1 \n\t"
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1855 "movaps %%xmm3, %%xmm4 \n\t"
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1856 "andps %%xmm1, %%xmm3 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	1857 "andnps %%xmm1, %%xmm4 \n\t"
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1858 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1859 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1860 "movaps %%xmm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1861 "movaps %%xmm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1862 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1863 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1864 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1865 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1866 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	1867
7563 8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1868 #define IF1(x) x
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1869 #define IF0(x)
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1870
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1871 #define MIX5(mono,stereo)\
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1872 __asm__ volatile(\
7563 8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1873 "movss 0(%2), %%xmm5 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1874 "movss 8(%2), %%xmm6 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1875 "movss 24(%2), %%xmm7 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1876 "shufps $0, %%xmm5, %%xmm5 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1877 "shufps $0, %%xmm6, %%xmm6 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1878 "shufps $0, %%xmm7, %%xmm7 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1879 "1: \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1880 "movaps (%0,%1), %%xmm0 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1881 "movaps 0x400(%0,%1), %%xmm1 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1882 "movaps 0x800(%0,%1), %%xmm2 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1883 "movaps 0xc00(%0,%1), %%xmm3 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1884 "movaps 0x1000(%0,%1), %%xmm4 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1885 "mulps %%xmm5, %%xmm0 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1886 "mulps %%xmm6, %%xmm1 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1887 "mulps %%xmm5, %%xmm2 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1888 "mulps %%xmm7, %%xmm3 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1889 "mulps %%xmm7, %%xmm4 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1890 stereo("addps %%xmm1, %%xmm0 \n")\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1891 "addps %%xmm1, %%xmm2 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1892 "addps %%xmm3, %%xmm0 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1893 "addps %%xmm4, %%xmm2 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1894 mono("addps %%xmm2, %%xmm0 \n")\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1895 "movaps %%xmm0, (%0,%1) \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1896 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1897 "add $16, %0 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1898 "jl 1b \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1899 :"+&r"(i)\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1900 :"r"(samples[0]+len), "r"(matrix)\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1901 :"memory"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1902 );
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1903
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1904 #define MIX_MISC(stereo)\
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1905 __asm__ volatile(\
7563 8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1906 "1: \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1907 "movaps (%3,%0), %%xmm0 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1908 stereo("movaps %%xmm0, %%xmm1 \n")\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1909 "mulps %%xmm6, %%xmm0 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1910 stereo("mulps %%xmm7, %%xmm1 \n")\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1911 "lea 1024(%3,%0), %1 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1912 "mov %5, %2 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1913 "2: \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1914 "movaps (%1), %%xmm2 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1915 stereo("movaps %%xmm2, %%xmm3 \n")\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1916 "mulps (%4,%2), %%xmm2 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1917 stereo("mulps 16(%4,%2), %%xmm3 \n")\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1918 "addps %%xmm2, %%xmm0 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1919 stereo("addps %%xmm3, %%xmm1 \n")\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1920 "add $1024, %1 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1921 "add $32, %2 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1922 "jl 2b \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1923 "movaps %%xmm0, (%3,%0) \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1924 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1925 "add $16, %0 \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1926 "jl 1b \n"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1927 :"+&r"(i), "=&r"(j), "=&r"(k)\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1928 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1929 :"memory"\
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1930 );
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1931
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1932 static void ac3_downmix_sse(float (samples)[256], float (matrix)[2], int out_ch, int in_ch, int len)
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1933 {
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1934 int (matrix_cmp)[2] = (int()[2])matrix;
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1935 intptr_t i,j,k;
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1936
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1937 i = -len*sizeof(float);
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1938 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]\|matrix_cmp[2][0]\|matrix_cmp[3][1]\|matrix_cmp[4][0]\|(matrix_cmp[1][0]^matrix_cmp[1][1])\|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1939 MIX5(IF0,IF1);
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1940 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1941 MIX5(IF1,IF0);
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1942 } else {
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1943 DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]);
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1944 j = 2in_chsizeof(float);
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1945 __asm__ volatile(
7563 8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1946 "1: \n"
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1947 "sub $8, %0 \n"
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1948 "movss (%2,%0), %%xmm6 \n"
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1949 "movss 4(%2,%0), %%xmm7 \n"
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1950 "shufps $0, %%xmm6, %%xmm6 \n"
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1951 "shufps $0, %%xmm7, %%xmm7 \n"
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1952 "movaps %%xmm6, (%1,%0,4) \n"
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1953 "movaps %%xmm7, 16(%1,%0,4) \n"
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1954 "jg 1b \n"
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1955 :"+&r"(j)
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1956 :"r"(matrix_simd), "r"(matrix)
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1957 :"memory"
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1958 );
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1959 if(out_ch == 2) {
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1960 MIX_MISC(IF1);
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1961 } else {
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1962 MIX_MISC(IF0);
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1963 }
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1964 }
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1965 }
8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	1966
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1967 static void vector_fmul_3dnow(float dst, const float src, int len){
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	1968 x86_reg i = (len-4)*4;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1969 __asm__ volatile(
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1970 "1: \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1971 "movq (%1,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1972 "movq 8(%1,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1973 "pfmul (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1974 "pfmul 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1975 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1976 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1977 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1978 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1979 "femms \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1980 :"+r"(i)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1981 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1982 :"memory"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1983 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1984 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	1985 static void vector_fmul_sse(float dst, const float src, int len){
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	1986 x86_reg i = (len-8)*4;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	1987 __asm__ volatile(
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1988 "1: \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1989 "movaps (%1,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1990 "movaps 16(%1,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1991 "mulps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1992 "mulps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1993 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1994 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1995 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1996 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1997 :"+r"(i)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1998 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	1999 :"memory"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2000 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2001 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2002
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2003 static void vector_fmul_reverse_3dnow2(float dst, const float src0, const float *src1, int len){
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	2004 x86_reg i = len*4-16;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2005 __asm__ volatile(
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2006 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2007 "pswapd 8(%1), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2008 "pswapd (%1), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2009 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2010 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2011 "movq %%mm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2012 "movq %%mm1, 8(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2013 "add $16, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2014 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2015 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2016 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2017 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2018 );
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2019 __asm__ volatile("femms");
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2020 }
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2021 static void vector_fmul_reverse_sse(float dst, const float src0, const float *src1, int len){
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	2022 x86_reg i = len*4-32;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2023 __asm__ volatile(
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2024 "1: \n\t"
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2025 "movaps 16(%1), %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2026 "movaps (%1), %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2027 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2028 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2029 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2030 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2031 "movaps %%xmm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2032 "movaps %%xmm1, 16(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2033 "add $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2034 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2035 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2036 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2037 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2038 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2039 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2040
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2041 static void vector_fmul_add_add_3dnow(float dst, const float src0, const float *src1,
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2042 const float *src2, int src3, int len, int step){
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	2043 x86_reg i = (len-4)*4;
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2044 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2045 dst += (len-4)*2;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2046 __asm__ volatile(
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2047 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2048 "movq (%2,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2049 "movq 8(%2,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2050 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2051 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2052 "pfadd (%4,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2053 "pfadd 8(%4,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2054 "movd %%mm0, (%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2055 "movd %%mm1, 16(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2056 "psrlq $32, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2057 "psrlq $32, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2058 "movd %%mm0, 8(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2059 "movd %%mm1, 24(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2060 "sub $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2061 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2062 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2063 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2064 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2065 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2066 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2067 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2068 else if(step == 1 && src3 == 0){
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2069 __asm__ volatile(
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2070 "1: \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2071 "movq (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2072 "movq 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2073 "pfmul (%3,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2074 "pfmul 8(%3,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2075 "pfadd (%4,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2076 "pfadd 8(%4,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2077 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2078 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2079 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2080 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2081 :"+r"(i)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2082 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2083 :"memory"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2084 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2085 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2086 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2087 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2088 __asm__ volatile("femms");
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2089 }
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2090 static void vector_fmul_add_add_sse(float dst, const float src0, const float *src1,
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2091 const float *src2, int src3, int len, int step){
6755 33896780c612 Do not misuse long as the size of a register in x86. ramiro parents: 6601 diff changeset	2092 x86_reg i = (len-8)*4;
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2093 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2094 dst += (len-8)*2;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2095 __asm__ volatile(
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2096 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2097 "movaps (%2,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2098 "movaps 16(%2,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2099 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2100 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2101 "addps (%4,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2102 "addps 16(%4,%0), %%xmm1 \n\t"
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2103 "movss %%xmm0, (%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2104 "movss %%xmm1, 32(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2105 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2106 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2107 "movss %%xmm2, 16(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2108 "movss %%xmm3, 48(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2109 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2110 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2111 "movss %%xmm0, 8(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2112 "movss %%xmm1, 40(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2113 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2114 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2115 "movss %%xmm2, 24(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2116 "movss %%xmm3, 56(%1) \n\t"
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2117 "sub $64, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2118 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2119 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2120 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2121 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2122 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2123 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2124 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2125 else if(step == 1 && src3 == 0){
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2126 __asm__ volatile(
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2127 "1: \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2128 "movaps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2129 "movaps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2130 "mulps (%3,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2131 "mulps 16(%3,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2132 "addps (%4,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2133 "addps 16(%4,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2134 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2135 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2136 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2137 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2138 :"+r"(i)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2139 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2140 :"memory"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2141 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2142 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2143 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2144 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2145 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2146
7263 fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2147 static void vector_fmul_window_3dnow2(float dst, const float src0, const float *src1,
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2148 const float *win, float add_bias, int len){
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2149 #ifdef HAVE_6REGS
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2150 if(add_bias == 0){
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2151 x86_reg i = -len*4;
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2152 x86_reg j = len*4-8;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2153 __asm__ volatile(
7263 fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2154 "1: \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2155 "pswapd (%5,%1), %%mm1 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2156 "movq (%5,%0), %%mm0 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2157 "pswapd (%4,%1), %%mm5 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2158 "movq (%3,%0), %%mm4 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2159 "movq %%mm0, %%mm2 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2160 "movq %%mm1, %%mm3 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2161 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2162 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2163 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2164 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2165 "pfadd %%mm3, %%mm2 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2166 "pfsub %%mm0, %%mm1 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2167 "pswapd %%mm2, %%mm2 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2168 "movq %%mm1, (%2,%0) \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2169 "movq %%mm2, (%2,%1) \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2170 "sub $8, %1 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2171 "add $8, %0 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2172 "jl 1b \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2173 "femms \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2174 :"+r"(i), "+r"(j)
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2175 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2176 );
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2177 }else
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2178 #endif
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2179 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2180 }
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2181
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2182 static void vector_fmul_window_sse(float dst, const float src0, const float *src1,
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2183 const float *win, float add_bias, int len){
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2184 #ifdef HAVE_6REGS
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2185 if(add_bias == 0){
7263 fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2186 x86_reg i = -len*4;
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2187 x86_reg j = len*4-16;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2188 __asm__ volatile(
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2189 "1: \n"
7263 fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2190 "movaps (%5,%1), %%xmm1 \n"
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2191 "movaps (%5,%0), %%xmm0 \n"
7263 fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2192 "movaps (%4,%1), %%xmm5 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2193 "movaps (%3,%0), %%xmm4 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2194 "shufps $0x1b, %%xmm1, %%xmm1 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2195 "shufps $0x1b, %%xmm5, %%xmm5 \n"
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2196 "movaps %%xmm0, %%xmm2 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2197 "movaps %%xmm1, %%xmm3 \n"
7263 fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2198 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2199 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2200 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2201 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2202 "addps %%xmm3, %%xmm2 \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2203 "subps %%xmm0, %%xmm1 \n"
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2204 "shufps $0x1b, %%xmm2, %%xmm2 \n"
7263 fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2205 "movaps %%xmm1, (%2,%0) \n"
fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2206 "movaps %%xmm2, (%2,%1) \n"
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2207 "sub $16, %1 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2208 "add $16, %0 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2209 "jl 1b \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2210 :"+r"(i), "+r"(j)
7263 fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2211 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2212 );
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2213 }else
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2214 #endif
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2215 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2216 }
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2217
7564 7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2218 static void int32_to_float_fmul_scalar_sse(float dst, const int src, float mul, int len)
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2219 {
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2220 x86_reg i = -4*len;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2221 __asm__ volatile(
7564 7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2222 "movss %3, %%xmm4 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2223 "shufps $0, %%xmm4, %%xmm4 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2224 "1: \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2225 "cvtpi2ps (%2,%0), %%xmm0 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2226 "cvtpi2ps 8(%2,%0), %%xmm1 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2227 "cvtpi2ps 16(%2,%0), %%xmm2 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2228 "cvtpi2ps 24(%2,%0), %%xmm3 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2229 "movlhps %%xmm1, %%xmm0 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2230 "movlhps %%xmm3, %%xmm2 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2231 "mulps %%xmm4, %%xmm0 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2232 "mulps %%xmm4, %%xmm2 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2233 "movaps %%xmm0, (%1,%0) \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2234 "movaps %%xmm2, 16(%1,%0) \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2235 "add $32, %0 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2236 "jl 1b \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2237 :"+r"(i)
7567 75841957d08b gcc chokes on xmm constraints, so pessimize int32_to_float_fmul_scalar_sse a little lorenm parents: 7565 diff changeset	2238 :"r"(dst+len), "r"(src+len), "m"(mul)
7564 7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2239 );
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2240 }
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2241
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2242 static void int32_to_float_fmul_scalar_sse2(float dst, const int src, float mul, int len)
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2243 {
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2244 x86_reg i = -4*len;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2245 __asm__ volatile(
7564 7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2246 "movss %3, %%xmm4 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2247 "shufps $0, %%xmm4, %%xmm4 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2248 "1: \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2249 "cvtdq2ps (%2,%0), %%xmm0 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2250 "cvtdq2ps 16(%2,%0), %%xmm1 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2251 "mulps %%xmm4, %%xmm0 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2252 "mulps %%xmm4, %%xmm1 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2253 "movaps %%xmm0, (%1,%0) \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2254 "movaps %%xmm1, 16(%1,%0) \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2255 "add $32, %0 \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2256 "jl 1b \n"
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2257 :"+r"(i)
7567 75841957d08b gcc chokes on xmm constraints, so pessimize int32_to_float_fmul_scalar_sse a little lorenm parents: 7565 diff changeset	2258 :"r"(dst+len), "r"(src+len), "m"(mul)
7564 7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2259 );
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2260 }
7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2261
7218 7f3d6509628b Fix x86-64 michael parents: 7217 diff changeset	2262 static void float_to_int16_3dnow(int16_t dst, const float src, long len){
8041 24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions. reimar parents: 8035 diff changeset	2263 x86_reg reglen = len;
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2264 // not bit-exact: pf2id uses different rounding than C and SSE
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2265 __asm__ volatile(
7217 726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2266 "add %0 , %0 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2267 "lea (%2,%0,2) , %2 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2268 "add %0 , %1 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2269 "neg %0 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2270 "1: \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2271 "pf2id (%2,%0,2) , %%mm0 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2272 "pf2id 8(%2,%0,2) , %%mm1 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2273 "pf2id 16(%2,%0,2) , %%mm2 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2274 "pf2id 24(%2,%0,2) , %%mm3 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2275 "packssdw %%mm1 , %%mm0 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2276 "packssdw %%mm3 , %%mm2 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2277 "movq %%mm0 , (%1,%0) \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2278 "movq %%mm2 , 8(%1,%0) \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2279 "add $16 , %0 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2280 " js 1b \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2281 "femms \n\t"
8041 24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions. reimar parents: 8035 diff changeset	2282 :"+r"(reglen), "+r"(dst), "+r"(src)
7217 726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow() michael parents: 7087 diff changeset	2283 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2284 }
7218 7f3d6509628b Fix x86-64 michael parents: 7217 diff changeset	2285 static void float_to_int16_sse(int16_t dst, const float src, long len){
8041 24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions. reimar parents: 8035 diff changeset	2286 x86_reg reglen = len;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2287 __asm__ volatile(
7219 f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2288 "add %0 , %0 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2289 "lea (%2,%0,2) , %2 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2290 "add %0 , %1 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2291 "neg %0 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2292 "1: \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2293 "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2294 "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2295 "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2296 "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2297 "packssdw %%mm1 , %%mm0 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2298 "packssdw %%mm3 , %%mm2 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2299 "movq %%mm0 , (%1,%0) \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2300 "movq %%mm2 , 8(%1,%0) \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2301 "add $16 , %0 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2302 " js 1b \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2303 "emms \n\t"
8041 24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions. reimar parents: 8035 diff changeset	2304 :"+r"(reglen), "+r"(dst), "+r"(src)
7219 f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse() michael parents: 7218 diff changeset	2305 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2306 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2307
7226 e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2308 static void float_to_int16_sse2(int16_t dst, const float src, long len){
8041 24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions. reimar parents: 8035 diff changeset	2309 x86_reg reglen = len;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2310 __asm__ volatile(
7226 e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2311 "add %0 , %0 \n\t"
e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2312 "lea (%2,%0,2) , %2 \n\t"
e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2313 "add %0 , %1 \n\t"
e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2314 "neg %0 \n\t"
e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2315 "1: \n\t"
e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2316 "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2317 "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2318 "packssdw %%xmm1 , %%xmm0 \n\t"
e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2319 "movdqa %%xmm0 , (%1,%0) \n\t"
e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2320 "add $16 , %0 \n\t"
e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2321 " js 1b \n\t"
8041 24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions. reimar parents: 8035 diff changeset	2322 :"+r"(reglen), "+r"(dst), "+r"(src)
7226 e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2323 );
e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2324 }
e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2325
7568 bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2326 #ifdef HAVE_YASM
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2327 void ff_float_to_int16_interleave6_sse(int16_t dst, const float *src, int len);
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2328 void ff_float_to_int16_interleave6_3dnow(int16_t dst, const float *src, int len);
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2329 void ff_float_to_int16_interleave6_3dn2(int16_t dst, const float *src, int len);
7565 474c7ae4b431 special case 6 channel version of float_to_int16_interleave lorenm parents: 7564 diff changeset	2330 #else
7568 bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2331 #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2332 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2333 #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
7565 474c7ae4b431 special case 6 channel version of float_to_int16_interleave lorenm parents: 7564 diff changeset	2334 #endif
7568 bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2335 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
7565 474c7ae4b431 special case 6 channel version of float_to_int16_interleave lorenm parents: 7564 diff changeset	2336
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2337 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2338 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
7565 474c7ae4b431 special case 6 channel version of float_to_int16_interleave lorenm parents: 7564 diff changeset	2339 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t dst, const float *src, long len, int channels){\
7286 e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous. lorenm parents: 7278 diff changeset	2340 DECLARE_ALIGNED_16(int16_t, tmp[len]);\
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2341 int i,j,c;\
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2342 for(c=0; c<channels; c++){\
7286 e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous. lorenm parents: 7278 diff changeset	2343 float_to_int16_##cpu(tmp, src[c], len);\
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2344 for(i=0, j=c; i<len; i++, j+=channels)\
7286 e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous. lorenm parents: 7278 diff changeset	2345 dst[j] = tmp[i];\
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2346 }\
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2347 }\
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2348 \
7286 e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous. lorenm parents: 7278 diff changeset	2349 static void float_to_int16_interleave_##cpu(int16_t dst, const float *src, long len, int channels){\
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2350 if(channels==1)\
7286 e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous. lorenm parents: 7278 diff changeset	2351 float_to_int16_##cpu(dst, src[0], len);\
7565 474c7ae4b431 special case 6 channel version of float_to_int16_interleave lorenm parents: 7564 diff changeset	2352 else if(channels==2){\
8041 24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions. reimar parents: 8035 diff changeset	2353 x86_reg reglen = len; \
7286 e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous. lorenm parents: 7278 diff changeset	2354 const float *src0 = src[0];\
e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous. lorenm parents: 7278 diff changeset	2355 const float *src1 = src[1];\
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2356 __asm__ volatile(\
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2357 "shl $2, %0 \n"\
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2358 "add %0, %1 \n"\
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2359 "add %0, %2 \n"\
7286 e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous. lorenm parents: 7278 diff changeset	2360 "add %0, %3 \n"\
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2361 "neg %0 \n"\
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2362 body\
8041 24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions. reimar parents: 8035 diff changeset	2363 :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2364 );\
7565 474c7ae4b431 special case 6 channel version of float_to_int16_interleave lorenm parents: 7564 diff changeset	2365 }else if(channels==6){\
7568 bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2366 ff_float_to_int16_interleave6_##cpu(dst, src, len);\
7565 474c7ae4b431 special case 6 channel version of float_to_int16_interleave lorenm parents: 7564 diff changeset	2367 }else\
474c7ae4b431 special case 6 channel version of float_to_int16_interleave lorenm parents: 7564 diff changeset	2368 float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2369 }
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2370
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2371 FLOAT_TO_INT16_INTERLEAVE(3dnow,
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2372 "1: \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2373 "pf2id (%2,%0), %%mm0 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2374 "pf2id 8(%2,%0), %%mm1 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2375 "pf2id (%3,%0), %%mm2 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2376 "pf2id 8(%3,%0), %%mm3 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2377 "packssdw %%mm1, %%mm0 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2378 "packssdw %%mm3, %%mm2 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2379 "movq %%mm0, %%mm1 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2380 "punpcklwd %%mm2, %%mm0 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2381 "punpckhwd %%mm2, %%mm1 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2382 "movq %%mm0, (%1,%0)\n"
7278 6c140c15ee8c 10l, float_to_int16_interleave_sse/3dnow wrote the wrong samples lorenm parents: 7263 diff changeset	2383 "movq %%mm1, 8(%1,%0)\n"
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2384 "add $16, %0 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2385 "js 1b \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2386 "femms \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2387 )
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2388
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2389 FLOAT_TO_INT16_INTERLEAVE(sse,
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2390 "1: \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2391 "cvtps2pi (%2,%0), %%mm0 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2392 "cvtps2pi 8(%2,%0), %%mm1 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2393 "cvtps2pi (%3,%0), %%mm2 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2394 "cvtps2pi 8(%3,%0), %%mm3 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2395 "packssdw %%mm1, %%mm0 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2396 "packssdw %%mm3, %%mm2 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2397 "movq %%mm0, %%mm1 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2398 "punpcklwd %%mm2, %%mm0 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2399 "punpckhwd %%mm2, %%mm1 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2400 "movq %%mm0, (%1,%0)\n"
7278 6c140c15ee8c 10l, float_to_int16_interleave_sse/3dnow wrote the wrong samples lorenm parents: 7263 diff changeset	2401 "movq %%mm1, 8(%1,%0)\n"
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2402 "add $16, %0 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2403 "js 1b \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2404 "emms \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2405 )
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2406
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2407 FLOAT_TO_INT16_INTERLEAVE(sse2,
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2408 "1: \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2409 "cvtps2dq (%2,%0), %%xmm0 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2410 "cvtps2dq (%3,%0), %%xmm1 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2411 "packssdw %%xmm1, %%xmm0 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2412 "movhlps %%xmm0, %%xmm1 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2413 "punpcklwd %%xmm1, %%xmm0 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2414 "movdqa %%xmm0, (%1,%0) \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2415 "add $16, %0 \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2416 "js 1b \n"
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2417 )
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2418
7568 bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2419 static void float_to_int16_interleave_3dn2(int16_t dst, const float *src, long len, int channels){
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2420 if(channels==6)
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2421 ff_float_to_int16_interleave6_3dn2(dst, src, len);
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2422 else
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2423 float_to_int16_interleave_3dnow(dst, src, len, channels);
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2424 }
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2425
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2426
8250 cf4d575b1982 Delete unnecessary 'extern' keywords. diego parents: 8104 diff changeset	2427 void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
cf4d575b1982 Delete unnecessary 'extern' keywords. diego parents: 8104 diff changeset	2428 void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
cf4d575b1982 Delete unnecessary 'extern' keywords. diego parents: 8104 diff changeset	2429 void ff_snow_vertical_compose97i_sse2(IDWTELEM b0, IDWTELEM b1, IDWTELEM b2, IDWTELEM b3, IDWTELEM b4, IDWTELEM b5, int width);
cf4d575b1982 Delete unnecessary 'extern' keywords. diego parents: 8104 diff changeset	2430 void ff_snow_vertical_compose97i_mmx(IDWTELEM b0, IDWTELEM b1, IDWTELEM b2, IDWTELEM b3, IDWTELEM b4, IDWTELEM b5, int width);
cf4d575b1982 Delete unnecessary 'extern' keywords. diego parents: 8104 diff changeset	2431 void ff_snow_inner_add_yblock_sse2(const uint8_t obmc, const int obmc_stride, uint8_t * block, int b_w, int b_h,
cf4d575b1982 Delete unnecessary 'extern' keywords. diego parents: 8104 diff changeset	2432 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
cf4d575b1982 Delete unnecessary 'extern' keywords. diego parents: 8104 diff changeset	2433 void ff_snow_inner_add_yblock_mmx(const uint8_t obmc, const int obmc_stride, uint8_t * block, int b_w, int b_h,
cf4d575b1982 Delete unnecessary 'extern' keywords. diego parents: 8104 diff changeset	2434 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2435
7238 08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2436
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2437 static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2438 {
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2439 x86_reg o = -(order << 1);
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2440 v1 += order;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2441 v2 += order;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2442 __asm__ volatile(
7238 08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2443 "1: \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2444 "movdqu (%1,%2), %%xmm0 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2445 "movdqu 16(%1,%2), %%xmm1 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2446 "paddw (%0,%2), %%xmm0 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2447 "paddw 16(%0,%2), %%xmm1 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2448 "movdqa %%xmm0, (%0,%2) \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2449 "movdqa %%xmm1, 16(%0,%2) \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2450 "add $32, %2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2451 "js 1b \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2452 : "+r"(v1), "+r"(v2), "+r"(o)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2453 );
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2454 }
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2455
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2456 static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2457 {
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2458 x86_reg o = -(order << 1);
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2459 v1 += order;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2460 v2 += order;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2461 __asm__ volatile(
7238 08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2462 "1: \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2463 "movdqa (%0,%2), %%xmm0 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2464 "movdqa 16(%0,%2), %%xmm2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2465 "movdqu (%1,%2), %%xmm1 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2466 "movdqu 16(%1,%2), %%xmm3 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2467 "psubw %%xmm1, %%xmm0 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2468 "psubw %%xmm3, %%xmm2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2469 "movdqa %%xmm0, (%0,%2) \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2470 "movdqa %%xmm2, 16(%0,%2) \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2471 "add $32, %2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2472 "js 1b \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2473 : "+r"(v1), "+r"(v2), "+r"(o)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2474 );
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2475 }
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2476
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2477 static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2478 {
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2479 int res = 0;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2480 DECLARE_ALIGNED_16(int64_t, sh);
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2481 x86_reg o = -(order << 1);
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2482
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2483 v1 += order;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2484 v2 += order;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2485 sh = shift;
8031 eebc7209c47f Convert asm keyword into __asm__. flameeyes parents: 7880 diff changeset	2486 __asm__ volatile(
7238 08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2487 "pxor %%xmm7, %%xmm7 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2488 "1: \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2489 "movdqu (%0,%3), %%xmm0 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2490 "movdqu 16(%0,%3), %%xmm1 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2491 "pmaddwd (%1,%3), %%xmm0 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2492 "pmaddwd 16(%1,%3), %%xmm1 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2493 "paddd %%xmm0, %%xmm7 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2494 "paddd %%xmm1, %%xmm7 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2495 "add $32, %3 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2496 "js 1b \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2497 "movhlps %%xmm7, %%xmm2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2498 "paddd %%xmm2, %%xmm7 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2499 "psrad %4, %%xmm7 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2500 "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2501 "paddd %%xmm2, %%xmm7 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2502 "movd %%xmm7, %2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2503 : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2504 : "m"(sh)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2505 );
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2506 return res;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2507 }
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2508
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2509 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
0 986e461dc072 Initial revision glantau parents: diff changeset	2510 {
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	2511 mm_flags = mm_support();
1115 74a46d77e061 * support FF_MM_FORCE kabi parents: 1092 diff changeset	2512
1122 ddc3b0140b8f * oooooops - sorry for this one - wrong logic kabi parents: 1115 diff changeset	2513 if (avctx->dsp_mask) {
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2514 if (avctx->dsp_mask & FF_MM_FORCE)
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	2515 mm_flags \|= (avctx->dsp_mask & 0xffff);
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2516 else
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	2517 mm_flags &= ~(avctx->dsp_mask & 0xffff);
1122 ddc3b0140b8f * oooooops - sorry for this one - wrong logic kabi parents: 1115 diff changeset	2518 }
1115 74a46d77e061 * support FF_MM_FORCE kabi parents: 1092 diff changeset	2519
631 47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>) michaelni parents: 629 diff changeset	2520 #if 0
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	2521 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2522 if (mm_flags & FF_MM_MMX)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	2523 av_log(avctx, AV_LOG_INFO, " mmx");
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2524 if (mm_flags & FF_MM_MMXEXT)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	2525 av_log(avctx, AV_LOG_INFO, " mmxext");
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2526 if (mm_flags & FF_MM_3DNOW)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	2527 av_log(avctx, AV_LOG_INFO, " 3dnow");
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2528 if (mm_flags & FF_MM_SSE)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	2529 av_log(avctx, AV_LOG_INFO, " sse");
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2530 if (mm_flags & FF_MM_SSE2)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	2531 av_log(avctx, AV_LOG_INFO, " sse2");
771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	2532 av_log(avctx, AV_LOG_INFO, "\n");
0 986e461dc072 Initial revision glantau parents: diff changeset	2533 #endif
986e461dc072 Initial revision glantau parents: diff changeset	2534
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2535 if (mm_flags & FF_MM_MMX) {
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2536 const int idct_algo= avctx->idct_algo;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2537
2256 7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2538 if(avctx->lowres==0){
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2539 if(idct_algo==FF_IDCT_AUTO \|\| idct_algo==FF_IDCT_SIMPLEMMX){
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2540 c->idct_put= ff_simple_idct_put_mmx;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2541 c->idct_add= ff_simple_idct_add_mmx;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2542 c->idct = ff_simple_idct_mmx;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2543 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3717 ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel diego parents: 3712 diff changeset	2544 #ifdef CONFIG_GPL
2256 7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2545 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2546 if(mm_flags & FF_MM_MMXEXT){
2256 7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2547 c->idct_put= ff_libmpeg2mmx2_idct_put;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2548 c->idct_add= ff_libmpeg2mmx2_idct_add;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2549 c->idct = ff_mmxext_idct;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2550 }else{
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2551 c->idct_put= ff_libmpeg2mmx_idct_put;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2552 c->idct_add= ff_libmpeg2mmx_idct_add;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2553 c->idct = ff_mmx_idct;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2554 }
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	2555 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3717 ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel diego parents: 3712 diff changeset	2556 #endif
7880 2d3d9b4181d7 Ensure MMX/SSE2 VP3 IDCT selection isn't disabled when only Theora is enabled conrad parents: 7876 diff changeset	2557 }else if((ENABLE_VP3_DECODER \|\| ENABLE_VP5_DECODER \|\| ENABLE_VP6_DECODER \|\| ENABLE_THEORA_DECODER) &&
7876 3fd591f125b5 MMX/SSE2 VP3 IDCT are bitexact now that the dequantization matrices are permutated correctly conrad parents: 7759 diff changeset	2558 idct_algo==FF_IDCT_VP3){
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2559 if(mm_flags & FF_MM_SSE2){
2696 9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2560 c->idct_put= ff_vp3_idct_put_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2561 c->idct_add= ff_vp3_idct_add_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2562 c->idct = ff_vp3_idct_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2563 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2564 }else{
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2565 c->idct_put= ff_vp3_idct_put_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2566 c->idct_add= ff_vp3_idct_add_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2567 c->idct = ff_vp3_idct_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2568 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	2569 }
3524 419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2570 }else if(idct_algo==FF_IDCT_CAVS){
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2571 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2868 666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2572 }else if(idct_algo==FF_IDCT_XVIDMMX){
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2573 if(mm_flags & FF_MM_SSE2){
6601 f76581c16848 Add a new xvid-style IDCT using SSE2. astrange parents: 6585 diff changeset	2574 c->idct_put= ff_idct_xvid_sse2_put;
f76581c16848 Add a new xvid-style IDCT using SSE2. astrange parents: 6585 diff changeset	2575 c->idct_add= ff_idct_xvid_sse2_add;
f76581c16848 Add a new xvid-style IDCT using SSE2. astrange parents: 6585 diff changeset	2576 c->idct = ff_idct_xvid_sse2;
f76581c16848 Add a new xvid-style IDCT using SSE2. astrange parents: 6585 diff changeset	2577 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2578 }else if(mm_flags & FF_MM_MMXEXT){
2868 666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2579 c->idct_put= ff_idct_xvid_mmx2_put;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2580 c->idct_add= ff_idct_xvid_mmx2_add;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2581 c->idct = ff_idct_xvid_mmx2;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2582 }else{
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2583 c->idct_put= ff_idct_xvid_mmx_put;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2584 c->idct_add= ff_idct_xvid_mmx_add;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2585 c->idct = ff_idct_xvid_mmx;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2586 }
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2587 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2588 }
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	2589
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2590 c->put_pixels_clamped = put_pixels_clamped_mmx;
1984 ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	2591 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2592 c->add_pixels_clamped = add_pixels_clamped_mmx;
8288 800444234375 clear_block mmx lorenm parents: 8250 diff changeset	2593 c->clear_block = clear_block_mmx;
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2594 c->clear_blocks = clear_blocks_mmx;
8288 800444234375 clear_block mmx lorenm parents: 8250 diff changeset	2595 if (mm_flags & FF_MM_SSE)
800444234375 clear_block mmx lorenm parents: 8250 diff changeset	2596 c->clear_block = clear_block_sse;
415 1c3f42442fba * added simple test main - see comments about how to kabi parents: 402 diff changeset	2597
6327 5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2598 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2599 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2600 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2601 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2602 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2603
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2604 SET_HPEL_FUNCS(put, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2605 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2606 SET_HPEL_FUNCS(avg, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2607 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2608 SET_HPEL_FUNCS(put, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2609 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2610 SET_HPEL_FUNCS(avg, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2611 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	2612
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2613 c->gmc= gmc_mmx;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2614
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	2615 c->add_bytes= add_bytes_mmx;
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	2616 c->add_bytes_l2= add_bytes_l2_mmx;
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	2617
6437 5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	2618 c->draw_edges = draw_edges_mmx;
5154ab444372 move draw_edges() into dsputil aurel parents: 6403 diff changeset	2619
5277 7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs aurel parents: 5255 diff changeset	2620 if (ENABLE_ANY_H263) {
5278 ef85411bb7e8 cosmetics: indentation aurel parents: 5277 diff changeset	2621 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
ef85411bb7e8 cosmetics: indentation aurel parents: 5277 diff changeset	2622 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
5277 7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs aurel parents: 5255 diff changeset	2623 }
6057 03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding. gpoirier parents: 6056 diff changeset	2624 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
2922 d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx. lorenm parents: 2902 diff changeset	2625 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
6057 03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding. gpoirier parents: 6056 diff changeset	2626 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	2627
3173 9a2cc7b0fbdb h264_idct_add only needs mmx1 lorenm parents: 3105 diff changeset	2628 c->h264_idct_dc_add=
9a2cc7b0fbdb h264_idct_add only needs mmx1 lorenm parents: 3105 diff changeset	2629 c->h264_idct_add= ff_h264_idct_add_mmx;
3174 b65cbae9d940 h264_idct8_add_mmx lorenm parents: 3173 diff changeset	2630 c->h264_idct8_dc_add=
b65cbae9d940 h264_idct8_add_mmx lorenm parents: 3173 diff changeset	2631 c->h264_idct8_add= ff_h264_idct8_add_mmx;
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2632 if (mm_flags & FF_MM_SSE2)
6320 ffb2a7b80d6d ff_h264_idct8_add_sse2. lorenm parents: 6196 diff changeset	2633 c->h264_idct8_add= ff_h264_idct8_add_sse2;
3173 9a2cc7b0fbdb h264_idct_add only needs mmx1 lorenm parents: 3105 diff changeset	2634
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2635 if (mm_flags & FF_MM_MMXEXT) {
3215 06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2636 c->prefetch = prefetch_mmx2;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2637
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2638 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2639 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	2640
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2641 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2642 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2643 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
415 1c3f42442fba * added simple test main - see comments about how to kabi parents: 402 diff changeset	2644
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2645 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2646 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	2647
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2648 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2649 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2650 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2651
3105 2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall lorenm parents: 3089 diff changeset	2652 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall lorenm parents: 3089 diff changeset	2653 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
2745 42d3e9068e32 MMX for H.264 iDCT (adapted from x264) lorenm parents: 2732 diff changeset	2654
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2655 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2656 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2657 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2658 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2659 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2660 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2661 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2662
8035 56f973432109 Cosmetics: reindent conrad parents: 8034 diff changeset	2663 if (ENABLE_VP3_DECODER \|\| ENABLE_THEORA_DECODER) {
56f973432109 Cosmetics: reindent conrad parents: 8034 diff changeset	2664 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
56f973432109 Cosmetics: reindent conrad parents: 8034 diff changeset	2665 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
8034 9b690041e298 Combine non-bitexact sections conrad parents: 8033 diff changeset	2666 }
8033 b661cf8690a1 VP3 loop filter is mmx2 not mmx conrad parents: 8032 diff changeset	2667 }
b661cf8690a1 VP3 loop filter is mmx2 not mmx conrad parents: 8032 diff changeset	2668
6327 5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2669 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2670 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2671 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2672 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2673 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2674 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2675 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2676 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2677 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2678 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2679 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2680 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2681 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2682 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2683 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2684 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2685 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2686
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2687 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2688 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2689 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2690 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2691 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2692 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2693
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2694 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2695 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2696 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2697 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2698 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2699 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2700
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2701 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2702 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2703 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2704 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	2705
6057 03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding. gpoirier parents: 6056 diff changeset	2706 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
2922 d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx. lorenm parents: 2902 diff changeset	2707 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
3213 57d31bdbebe8 added mmx implementation of h264_chroma_mc2 lorenm parents: 3211 diff changeset	2708 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
57d31bdbebe8 added mmx implementation of h264_chroma_mc2 lorenm parents: 3211 diff changeset	2709 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
2633 72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	2710 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	2711 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	2712 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	2713 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
2707 360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math) lorenm parents: 2696 diff changeset	2714 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math) lorenm parents: 2696 diff changeset	2715 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
3645 47821be55b6c mmx implementation of deblocking strength decision. lorenm parents: 3576 diff changeset	2716 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
2633 72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	2717
2902 3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2718 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2719 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2720 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2721 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2722 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2723 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2724 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2725 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2726
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2727 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2728 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2729 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2730 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2731 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2732 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2733 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2734 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	2735
5949 d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_ aurel parents: 5948 diff changeset	2736 if (ENABLE_CAVS_DECODER)
5950 e419e6d4e7eb cosmetics: indentation aurel parents: 5949 diff changeset	2737 ff_cavsdsp_init_mmx2(c, avctx);
5949 d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_ aurel parents: 5948 diff changeset	2738
d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_ aurel parents: 5948 diff changeset	2739 if (ENABLE_VC1_DECODER \|\| ENABLE_WMV3_DECODER)
5950 e419e6d4e7eb cosmetics: indentation aurel parents: 5949 diff changeset	2740 ff_vc1dsp_init_mmx(c, avctx);
5933 6ce8f15fc02b add VC-1 MMX DSP functions, under MIT license. gpoirier parents: 5912 diff changeset	2741
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	2742 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2743 } else if (mm_flags & FF_MM_3DNOW) {
3215 06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2744 c->prefetch = prefetch_3dnow;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2745
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2746 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2747 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
393 bf164fce2c14 removed debug function glantau parents: 387 diff changeset	2748
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2749 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2750 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2751 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	2752
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2753 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2754 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2755
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2756 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2757 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	2758 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2759
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2760 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2761 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2762 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2763 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2764 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2765 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2766 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2767 }
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2768
6327 5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2769 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2770 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2771 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2772 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2773 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2774 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2775
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2776 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2777 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2778 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2779 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2780 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2781 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2782
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2783 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2784 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2785 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication lorenm parents: 6322 diff changeset	2786 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
3807 6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2787
6057 03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding. gpoirier parents: 6056 diff changeset	2788 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
2922 d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx. lorenm parents: 2902 diff changeset	2789 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
6522 dfa76e0734e5 Add missed call to ff_cavsdsp_init_3dnow() in dsputil_init_mmx() zuxy parents: 6513 diff changeset	2790
dfa76e0734e5 Add missed call to ff_cavsdsp_init_3dnow() in dsputil_init_mmx() zuxy parents: 6513 diff changeset	2791 if (ENABLE_CAVS_DECODER)
dfa76e0734e5 Add missed call to ff_cavsdsp_init_3dnow() in dsputil_init_mmx() zuxy parents: 6513 diff changeset	2792 ff_cavsdsp_init_3dnow(c, avctx);
0 986e461dc072 Initial revision glantau parents: diff changeset	2793 }
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2794
6336 ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2795
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2796 #define H264_QPEL_FUNCS(x, y, CPU)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2797 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2798 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2799 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2800 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2801 if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2802 // these functions are slower than mmx on AMD, but faster on Intel
6336 ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2803 /* FIXME works in most codecs, but crashes svq1 due to unaligned chroma
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2804 c->put_pixels_tab[0][0] = put_pixels16_sse2;
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2805 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
6336 ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2806 */
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2807 H264_QPEL_FUNCS(0, 0, sse2);
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2808 }
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2809 if(mm_flags & FF_MM_SSE2){
6336 ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2810 H264_QPEL_FUNCS(0, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2811 H264_QPEL_FUNCS(0, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2812 H264_QPEL_FUNCS(0, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2813 H264_QPEL_FUNCS(1, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2814 H264_QPEL_FUNCS(1, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2815 H264_QPEL_FUNCS(1, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2816 H264_QPEL_FUNCS(2, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2817 H264_QPEL_FUNCS(2, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2818 H264_QPEL_FUNCS(2, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2819 H264_QPEL_FUNCS(3, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2820 H264_QPEL_FUNCS(3, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2821 H264_QPEL_FUNCS(3, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2822 }
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2823 #ifdef HAVE_SSSE3
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2824 if(mm_flags & FF_MM_SSSE3){
6336 ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2825 H264_QPEL_FUNCS(1, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2826 H264_QPEL_FUNCS(1, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2827 H264_QPEL_FUNCS(1, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2828 H264_QPEL_FUNCS(1, 3, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2829 H264_QPEL_FUNCS(2, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2830 H264_QPEL_FUNCS(2, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2831 H264_QPEL_FUNCS(2, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2832 H264_QPEL_FUNCS(2, 3, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2833 H264_QPEL_FUNCS(3, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2834 H264_QPEL_FUNCS(3, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2835 H264_QPEL_FUNCS(3, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3. lorenm parents: 6335 diff changeset	2836 H264_QPEL_FUNCS(3, 3, ssse3);
6557 e1208c4f8898 h264 chroma mc ssse3 lorenm parents: 6522 diff changeset	2837 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_nornd;
e1208c4f8898 h264 chroma mc ssse3 lorenm parents: 6522 diff changeset	2838 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
e1208c4f8898 h264 chroma mc ssse3 lorenm parents: 6522 diff changeset	2839 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
e1208c4f8898 h264 chroma mc ssse3 lorenm parents: 6522 diff changeset	2840 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
e1208c4f8898 h264 chroma mc ssse3 lorenm parents: 6522 diff changeset	2841 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
6384 0a403ade8c81 simd and unroll png_filter_row lorenm parents: 6336 diff changeset	2842 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
6331 c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2843 }
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2844 #endif
c57670e07668 ssse3 h264 motion compensation. lorenm parents: 6329 diff changeset	2845
4589 30261f4ed12d Fix wrong conditional, Snow decoding, not encoding, was SIMD-accelerated. diego parents: 4436 diff changeset	2846 #ifdef CONFIG_SNOW_DECODER
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2847 if(mm_flags & FF_MM_SSE2 & 0){
3210 81cafbc23b8d snow mmx+sse2 optimizations, part 4 corey parents: 3207 diff changeset	2848 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
5602 3b21f3268707 CONFIG_7REGS has been renamed to HAVE_7REGS ramiro parents: 5601 diff changeset	2849 #ifdef HAVE_7REGS
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2850 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
5601 b26025b9586d workaround gcc bug, untested as my gcc is not complaining michael parents: 5594 diff changeset	2851 #endif
3211 b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock gpoirier parents: 3210 diff changeset	2852 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2853 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2854 else{
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2855 if(mm_flags & FF_MM_MMXEXT){
3210 81cafbc23b8d snow mmx+sse2 optimizations, part 4 corey parents: 3207 diff changeset	2856 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
5602 3b21f3268707 CONFIG_7REGS has been renamed to HAVE_7REGS ramiro parents: 5601 diff changeset	2857 #ifdef HAVE_7REGS
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2858 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
5601 b26025b9586d workaround gcc bug, untested as my gcc is not complaining michael parents: 5594 diff changeset	2859 #endif
5594 384629ebcb93 avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum michael parents: 5591 diff changeset	2860 }
3211 b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock gpoirier parents: 3210 diff changeset	2861 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2862 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	2863 #endif
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2864
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2865 if(mm_flags & FF_MM_3DNOW){
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2866 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2867 c->vector_fmul = vector_fmul_3dnow;
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2868 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2869 c->float_to_int16 = float_to_int16_3dnow;
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2870 c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2871 }
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2872 }
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2873 if(mm_flags & FF_MM_3DNOWEXT){
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2874 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
7263 fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2875 c->vector_fmul_window = vector_fmul_window_3dnow2;
7568 bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2876 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2877 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm lorenm parents: 7567 diff changeset	2878 }
7263 fc843d00867c exploit mdct symmetry lorenm parents: 7261 diff changeset	2879 }
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2880 if(mm_flags & FF_MM_SSE){
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2881 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
7563 8390efaa0c03 simd downmix lorenm parents: 7548 diff changeset	2882 c->ac3_downmix = ac3_downmix_sse;
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2883 c->vector_fmul = vector_fmul_sse;
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2884 c->vector_fmul_reverse = vector_fmul_reverse_sse;
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	2885 c->vector_fmul_add_add = vector_fmul_add_add_sse;
7261 032a49f033e8 simplify vorbis windowing lorenm parents: 7238 diff changeset	2886 c->vector_fmul_window = vector_fmul_window_sse;
7564 7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2887 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
7548 3d1b177a1b8c cosmetics in dsp init lorenm parents: 7460 diff changeset	2888 c->float_to_int16 = float_to_int16_sse;
3d1b177a1b8c cosmetics in dsp init lorenm parents: 7460 diff changeset	2889 c->float_to_int16_interleave = float_to_int16_interleave_sse;
7226 e707d79a5ffd float_to_int16_sse2() michael parents: 7219 diff changeset	2890 }
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2891 if(mm_flags & FF_MM_3DNOW)
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2892 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
8104 0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h. rathann parents: 8073 diff changeset	2893 if(mm_flags & FF_MM_SSE2){
7564 7cf793954871 simd int->float lorenm parents: 7563 diff changeset	2894 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
7548 3d1b177a1b8c cosmetics in dsp init lorenm parents: 7460 diff changeset	2895 c->float_to_int16 = float_to_int16_sse2;
3d1b177a1b8c cosmetics in dsp init lorenm parents: 7460 diff changeset	2896 c->float_to_int16_interleave = float_to_int16_interleave_sse2;
7238 08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2897 c->add_int16 = add_int16_sse2;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2898 c->sub_int16 = sub_int16_sse2;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2899 c->scalarproduct_int16 = scalarproduct_int16_sse2;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions kostya parents: 7226 diff changeset	2900 }
0 986e461dc072 Initial revision glantau parents: diff changeset	2901 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	2902
6403 9a736918fd90 split encoding part of dsputil_mmx into its own file aurel parents: 6391 diff changeset	2903 if (ENABLE_ENCODERS)
9a736918fd90 split encoding part of dsputil_mmx into its own file aurel parents: 6391 diff changeset	2904 dsputilenc_init_mmx(c, avctx);
9a736918fd90 split encoding part of dsputil_mmx into its own file aurel parents: 6391 diff changeset	2905
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2906 #if 0
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2907 // for speed testing
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2908 get_pixels = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2909 put_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2910 add_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2911
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2912 pix_abs16x16 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2913 pix_abs16x16_x2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2914 pix_abs16x16_y2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2915 pix_abs16x16_xy2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2916
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2917 put_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2918 put_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2919 put_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2920 put_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2921
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2922 put_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2923 put_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2924 put_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2925 put_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2926
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2927 avg_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2928 avg_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2929 avg_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2930 avg_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2931
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2932 avg_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2933 avg_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2934 avg_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2935 avg_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2936
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2937 //av_fdct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2938 //ff_idct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2939 #endif
0 986e461dc072 Initial revision glantau parents: diff changeset	2940 }

Mercurial > libavcodec.hg

annotate i386/dsputil_mmx.c @ 8340:834a77844ba3 libavcodec