libavcodec.hg: i386/dsputil

annotate i386/dsputil_mmx.c @ 5591:642588a60570 libavcodec

update mmx code to latest snow changes note, the code likely can overflow and thus needs some more changes sse2 updated too but disabled as it is untested

author	michael
date	Sat, 25 Aug 2007 15:20:56 +0000
parents	3ae03eacbe9f
children	384629ebcb93

rev	line source
0 986e461dc072 Initial revision glantau parents: diff changeset	1 /*
986e461dc072 Initial revision glantau parents: diff changeset	2 * MMX optimized DSP utils
429 718a22dc121f license/copyright change glantau parents: 422 diff changeset	3 * Copyright (c) 2000, 2001 Fabrice Bellard.
1739 07a484280a82 copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise michael parents: 1729 diff changeset	4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
0 986e461dc072 Initial revision glantau parents: diff changeset	5 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	6 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	7 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	8 * FFmpeg is free software; you can redistribute it and/or
429 718a22dc121f license/copyright change glantau parents: 422 diff changeset	9 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change glantau parents: 422 diff changeset	10 * License as published by the Free Software Foundation; either
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	11 * version 2.1 of the License, or (at your option) any later version.
0 986e461dc072 Initial revision glantau parents: diff changeset	12 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	13 * FFmpeg is distributed in the hope that it will be useful,
0 986e461dc072 Initial revision glantau parents: diff changeset	14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429 718a22dc121f license/copyright change glantau parents: 422 diff changeset	15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change glantau parents: 422 diff changeset	16 * Lesser General Public License for more details.
0 986e461dc072 Initial revision glantau parents: diff changeset	17 *
429 718a22dc121f license/copyright change glantau parents: 422 diff changeset	18 * You should have received a copy of the GNU Lesser General Public
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3932 diff changeset	19 * License along with FFmpeg; if not, write to the Free Software
3036 0b546eab515d Update licensing information: The FSF changed postal address. diego parents: 2979 diff changeset	20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0 986e461dc072 Initial revision glantau parents: diff changeset	21 *
986e461dc072 Initial revision glantau parents: diff changeset	22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
986e461dc072 Initial revision glantau parents: diff changeset	23 */
986e461dc072 Initial revision glantau parents: diff changeset	24
5010 d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header diego parents: 5007 diff changeset	25 #include "dsputil.h"
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header diego parents: 5007 diff changeset	26 #include "simple_idct.h"
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header diego parents: 5007 diff changeset	27 #include "mpegvideo.h"
3398 e0927bc44a10 Move REG_* macros from libavcodec/i386/mmx.h to libavutil/x86_cpu.h lucabe parents: 3250 diff changeset	28 #include "x86_cpu.h"
1984 ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	29 #include "mmx.h"
5014 42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c aurel parents: 5010 diff changeset	30 #include "vp3dsp_mmx.h"
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c aurel parents: 5010 diff changeset	31 #include "vp3dsp_sse2.h"
5277 7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs aurel parents: 5255 diff changeset	32 #include "h263.h"
0 986e461dc072 Initial revision glantau parents: diff changeset	33
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	34 //#undef NDEBUG
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	35 //#include <assert.h>
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	36
2868 666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	37 extern void ff_idct_xvid_mmx(short *block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	38 extern void ff_idct_xvid_mmx2(short *block);
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	39
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	40 int mm_flags; /* multimedia extension flags */
936 caa77cd960c0 qpel encoding michaelni parents: 866 diff changeset	41
0 986e461dc072 Initial revision glantau parents: diff changeset	42 /* pixel operations */
1845 3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov)) michael parents: 1784 diff changeset	43 static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov)) michael parents: 1784 diff changeset	44 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov)) michael parents: 1784 diff changeset	45 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
0 986e461dc072 Initial revision glantau parents: diff changeset	46
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	47 static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) =
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	48 {0x8000000080000000ULL, 0x8000000080000000ULL};
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	49
1845 3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov)) michael parents: 1784 diff changeset	50 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov)) michael parents: 1784 diff changeset	51 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
2633 72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	52 static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	53 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
2922 d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx. lorenm parents: 2902 diff changeset	54 static const uint64_t ff_pw_8 attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL;
1845 3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov)) michael parents: 1784 diff changeset	55 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	56 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
2754 a49f140179e9 sort H.264 mmx dsp functions into their own file lorenm parents: 2753 diff changeset	57 static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
1845 3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov)) michael parents: 1784 diff changeset	58 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	59
3645 47821be55b6c mmx implementation of deblocking strength decision. lorenm parents: 3576 diff changeset	60 static const uint64_t ff_pb_1 attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
47821be55b6c mmx implementation of deblocking strength decision. lorenm parents: 3576 diff changeset	61 static const uint64_t ff_pb_3 attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
47821be55b6c mmx implementation of deblocking strength decision. lorenm parents: 3576 diff changeset	62 static const uint64_t ff_pb_7 attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
2707 360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math) lorenm parents: 2696 diff changeset	63 static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
4127 d011a097bb85 optimize H264_DEBLOCK_P0_Q0 michael parents: 4020 diff changeset	64 static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL;
d011a097bb85 optimize H264_DEBLOCK_P0_Q0 michael parents: 4020 diff changeset	65 static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL;
1845 3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov)) michael parents: 1784 diff changeset	66 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	67
3576 f7125bf10892 Support for MacIntel, last part: balign directives gpoirier parents: 3574 diff changeset	68 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	69 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	70
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	71 #define MOVQ_WONE(regd) \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	72 __asm __volatile ( \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	73 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	74 "psrlw $15, %%" #regd ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	75
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	76 #define MOVQ_BFE(regd) \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	77 __asm __volatile ( \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	78 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	79 "paddb %%" #regd ", %%" #regd " \n\t" ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	80
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	81 #ifndef PIC
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	82 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	83 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	84 #else
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	85 // for shared library it's better to use this way for accessing constants
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	86 // pcmpeqd -> -1
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	87 #define MOVQ_BONE(regd) \
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	88 __asm __volatile ( \
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	89 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	90 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	91 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	92
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	93 #define MOVQ_WTWO(regd) \
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	94 __asm __volatile ( \
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	95 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	96 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	97 "psllw $1, %%" #regd " \n\t"::)
387 b8f3affeb8e1 shared lib support (req by kabi) ... michaelni parents: 386 diff changeset	98
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	99 #endif
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	100
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	101 // using regr as temporary and for the output result
444 a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC kabi parents: 438 diff changeset	102 // first argument is unmodifed and second is trashed
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	103 // regfe is supposed to contain 0xfefefefefefefefe
d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	104 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	105 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	106 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	107 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	108 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	109 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	110 "paddb " #regb ", " #regr " \n\t"
444 a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC kabi parents: 438 diff changeset	111
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	112 #define PAVGB_MMX(rega, regb, regr, regfe) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	113 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	114 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	115 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	116 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	117 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	118 "psubb " #regb ", " #regr " \n\t"
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	119
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	120 // mm6 is supposed to contain 0xfefefefefefefefe
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	121 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	122 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	123 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	124 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	125 "pand " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	126 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	127 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	128 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	129 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	130 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	131 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	132 "paddb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	133 "paddb " #regd ", " #regp " \n\t"
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	134
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	135 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	136 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	137 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	138 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	139 "por " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	140 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	141 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	142 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	143 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	144 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	145 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	146 "psubb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	147 "psubb " #regd ", " #regp " \n\t"
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	148
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	149 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	150 /* MMX no rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	151 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	152 #define SET_RND MOVQ_WONE
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	153 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	154 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	155
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	156 #include "dsputil_mmx_rnd.h"
444 a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC kabi parents: 438 diff changeset	157
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	158 #undef DEF
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	159 #undef SET_RND
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	160 #undef PAVGBP
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	161 #undef PAVGB
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	162 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	163 /* MMX rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	164
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	165 #define DEF(x, y) x ## _ ## y ##_mmx
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	166 #define SET_RND MOVQ_WTWO
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	167 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	168 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
445 62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	169
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	170 #include "dsputil_mmx_rnd.h"
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	171
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file kabi parents: 444 diff changeset	172 #undef DEF
448 e8c8ca9106aa * removed MANGLE from macros for setting constants kabi parents: 446 diff changeset	173 #undef SET_RND
446 efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with kabi parents: 445 diff changeset	174 #undef PAVGBP
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	175 #undef PAVGB
387 b8f3affeb8e1 shared lib support (req by kabi) ... michaelni parents: 386 diff changeset	176
0 986e461dc072 Initial revision glantau parents: diff changeset	177 /***********************************/
986e461dc072 Initial revision glantau parents: diff changeset	178 /* 3Dnow specific */
986e461dc072 Initial revision glantau parents: diff changeset	179
986e461dc072 Initial revision glantau parents: diff changeset	180 #define DEF(x) x ## _3dnow
986e461dc072 Initial revision glantau parents: diff changeset	181 #define PAVGB "pavgusb"
986e461dc072 Initial revision glantau parents: diff changeset	182
986e461dc072 Initial revision glantau parents: diff changeset	183 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision glantau parents: diff changeset	184
986e461dc072 Initial revision glantau parents: diff changeset	185 #undef DEF
986e461dc072 Initial revision glantau parents: diff changeset	186 #undef PAVGB
986e461dc072 Initial revision glantau parents: diff changeset	187
986e461dc072 Initial revision glantau parents: diff changeset	188 /***********************************/
986e461dc072 Initial revision glantau parents: diff changeset	189 /* MMX2 specific */
986e461dc072 Initial revision glantau parents: diff changeset	190
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 342 diff changeset	191 #define DEF(x) x ## _mmx2
0 986e461dc072 Initial revision glantau parents: diff changeset	192
986e461dc072 Initial revision glantau parents: diff changeset	193 /* Introduced only in MMX2 set */
986e461dc072 Initial revision glantau parents: diff changeset	194 #define PAVGB "pavgb"
986e461dc072 Initial revision glantau parents: diff changeset	195
986e461dc072 Initial revision glantau parents: diff changeset	196 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision glantau parents: diff changeset	197
986e461dc072 Initial revision glantau parents: diff changeset	198 #undef DEF
986e461dc072 Initial revision glantau parents: diff changeset	199 #undef PAVGB
986e461dc072 Initial revision glantau parents: diff changeset	200
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	201 #define SBUTTERFLY(a,b,t,n,m)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	202 "mov" #m " " #a ", " #t " \n\t" /* abcd */\
3416 fb9d94637fb2 #define SBUTTERFLY outside CONFIG_ENCODERS mru parents: 3398 diff changeset	203 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
fb9d94637fb2 #define SBUTTERFLY outside CONFIG_ENCODERS mru parents: 3398 diff changeset	204 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
fb9d94637fb2 #define SBUTTERFLY outside CONFIG_ENCODERS mru parents: 3398 diff changeset	205
4939 3409dc0e7797 cosmetics: remove duplicate transpose macro lorenm parents: 4796 diff changeset	206 #define TRANSPOSE4(a,b,c,d,t)\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	207 SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	208 SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	209 SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	210 SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
4939 3409dc0e7797 cosmetics: remove duplicate transpose macro lorenm parents: 4796 diff changeset	211
0 986e461dc072 Initial revision glantau parents: diff changeset	212 /***********************************/
986e461dc072 Initial revision glantau parents: diff changeset	213 /* standard MMX */
986e461dc072 Initial revision glantau parents: diff changeset	214
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	215 #ifdef CONFIG_ENCODERS
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	216 static void get_pixels_mmx(DCTELEM block, const uint8_t pixels, int line_size)
0 986e461dc072 Initial revision glantau parents: diff changeset	217 {
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 342 diff changeset	218 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	219 "mov $-128, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	220 "pxor %%mm7, %%mm7 \n\t"
3576 f7125bf10892 Support for MacIntel, last part: balign directives gpoirier parents: 3574 diff changeset	221 ASMALIGN(4)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	222 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	223 "movq (%0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	224 "movq (%0, %2), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	225 "movq %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	226 "movq %%mm2, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	227 "punpcklbw %%mm7, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	228 "punpckhbw %%mm7, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	229 "punpcklbw %%mm7, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	230 "punpckhbw %%mm7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	231 "movq %%mm0, (%1, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	232 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	233 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	234 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	235 "add %3, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	236 "add $32, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	237 "js 1b \n\t"
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 342 diff changeset	238 : "+r" (pixels)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	239 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	240 : "%"REG_a
386 f49629bab18d hopefully faster mmx2&3dnow MC michaelni parents: 342 diff changeset	241 );
0 986e461dc072 Initial revision glantau parents: diff changeset	242 }
986e461dc072 Initial revision glantau parents: diff changeset	243
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	244 static inline void diff_pixels_mmx(DCTELEM block, const uint8_t s1, const uint8_t *s2, int stride)
324 9c6f056f0e41 fixed mpeg4 time stuff on encoding michaelni parents: 296 diff changeset	245 {
9c6f056f0e41 fixed mpeg4 time stuff on encoding michaelni parents: 296 diff changeset	246 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	247 "pxor %%mm7, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	248 "mov $-128, %%"REG_a" \n\t"
3576 f7125bf10892 Support for MacIntel, last part: balign directives gpoirier parents: 3574 diff changeset	249 ASMALIGN(4)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	250 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	251 "movq (%0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	252 "movq (%1), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	253 "movq %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	254 "movq %%mm2, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	255 "punpcklbw %%mm7, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	256 "punpckhbw %%mm7, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	257 "punpcklbw %%mm7, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	258 "punpckhbw %%mm7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	259 "psubw %%mm2, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	260 "psubw %%mm3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	261 "movq %%mm0, (%2, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	262 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	263 "add %3, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	264 "add %3, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	265 "add $16, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	266 "jnz 1b \n\t"
324 9c6f056f0e41 fixed mpeg4 time stuff on encoding michaelni parents: 296 diff changeset	267 : "+r" (s1), "+r" (s2)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	268 : "r" (block+64), "r" ((long)stride)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	269 : "%"REG_a
324 9c6f056f0e41 fixed mpeg4 time stuff on encoding michaelni parents: 296 diff changeset	270 );
9c6f056f0e41 fixed mpeg4 time stuff on encoding michaelni parents: 296 diff changeset	271 }
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	272 #endif //CONFIG_ENCODERS
324 9c6f056f0e41 fixed mpeg4 time stuff on encoding michaelni parents: 296 diff changeset	273
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	274 void put_pixels_clamped_mmx(const DCTELEM block, uint8_t pixels, int line_size)
0 986e461dc072 Initial revision glantau parents: diff changeset	275 {
986e461dc072 Initial revision glantau parents: diff changeset	276 const DCTELEM *p;
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	277 uint8_t *pix;
0 986e461dc072 Initial revision glantau parents: diff changeset	278
986e461dc072 Initial revision glantau parents: diff changeset	279 /* read the pixels */
986e461dc072 Initial revision glantau parents: diff changeset	280 p = block;
986e461dc072 Initial revision glantau parents: diff changeset	281 pix = pixels;
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	282 /* unrolled loop */
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	283 __asm __volatile(
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	284 "movq %3, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	285 "movq 8%3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	286 "movq 16%3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	287 "movq 24%3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	288 "movq 32%3, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	289 "movq 40%3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	290 "movq 48%3, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	291 "movq 56%3, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	292 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	293 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	294 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	295 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	296 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	297 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	298 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	299 "movq %%mm6, (%0, %2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	300 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size3), "m"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	301 :"memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	302 pix += line_size*4;
986e461dc072 Initial revision glantau parents: diff changeset	303 p += 32;
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	304
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	305 // if here would be an exact copy of the code above
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	306 // compiler would generate some very strange code
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	307 // thus using "r"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	308 __asm __volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	309 "movq (%3), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	310 "movq 8(%3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	311 "movq 16(%3), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	312 "movq 24(%3), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	313 "movq 32(%3), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	314 "movq 40(%3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	315 "movq 48(%3), %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	316 "movq 56(%3), %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	317 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	318 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	319 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	320 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	321 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	322 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	323 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	324 "movq %%mm6, (%0, %2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	325 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	326 :"memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	327 }
986e461dc072 Initial revision glantau parents: diff changeset	328
3089 072dbc669253 MSVC-compatible __align8/__align16 declaration diego parents: 3036 diff changeset	329 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
1985 b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler melanson parents: 1984 diff changeset	330 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler melanson parents: 1984 diff changeset	331
1984 ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	332 void put_signed_pixels_clamped_mmx(const DCTELEM block, uint8_t pixels, int line_size)
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	333 {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	334 int i;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	335
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	336 movq_m2r(*vector128, mm1);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	337 for (i = 0; i < 8; i++) {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	338 movq_m2r(*(block), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	339 packsswb_m2r(*(block + 4), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	340 block += 8;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	341 paddb_r2r(mm1, mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	342 movq_r2m(mm0, *pixels);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	343 pixels += line_size;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	344 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	345 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	346
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	347 void add_pixels_clamped_mmx(const DCTELEM block, uint8_t pixels, int line_size)
0 986e461dc072 Initial revision glantau parents: diff changeset	348 {
986e461dc072 Initial revision glantau parents: diff changeset	349 const DCTELEM *p;
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	350 uint8_t *pix;
0 986e461dc072 Initial revision glantau parents: diff changeset	351 int i;
986e461dc072 Initial revision glantau parents: diff changeset	352
986e461dc072 Initial revision glantau parents: diff changeset	353 /* read the pixels */
986e461dc072 Initial revision glantau parents: diff changeset	354 p = block;
986e461dc072 Initial revision glantau parents: diff changeset	355 pix = pixels;
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	356 MOVQ_ZERO(mm7);
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	357 i = 4;
342 8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler kabi parents: 324 diff changeset	358 do {
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	359 __asm __volatile(
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	360 "movq (%2), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	361 "movq 8(%2), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	362 "movq 16(%2), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	363 "movq 24(%2), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	364 "movq %0, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	365 "movq %1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	366 "movq %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	367 "punpcklbw %%mm7, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	368 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	369 "paddsw %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	370 "paddsw %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	371 "movq %%mm6, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	372 "punpcklbw %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	373 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	374 "paddsw %%mm6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	375 "paddsw %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	376 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	377 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	378 "movq %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	379 "movq %%mm2, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	380 :"+m"(pix), "+m"((pix+line_size))
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	381 :"r"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	382 :"memory");
0 986e461dc072 Initial revision glantau parents: diff changeset	383 pix += line_size*2;
986e461dc072 Initial revision glantau parents: diff changeset	384 p += 16;
342 8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler kabi parents: 324 diff changeset	385 } while (--i);
0 986e461dc072 Initial revision glantau parents: diff changeset	386 }
986e461dc072 Initial revision glantau parents: diff changeset	387
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	388 static void put_pixels4_mmx(uint8_t block, const uint8_t pixels, int line_size, int h)
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	389 {
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	390 __asm __volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	391 "lea (%3, %3), %%"REG_a" \n\t"
3576 f7125bf10892 Support for MacIntel, last part: balign directives gpoirier parents: 3574 diff changeset	392 ASMALIGN(3)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	393 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	394 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	395 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	396 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	397 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	398 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	399 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	400 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	401 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	402 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	403 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	404 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	405 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	406 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	407 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	408 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	409 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	410 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	411 );
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	412 }
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	413
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	414 static void put_pixels8_mmx(uint8_t block, const uint8_t pixels, int line_size, int h)
0 986e461dc072 Initial revision glantau parents: diff changeset	415 {
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	416 __asm __volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	417 "lea (%3, %3), %%"REG_a" \n\t"
3576 f7125bf10892 Support for MacIntel, last part: balign directives gpoirier parents: 3574 diff changeset	418 ASMALIGN(3)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	419 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	420 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	421 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	422 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	423 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	424 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	425 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	426 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	427 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	428 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	429 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	430 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	431 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	432 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	433 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	434 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	435 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	436 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	437 );
0 986e461dc072 Initial revision glantau parents: diff changeset	438 }
986e461dc072 Initial revision glantau parents: diff changeset	439
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	440 static void put_pixels16_mmx(uint8_t block, const uint8_t pixels, int line_size, int h)
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	441 {
45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	442 __asm __volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	443 "lea (%3, %3), %%"REG_a" \n\t"
3576 f7125bf10892 Support for MacIntel, last part: balign directives gpoirier parents: 3574 diff changeset	444 ASMALIGN(3)
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	445 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	446 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	447 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	448 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	449 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	450 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	451 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	452 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	453 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	454 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	455 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	456 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	457 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	458 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	459 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	460 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	461 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	462 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	463 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	464 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	465 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	466 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	467 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	468 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	469 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	470 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	471 );
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	472 }
45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	473
296 c1a8a1b4a24b sizeof(s->block) isnt 6462 anymore bugfix michaelni parents: 294 diff changeset	474 static void clear_blocks_mmx(DCTELEM *blocks)
c1a8a1b4a24b sizeof(s->block) isnt 6462 anymore bugfix michaelni parents: 294 diff changeset	475 {
471 d7f65ea52aaa * reimplemented remaing avg_ pixel functions kabi parents: 448 diff changeset	476 __asm __volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	477 "pxor %%mm7, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	478 "mov $-128*6, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	479 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	480 "movq %%mm7, (%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	481 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	482 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	483 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	484 "add $32, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	485 " js 1b \n\t"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	486 : : "r" (((uint8_t )blocks)+1286)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	487 : "%"REG_a
296 c1a8a1b4a24b sizeof(s->block) isnt 6462 anymore bugfix michaelni parents: 294 diff changeset	488 );
c1a8a1b4a24b sizeof(s->block) isnt 6462 anymore bugfix michaelni parents: 294 diff changeset	489 }
c1a8a1b4a24b sizeof(s->block) isnt 6462 anymore bugfix michaelni parents: 294 diff changeset	490
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	491 #ifdef CONFIG_ENCODERS
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	492 static int pix_sum16_mmx(uint8_t * pix, int line_size){
688 894b61908734 pix_sum16_mmx() michaelni parents: 651 diff changeset	493 const int h=16;
894b61908734 pix_sum16_mmx() michaelni parents: 651 diff changeset	494 int sum;
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	495 long index= -line_size*h;
688 894b61908734 pix_sum16_mmx() michaelni parents: 651 diff changeset	496
894b61908734 pix_sum16_mmx() michaelni parents: 651 diff changeset	497 __asm __volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	498 "pxor %%mm7, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	499 "pxor %%mm6, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	500 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	501 "movq (%2, %1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	502 "movq (%2, %1), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	503 "movq 8(%2, %1), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	504 "movq 8(%2, %1), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	505 "punpcklbw %%mm7, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	506 "punpckhbw %%mm7, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	507 "punpcklbw %%mm7, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	508 "punpckhbw %%mm7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	509 "paddw %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	510 "paddw %%mm2, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	511 "paddw %%mm1, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	512 "paddw %%mm3, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	513 "add %3, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	514 " js 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	515 "movq %%mm6, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	516 "psrlq $32, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	517 "paddw %%mm5, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	518 "movq %%mm6, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	519 "psrlq $16, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	520 "paddw %%mm5, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	521 "movd %%mm6, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	522 "andl $0xFFFF, %0 \n\t"
688 894b61908734 pix_sum16_mmx() michaelni parents: 651 diff changeset	523 : "=&r" (sum), "+r" (index)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	524 : "r" (pix - index), "r" ((long)line_size)
688 894b61908734 pix_sum16_mmx() michaelni parents: 651 diff changeset	525 );
894b61908734 pix_sum16_mmx() michaelni parents: 651 diff changeset	526
894b61908734 pix_sum16_mmx() michaelni parents: 651 diff changeset	527 return sum;
894b61908734 pix_sum16_mmx() michaelni parents: 651 diff changeset	528 }
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	529 #endif //CONFIG_ENCODERS
688 894b61908734 pix_sum16_mmx() michaelni parents: 651 diff changeset	530
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	531 static void add_bytes_mmx(uint8_t dst, uint8_t src, int w){
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	532 long i=0;
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	533 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	534 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	535 "movq (%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	536 "movq (%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	537 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	538 "movq %%mm1, (%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	539 "movq 8(%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	540 "movq 8(%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	541 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	542 "movq %%mm1, 8(%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	543 "add $16, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	544 "cmp %3, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	545 " jb 1b \n\t"
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	546 : "+r" (i)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	547 : "r"(src), "r"(dst), "r"((long)w-15)
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	548 );
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	549 for(; i<w; i++)
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	550 dst[i+0] += src[i+0];
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	551 }
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	552
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	553 #define H263_LOOP_FILTER \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	554 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	555 "movq %0, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	556 "movq %0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	557 "movq %3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	558 "movq %3, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	559 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	560 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	561 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	562 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	563 "psubw %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	564 "psubw %%mm3, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	565 "movq %1, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	566 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	567 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	568 "movq %2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	569 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	570 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	571 "punpcklbw %%mm7, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	572 "punpckhbw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	573 "psubw %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	574 "psubw %%mm3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	575 "psllw $2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	576 "psllw $2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	577 "paddw %%mm0, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	578 "paddw %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	579 "pxor %%mm6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	580 "pcmpgtw %%mm4, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	581 "pcmpgtw %%mm5, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	582 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	583 "pxor %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	584 "psubw %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	585 "psubw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	586 "psrlw $3, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	587 "psrlw $3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	588 "packuswb %%mm5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	589 "packsswb %%mm7, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	590 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	591 "movd %4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	592 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	593 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	594 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	595 "psubusb %%mm4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	596 "movq %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	597 "psubusb %%mm4, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	598 "psubb %%mm3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	599 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	600 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	601 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	602 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	603 "paddusb %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	604 "psubusb %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	605 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	606 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	607 "paddusb %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	608 "packsswb %%mm1, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	609 "pcmpgtb %%mm0, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	610 "pxor %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	611 "psubb %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	612 "movq %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	613 "psubusb %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	614 "psubb %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	615 "pand %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	616 "psrlw $2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	617 "pxor %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	618 "psubb %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	619 "movq %0, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	620 "movq %3, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	621 "psubb %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	622 "paddb %%mm1, %%mm6 \n\t"
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	623
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	624 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394 e9a6215f4e3a help some gcc version to optimize out those functions aurel parents: 5278 diff changeset	625 if(ENABLE_ANY_H263) {
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	626 const int strength= ff_h263_loop_filter_strength[qscale];
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	627
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	628 asm volatile(
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	629
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	630 H263_LOOP_FILTER
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	631
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	632 "movq %%mm3, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	633 "movq %%mm4, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	634 "movq %%mm5, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	635 "movq %%mm6, %3 \n\t"
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	636 : "+m" ((uint64_t)(src - 2*stride)),
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	637 "+m" ((uint64_t)(src - 1*stride)),
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	638 "+m" ((uint64_t)(src + 0*stride)),
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	639 "+m" ((uint64_t)(src + 1*stride))
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	640 : "g" (2*strength), "m"(ff_pb_FC)
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	641 );
5394 e9a6215f4e3a help some gcc version to optimize out those functions aurel parents: 5278 diff changeset	642 }
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	643 }
c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	644
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	645 static inline void transpose4x4(uint8_t dst, uint8_t src, int dst_stride, int src_stride){
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	646 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	647 "movd %4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	648 "movd %5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	649 "movd %6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	650 "movd %7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	651 "punpcklbw %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	652 "punpcklbw %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	653 "movq %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	654 "punpcklwd %%mm2, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	655 "punpckhwd %%mm2, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	656 "movd %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	657 "punpckhdq %%mm0, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	658 "movd %%mm0, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	659 "movd %%mm1, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	660 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	661 "movd %%mm1, %3 \n\t"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	662
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	663 : "=m" ((uint32_t)(dst + 0*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	664 "=m" ((uint32_t)(dst + 1*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	665 "=m" ((uint32_t)(dst + 2*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	666 "=m" ((uint32_t)(dst + 3*dst_stride))
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	667 : "m" ((uint32_t)(src + 0*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	668 "m" ((uint32_t)(src + 1*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	669 "m" ((uint32_t)(src + 2*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	670 "m" ((uint32_t)(src + 3*src_stride))
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	671 );
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	672 }
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	673
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	674 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394 e9a6215f4e3a help some gcc version to optimize out those functions aurel parents: 5278 diff changeset	675 if(ENABLE_ANY_H263) {
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	676 const int strength= ff_h263_loop_filter_strength[qscale];
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	677 uint64_t temp[4] __attribute__ ((aligned(8)));
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	678 uint8_t btemp= (uint8_t)temp;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	679
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	680 src -= 2;
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	681
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	682 transpose4x4(btemp , src , 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	683 transpose4x4(btemp+4, src + 4*stride, 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	684 asm volatile(
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	685 H263_LOOP_FILTER // 5 3 4 6
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	686
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	687 : "+m" (temp[0]),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	688 "+m" (temp[1]),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	689 "+m" (temp[2]),
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	690 "+m" (temp[3])
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	691 : "g" (2*strength), "m"(ff_pb_FC)
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	692 );
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	693
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	694 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	695 "movq %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	696 "movq %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	697 "punpcklbw %%mm3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	698 "punpcklbw %%mm6, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	699 "punpckhbw %%mm3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	700 "punpckhbw %%mm6, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	701 "movq %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	702 "movq %%mm1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	703 "punpcklwd %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	704 "punpcklwd %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	705 "punpckhwd %%mm4, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	706 "punpckhwd %%mm0, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	707 "movd %%mm5, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	708 "punpckhdq %%mm5, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	709 "movd %%mm5, (%0,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	710 "movd %%mm3, (%0,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	711 "punpckhdq %%mm3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	712 "movd %%mm3, (%0,%3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	713 "movd %%mm1, (%1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	714 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	715 "movd %%mm1, (%1,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	716 "movd %%mm6, (%1,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	717 "punpckhdq %%mm6, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	718 "movd %%mm6, (%1,%3) \n\t"
2505 86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<) michael parents: 2293 diff changeset	719 :: "r" (src),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<) michael parents: 2293 diff changeset	720 "r" (src + 4*stride),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<) michael parents: 2293 diff changeset	721 "r" ((long) stride ),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<) michael parents: 2293 diff changeset	722 "r" ((long)(3*stride))
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	723 );
5394 e9a6215f4e3a help some gcc version to optimize out those functions aurel parents: 5278 diff changeset	724 }
1648 de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	725 }
de28264c3dc3 h263_h_loop_filter_mmx michael parents: 1647 diff changeset	726
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	727 #ifdef CONFIG_ENCODERS
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	728 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	729 int tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	730 asm volatile (
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	731 "movl $16,%%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	732 "pxor %%mm0,%%mm0\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	733 "pxor %%mm7,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	734 "1:\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	735 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	736 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	737
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	738 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	739
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	740 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	741 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	742
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	743 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	744 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	745 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	746
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	747 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	748 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	749
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	750 "pmaddwd %%mm3,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	751 "pmaddwd %%mm4,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	752
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	753 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	754 pix2^2+pix3^2+pix6^2+pix7^2) */
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	755 "paddd %%mm3,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	756 "paddd %%mm2,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	757
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	758 "add %2, %0\n"
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	759 "paddd %%mm4,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	760 "dec %%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	761 "jnz 1b\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	762
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	763 "movq %%mm7,%%mm1\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	764 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	765 "paddd %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	766 "movd %%mm1,%1\n"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	767 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	768 return tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	769 }
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	770
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	771 static int sse8_mmx(void v, uint8_t pix1, uint8_t * pix2, int line_size, int h) {
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	772 int tmp;
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	773 asm volatile (
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	774 "movl %4,%%ecx\n"
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	775 "shr $1,%%ecx\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	776 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	777 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	778 "1:\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	779 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	780 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	781 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	782 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	783
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	784 /* todo: mm1-mm2, mm3-mm4 */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	785 /* algo: substract mm1 from mm2 with saturation and vice versa */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	786 /* OR the results to get absolute difference */
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	787 "movq %%mm1,%%mm5\n"
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	788 "movq %%mm3,%%mm6\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	789 "psubusb %%mm2,%%mm1\n"
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	790 "psubusb %%mm4,%%mm3\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	791 "psubusb %%mm5,%%mm2\n"
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	792 "psubusb %%mm6,%%mm4\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	793
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	794 "por %%mm1,%%mm2\n"
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	795 "por %%mm3,%%mm4\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	796
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	797 /* now convert to 16-bit vectors so we can square them */
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	798 "movq %%mm2,%%mm1\n"
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	799 "movq %%mm4,%%mm3\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	800
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	801 "punpckhbw %%mm0,%%mm2\n"
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	802 "punpckhbw %%mm0,%%mm4\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	803 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	804 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	805
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	806 "pmaddwd %%mm2,%%mm2\n"
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	807 "pmaddwd %%mm4,%%mm4\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	808 "pmaddwd %%mm1,%%mm1\n"
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	809 "pmaddwd %%mm3,%%mm3\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	810
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	811 "lea (%0,%3,2), %0\n" /* pix1 += 2line_size /
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	812 "lea (%1,%3,2), %1\n" /* pix2 += 2line_size /
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	813
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	814 "paddd %%mm2,%%mm1\n"
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	815 "paddd %%mm4,%%mm3\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	816 "paddd %%mm1,%%mm7\n"
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	817 "paddd %%mm3,%%mm7\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	818
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	819 "decl %%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	820 "jnz 1b\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	821
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	822 "movq %%mm7,%%mm1\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	823 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	824 "paddd %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	825 "movd %%mm1,%2\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	826 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	827 : "r" ((long)line_size) , "m" (h)
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	828 : "%ecx");
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	829 return tmp;
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	830 }
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	831
1708 dea5b2946999 interlaced motion estimation michael parents: 1686 diff changeset	832 static int sse16_mmx(void v, uint8_t pix1, uint8_t * pix2, int line_size, int h) {
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	833 int tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	834 asm volatile (
1708 dea5b2946999 interlaced motion estimation michael parents: 1686 diff changeset	835 "movl %4,%%ecx\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	836 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	837 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	838 "1:\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	839 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	840 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	841 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	842 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	843
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	844 /* todo: mm1-mm2, mm3-mm4 */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	845 /* algo: substract mm1 from mm2 with saturation and vice versa */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	846 /* OR the results to get absolute difference */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	847 "movq %%mm1,%%mm5\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	848 "movq %%mm3,%%mm6\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	849 "psubusb %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	850 "psubusb %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	851 "psubusb %%mm5,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	852 "psubusb %%mm6,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	853
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	854 "por %%mm1,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	855 "por %%mm3,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	856
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	857 /* now convert to 16-bit vectors so we can square them */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	858 "movq %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	859 "movq %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	860
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	861 "punpckhbw %%mm0,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	862 "punpckhbw %%mm0,%%mm4\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	863 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	864 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	865
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	866 "pmaddwd %%mm2,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	867 "pmaddwd %%mm4,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	868 "pmaddwd %%mm1,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	869 "pmaddwd %%mm3,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	870
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	871 "add %3,%0\n"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	872 "add %3,%1\n"
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	873
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	874 "paddd %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	875 "paddd %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	876 "paddd %%mm1,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	877 "paddd %%mm3,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	878
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	879 "decl %%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	880 "jnz 1b\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	881
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	882 "movq %%mm7,%%mm1\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	883 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	884 "paddd %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	885 "movd %%mm1,%2\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	886 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	887 : "r" ((long)line_size) , "m" (h)
1708 dea5b2946999 interlaced motion estimation michael parents: 1686 diff changeset	888 : "%ecx");
997 4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	889 return tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	890 }
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications) michaelni parents: 984 diff changeset	891
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	892 static int sse16_sse2(void v, uint8_t pix1, uint8_t * pix2, int line_size, int h) {
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	893 int tmp;
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	894 asm volatile (
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	895 "shr $1,%2\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	896 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	897 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	898 "1:\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	899 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	900 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	901 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	902 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	903
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	904 /* todo: mm1-mm2, mm3-mm4 */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	905 /* algo: substract mm1 from mm2 with saturation and vice versa */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	906 /* OR the results to get absolute difference */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	907 "movdqa %%xmm1,%%xmm5\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	908 "movdqa %%xmm3,%%xmm6\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	909 "psubusb %%xmm2,%%xmm1\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	910 "psubusb %%xmm4,%%xmm3\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	911 "psubusb %%xmm5,%%xmm2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	912 "psubusb %%xmm6,%%xmm4\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	913
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	914 "por %%xmm1,%%xmm2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	915 "por %%xmm3,%%xmm4\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	916
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	917 /* now convert to 16-bit vectors so we can square them */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	918 "movdqa %%xmm2,%%xmm1\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	919 "movdqa %%xmm4,%%xmm3\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	920
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	921 "punpckhbw %%xmm0,%%xmm2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	922 "punpckhbw %%xmm0,%%xmm4\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	923 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	924 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	925
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	926 "pmaddwd %%xmm2,%%xmm2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	927 "pmaddwd %%xmm4,%%xmm4\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	928 "pmaddwd %%xmm1,%%xmm1\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	929 "pmaddwd %%xmm3,%%xmm3\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	930
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	931 "lea (%0,%4,2), %0\n" /* pix1 += 2line_size /
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	932 "lea (%1,%4,2), %1\n" /* pix2 += 2line_size /
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	933
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	934 "paddd %%xmm2,%%xmm1\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	935 "paddd %%xmm4,%%xmm3\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	936 "paddd %%xmm1,%%xmm7\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	937 "paddd %%xmm3,%%xmm7\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	938
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	939 "decl %2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	940 "jnz 1b\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	941
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	942 "movdqa %%xmm7,%%xmm1\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	943 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	944 "paddd %%xmm1,%%xmm7\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	945 "movdqa %%xmm7,%%xmm1\n"
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	946 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	947 "paddd %%xmm1,%%xmm7\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	948 "movd %%xmm7,%3\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	949 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
2899 d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	950 : "r" ((long)line_size));
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	951 return tmp;
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	952 }
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8) lorenm parents: 2892 diff changeset	953
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	954 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	955 int tmp;
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	956 asm volatile (
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	957 "movl %3,%%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	958 "pxor %%mm7,%%mm7\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	959 "pxor %%mm6,%%mm6\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	960
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	961 "movq (%0),%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	962 "movq %%mm0, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	963 "psllq $8, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	964 "psrlq $8, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	965 "psrlq $8, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	966 "movq %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	967 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	968 "punpcklbw %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	969 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	970 "punpckhbw %%mm7,%%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	971 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	972 "psubw %%mm1, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	973 "psubw %%mm3, %%mm2\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	974
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	975 "add %2,%0\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	976
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	977 "movq (%0),%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	978 "movq %%mm4, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	979 "psllq $8, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	980 "psrlq $8, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	981 "psrlq $8, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	982 "movq %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	983 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	984 "punpcklbw %%mm7,%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	985 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	986 "punpckhbw %%mm7,%%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	987 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	988 "psubw %%mm1, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	989 "psubw %%mm3, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	990 "psubw %%mm4, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	991 "psubw %%mm5, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	992 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	993 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	994 "pcmpgtw %%mm0, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	995 "pcmpgtw %%mm2, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	996 "pxor %%mm3, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	997 "pxor %%mm1, %%mm2\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	998 "psubw %%mm3, %%mm0\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	999 "psubw %%mm1, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1000 "paddw %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1001 "paddw %%mm2, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1002
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1003 "add %2,%0\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1004 "1:\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1005
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1006 "movq (%0),%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1007 "movq %%mm0, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1008 "psllq $8, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1009 "psrlq $8, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1010 "psrlq $8, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1011 "movq %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1012 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1013 "punpcklbw %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1014 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1015 "punpckhbw %%mm7,%%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1016 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1017 "psubw %%mm1, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1018 "psubw %%mm3, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1019 "psubw %%mm0, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1020 "psubw %%mm2, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1021 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1022 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1023 "pcmpgtw %%mm4, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1024 "pcmpgtw %%mm5, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1025 "pxor %%mm3, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1026 "pxor %%mm1, %%mm5\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1027 "psubw %%mm3, %%mm4\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1028 "psubw %%mm1, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1029 "paddw %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1030 "paddw %%mm5, %%mm6\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1031
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1032 "add %2,%0\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1033
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1034 "movq (%0),%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1035 "movq %%mm4, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1036 "psllq $8, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1037 "psrlq $8, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1038 "psrlq $8, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1039 "movq %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1040 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1041 "punpcklbw %%mm7,%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1042 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1043 "punpckhbw %%mm7,%%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1044 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1045 "psubw %%mm1, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1046 "psubw %%mm3, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1047 "psubw %%mm4, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1048 "psubw %%mm5, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1049 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1050 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1051 "pcmpgtw %%mm0, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1052 "pcmpgtw %%mm2, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1053 "pxor %%mm3, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1054 "pxor %%mm1, %%mm2\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1055 "psubw %%mm3, %%mm0\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1056 "psubw %%mm1, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1057 "paddw %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1058 "paddw %%mm2, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1059
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1060 "add %2,%0\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1061 "subl $2, %%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1062 " jnz 1b\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1063
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1064 "movq %%mm6, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1065 "punpcklwd %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1066 "punpckhwd %%mm7,%%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1067 "paddd %%mm0, %%mm6\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1068
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1069 "movq %%mm6,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1070 "psrlq $32, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1071 "paddd %%mm6,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1072 "movd %%mm0,%1\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1073 : "+r" (pix1), "=r"(tmp)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1074 : "r" ((long)line_size) , "g" (h-2)
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1075 : "%ecx");
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1076 return tmp;
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1077 }
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1078
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1079 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1080 int tmp;
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1081 uint8_t * pix= pix1;
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1082 asm volatile (
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1083 "movl %3,%%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1084 "pxor %%mm7,%%mm7\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1085 "pxor %%mm6,%%mm6\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1086
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1087 "movq (%0),%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1088 "movq 1(%0),%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1089 "movq %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1090 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1091 "punpcklbw %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1092 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1093 "punpckhbw %%mm7,%%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1094 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1095 "psubw %%mm1, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1096 "psubw %%mm3, %%mm2\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1097
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1098 "add %2,%0\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1099
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1100 "movq (%0),%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1101 "movq 1(%0),%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1102 "movq %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1103 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1104 "punpcklbw %%mm7,%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1105 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1106 "punpckhbw %%mm7,%%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1107 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1108 "psubw %%mm1, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1109 "psubw %%mm3, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1110 "psubw %%mm4, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1111 "psubw %%mm5, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1112 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1113 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1114 "pcmpgtw %%mm0, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1115 "pcmpgtw %%mm2, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1116 "pxor %%mm3, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1117 "pxor %%mm1, %%mm2\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1118 "psubw %%mm3, %%mm0\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1119 "psubw %%mm1, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1120 "paddw %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1121 "paddw %%mm2, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1122
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1123 "add %2,%0\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1124 "1:\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1125
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1126 "movq (%0),%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1127 "movq 1(%0),%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1128 "movq %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1129 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1130 "punpcklbw %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1131 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1132 "punpckhbw %%mm7,%%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1133 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1134 "psubw %%mm1, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1135 "psubw %%mm3, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1136 "psubw %%mm0, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1137 "psubw %%mm2, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1138 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1139 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1140 "pcmpgtw %%mm4, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1141 "pcmpgtw %%mm5, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1142 "pxor %%mm3, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1143 "pxor %%mm1, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1144 "psubw %%mm3, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1145 "psubw %%mm1, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1146 "paddw %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1147 "paddw %%mm5, %%mm6\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1148
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1149 "add %2,%0\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1150
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1151 "movq (%0),%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1152 "movq 1(%0),%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1153 "movq %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1154 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1155 "punpcklbw %%mm7,%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1156 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1157 "punpckhbw %%mm7,%%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1158 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1159 "psubw %%mm1, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1160 "psubw %%mm3, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1161 "psubw %%mm4, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1162 "psubw %%mm5, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1163 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1164 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1165 "pcmpgtw %%mm0, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1166 "pcmpgtw %%mm2, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1167 "pxor %%mm3, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1168 "pxor %%mm1, %%mm2\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1169 "psubw %%mm3, %%mm0\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1170 "psubw %%mm1, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1171 "paddw %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1172 "paddw %%mm2, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1173
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1174 "add %2,%0\n"
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1175 "subl $2, %%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1176 " jnz 1b\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1177
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1178 "movq %%mm6, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1179 "punpcklwd %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1180 "punpckhwd %%mm7,%%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1181 "paddd %%mm0, %%mm6\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1182
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1183 "movq %%mm6,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1184 "psrlq $32, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1185 "paddd %%mm6,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1186 "movd %%mm0,%1\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1187 : "+r" (pix1), "=r"(tmp)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1188 : "r" ((long)line_size) , "g" (h-2)
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1189 : "%ecx");
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1190 return tmp + hf_noise8_mmx(pix+8, line_size, h);
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1191 }
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1192
2864 95bac7109ff0 Kill some compiler warnings. Compiled code verified identical after changes. mru parents: 2754 diff changeset	1193 static int nsse16_mmx(void p, uint8_t pix1, uint8_t * pix2, int line_size, int h) {
95bac7109ff0 Kill some compiler warnings. Compiled code verified identical after changes. mru parents: 2754 diff changeset	1194 MpegEncContext *c = p;
2940 8aa244d7c274 use sse16_sse2() in nsse lorenm parents: 2922 diff changeset	1195 int score1, score2;
8aa244d7c274 use sse16_sse2() in nsse lorenm parents: 2922 diff changeset	1196
8aa244d7c274 use sse16_sse2() in nsse lorenm parents: 2922 diff changeset	1197 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
8aa244d7c274 use sse16_sse2() in nsse lorenm parents: 2922 diff changeset	1198 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
8aa244d7c274 use sse16_sse2() in nsse lorenm parents: 2922 diff changeset	1199 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1200
4001 34fdffe98bd0 Rename ABS macro to FFABS. diego parents: 3947 diff changeset	1201 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
34fdffe98bd0 Rename ABS macro to FFABS. diego parents: 3947 diff changeset	1202 else return score1 + FFABS(score2)*8;
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1203 }
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1204
2864 95bac7109ff0 Kill some compiler warnings. Compiled code verified identical after changes. mru parents: 2754 diff changeset	1205 static int nsse8_mmx(void p, uint8_t pix1, uint8_t * pix2, int line_size, int h) {
95bac7109ff0 Kill some compiler warnings. Compiled code verified identical after changes. mru parents: 2754 diff changeset	1206 MpegEncContext *c = p;
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1207 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1208 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1209
4001 34fdffe98bd0 Rename ABS macro to FFABS. diego parents: 3947 diff changeset	1210 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
34fdffe98bd0 Rename ABS macro to FFABS. diego parents: 3947 diff changeset	1211 else return score1 + FFABS(score2)*8;
2067 f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1212 }
f37b6ffc81ed sse8 and nsse in mmx michael parents: 2024 diff changeset	1213
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1214 static int vsad_intra16_mmx(void v, uint8_t pix, uint8_t * dummy, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1215 int tmp;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1216
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1217 assert( (((int)pix) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1218 assert((line_size &7) ==0);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1219
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1220 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1221 "movq (%0), %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1222 "movq 8(%0), %%mm3\n"\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1223 "add %2,%0\n"\
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1224 "movq %%mm2, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1225 "movq %%mm3, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1226 "psubusb " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1227 "psubusb " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1228 "psubusb " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1229 "psubusb " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1230 "por %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1231 "por %%mm3, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1232 "movq " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1233 "movq " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1234 "punpcklbw %%mm7, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1235 "punpcklbw %%mm7, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1236 "punpckhbw %%mm7, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1237 "punpckhbw %%mm7, %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1238 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1239 "paddw %%mm3, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1240 "paddw %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1241 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1242
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1243
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1244 asm volatile (
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1245 "movl %3,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1246 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1247 "pxor %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1248 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1249 "movq 8(%0),%%mm1\n"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1250 "add %2,%0\n"
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1251 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1252 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1253 "1:\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1254
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1255 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1256
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1257 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1258
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1259 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1260 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1261
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1262 "movq %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1263 "psrlq $32, %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1264 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1265 "movq %%mm0,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1266 "psrlq $16, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1267 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1268 "movd %%mm0,%1\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1269 : "+r" (pix), "=r"(tmp)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1270 : "r" ((long)line_size) , "m" (h)
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1271 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1272 return tmp & 0xFFFF;
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1273 }
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1274 #undef SUM
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1275
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1276 static int vsad_intra16_mmx2(void v, uint8_t pix, uint8_t * dummy, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1277 int tmp;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1278
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1279 assert( (((int)pix) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1280 assert((line_size &7) ==0);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1281
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1282 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1283 "movq (%0), " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1284 "movq 8(%0), " #out1 "\n"\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1285 "add %2,%0\n"\
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1286 "psadbw " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1287 "psadbw " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1288 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1289 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1290
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1291 asm volatile (
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1292 "movl %3,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1293 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1294 "pxor %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1295 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1296 "movq 8(%0),%%mm1\n"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1297 "add %2,%0\n"
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1298 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1299 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1300 "1:\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1301
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1302 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1303
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1304 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1305
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1306 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1307 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1308
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1309 "movd %%mm6,%1\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1310 : "+r" (pix), "=r"(tmp)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1311 : "r" ((long)line_size) , "m" (h)
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1312 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1313 return tmp;
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1314 }
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1315 #undef SUM
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1316
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1317 static int vsad16_mmx(void v, uint8_t pix1, uint8_t * pix2, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1318 int tmp;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1319
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1320 assert( (((int)pix1) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1321 assert( (((int)pix2) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1322 assert((line_size &7) ==0);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1323
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1324 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1325 "movq (%0),%%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1326 "movq (%1)," #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1327 "movq 8(%0),%%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1328 "movq 8(%1)," #out1 "\n"\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1329 "add %3,%0\n"\
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1330 "add %3,%1\n"\
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1331 "psubb " #out0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1332 "psubb " #out1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1333 "pxor %%mm7, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1334 "pxor %%mm7, %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1335 "movq %%mm2, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1336 "movq %%mm3, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1337 "psubusb " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1338 "psubusb " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1339 "psubusb " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1340 "psubusb " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1341 "por %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1342 "por %%mm3, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1343 "movq " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1344 "movq " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1345 "punpcklbw %%mm7, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1346 "punpcklbw %%mm7, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1347 "punpckhbw %%mm7, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1348 "punpckhbw %%mm7, %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1349 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1350 "paddw %%mm3, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1351 "paddw %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1352 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1353
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1354
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1355 asm volatile (
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1356 "movl %4,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1357 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1358 "pcmpeqw %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1359 "psllw $15, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1360 "packsswb %%mm7, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1361 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1362 "movq (%1),%%mm2\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1363 "movq 8(%0),%%mm1\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1364 "movq 8(%1),%%mm3\n"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1365 "add %3,%0\n"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1366 "add %3,%1\n"
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1367 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1368 "psubb %%mm2, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1369 "psubb %%mm3, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1370 "pxor %%mm7, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1371 "pxor %%mm7, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1372 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1373 "1:\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1374
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1375 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1376
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1377 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1378
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1379 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1380 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1381
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1382 "movq %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1383 "psrlq $32, %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1384 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1385 "movq %%mm0,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1386 "psrlq $16, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1387 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1388 "movd %%mm0,%2\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1389 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1390 : "r" ((long)line_size) , "m" (h)
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1391 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1392 return tmp & 0x7FFF;
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1393 }
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1394 #undef SUM
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1395
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1396 static int vsad16_mmx2(void v, uint8_t pix1, uint8_t * pix2, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1397 int tmp;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1398
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1399 assert( (((int)pix1) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1400 assert( (((int)pix2) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1401 assert((line_size &7) ==0);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1402
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1403 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1404 "movq (%0)," #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1405 "movq (%1),%%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1406 "movq 8(%0)," #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1407 "movq 8(%1),%%mm3\n"\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1408 "add %3,%0\n"\
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1409 "add %3,%1\n"\
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1410 "psubb %%mm2, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1411 "psubb %%mm3, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1412 "pxor %%mm7, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1413 "pxor %%mm7, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1414 "psadbw " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1415 "psadbw " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1416 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1417 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1418
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1419 asm volatile (
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1420 "movl %4,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1421 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1422 "pcmpeqw %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1423 "psllw $15, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1424 "packsswb %%mm7, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1425 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1426 "movq (%1),%%mm2\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1427 "movq 8(%0),%%mm1\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1428 "movq 8(%1),%%mm3\n"
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1429 "add %3,%0\n"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1430 "add %3,%1\n"
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1431 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1432 "psubb %%mm2, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1433 "psubb %%mm3, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1434 "pxor %%mm7, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1435 "pxor %%mm7, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1436 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1437 "1:\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1438
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1439 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1440
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1441 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1442
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1443 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1444 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1445
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1446 "movd %%mm6,%2\n"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1447 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1448 : "r" ((long)line_size) , "m" (h)
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1449 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1450 return tmp;
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1451 }
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1452 #undef SUM
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	1453
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	1454 static void diff_bytes_mmx(uint8_t dst, uint8_t src1, uint8_t *src2, int w){
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1455 long i=0;
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	1456 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1457 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1458 "movq (%2, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1459 "movq (%1, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1460 "psubb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1461 "movq %%mm1, (%3, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1462 "movq 8(%2, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1463 "movq 8(%1, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1464 "psubb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1465 "movq %%mm1, 8(%3, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1466 "add $16, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1467 "cmp %4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1468 " jb 1b \n\t"
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	1469 : "+r" (i)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1470 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	1471 );
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	1472 for(; i<w; i++)
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	1473 dst[i+0] = src1[i+0]-src2[i+0];
725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	1474 }
1527 8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1475
8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1476 static void sub_hfyu_median_prediction_mmx2(uint8_t dst, uint8_t src1, uint8_t src2, int w, int left, int *left_top){
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1477 long i=0;
1527 8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1478 uint8_t l, lt;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1479
1527 8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1480 asm volatile(
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1481 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1482 "movq -1(%1, %0), %%mm0 \n\t" // LT
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1483 "movq (%1, %0), %%mm1 \n\t" // T
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1484 "movq -1(%2, %0), %%mm2 \n\t" // L
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1485 "movq (%2, %0), %%mm3 \n\t" // X
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1486 "movq %%mm2, %%mm4 \n\t" // L
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1487 "psubb %%mm0, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1488 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1489 "movq %%mm4, %%mm5 \n\t" // L
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1490 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1491 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1492 "pminub %%mm2, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1493 "pmaxub %%mm1, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1494 "psubb %%mm4, %%mm3 \n\t" // dst - pred
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1495 "movq %%mm3, (%3, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1496 "add $8, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1497 "cmp %4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1498 " jb 1b \n\t"
1527 8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1499 : "+r" (i)
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	1500 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1527 8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1501 );
8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1502
8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1503 l= *left;
8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1504 lt= *left_top;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1505
1527 8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1506 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1507
1527 8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1508 *left_top= src1[w-1];
8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1509 *left = src2[w-1];
8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1510 }
8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	1511
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1512 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1513 "mov"#m" "#p1", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1514 "mov"#m" "#p2", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1515 "punpcklbw "#a", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1516 "punpcklbw "#a", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1517 "psubw "#t", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1518
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1519 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1520 uint8_t p1b=p1, p2b=p2;\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1521 asm volatile(\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1522 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1523 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1524 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1525 "add %4, %1 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1526 "add %4, %2 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1527 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1528 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1529 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1530 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1531 "mov"#m1" "#mm"0, %0 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1532 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1533 "mov"#m1" %0, "#mm"0 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1534 : "=m"(temp), "+r"(p1b), "+r"(p2b)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1535 : "r"((long)stride), "r"((long)stride*3)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1536 );\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1537 }
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1538
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1539 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1540 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1541
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1542 #ifdef ARCH_X86_64
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1543 // permutes 01234567 -> 05736421
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1544 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1545 SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1546 SBUTTERFLY(c,d,b,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1547 SBUTTERFLY(e,f,d,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1548 SBUTTERFLY(g,h,f,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1549 SBUTTERFLY(a,c,h,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1550 SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1551 SBUTTERFLY(e,g,b,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1552 SBUTTERFLY(d,f,g,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1553 SBUTTERFLY(a,e,f,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1554 SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1555 SBUTTERFLY(h,b,d,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1556 SBUTTERFLY(c,g,b,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1557 "movdqa %%xmm8, "#g" \n\t"
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1558 #else
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1559 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1560 "movdqa "#h", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1561 SBUTTERFLY(a,b,h,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1562 "movdqa "#h", 16"#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1563 "movdqa "#t", "#h" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1564 SBUTTERFLY(c,d,b,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1565 SBUTTERFLY(e,f,d,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1566 SBUTTERFLY(g,h,f,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1567 SBUTTERFLY(a,c,h,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1568 "movdqa "#h", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1569 "movdqa 16"#t", "#h" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1570 SBUTTERFLY(h,b,c,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1571 SBUTTERFLY(e,g,b,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1572 SBUTTERFLY(d,f,g,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1573 SBUTTERFLY(a,e,f,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1574 SBUTTERFLY(h,d,e,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1575 "movdqa "#h", 16"#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1576 "movdqa "#t", "#h" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1577 SBUTTERFLY(h,b,d,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1578 SBUTTERFLY(c,g,b,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1579 "movdqa 16"#t", "#g" \n\t"
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1580 #endif
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1581
1153 2725c8eb3c81 faster hadamard transform michaelni parents: 1122 diff changeset	1582 #define LBUTTERFLY2(a1,b1,a2,b2)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1583 "paddw " #b1 ", " #a1 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1584 "paddw " #b2 ", " #a2 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1585 "paddw " #b1 ", " #b1 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1586 "paddw " #b2 ", " #b2 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1587 "psubw " #a1 ", " #b1 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1588 "psubw " #a2 ", " #b2 " \n\t"
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	1589
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1590 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1591 LBUTTERFLY2(m0, m1, m2, m3)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1592 LBUTTERFLY2(m4, m5, m6, m7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1593 LBUTTERFLY2(m0, m2, m1, m3)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1594 LBUTTERFLY2(m4, m6, m5, m7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1595 LBUTTERFLY2(m0, m4, m1, m5)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1596 LBUTTERFLY2(m2, m6, m3, m7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1597
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1598 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
936 caa77cd960c0 qpel encoding michaelni parents: 866 diff changeset	1599
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1600 #define MMABS_MMX(a,z)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1601 "pxor " #z ", " #z " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1602 "pcmpgtw " #a ", " #z " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1603 "pxor " #z ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1604 "psubw " #z ", " #a " \n\t"
936 caa77cd960c0 qpel encoding michaelni parents: 866 diff changeset	1605
1153 2725c8eb3c81 faster hadamard transform michaelni parents: 1122 diff changeset	1606 #define MMABS_MMX2(a,z)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1607 "pxor " #z ", " #z " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1608 "psubw " #a ", " #z " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1609 "pmaxsw " #z ", " #a " \n\t"
1153 2725c8eb3c81 faster hadamard transform michaelni parents: 1122 diff changeset	1610
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1611 #define MMABS_SSSE3(a,z)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1612 "pabsw " #a ", " #a " \n\t"
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1613
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1614 #define MMABS_SUM(a,z, sum)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1615 MMABS(a,z)\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1616 "paddusw " #a ", " #sum " \n\t"
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1617
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1618 #define MMABS_SUM_8x8_NOSPILL\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1619 MMABS(%%xmm0, %%xmm8)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1620 MMABS(%%xmm1, %%xmm9)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1621 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1622 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1623 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1624 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1625 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1626 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1627 "paddusw %%xmm1, %%xmm0 \n\t"
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1628
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1629 #ifdef ARCH_X86_64
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1630 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1631 #else
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1632 #define MMABS_SUM_8x8_SSE2\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1633 "movdqa %%xmm7, (%1) \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1634 MMABS(%%xmm0, %%xmm7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1635 MMABS(%%xmm1, %%xmm7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1636 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1637 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1638 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1639 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1640 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1641 "movdqa (%1), %%xmm2 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1642 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1643 "paddusw %%xmm1, %%xmm0 \n\t"
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1644 #endif
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	1645
936 caa77cd960c0 qpel encoding michaelni parents: 866 diff changeset	1646 #define LOAD4(o, a, b, c, d)\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1647 "movq "#o"(%1), "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1648 "movq "#o"+8(%1), "#b" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1649 "movq "#o"+16(%1), "#c" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1650 "movq "#o"+24(%1), "#d" \n\t"\
936 caa77cd960c0 qpel encoding michaelni parents: 866 diff changeset	1651
caa77cd960c0 qpel encoding michaelni parents: 866 diff changeset	1652 #define STORE4(o, a, b, c, d)\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1653 "movq "#a", "#o"(%1) \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1654 "movq "#b", "#o"+8(%1) \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1655 "movq "#c", "#o"+16(%1) \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1656 "movq "#d", "#o"+24(%1) \n\t"\
936 caa77cd960c0 qpel encoding michaelni parents: 866 diff changeset	1657
4988 689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1658 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1659 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1660 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1661 #define HSUM_MMX(a, t, dst)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1662 "movq "#a", "#t" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1663 "psrlq $32, "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1664 "paddusw "#t", "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1665 "movq "#a", "#t" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1666 "psrlq $16, "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1667 "paddusw "#t", "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1668 "movd "#a", "#dst" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1669
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1670 #define HSUM_MMX2(a, t, dst)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1671 "pshufw $0x0E, "#a", "#t" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1672 "paddusw "#t", "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1673 "pshufw $0x01, "#a", "#t" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1674 "paddusw "#t", "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1675 "movd "#a", "#dst" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1676
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1677 #define HSUM_SSE2(a, t, dst)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1678 "movhlps "#a", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1679 "paddusw "#t", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1680 "pshuflw $0x0E, "#a", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1681 "paddusw "#t", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1682 "pshuflw $0x01, "#a", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1683 "paddusw "#t", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1684 "movd "#a", "#dst" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1685
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1686 #define HADAMARD8_DIFF_MMX(cpu) \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1687 static int hadamard8_diff_##cpu(void s, uint8_t src1, uint8_t *src2, int stride, int h){\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1688 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1689 int sum;\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1690 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1691 assert(h==8);\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1692 \
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1693 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1694 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1695 asm volatile(\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1696 HADAMARD48\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1697 \
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1698 "movq %%mm7, 96(%1) \n\t"\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1699 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1700 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1701 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1702 \
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1703 "movq 96(%1), %%mm7 \n\t"\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1704 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1705 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1706 \
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1707 : "=r" (sum)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1708 : "r"(temp)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1709 );\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1710 \
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1711 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1712 \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1713 asm volatile(\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1714 HADAMARD48\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1715 \
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1716 "movq %%mm7, 96(%1) \n\t"\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1717 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1718 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1719 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1720 \
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1721 "movq 96(%1), %%mm7 \n\t"\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1722 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1723 "movq %%mm7, %%mm5 \n\t"/FIXME remove/\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1724 "movq %%mm6, %%mm7 \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1725 "movq %%mm0, %%mm6 \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1726 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1727 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1728 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1729 HADAMARD48\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1730 "movq %%mm7, 64(%1) \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1731 MMABS(%%mm0, %%mm7)\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1732 MMABS(%%mm1, %%mm7)\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1733 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1734 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1735 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1736 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1737 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1738 "movq 64(%1), %%mm2 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1739 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1740 "paddusw %%mm1, %%mm0 \n\t"\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1741 "movq %%mm0, 64(%1) \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1742 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1743 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1744 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1745 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1746 HADAMARD48\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1747 "movq %%mm7, (%1) \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1748 MMABS(%%mm0, %%mm7)\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1749 MMABS(%%mm1, %%mm7)\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1750 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1751 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1752 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1753 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1754 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1755 "movq (%1), %%mm2 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1756 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1757 "paddusw 64(%1), %%mm0 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1758 "paddusw %%mm1, %%mm0 \n\t"\
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1759 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1760 HSUM(%%mm0, %%mm1, %0)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1761 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1762 : "=r" (sum)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1763 : "r"(temp)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1764 );\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1765 return sum&0xFFFF;\
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1766 }\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1767 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1768
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1769 #define HADAMARD8_DIFF_SSE2(cpu) \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1770 static int hadamard8_diff_##cpu(void s, uint8_t src1, uint8_t *src2, int stride, int h){\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1771 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1772 int sum;\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1773 \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1774 assert(h==8);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1775 \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1776 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1777 \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1778 asm volatile(\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1779 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1780 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1781 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1782 MMABS_SUM_8x8\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1783 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1784 : "=r" (sum)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1785 : "r"(temp)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1786 );\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1787 return sum&0xFFFF;\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1788 }\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1789 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
936 caa77cd960c0 qpel encoding michaelni parents: 866 diff changeset	1790
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1791 #define MMABS(a,z) MMABS_MMX(a,z)
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1792 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1793 HADAMARD8_DIFF_MMX(mmx)
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1794 #undef MMABS
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1795 #undef HSUM
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1796
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1797 #define MMABS(a,z) MMABS_MMX2(a,z)
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1798 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1799 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1800 HADAMARD8_DIFF_MMX(mmx2)
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1801 HADAMARD8_DIFF_SSE2(sse2)
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1802 #undef MMABS
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1803 #undef MMABS_SUM_8x8
4946 c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx lorenm parents: 4939 diff changeset	1804 #undef HSUM
1153 2725c8eb3c81 faster hadamard transform michaelni parents: 1122 diff changeset	1805
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1806 #ifdef HAVE_SSSE3
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1807 #define MMABS(a,z) MMABS_SSSE3(a,z)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1808 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1809 HADAMARD8_DIFF_SSE2(ssse3)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1810 #undef MMABS
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1811 #undef MMABS_SUM_8x8
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	1812 #endif
4749 7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1813
4988 689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1814 #define DCT_SAD4(m,mm,o)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1815 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1816 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1817 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1818 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1819 MMABS_SUM(mm##2, mm##6, mm##0)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1820 MMABS_SUM(mm##3, mm##7, mm##1)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1821 MMABS_SUM(mm##4, mm##6, mm##0)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1822 MMABS_SUM(mm##5, mm##7, mm##1)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1823
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1824 #define DCT_SAD_MMX\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1825 "pxor %%mm0, %%mm0 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1826 "pxor %%mm1, %%mm1 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1827 DCT_SAD4(q, %%mm, 0)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1828 DCT_SAD4(q, %%mm, 8)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1829 DCT_SAD4(q, %%mm, 64)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1830 DCT_SAD4(q, %%mm, 72)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1831 "paddusw %%mm1, %%mm0 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1832 HSUM(%%mm0, %%mm1, %0)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1833
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1834 #define DCT_SAD_SSE2\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1835 "pxor %%xmm0, %%xmm0 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1836 "pxor %%xmm1, %%xmm1 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1837 DCT_SAD4(dqa, %%xmm, 0)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1838 DCT_SAD4(dqa, %%xmm, 64)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1839 "paddusw %%xmm1, %%xmm0 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1840 HSUM(%%xmm0, %%xmm1, %0)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1841
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1842 #define DCT_SAD_FUNC(cpu) \
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1843 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1844 int sum;\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1845 asm volatile(\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1846 DCT_SAD\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1847 :"=r"(sum)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1848 :"r"(block)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1849 );\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1850 return sum&0xFFFF;\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1851 }
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1852
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1853 #define DCT_SAD DCT_SAD_MMX
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1854 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1855 #define MMABS(a,z) MMABS_MMX(a,z)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1856 DCT_SAD_FUNC(mmx)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1857 #undef MMABS
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1858 #undef HSUM
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1859
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1860 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1861 #define MMABS(a,z) MMABS_MMX2(a,z)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1862 DCT_SAD_FUNC(mmx2)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1863 #undef HSUM
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1864 #undef DCT_SAD
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1865
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1866 #define DCT_SAD DCT_SAD_SSE2
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1867 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1868 DCT_SAD_FUNC(sse2)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1869 #undef MMABS
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1870
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1871 #ifdef HAVE_SSSE3
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1872 #define MMABS(a,z) MMABS_SSSE3(a,z)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1873 DCT_SAD_FUNC(ssse3)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1874 #undef MMABS
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1875 #endif
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1876 #undef HSUM
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1877 #undef DCT_SAD
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	1878
5255 669a97223dc7 make arguments to ssd_int8_vs_int16() const mru parents: 5049 diff changeset	1879 static int ssd_int8_vs_int16_mmx(const int8_t pix1, const int16_t pix2, int size){
4749 7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1880 int sum;
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1881 long i=size;
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1882 asm volatile(
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1883 "pxor %%mm4, %%mm4 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1884 "1: \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1885 "sub $8, %0 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1886 "movq (%2,%0), %%mm2 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1887 "movq (%3,%0,2), %%mm0 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1888 "movq 8(%3,%0,2), %%mm1 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1889 "punpckhbw %%mm2, %%mm3 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1890 "punpcklbw %%mm2, %%mm2 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1891 "psraw $8, %%mm3 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1892 "psraw $8, %%mm2 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1893 "psubw %%mm3, %%mm1 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1894 "psubw %%mm2, %%mm0 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1895 "pmaddwd %%mm1, %%mm1 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1896 "pmaddwd %%mm0, %%mm0 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1897 "paddd %%mm1, %%mm4 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1898 "paddd %%mm0, %%mm4 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1899 "jg 1b \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1900 "movq %%mm4, %%mm3 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1901 "psrlq $32, %%mm3 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1902 "paddd %%mm3, %%mm4 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1903 "movd %%mm4, %1 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1904 :"+r"(i), "=r"(sum)
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1905 :"r"(pix1), "r"(pix2)
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1906 );
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1907 return sum;
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1908 }
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	1909
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	1910 #endif //CONFIG_ENCODERS
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	1911
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1912 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1913 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1914
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1915 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1916 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1917 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1918 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1919 "movq "#in7", " #m3 " \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1920 "movq "#in0", %%mm5 \n\t" /* D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1921 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1922 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1923 "movq "#in1", %%mm5 \n\t" /* C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1924 "movq "#in2", %%mm6 \n\t" /* B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1925 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1926 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1927 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1928 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1929 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1930 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1931 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1932 "psraw $5, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1933 "packuswb %%mm5, %%mm5 \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1934 OP(%%mm5, out, %%mm7, d)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1935
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1936 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1057 bb5de8a59da8 * static,const,compiler warning cleanup kabi parents: 997 diff changeset	1937 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1938 uint64_t temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1939 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1940 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1941 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1942 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1943 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1944 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1945 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1946 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1947 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1948 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1949 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1950 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1951 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1952 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1953 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1954 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1955 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1956 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1957 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1958 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1959 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1960 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1961 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1962 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1963 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1964 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1965 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1966 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1967 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1968 "paddw %6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1969 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1970 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1971 "movq %%mm0, %5 \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1972 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	1973 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1974 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1975 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1976 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1977 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1978 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1979 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1980 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1981 "paddw %%mm0, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1982 "paddw %%mm5, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1983 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1984 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1985 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1986 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1987 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1988 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1989 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1990 "paddw %%mm2, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1991 "paddw %%mm6, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1992 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1993 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1994 "paddw %6, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1995 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1996 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1997 "movq %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	1998 "packuswb %%mm3, %%mm1 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	1999 OP_MMX2(%%mm1, (%1),%%mm4, q)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2000 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2001 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2002 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2003 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2004 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2005 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2006 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2007 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2008 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2009 "paddw %%mm1, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2010 "paddw %%mm4, %%mm0 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2011 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2012 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2013 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2014 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2015 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2016 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2017 "paddw %%mm3, %%mm2 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2018 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2019 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2020 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2021 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2022 "paddw %%mm2, %%mm6 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2023 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2024 "paddw %6, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2025 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2026 "psraw $5, %%mm0 \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2027 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2028 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2029 "paddw %%mm5, %%mm3 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2030 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2031 "paddw %%mm4, %%mm6 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2032 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2033 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2034 "paddw %%mm1, %%mm4 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2035 "paddw %%mm2, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2036 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2037 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2038 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2039 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2040 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2041 "paddw %6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2042 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2043 "psraw $5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2044 "packuswb %%mm4, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2045 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2046 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2047 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2048 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2049 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2050 " jnz 1b \n\t"\
967 274b518c4ecb PIC / ebx fix michaelni parents: 966 diff changeset	2051 : "+a"(src), "+c"(dst), "+m"(h)\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	2052 : "d"((long)srcStride), "S"((long)dstStride), /"m"(ff_pw_20), "m"(ff_pw_3),/ "m"(temp), "m"(ROUNDER)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	2053 : "memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2054 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2055 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2056 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2057 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2058 int i;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2059 int16_t temp[16];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2060 /* quick HACK, XXX FIXME MUST be optimized */\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2061 for(i=0; i<h; i++)\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2062 {\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2063 temp[ 0]= (src[ 0]+src[ 1])20 - (src[ 0]+src[ 2])6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2064 temp[ 1]= (src[ 1]+src[ 2])20 - (src[ 0]+src[ 3])6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2065 temp[ 2]= (src[ 2]+src[ 3])20 - (src[ 1]+src[ 4])6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2066 temp[ 3]= (src[ 3]+src[ 4])20 - (src[ 2]+src[ 5])6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2067 temp[ 4]= (src[ 4]+src[ 5])20 - (src[ 3]+src[ 6])6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2068 temp[ 5]= (src[ 5]+src[ 6])20 - (src[ 4]+src[ 7])6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2069 temp[ 6]= (src[ 6]+src[ 7])20 - (src[ 5]+src[ 8])6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2070 temp[ 7]= (src[ 7]+src[ 8])20 - (src[ 6]+src[ 9])6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2071 temp[ 8]= (src[ 8]+src[ 9])20 - (src[ 7]+src[10])6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2072 temp[ 9]= (src[ 9]+src[10])20 - (src[ 8]+src[11])6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2073 temp[10]= (src[10]+src[11])20 - (src[ 9]+src[12])6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2074 temp[11]= (src[11]+src[12])20 - (src[10]+src[13])6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2075 temp[12]= (src[12]+src[13])20 - (src[11]+src[14])6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2076 temp[13]= (src[13]+src[14])20 - (src[12]+src[15])6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2077 temp[14]= (src[14]+src[15])20 - (src[13]+src[16])6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2078 temp[15]= (src[15]+src[16])20 - (src[14]+src[16])6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2079 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2080 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2081 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2082 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2083 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2084 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2085 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2086 "packuswb %%mm1, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2087 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2088 "movq 16(%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2089 "movq 24(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2090 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2091 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2092 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2093 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2094 "packuswb %%mm1, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2095 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2096 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	2097 : "memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2098 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2099 dst+=dstStride;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2100 src+=srcStride;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2101 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2102 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2103 \
1057 bb5de8a59da8 * static,const,compiler warning cleanup kabi parents: 997 diff changeset	2104 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2105 uint64_t temp;\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2106 \
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2107 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2108 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2109 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2110 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2111 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2112 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2113 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2114 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2115 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2116 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2117 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2118 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2119 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2120 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2121 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2122 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2123 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2124 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2125 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2126 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2127 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2128 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2129 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2130 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2131 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2132 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2133 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2134 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2135 "paddw %6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2136 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2137 "psraw $5, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2138 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2139 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2140 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2141 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2142 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2143 "paddw %%mm5, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2144 "paddw %%mm6, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2145 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2146 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2147 "paddw %%mm6, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2148 "paddw %%mm5, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2149 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2150 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2151 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2152 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2153 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2154 "paddw %6, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2155 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2156 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2157 "packuswb %%mm3, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2158 OP_MMX2(%%mm0, (%1), %%mm4, q)\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2159 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2160 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2161 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2162 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2163 " jnz 1b \n\t"\
967 274b518c4ecb PIC / ebx fix michaelni parents: 966 diff changeset	2164 : "+a"(src), "+c"(dst), "+m"(h)\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	2165 : "S"((long)srcStride), "D"((long)dstStride), /"m"(ff_pw_20), "m"(ff_pw_3),/ "m"(temp), "m"(ROUNDER)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	2166 : "memory"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2167 );\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2168 }\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2169 \
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2170 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t dst, uint8_t src, int dstStride, int srcStride, int h){\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2171 int i;\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2172 int16_t temp[8];\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2173 /* quick HACK, XXX FIXME MUST be optimized */\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2174 for(i=0; i<h; i++)\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2175 {\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2176 temp[ 0]= (src[ 0]+src[ 1])20 - (src[ 0]+src[ 2])6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2177 temp[ 1]= (src[ 1]+src[ 2])20 - (src[ 0]+src[ 3])6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2178 temp[ 2]= (src[ 2]+src[ 3])20 - (src[ 1]+src[ 4])6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2179 temp[ 3]= (src[ 3]+src[ 4])20 - (src[ 2]+src[ 5])6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2180 temp[ 4]= (src[ 4]+src[ 5])20 - (src[ 3]+src[ 6])6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2181 temp[ 5]= (src[ 5]+src[ 6])20 - (src[ 4]+src[ 7])6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2182 temp[ 6]= (src[ 6]+src[ 7])20 - (src[ 5]+src[ 8])6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2183 temp[ 7]= (src[ 7]+src[ 8])20 - (src[ 6]+src[ 8])6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2184 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2185 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2186 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2187 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2188 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2189 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2190 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2191 "packuswb %%mm1, %%mm0 \n\t"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2192 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2193 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	2194 :"memory"\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2195 );\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2196 dst+=dstStride;\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2197 src+=srcStride;\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2198 }\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2199 }
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2200
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2201 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2202 \
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2203 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2204 uint64_t temp[17*4];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2205 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2206 int count= 17;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2207 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2208 /FIXME unroll /\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2209 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2210 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2211 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2212 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2213 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2214 "movq 8(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2215 "movq 8(%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2216 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2217 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2218 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2219 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2220 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2221 "movq %%mm1, 17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2222 "movq %%mm2, 2178(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2223 "movq %%mm3, 3178(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2224 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2225 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2226 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2227 " jnz 1b \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2228 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	2229 : "r" ((long)srcStride)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	2230 : "memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2231 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2232 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2233 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2234 count=4;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2235 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2236 /FIXME reorder for speed /\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2237 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2238 /"pxor %%mm7, %%mm7 \n\t"/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2239 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2240 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2241 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2242 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2243 "movq 24(%0), %%mm3 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2244 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2245 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2246 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2247 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2248 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2249 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2250 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2251 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2252 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2253 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2254 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2255 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2256 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2257 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2258 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2259 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2260 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2261 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2262 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2263 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2264 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2265 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2266 "add %4, %1 \n\t" \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2267 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2268 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2269 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2270 "add $136, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2271 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2272 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2273 " jnz 1b \n\t"\
958 9bb668034ecf slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped) michaelni parents: 954 diff changeset	2274 \
967 274b518c4ecb PIC / ebx fix michaelni parents: 966 diff changeset	2275 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	2276 : "r"((long)dstStride), "r"(2(long)dstStride), /"m"(ff_pw_20), "m"(ff_pw_3),/ "m"(ROUNDER), "g"(4-14(long)dstStride)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	2277 :"memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2278 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2279 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2280 \
1057 bb5de8a59da8 * static,const,compiler warning cleanup kabi parents: 997 diff changeset	2281 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t dst, uint8_t src, int dstStride, int srcStride){\
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	2282 uint64_t temp[9*2];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2283 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2284 int count= 9;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2285 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2286 /FIXME unroll /\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2287 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2288 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2289 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2290 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2291 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2292 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2293 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2294 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2295 "movq %%mm1, 9*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2296 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2297 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2298 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2299 " jnz 1b \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2300 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	2301 : "r" ((long)srcStride)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	2302 : "memory"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2303 );\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2304 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2305 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2306 count=2;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2307 \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2308 /FIXME reorder for speed /\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2309 asm volatile(\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2310 /"pxor %%mm7, %%mm7 \n\t"/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2311 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2312 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2313 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2314 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2315 "movq 24(%0), %%mm3 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2316 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2317 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2318 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2319 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2320 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2321 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2322 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2323 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2324 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2325 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2326 "add %4, %1 \n\t"\
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2327 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2328 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2329 \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2330 "add $72, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2331 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2332 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2333 " jnz 1b \n\t"\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2334 \
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	2335 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2293 15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>) michael parents: 2256 diff changeset	2336 : "r"((long)dstStride), "r"(2(long)dstStride), /"m"(ff_pw_20), "m"(ff_pw_3),/ "m"(ROUNDER), "g"(4-6(long)dstStride)\
966 7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	2337 : "memory"\
7ef9226f430e more gcc bug workarounds michaelni parents: 961 diff changeset	2338 );\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2339 }\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2340 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2341 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t dst, uint8_t src, int stride){\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2342 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2343 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2344 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2345 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2346 uint64_t temp[8];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2347 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2348 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2349 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2350 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2351 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2352 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2353 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2354 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2355 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2356 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2357 uint64_t temp[8];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2358 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2359 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2360 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2361 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2362 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2363 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2364 uint64_t temp[8];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2365 uint8_t * const half= (uint8_t*)temp;\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2366 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2367 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2368 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2369 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2370 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t dst, uint8_t src, int stride){\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2371 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2372 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2373 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2374 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2375 uint64_t temp[8];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2376 uint8_t * const half= (uint8_t*)temp;\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2377 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2378 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2379 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2380 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2381 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2382 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2383 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2384 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2385 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2386 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2387 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2388 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2389 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2390 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2391 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2392 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2393 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2394 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2395 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2396 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2397 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2398 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2399 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2400 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2401 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2402 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2403 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2404 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2405 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2406 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2407 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2408 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2409 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2410 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2411 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2412 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2413 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2414 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2415 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2416 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2417 uint64_t half[8 + 9];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2418 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2419 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2420 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2421 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2422 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2423 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2424 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2425 uint64_t half[8 + 9];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2426 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2427 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2428 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2429 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2430 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2431 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2432 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2433 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2434 uint8_t * const halfH= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2435 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2436 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2437 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2438 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2439 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2440 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2441 uint8_t * const halfH= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2442 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2443 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2444 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2445 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2446 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2447 uint64_t half[9];\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2448 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2449 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2450 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2451 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2452 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t dst, uint8_t src, int stride){\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2453 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2454 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2455 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2456 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2457 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2458 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2459 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2460 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2461 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2462 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2463 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2464 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2465 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2466 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2467 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2468 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2469 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2470 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2471 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2472 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2473 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2474 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2475 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2476 uint8_t * const half= (uint8_t*)temp;\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2477 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2478 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2479 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2480 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2481 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t dst, uint8_t src, int stride){\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2482 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2483 }\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2484 \
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2485 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2486 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2487 uint8_t * const half= (uint8_t*)temp;\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2488 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2489 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2490 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2491 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2492 uint64_t half[162 + 172];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2493 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2494 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2495 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2496 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2497 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2498 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2499 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2500 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2501 uint64_t half[162 + 172];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2502 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2503 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2504 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2505 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2506 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2507 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2508 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2509 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2510 uint64_t half[162 + 172];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2511 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2512 uint8_t * const halfHV= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2513 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2514 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2515 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2516 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2517 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2518 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2519 uint64_t half[162 + 172];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2520 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2521 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2522 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2523 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2524 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2525 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2526 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2527 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2528 uint64_t half[162 + 172];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2529 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2530 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2531 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2532 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2533 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2534 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2535 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2536 uint64_t half[162 + 172];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2537 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2538 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2539 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2540 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2541 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2542 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2543 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2544 uint64_t half[17*2];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2545 uint8_t * const halfH= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2546 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2547 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2548 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2549 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2550 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t dst, uint8_t src, int stride){\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2551 uint64_t half[17*2];\
e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2552 uint8_t * const halfH= ((uint8_t*)half);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2553 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207 22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster michael parents: 2067 diff changeset	2554 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	2555 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2556 }\
1064 b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t kabi parents: 1057 diff changeset	2557 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t dst, uint8_t src, int stride){\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2558 uint64_t half[17*2];\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2559 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2560 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2561 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2562 }
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2563
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2564 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2565 #define AVG_3DNOW_OP(a,b,temp, size) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2566 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2567 "pavgusb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2568 "mov" #size " " #a ", " #b " \n\t"
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2569 #define AVG_MMX2_OP(a,b,temp, size) \
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2570 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2571 "pavgb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	2572 "mov" #size " " #a ", " #b " \n\t"
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2573
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2574 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2575 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2576 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2577 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2578 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2579 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2580 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	2581 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2582 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2583
3807 6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2584 /***********************************/
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2585 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2586
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2587 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2588 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t dst, uint8_t src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2589 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2590 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2591 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2592 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t dst, uint8_t src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2593 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2594 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2595
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2596 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2597 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2598 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2599 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2600 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2601 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2602 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2603 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2604 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2605 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2606 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t dst, uint8_t src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2607 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2608 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2609 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t dst, uint8_t src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2610 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2611 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2612 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2613 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2614 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2615 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2616 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2617 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2618 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2619 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2620
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2621 QPEL_2TAP(put_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2622 QPEL_2TAP(avg_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2623 QPEL_2TAP(put_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2624 QPEL_2TAP(avg_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2625 QPEL_2TAP(put_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2626 QPEL_2TAP(avg_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2627 QPEL_2TAP(put_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2628 QPEL_2TAP(avg_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2629
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	2630
393 bf164fce2c14 removed debug function glantau parents: 387 diff changeset	2631 #if 0
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2632 static void just_return() { return; }
393 bf164fce2c14 removed debug function glantau parents: 387 diff changeset	2633 #endif
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	2634
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2635 #define SET_QPEL_FUNC(postfix1, postfix2) \
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2636 c->put_ ## postfix1 = put_ ## postfix2;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2637 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	2638 c->avg_ ## postfix1 = avg_ ## postfix2;
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2639
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2640 static void gmc_mmx(uint8_t dst, uint8_t src, int stride, int h, int ox, int oy,
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2641 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2642 const int w = 8;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2643 const int ix = ox>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2644 const int iy = oy>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2645 const int oxs = ox>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2646 const int oys = oy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2647 const int dxxs = dxx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2648 const int dxys = dxy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2649 const int dyxs = dyx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2650 const int dyys = dyy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2651 const uint16_t r4[4] = {r,r,r,r};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2652 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2653 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2654 const uint64_t shift2 = 2*shift;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2655 uint8_t edge_buf[(h+1)*stride];
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2656 int x, y;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2657
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2658 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2659 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2660 const int dxh = dxy*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2661 const int dyw = dyx*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2662 if( // non-constant fullpel offset (3% of blocks)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2663 (ox^(ox+dxw) \| ox^(ox+dxh) \| ox^(ox+dxw+dxh) \|
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2664 oy^(oy+dyw) \| oy^(oy+dyh) \| oy^(oy+dyw+dyh)) >> (16+shift)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2665 // uses more than 16 bits of subpel mv (only at huge resolution)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2666 \|\| (dxx\|dxy\|dyx\|dyy)&15 )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2667 {
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2668 //FIXME could still use mmx for some of the rows
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2669 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2670 return;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2671 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2672
3250 fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	2673 src += ix + iy*stride;
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2674 if( (unsigned)ix >= width-w \|\|
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2675 (unsigned)iy >= height-h )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2676 {
3250 fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	2677 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2678 src = edge_buf;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2679 }
3250 fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	2680
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	2681 asm volatile(
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	2682 "movd %0, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	2683 "pxor %%mm7, %%mm7 \n\t"
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	2684 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	2685 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	2686 :: "r"(1<<shift)
fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	2687 );
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2688
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2689 for(x=0; x<w; x+=4){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2690 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2691 oxs - dxys + dxxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2692 oxs - dxys + dxxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2693 oxs - dxys + dxxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2694 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2695 oys - dyys + dyxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2696 oys - dyys + dyxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2697 oys - dyys + dyxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2698
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2699 for(y=0; y<h; y++){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2700 asm volatile(
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2701 "movq %0, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2702 "movq %1, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2703 "paddw %2, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2704 "paddw %3, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2705 "movq %%mm4, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2706 "movq %%mm5, %1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2707 "psrlw $12, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2708 "psrlw $12, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2709 : "+m"(dx4), "+m"(dy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2710 : "m"(dxy4), "m"(dyy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2711 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2712
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2713 asm volatile(
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2714 "movq %%mm6, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2715 "movq %%mm6, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2716 "psubw %%mm4, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2717 "psubw %%mm5, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2718 "movq %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2719 "movq %%mm4, %%mm3 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2720 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2721 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2722 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2723 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2724
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2725 "movd %4, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2726 "movd %3, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2727 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2728 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2729 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2730 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2731
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2732 "movd %2, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2733 "movd %1, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2734 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2735 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2736 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2737 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
3250 fec9bc8d63fc gmc_mmx tweaks lorenm parents: 3248 diff changeset	2738 "paddw %5, %%mm1 \n\t"
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2739 "paddw %%mm3, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2740 "paddw %%mm1, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2741 "paddw %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2742
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2743 "psrlw %6, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2744 "packuswb %%mm0, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2745 "movd %%mm0, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2746
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2747 : "=m"(dst[x+y*stride])
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2748 : "m"(src[0]), "m"(src[1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2749 "m"(src[stride]), "m"(src[stride+1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2750 "m"(*r4), "m"(shift2)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2751 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2752 src += stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2753 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2754 src += 4-h*stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2755 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2756 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	2757
3777 20545fbb6f7c add some #ifdef CONFIG_ENCODERS/DECODERS mru parents: 3721 diff changeset	2758 #ifdef CONFIG_ENCODERS
5024 8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2759
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2760 #define PHADDD(a, t)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2761 "movq "#a", "#t" \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2762 "psrlq $32, "#a" \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2763 "paddd "#t", "#a" \n\t"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2764 /*
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2765 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2766 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2767 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2768 */
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2769 #define PMULHRW(x, y, s, o)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2770 "pmulhw " #s ", "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2771 "pmulhw " #s ", "#y " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2772 "paddw " #o ", "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2773 "paddw " #o ", "#y " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2774 "psraw $1, "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2775 "psraw $1, "#y " \n\t"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2776 #define DEF(x) x ## _mmx
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2777 #define SET_RND MOVQ_WONE
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2778 #define SCALE_OFFSET 1
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2779
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2780 #include "dsputil_mmx_qns.h"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2781
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2782 #undef DEF
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2783 #undef SET_RND
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2784 #undef SCALE_OFFSET
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2785 #undef PMULHRW
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2786
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2787 #define DEF(x) x ## _3dnow
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2788 #define SET_RND(x)
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2789 #define SCALE_OFFSET 0
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2790 #define PMULHRW(x, y, s, o)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2791 "pmulhrw " #s ", "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2792 "pmulhrw " #s ", "#y " \n\t"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2793
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2794 #include "dsputil_mmx_qns.h"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2795
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2796 #undef DEF
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2797 #undef SET_RND
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2798 #undef SCALE_OFFSET
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2799 #undef PMULHRW
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2800
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2801 #ifdef HAVE_SSSE3
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2802 #undef PHADDD
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2803 #define DEF(x) x ## _ssse3
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2804 #define SET_RND(x)
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2805 #define SCALE_OFFSET -1
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2806 #define PHADDD(a, t)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2807 "pshufw $0x0E, "#a", "#t" \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2808 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2809 #define PMULHRW(x, y, s, o)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2810 "pmulhrsw " #s ", "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2811 "pmulhrsw " #s ", "#y " \n\t"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2812
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2813 #include "dsputil_mmx_qns.h"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2814
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2815 #undef DEF
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2816 #undef SET_RND
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2817 #undef SCALE_OFFSET
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2818 #undef PMULHRW
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2819 #undef PHADDD
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2820 #endif //HAVE_SSSE3
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	2821
3777 20545fbb6f7c add some #ifdef CONFIG_ENCODERS/DECODERS mru parents: 3721 diff changeset	2822 #endif /* CONFIG_ENCODERS */
2754 a49f140179e9 sort H.264 mmx dsp functions into their own file lorenm parents: 2753 diff changeset	2823
3215 06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2824 #define PREFETCH(name, op) \
4172 608e2dfcb86e adding more static keywords mru parents: 4127 diff changeset	2825 static void name(void *mem, int stride, int h){\
3215 06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2826 const uint8_t *p= mem;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2827 do{\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2828 asm volatile(#op" %0" :: "m"(*p));\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2829 p+= stride;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2830 }while(--h);\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2831 }
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2832 PREFETCH(prefetch_mmx2, prefetcht0)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2833 PREFETCH(prefetch_3dnow, prefetch)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2834 #undef PREFETCH
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	2835
2754 a49f140179e9 sort H.264 mmx dsp functions into their own file lorenm parents: 2753 diff changeset	2836 #include "h264dsp_mmx.c"
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	2837
3524 419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2838 /* AVS specific */
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2839 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2840
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2841 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t dst, uint8_t src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2842 put_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2843 }
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2844 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t dst, uint8_t src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2845 avg_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2846 }
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2847 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t dst, uint8_t src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2848 put_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2849 }
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2850 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t dst, uint8_t src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2851 avg_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2852 }
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	2853
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2854 /* external functions, from idct_mmx.c */
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2855 void ff_mmx_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2856 void ff_mmxext_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2857
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2858 /* XXX: those functions should be suppressed ASAP when all IDCTs are
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2859 converted */
4020 723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure diego parents: 4001 diff changeset	2860 #ifdef CONFIG_GPL
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2861 static void ff_libmpeg2mmx_idct_put(uint8_t dest, int line_size, DCTELEM block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2862 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2863 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2864 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2865 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2866 static void ff_libmpeg2mmx_idct_add(uint8_t dest, int line_size, DCTELEM block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2867 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2868 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2869 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2870 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2871 static void ff_libmpeg2mmx2_idct_put(uint8_t dest, int line_size, DCTELEM block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2872 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2873 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2874 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2875 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2876 static void ff_libmpeg2mmx2_idct_add(uint8_t dest, int line_size, DCTELEM block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2877 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2878 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2879 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	2880 }
4020 723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure diego parents: 4001 diff changeset	2881 #endif
2868 666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2882 static void ff_idct_xvid_mmx_put(uint8_t dest, int line_size, DCTELEM block)
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2883 {
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2884 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2885 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2886 }
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2887 static void ff_idct_xvid_mmx_add(uint8_t dest, int line_size, DCTELEM block)
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2888 {
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2889 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2890 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2891 }
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2892 static void ff_idct_xvid_mmx2_put(uint8_t dest, int line_size, DCTELEM block)
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2893 {
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2894 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2895 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2896 }
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2897 static void ff_idct_xvid_mmx2_add(uint8_t dest, int line_size, DCTELEM block)
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2898 {
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2899 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2900 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	2901 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	2902
3541 3fbddeb13686 10l, vorbis_inverse_coupling_sse() was really 3dnow lorenm parents: 3536 diff changeset	2903 static void vorbis_inverse_coupling_3dnow(float mag, float ang, int blocksize)
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2904 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2905 int i;
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2906 asm volatile("pxor %%mm7, %%mm7":);
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2907 for(i=0; i<blocksize; i+=2) {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2908 asm volatile(
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2909 "movq %0, %%mm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2910 "movq %1, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2911 "movq %%mm0, %%mm2 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2912 "movq %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2913 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2914 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2915 "pslld $31, %%mm2 \n\t" // keep only the sign bit
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2916 "pxor %%mm2, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2917 "movq %%mm3, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2918 "pand %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2919 "pandn %%mm1, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2920 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2921 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2922 "movq %%mm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2923 "movq %%mm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2924 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2925 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2926 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2927 }
3561 97325fecd35a emms -> femms lorenm parents: 3557 diff changeset	2928 asm volatile("femms");
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2929 }
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2930 static void vorbis_inverse_coupling_sse(float mag, float ang, int blocksize)
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2931 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2932 int i;
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2933
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2934 asm volatile(
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2935 "movaps %0, %%xmm5 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2936 ::"m"(ff_pdw_80000000[0])
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2937 );
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2938 for(i=0; i<blocksize; i+=4) {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2939 asm volatile(
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2940 "movaps %0, %%xmm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2941 "movaps %1, %%xmm1 \n\t"
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2942 "xorps %%xmm2, %%xmm2 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2943 "xorps %%xmm3, %%xmm3 \n\t"
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2944 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2945 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2946 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2947 "xorps %%xmm2, %%xmm1 \n\t"
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2948 "movaps %%xmm3, %%xmm4 \n\t"
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2949 "andps %%xmm1, %%xmm3 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	2950 "andnps %%xmm1, %%xmm4 \n\t"
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2951 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2952 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2953 "movaps %%xmm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2954 "movaps %%xmm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2955 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2956 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2957 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2958 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2959 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	2960
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2961 static void vector_fmul_3dnow(float dst, const float src, int len){
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2962 long i = (len-4)*4;
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2963 asm volatile(
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2964 "1: \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2965 "movq (%1,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2966 "movq 8(%1,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2967 "pfmul (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2968 "pfmul 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2969 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2970 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2971 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2972 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2973 "femms \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2974 :"+r"(i)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2975 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2976 :"memory"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2977 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2978 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2979 static void vector_fmul_sse(float dst, const float src, int len){
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2980 long i = (len-8)*4;
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2981 asm volatile(
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2982 "1: \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2983 "movaps (%1,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2984 "movaps 16(%1,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2985 "mulps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2986 "mulps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2987 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2988 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2989 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2990 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2991 :"+r"(i)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2992 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2993 :"memory"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	2994 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2995 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2996
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2997 static void vector_fmul_reverse_3dnow2(float dst, const float src0, const float *src1, int len){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2998 long i = len*4-16;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	2999 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3000 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3001 "pswapd 8(%1), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3002 "pswapd (%1), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3003 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3004 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3005 "movq %%mm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3006 "movq %%mm1, 8(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3007 "add $16, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3008 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3009 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3010 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3011 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3012 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3013 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3014 }
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3015 static void vector_fmul_reverse_sse(float dst, const float src0, const float *src1, int len){
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3016 long i = len*4-32;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3017 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3018 "1: \n\t"
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3019 "movaps 16(%1), %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3020 "movaps (%1), %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3021 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3022 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3023 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3024 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3025 "movaps %%xmm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3026 "movaps %%xmm1, 16(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3027 "add $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3028 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3029 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3030 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3031 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3032 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3033 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3034
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3035 static void vector_fmul_add_add_3dnow(float dst, const float src0, const float *src1,
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3036 const float *src2, int src3, int len, int step){
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3037 long i = (len-4)*4;
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3038 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3039 dst += (len-4)*2;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3040 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3041 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3042 "movq (%2,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3043 "movq 8(%2,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3044 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3045 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3046 "pfadd (%4,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3047 "pfadd 8(%4,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3048 "movd %%mm0, (%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3049 "movd %%mm1, 16(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3050 "psrlq $32, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3051 "psrlq $32, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3052 "movd %%mm0, 8(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3053 "movd %%mm1, 24(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3054 "sub $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3055 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3056 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3057 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3058 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3059 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3060 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3061 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3062 else if(step == 1 && src3 == 0){
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3063 asm volatile(
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3064 "1: \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3065 "movq (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3066 "movq 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3067 "pfmul (%3,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3068 "pfmul 8(%3,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3069 "pfadd (%4,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3070 "pfadd 8(%4,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3071 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3072 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3073 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3074 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3075 :"+r"(i)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3076 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3077 :"memory"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3078 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3079 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3080 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3081 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3082 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3083 }
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3084 static void vector_fmul_add_add_sse(float dst, const float src0, const float *src1,
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3085 const float *src2, int src3, int len, int step){
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3086 long i = (len-8)*4;
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3087 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3088 dst += (len-8)*2;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3089 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3090 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3091 "movaps (%2,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3092 "movaps 16(%2,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3093 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3094 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3095 "addps (%4,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3096 "addps 16(%4,%0), %%xmm1 \n\t"
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3097 "movss %%xmm0, (%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3098 "movss %%xmm1, 32(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3099 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3100 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3101 "movss %%xmm2, 16(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3102 "movss %%xmm3, 48(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3103 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3104 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3105 "movss %%xmm0, 8(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3106 "movss %%xmm1, 40(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3107 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3108 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3109 "movss %%xmm2, 24(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3110 "movss %%xmm3, 56(%1) \n\t"
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3111 "sub $64, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3112 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3113 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3114 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3115 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3116 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3117 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3118 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3119 else if(step == 1 && src3 == 0){
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3120 asm volatile(
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3121 "1: \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3122 "movaps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3123 "movaps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3124 "mulps (%3,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3125 "mulps 16(%3,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3126 "addps (%4,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3127 "addps 16(%4,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3128 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3129 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3130 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3131 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3132 :"+r"(i)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3133 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3134 :"memory"
f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3135 );
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3136 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3137 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3138 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3139 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3140
4172 608e2dfcb86e adding more static keywords mru parents: 4127 diff changeset	3141 static void float_to_int16_3dnow(int16_t dst, const float src, int len){
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3142 // not bit-exact: pf2id uses different rounding than C and SSE
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3143 int i;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3144 for(i=0; i<len; i+=4) {
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3145 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3146 "pf2id %1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3147 "pf2id %2, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3148 "packssdw %%mm1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3149 "movq %%mm0, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3150 :"=m"(dst[i])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3151 :"m"(src[i]), "m"(src[i+2])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3152 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3153 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3154 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3155 }
4172 608e2dfcb86e adding more static keywords mru parents: 4127 diff changeset	3156 static void float_to_int16_sse(int16_t dst, const float src, int len){
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3157 int i;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3158 for(i=0; i<len; i+=4) {
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3159 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3160 "cvtps2pi %1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3161 "cvtps2pi %2, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3162 "packssdw %%mm1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3163 "movq %%mm0, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3164 :"=m"(dst[i])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3165 :"m"(src[i]), "m"(src[i+2])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3166 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3167 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3168 asm volatile("emms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3169 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3170
4589 30261f4ed12d Fix wrong conditional, Snow decoding, not encoding, was SIMD-accelerated. diego parents: 4436 diff changeset	3171 #ifdef CONFIG_SNOW_DECODER
3210 81cafbc23b8d snow mmx+sse2 optimizations, part 4 corey parents: 3207 diff changeset	3172 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
81cafbc23b8d snow mmx+sse2 optimizations, part 4 corey parents: 3207 diff changeset	3173 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	3174 extern void ff_snow_vertical_compose97i_sse2(DWTELEM b0, DWTELEM b1, DWTELEM b2, DWTELEM b3, DWTELEM b4, DWTELEM b5, int width);
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	3175 extern void ff_snow_vertical_compose97i_mmx(DWTELEM b0, DWTELEM b1, DWTELEM b2, DWTELEM b3, DWTELEM b4, DWTELEM b5, int width);
4436 d3e389536b0a Add the const specifier as needed to reduce the number of warnings. takis parents: 4197 diff changeset	3176 extern void ff_snow_inner_add_yblock_sse2(const uint8_t obmc, const int obmc_stride, uint8_t * block, int b_w, int b_h,
3211 b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock gpoirier parents: 3210 diff changeset	3177 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
4436 d3e389536b0a Add the const specifier as needed to reduce the number of warnings. takis parents: 4197 diff changeset	3178 extern void ff_snow_inner_add_yblock_mmx(const uint8_t obmc, const int obmc_stride, uint8_t * block, int b_w, int b_h,
3211 b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock gpoirier parents: 3210 diff changeset	3179 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3210 81cafbc23b8d snow mmx+sse2 optimizations, part 4 corey parents: 3207 diff changeset	3180 #endif
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	3181
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3182 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
0 986e461dc072 Initial revision glantau parents: diff changeset	3183 {
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3184 mm_flags = mm_support();
1115 74a46d77e061 * support FF_MM_FORCE kabi parents: 1092 diff changeset	3185
1122 ddc3b0140b8f * oooooops - sorry for this one - wrong logic kabi parents: 1115 diff changeset	3186 if (avctx->dsp_mask) {
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	3187 if (avctx->dsp_mask & FF_MM_FORCE)
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3188 mm_flags \|= (avctx->dsp_mask & 0xffff);
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	3189 else
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3190 mm_flags &= ~(avctx->dsp_mask & 0xffff);
1122 ddc3b0140b8f * oooooops - sorry for this one - wrong logic kabi parents: 1115 diff changeset	3191 }
1115 74a46d77e061 * support FF_MM_FORCE kabi parents: 1092 diff changeset	3192
631 47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>) michaelni parents: 629 diff changeset	3193 #if 0
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	3194 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3195 if (mm_flags & MM_MMX)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	3196 av_log(avctx, AV_LOG_INFO, " mmx");
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3197 if (mm_flags & MM_MMXEXT)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	3198 av_log(avctx, AV_LOG_INFO, " mmxext");
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3199 if (mm_flags & MM_3DNOW)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	3200 av_log(avctx, AV_LOG_INFO, " 3dnow");
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3201 if (mm_flags & MM_SSE)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	3202 av_log(avctx, AV_LOG_INFO, " sse");
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3203 if (mm_flags & MM_SSE2)
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	3204 av_log(avctx, AV_LOG_INFO, " sse2");
771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	3205 av_log(avctx, AV_LOG_INFO, "\n");
0 986e461dc072 Initial revision glantau parents: diff changeset	3206 #endif
986e461dc072 Initial revision glantau parents: diff changeset	3207
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3208 if (mm_flags & MM_MMX) {
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3209 const int idct_algo= avctx->idct_algo;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3210
1232 e88d3b1fb2a1 more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>) michaelni parents: 1186 diff changeset	3211 #ifdef CONFIG_ENCODERS
2024 f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1985 diff changeset	3212 const int dct_algo = avctx->dct_algo;
1565 1a9a63f59849 minor mmx2 optimization if the dct michael parents: 1530 diff changeset	3213 if(dct_algo==FF_DCT_AUTO \|\| dct_algo==FF_DCT_MMX){
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3214 if(mm_flags & MM_SSE2){
1765 e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>) michael parents: 1739 diff changeset	3215 c->fdct = ff_fdct_sse2;
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3216 }else if(mm_flags & MM_MMXEXT){
1565 1a9a63f59849 minor mmx2 optimization if the dct michael parents: 1530 diff changeset	3217 c->fdct = ff_fdct_mmx2;
1a9a63f59849 minor mmx2 optimization if the dct michael parents: 1530 diff changeset	3218 }else{
1a9a63f59849 minor mmx2 optimization if the dct michael parents: 1530 diff changeset	3219 c->fdct = ff_fdct_mmx;
1a9a63f59849 minor mmx2 optimization if the dct michael parents: 1530 diff changeset	3220 }
1a9a63f59849 minor mmx2 optimization if the dct michael parents: 1530 diff changeset	3221 }
1232 e88d3b1fb2a1 more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>) michaelni parents: 1186 diff changeset	3222 #endif //CONFIG_ENCODERS
2256 7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3223 if(avctx->lowres==0){
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3224 if(idct_algo==FF_IDCT_AUTO \|\| idct_algo==FF_IDCT_SIMPLEMMX){
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3225 c->idct_put= ff_simple_idct_put_mmx;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3226 c->idct_add= ff_simple_idct_add_mmx;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3227 c->idct = ff_simple_idct_mmx;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3228 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3717 ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel diego parents: 3712 diff changeset	3229 #ifdef CONFIG_GPL
2256 7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3230 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3231 if(mm_flags & MM_MMXEXT){
2256 7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3232 c->idct_put= ff_libmpeg2mmx2_idct_put;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3233 c->idct_add= ff_libmpeg2mmx2_idct_add;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3234 c->idct = ff_mmxext_idct;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3235 }else{
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3236 c->idct_put= ff_libmpeg2mmx_idct_put;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3237 c->idct_add= ff_libmpeg2mmx_idct_add;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3238 c->idct = ff_mmx_idct;
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3239 }
7e0b2e86afa9 1/2 resolution decoding michael parents: 2217 diff changeset	3240 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3717 ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel diego parents: 3712 diff changeset	3241 #endif
5007 f7edc4fe94db Make vp3dsp.c compilation optional. takis* parents: 4988 diff changeset	3242 }else if((ENABLE_VP3_DECODER \|\| ENABLE_VP5_DECODER \|\| ENABLE_VP6_DECODER) &&
f7edc4fe94db Make vp3dsp.c compilation optional. takis* parents: 4988 diff changeset	3243 idct_algo==FF_IDCT_VP3 &&
3721 2000e401593d disable vp3 mmx idct for theora files to avoid artifacts aurel parents: 3717 diff changeset	3244 avctx->codec->id!=CODEC_ID_THEORA &&
3712 f7f75f718efb Enables back the mmx/sse optimized version of the vp3 idct. aurel parents: 3666 diff changeset	3245 !(avctx->flags & CODEC_FLAG_BITEXACT)){
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3246 if(mm_flags & MM_SSE2){
2696 9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	3247 c->idct_put= ff_vp3_idct_put_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	3248 c->idct_add= ff_vp3_idct_add_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	3249 c->idct = ff_vp3_idct_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	3250 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	3251 }else{
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	3252 ff_vp3_dsp_init_mmx();
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	3253 c->idct_put= ff_vp3_idct_put_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	3254 c->idct_add= ff_vp3_idct_add_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	3255 c->idct = ff_vp3_idct_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	3256 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API michael parents: 2691 diff changeset	3257 }
3524 419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	3258 }else if(idct_algo==FF_IDCT_CAVS){
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	3259 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2868 666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	3260 }else if(idct_algo==FF_IDCT_XVIDMMX){
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3261 if(mm_flags & MM_MMXEXT){
2868 666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	3262 c->idct_put= ff_idct_xvid_mmx2_put;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	3263 c->idct_add= ff_idct_xvid_mmx2_add;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	3264 c->idct = ff_idct_xvid_mmx2;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	3265 }else{
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	3266 c->idct_put= ff_idct_xvid_mmx_put;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	3267 c->idct_add= ff_idct_xvid_mmx_add;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	3268 c->idct = ff_idct_xvid_mmx;
666064f710d4 xvids mmx&mmx2 idcts michael parents: 2864 diff changeset	3269 }
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3270 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3271 }
1868 771dcc2d4a0c use optimized VP3 functions where appropriate melanson parents: 1845 diff changeset	3272
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	3273 #ifdef CONFIG_ENCODERS
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3274 c->get_pixels = get_pixels_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3275 c->diff_pixels = diff_pixels_mmx;
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	3276 #endif //CONFIG_ENCODERS
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3277 c->put_pixels_clamped = put_pixels_clamped_mmx;
1984 ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and melanson parents: 1977 diff changeset	3278 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3279 c->add_pixels_clamped = add_pixels_clamped_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3280 c->clear_blocks = clear_blocks_mmx;
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	3281 #ifdef CONFIG_ENCODERS
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3282 c->pix_sum = pix_sum16_mmx;
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	3283 #endif //CONFIG_ENCODERS
415 1c3f42442fba * added simple test main - see comments about how to kabi parents: 402 diff changeset	3284
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3285 c->put_pixels_tab[0][0] = put_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3286 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3287 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3288 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
0 986e461dc072 Initial revision glantau parents: diff changeset	3289
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3290 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3291 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3292 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3293 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	3294
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3295 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3296 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3297 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3298 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
415 1c3f42442fba * added simple test main - see comments about how to kabi parents: 402 diff changeset	3299
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3300 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3301 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3302 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3303 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3304
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3305 c->put_pixels_tab[1][0] = put_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3306 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3307 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3308 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
0 986e461dc072 Initial revision glantau parents: diff changeset	3309
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3310 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3311 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3312 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3313 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	3314
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3315 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3316 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3317 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3318 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	3319
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3320 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3321 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3322 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3323 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	3324
3248 7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	3325 c->gmc= gmc_mmx;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C) lorenm parents: 3215 diff changeset	3326
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	3327 c->add_bytes= add_bytes_mmx;
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	3328 #ifdef CONFIG_ENCODERS
866 725ef4ea3ecc huffyuv michaelni parents: 853 diff changeset	3329 c->diff_bytes= diff_bytes_mmx;
4988 689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	3330 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	3331
936 caa77cd960c0 qpel encoding michaelni parents: 866 diff changeset	3332 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
caa77cd960c0 qpel encoding michaelni parents: 866 diff changeset	3333 c->hadamard8_diff[1]= hadamard8_diff_mmx;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	3334
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	3335 c->pix_norm1 = pix_norm1_mmx;
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3336 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	3337 c->sse[1] = sse8_mmx;
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	3338 c->vsad[4]= vsad_intra16_mmx;
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	3339
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	3340 c->nsse[0] = nsse16_mmx;
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	3341 c->nsse[1] = nsse8_mmx;
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	3342 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	3343 c->vsad[0] = vsad16_mmx;
a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	3344 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	3345
1784 65f7bd09f37b quantizer noise shaping optimization michael parents: 1772 diff changeset	3346 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
65f7bd09f37b quantizer noise shaping optimization michael parents: 1772 diff changeset	3347 c->try_8x8basis= try_8x8basis_mmx;
65f7bd09f37b quantizer noise shaping optimization michael parents: 1772 diff changeset	3348 }
65f7bd09f37b quantizer noise shaping optimization michael parents: 1772 diff changeset	3349 c->add_8x8basis= add_8x8basis_mmx;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	3350
4749 7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	3351 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding. lorenm parents: 4589 diff changeset	3352
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	3353 #endif //CONFIG_ENCODERS
1647 c943c1d2d099 h263_v_loop_filter_mmx michael parents: 1566 diff changeset	3354
5277 7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs aurel parents: 5255 diff changeset	3355 if (ENABLE_ANY_H263) {
5278 ef85411bb7e8 cosmetics: indentation aurel parents: 5277 diff changeset	3356 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
ef85411bb7e8 cosmetics: indentation aurel parents: 5277 diff changeset	3357 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
5277 7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs aurel parents: 5255 diff changeset	3358 }
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	3359 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
2922 d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx. lorenm parents: 2902 diff changeset	3360 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	3361
3173 9a2cc7b0fbdb h264_idct_add only needs mmx1 lorenm parents: 3105 diff changeset	3362 c->h264_idct_dc_add=
9a2cc7b0fbdb h264_idct_add only needs mmx1 lorenm parents: 3105 diff changeset	3363 c->h264_idct_add= ff_h264_idct_add_mmx;
3174 b65cbae9d940 h264_idct8_add_mmx lorenm parents: 3173 diff changeset	3364 c->h264_idct8_dc_add=
b65cbae9d940 h264_idct8_add_mmx lorenm parents: 3173 diff changeset	3365 c->h264_idct8_add= ff_h264_idct8_add_mmx;
3173 9a2cc7b0fbdb h264_idct_add only needs mmx1 lorenm parents: 3105 diff changeset	3366
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3367 if (mm_flags & MM_MMXEXT) {
3215 06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	3368 c->prefetch = prefetch_mmx2;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	3369
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3370 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3371 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	3372
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3373 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3374 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3375 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
415 1c3f42442fba * added simple test main - see comments about how to kabi parents: 402 diff changeset	3376
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3377 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3378 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	3379
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3380 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3381 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3382 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3383
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	3384 #ifdef CONFIG_ENCODERS
4988 689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	3385 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1153 2725c8eb3c81 faster hadamard transform michaelni parents: 1122 diff changeset	3386 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
2725c8eb3c81 faster hadamard transform michaelni parents: 1122 diff changeset	3387 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	3388 c->vsad[4]= vsad_intra16_mmx2;
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	3389 #endif //CONFIG_ENCODERS
1153 2725c8eb3c81 faster hadamard transform michaelni parents: 1122 diff changeset	3390
3105 2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall lorenm parents: 3089 diff changeset	3391 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall lorenm parents: 3089 diff changeset	3392 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
2745 42d3e9068e32 MMX for H.264 iDCT (adapted from x264) lorenm parents: 2732 diff changeset	3393
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3394 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3395 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3396 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3397 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3398 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3399 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3400 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
1772 8cd5257195c9 vsad16_mmx2 only applies if encoders are turned on melanson parents: 1765 diff changeset	3401 #ifdef CONFIG_ENCODERS
1729 a4a5e7521339 interlaced dct decision cleanup michael parents: 1708 diff changeset	3402 c->vsad[0] = vsad16_mmx2;
1772 8cd5257195c9 vsad16_mmx2 only applies if encoders are turned on melanson parents: 1765 diff changeset	3403 #endif //CONFIG_ENCODERS
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3404 }
959 3ec070eef24a qpel in b frames bugfixes michaelni parents: 958 diff changeset	3405
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	3406 #if 1
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3407 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3408 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3409 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3410 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3411 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3412 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3413 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3414 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3415 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3416 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3417 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3418 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3419 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3420 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3421 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3422 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3423 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3424 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3425 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3426 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3427 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3428 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3429 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3430 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3431 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3432 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3433 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3434 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3435 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3436 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3437 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3438 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
961 f8c5babc7b4e 1000l (push & esp) using mangle now ... michaelni parents: 959 diff changeset	3439 #endif
1527 8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	3440
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3441 //FIXME 3dnow too
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3442 #define dspfunc(PFX, IDX, NUM) \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3443 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3444 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3445 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3446 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3447 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3448 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3449 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3450 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3451 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3452 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3453 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3454 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3455 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3456 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3457 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3458 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3459
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3460 dspfunc(put_h264_qpel, 0, 16);
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3461 dspfunc(put_h264_qpel, 1, 8);
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3462 dspfunc(put_h264_qpel, 2, 4);
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3463 dspfunc(avg_h264_qpel, 0, 16);
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3464 dspfunc(avg_h264_qpel, 1, 8);
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3465 dspfunc(avg_h264_qpel, 2, 4);
3807 6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	3466
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	3467 dspfunc(put_2tap_qpel, 0, 16);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	3468 dspfunc(put_2tap_qpel, 1, 8);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	3469 dspfunc(avg_2tap_qpel, 0, 16);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	3470 dspfunc(avg_2tap_qpel, 1, 8);
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3471 #undef dspfunc
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3472
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	3473 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
2922 d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx. lorenm parents: 2902 diff changeset	3474 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
3213 57d31bdbebe8 added mmx implementation of h264_chroma_mc2 lorenm parents: 3211 diff changeset	3475 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
57d31bdbebe8 added mmx implementation of h264_chroma_mc2 lorenm parents: 3211 diff changeset	3476 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
2633 72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	3477 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	3478 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	3479 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	3480 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
2707 360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math) lorenm parents: 2696 diff changeset	3481 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math) lorenm parents: 2696 diff changeset	3482 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
3645 47821be55b6c mmx implementation of deblocking strength decision. lorenm parents: 3576 diff changeset	3483 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
2633 72e6ffa1f3a5 MMX for H.264 deblocking filter lorenm parents: 2505 diff changeset	3484
2902 3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3485 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3486 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3487 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3488 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3489 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3490 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3491 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3492 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3493
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3494 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3495 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3496 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3497 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3498 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3499 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3500 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3501 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup. lorenm parents: 2899 diff changeset	3502
3524 419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	3503 #ifdef CONFIG_CAVS_DECODER
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	3504 ff_cavsdsp_init_mmx2(c, avctx);
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	3505 #endif
419409926166 some MMX optimizations for the CAVS decoder stefang parents: 3496 diff changeset	3506
1686 68abbec33289 Here are juste two added #ifdef CONFIG_ENCODERS to allow michael parents: 1648 diff changeset	3507 #ifdef CONFIG_ENCODERS
1527 8ffd0c00e6df mmx2 optimization of huffyuv median encoding michael parents: 1324 diff changeset	3508 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1686 68abbec33289 Here are juste two added #ifdef CONFIG_ENCODERS to allow michael parents: 1648 diff changeset	3509 #endif //CONFIG_ENCODERS
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3510 } else if (mm_flags & MM_3DNOW) {
3215 06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	3511 c->prefetch = prefetch_3dnow;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264. lorenm parents: 3213 diff changeset	3512
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3513 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3514 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
393 bf164fce2c14 removed debug function glantau parents: 387 diff changeset	3515
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3516 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3517 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3518 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
651 45e8f39fda50 put/avg_pixels16 michaelni parents: 631 diff changeset	3519
853 eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3520 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3521 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3522
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3523 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3524 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension kabi parents: 706 diff changeset	3525 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3526
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3527 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3528 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3529 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3530 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3531 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3532 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3533 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3534 }
984 e162c09efbe7 qpel fix michaelni parents: 967 diff changeset	3535
954 13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3536 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3537 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3538 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3539 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3540 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3541 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3542 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3543 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3544 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3545 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3546 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3547 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3548 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3549 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3550 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3551 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3552 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3553 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3554 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3555 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3556 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3557 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3558 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3559 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3560 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3561 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3562 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3563 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3564 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3565 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3566 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
13aec7e50c52 qpel in mmx2/3dnow michaelni parents: 936 diff changeset	3567 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
2209 c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3568
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3569 #define dspfunc(PFX, IDX, NUM) \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3570 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3571 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3572 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3573 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3574 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3575 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3576 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3577 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3578 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3579 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3580 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3581 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3582 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3583 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3584 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3585 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3586
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3587 dspfunc(put_h264_qpel, 0, 16);
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3588 dspfunc(put_h264_qpel, 1, 8);
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3589 dspfunc(put_h264_qpel, 2, 4);
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3590 dspfunc(avg_h264_qpel, 0, 16);
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3591 dspfunc(avg_h264_qpel, 1, 8);
c4a476971abc h264 luma motion compensation in mmx2/3dnow michael parents: 2207 diff changeset	3592 dspfunc(avg_h264_qpel, 2, 4);
2732 473ee06ec3a1 MMX code for (put\|avg)_h264_chroma_mc8 hzoli parents: 2707 diff changeset	3593
3807 6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	3594 dspfunc(put_2tap_qpel, 0, 16);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	3595 dspfunc(put_2tap_qpel, 1, 8);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	3596 dspfunc(avg_2tap_qpel, 0, 16);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	3597 dspfunc(avg_2tap_qpel, 1, 8);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast. lorenm parents: 3777 diff changeset	3598
2979 bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting diego parents: 2967 diff changeset	3599 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
2922 d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx. lorenm parents: 2902 diff changeset	3600 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
0 986e461dc072 Initial revision glantau parents: diff changeset	3601 }
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	3602
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3603 #ifdef CONFIG_ENCODERS
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3604 if(mm_flags & MM_SSE2){
4988 689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	3605 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3606 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3607 c->hadamard8_diff[1]= hadamard8_diff_sse2;
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3608 }
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3609
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3610 #ifdef HAVE_SSSE3
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3611 if(mm_flags & MM_SSSE3){
5024 8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	3612 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	3613 c->try_8x8basis= try_8x8basis_ssse3;
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	3614 }
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	3615 c->add_8x8basis= add_8x8basis_ssse3;
4988 689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it. lorenm parents: 4987 diff changeset	3616 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
4987 02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3617 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3618 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3619 }
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3620 #endif
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3621 #endif
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels. lorenm parents: 4946 diff changeset	3622
4589 30261f4ed12d Fix wrong conditional, Snow decoding, not encoding, was SIMD-accelerated. diego parents: 4436 diff changeset	3623 #ifdef CONFIG_SNOW_DECODER
5591 642588a60570 update mmx code to latest snow changes michael parents: 5587 diff changeset	3624 if(mm_flags & MM_SSE2 & 0){
3210 81cafbc23b8d snow mmx+sse2 optimizations, part 4 corey parents: 3207 diff changeset	3625 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	3626 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
3211 b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock gpoirier parents: 3210 diff changeset	3627 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	3628 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	3629 else{
3210 81cafbc23b8d snow mmx+sse2 optimizations, part 4 corey parents: 3207 diff changeset	3630 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	3631 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
3211 b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock gpoirier parents: 3210 diff changeset	3632 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
3207 33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	3633 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose(). gpoirier parents: 3174 diff changeset	3634 #endif
3536 545a15c19c91 sse & sse2 implementations of vorbis channel coupling. lorenm parents: 3524 diff changeset	3635
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3636 if(mm_flags & MM_3DNOW){
5024 8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	3637 #ifdef CONFIG_ENCODERS
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	3638 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	3639 c->try_8x8basis= try_8x8basis_3dnow;
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	3640 }
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	3641 c->add_8x8basis= add_8x8basis_3dnow;
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw zuxy parents: 5014 diff changeset	3642 #endif //CONFIG_ENCODERS
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3643 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3644 c->vector_fmul = vector_fmul_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3645 if(!(avctx->flags & CODEC_FLAG_BITEXACT))
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3646 c->float_to_int16 = float_to_int16_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3647 }
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3648 if(mm_flags & MM_3DNOWEXT)
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3649 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3650 if(mm_flags & MM_SSE){
3557 8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus michael parents: 3541 diff changeset	3651 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3652 c->vector_fmul = vector_fmul_sse;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3653 c->float_to_int16 = float_to_int16_sse;
3569 c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3654 c->vector_fmul_reverse = vector_fmul_reverse_sse;
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse michael parents: 3568 diff changeset	3655 c->vector_fmul_add_add = vector_fmul_add_add_sse;
3568 945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing. lorenm parents: 3561 diff changeset	3656 }
4197 bbe0bc387a19 revert bad checkin mru parents: 4196 diff changeset	3657 if(mm_flags & MM_3DNOW)
3574 f549d1e685f7 vorbis simd tweaks lorenm parents: 3569 diff changeset	3658 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
0 986e461dc072 Initial revision glantau parents: diff changeset	3659 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2940 diff changeset	3660
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	3661 #ifdef CONFIG_ENCODERS
1092 f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_* michaelni parents: 1065 diff changeset	3662 dsputil_init_pix_mmx(c, avctx);
1530 3b31998fe22f disable encoders where appropriate (patch courtesy of BERO melanson parents: 1527 diff changeset	3663 #endif //CONFIG_ENCODERS
247 6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3664 #if 0
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3665 // for speed testing
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3666 get_pixels = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3667 put_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3668 add_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3669
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3670 pix_abs16x16 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3671 pix_abs16x16_x2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3672 pix_abs16x16_y2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3673 pix_abs16x16_xy2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3674
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3675 put_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3676 put_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3677 put_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3678 put_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3679
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3680 put_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3681 put_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3682 put_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3683 put_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3684
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3685 avg_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3686 avg_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3687 avg_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3688 avg_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3689
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3690 avg_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3691 avg_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3692 avg_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3693 avg_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3694
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3695 //av_fdct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3696 //ff_idct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC kabi parents: 188 diff changeset	3697 #endif
0 986e461dc072 Initial revision glantau parents: diff changeset	3698 }

Mercurial > libavcodec.hg

annotate i386/dsputil_mmx.c @ 5591:642588a60570 libavcodec