annotate i386/dsputil_mmx.c @ 6391:3dc36ec2dcad libavcodec

__asm __volatile -> asm volatile, improves code consistency and works (as far as that is possible) with the Sun C compiler.
author reimar
date Sun, 24 Feb 2008 14:46:22 +0000
parents 0a403ade8c81
children 9a736918fd90
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1 /*
986e461dc072 Initial revision
glantau
parents:
diff changeset
2 * MMX optimized DSP utils
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
1739
07a484280a82 copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents: 1729
diff changeset
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
5 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
6 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
7 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
8 * FFmpeg is free software; you can redistribute it and/or
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
9 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
10 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
11 * version 2.1 of the License, or (at your option) any later version.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
12 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
13 * FFmpeg is distributed in the hope that it will be useful,
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
16 * Lesser General Public License for more details.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
17 *
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
18 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
19 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2979
diff changeset
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
21 *
986e461dc072 Initial revision
glantau
parents:
diff changeset
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
986e461dc072 Initial revision
glantau
parents:
diff changeset
23 */
986e461dc072 Initial revision
glantau
parents:
diff changeset
24
5010
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header
diego
parents: 5007
diff changeset
25 #include "dsputil.h"
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
26 #include "dsputil_mmx.h"
5010
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header
diego
parents: 5007
diff changeset
27 #include "simple_idct.h"
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header
diego
parents: 5007
diff changeset
28 #include "mpegvideo.h"
3398
e0927bc44a10 Move REG_* macros from libavcodec/i386/mmx.h to libavutil/x86_cpu.h
lucabe
parents: 3250
diff changeset
29 #include "x86_cpu.h"
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
30 #include "mmx.h"
5014
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
31 #include "vp3dsp_mmx.h"
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
32 #include "vp3dsp_sse2.h"
5277
7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs
aurel
parents: 5255
diff changeset
33 #include "h263.h"
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
34
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
35 //#undef NDEBUG
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
36 //#include <assert.h>
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
37
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
38 extern void ff_idct_xvid_mmx(short *block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
39 extern void ff_idct_xvid_mmx2(short *block);
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
40
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
41 int mm_flags; /* multimedia extension flags */
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
42
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
43 /* pixel operations */
5947
37a03989871b use ff_ prefix for extern vars
aurel
parents: 5946
diff changeset
44 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
37a03989871b use ff_ prefix for extern vars
aurel
parents: 5946
diff changeset
45 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
46
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
47 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
48 {0x8000000080000000ULL, 0x8000000080000000ULL};
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
49
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
50 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
51 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
52 DECLARE_ALIGNED_16(const xmm_t, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
53 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
54 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
55 DECLARE_ALIGNED_16(const xmm_t, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
56 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
6329
5969caa9190d clean up an ugliness introduced in r11826. this syntax will require fewer changes when adding future sse2 code.
lorenm
parents: 6327
diff changeset
57 DECLARE_ALIGNED_16(const xmm_t, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
58 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
59 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
60 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
6333
beb52d4a5efe constant was excessively aligned
lorenm
parents: 6331
diff changeset
61 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
62 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
63
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
64 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
65 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
66 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
67 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
68 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
69 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
70
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
71 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
72 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
5737
efa3c1f9259a sse2 version of compute_autocorr().
lorenm
parents: 5602
diff changeset
73
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
74 #define JUMPALIGN() asm volatile (ASMALIGN(3)::)
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
75 #define MOVQ_ZERO(regd) asm volatile ("pxor %%" #regd ", %%" #regd ::)
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
76
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
77 #define MOVQ_WONE(regd) \
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
78 asm volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
79 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
80 "psrlw $15, %%" #regd ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
81
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
82 #define MOVQ_BFE(regd) \
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
83 asm volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
84 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
85 "paddb %%" #regd ", %%" #regd " \n\t" ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
86
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
87 #ifndef PIC
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
88 #define MOVQ_BONE(regd) asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
89 #define MOVQ_WTWO(regd) asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
90 #else
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
91 // for shared library it's better to use this way for accessing constants
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
92 // pcmpeqd -> -1
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
93 #define MOVQ_BONE(regd) \
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
94 asm volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
95 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
96 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
97 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
98
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
99 #define MOVQ_WTWO(regd) \
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
100 asm volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
101 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
102 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
103 "psllw $1, %%" #regd " \n\t"::)
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
104
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
105 #endif
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
106
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
107 // using regr as temporary and for the output result
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
108 // first argument is unmodifed and second is trashed
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
109 // regfe is supposed to contain 0xfefefefefefefefe
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
110 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
111 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
112 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
113 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
114 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
115 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
116 "paddb " #regb ", " #regr " \n\t"
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
117
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
118 #define PAVGB_MMX(rega, regb, regr, regfe) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
119 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
120 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
121 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
122 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
123 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
124 "psubb " #regb ", " #regr " \n\t"
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
125
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
126 // mm6 is supposed to contain 0xfefefefefefefefe
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
127 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
128 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
129 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
130 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
131 "pand " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
132 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
133 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
134 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
135 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
136 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
137 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
138 "paddb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
139 "paddb " #regd ", " #regp " \n\t"
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
140
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
141 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
142 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
143 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
144 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
145 "por " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
146 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
147 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
148 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
149 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
150 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
151 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
152 "psubb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
153 "psubb " #regd ", " #regp " \n\t"
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
154
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
155 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
156 /* MMX no rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
157 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
158 #define SET_RND MOVQ_WONE
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
159 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
160 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
161
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
162 #include "dsputil_mmx_rnd.h"
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
163
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
164 #undef DEF
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
165 #undef SET_RND
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
166 #undef PAVGBP
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
167 #undef PAVGB
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
168 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
169 /* MMX rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
170
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
171 #define DEF(x, y) x ## _ ## y ##_mmx
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
172 #define SET_RND MOVQ_WTWO
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
173 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
174 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
175
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
176 #include "dsputil_mmx_rnd.h"
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
177
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
178 #undef DEF
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
179 #undef SET_RND
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
180 #undef PAVGBP
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
181 #undef PAVGB
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
182
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
183 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
184 /* 3Dnow specific */
986e461dc072 Initial revision
glantau
parents:
diff changeset
185
986e461dc072 Initial revision
glantau
parents:
diff changeset
186 #define DEF(x) x ## _3dnow
986e461dc072 Initial revision
glantau
parents:
diff changeset
187 #define PAVGB "pavgusb"
986e461dc072 Initial revision
glantau
parents:
diff changeset
188
986e461dc072 Initial revision
glantau
parents:
diff changeset
189 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision
glantau
parents:
diff changeset
190
986e461dc072 Initial revision
glantau
parents:
diff changeset
191 #undef DEF
986e461dc072 Initial revision
glantau
parents:
diff changeset
192 #undef PAVGB
986e461dc072 Initial revision
glantau
parents:
diff changeset
193
986e461dc072 Initial revision
glantau
parents:
diff changeset
194 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
195 /* MMX2 specific */
986e461dc072 Initial revision
glantau
parents:
diff changeset
196
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
197 #define DEF(x) x ## _mmx2
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
198
986e461dc072 Initial revision
glantau
parents:
diff changeset
199 /* Introduced only in MMX2 set */
986e461dc072 Initial revision
glantau
parents:
diff changeset
200 #define PAVGB "pavgb"
986e461dc072 Initial revision
glantau
parents:
diff changeset
201
986e461dc072 Initial revision
glantau
parents:
diff changeset
202 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision
glantau
parents:
diff changeset
203
986e461dc072 Initial revision
glantau
parents:
diff changeset
204 #undef DEF
986e461dc072 Initial revision
glantau
parents:
diff changeset
205 #undef PAVGB
986e461dc072 Initial revision
glantau
parents:
diff changeset
206
6327
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
207 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
208 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
6321
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
209 #define put_pixels16_mmx2 put_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
210 #define put_pixels8_mmx2 put_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
211 #define put_pixels4_mmx2 put_pixels4_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
212 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
213 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
214 #define put_pixels16_3dnow put_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
215 #define put_pixels8_3dnow put_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
216 #define put_pixels4_3dnow put_pixels4_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
217 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
218 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
219
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
220 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
221 /* standard MMX */
986e461dc072 Initial revision
glantau
parents:
diff changeset
222
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
223 #ifdef CONFIG_ENCODERS
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
224 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
225 {
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
226 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
227 "mov $-128, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
228 "pxor %%mm7, %%mm7 \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
229 ASMALIGN(4)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
230 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
231 "movq (%0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
232 "movq (%0, %2), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
233 "movq %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
234 "movq %%mm2, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
235 "punpcklbw %%mm7, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
236 "punpckhbw %%mm7, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
237 "punpcklbw %%mm7, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
238 "punpckhbw %%mm7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
239 "movq %%mm0, (%1, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
240 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
241 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
242 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
243 "add %3, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
244 "add $32, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
245 "js 1b \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
246 : "+r" (pixels)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
247 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
248 : "%"REG_a
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
249 );
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
250 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
251
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
252 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
253 {
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
254 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
255 "pxor %%mm7, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
256 "mov $-128, %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
257 ASMALIGN(4)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
258 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
259 "movq (%0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
260 "movq (%1), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
261 "movq %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
262 "movq %%mm2, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
263 "punpcklbw %%mm7, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
264 "punpckhbw %%mm7, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
265 "punpcklbw %%mm7, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
266 "punpckhbw %%mm7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
267 "psubw %%mm2, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
268 "psubw %%mm3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
269 "movq %%mm0, (%2, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
270 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
271 "add %3, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
272 "add %3, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
273 "add $16, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
274 "jnz 1b \n\t"
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
275 : "+r" (s1), "+r" (s2)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
276 : "r" (block+64), "r" ((long)stride)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
277 : "%"REG_a
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
278 );
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
279 }
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
280 #endif //CONFIG_ENCODERS
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
281
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
282 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
283 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
284 const DCTELEM *p;
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
285 uint8_t *pix;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
286
986e461dc072 Initial revision
glantau
parents:
diff changeset
287 /* read the pixels */
986e461dc072 Initial revision
glantau
parents:
diff changeset
288 p = block;
986e461dc072 Initial revision
glantau
parents:
diff changeset
289 pix = pixels;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
290 /* unrolled loop */
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
291 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
292 "movq %3, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
293 "movq 8%3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
294 "movq 16%3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
295 "movq 24%3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
296 "movq 32%3, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
297 "movq 40%3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
298 "movq 48%3, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
299 "movq 56%3, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
300 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
301 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
302 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
303 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
304 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
305 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
306 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
307 "movq %%mm6, (%0, %2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
308 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
309 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
310 pix += line_size*4;
986e461dc072 Initial revision
glantau
parents:
diff changeset
311 p += 32;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
312
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
313 // if here would be an exact copy of the code above
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
314 // compiler would generate some very strange code
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
315 // thus using "r"
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
316 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
317 "movq (%3), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
318 "movq 8(%3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
319 "movq 16(%3), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
320 "movq 24(%3), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
321 "movq 32(%3), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
322 "movq 40(%3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
323 "movq 48(%3), %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
324 "movq 56(%3), %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
325 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
326 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
327 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
328 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
329 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
330 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
331 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
332 "movq %%mm6, (%0, %2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
333 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
334 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
335 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
336
3089
072dbc669253 MSVC-compatible __align8/__align16 declaration
diego
parents: 3036
diff changeset
337 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
1985
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents: 1984
diff changeset
338 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents: 1984
diff changeset
339
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
340 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
341 {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
342 int i;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
343
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
344 movq_m2r(*vector128, mm1);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
345 for (i = 0; i < 8; i++) {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
346 movq_m2r(*(block), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
347 packsswb_m2r(*(block + 4), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
348 block += 8;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
349 paddb_r2r(mm1, mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
350 movq_r2m(mm0, *pixels);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
351 pixels += line_size;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
352 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
353 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
354
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
355 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
356 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
357 const DCTELEM *p;
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
358 uint8_t *pix;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
359 int i;
986e461dc072 Initial revision
glantau
parents:
diff changeset
360
986e461dc072 Initial revision
glantau
parents:
diff changeset
361 /* read the pixels */
986e461dc072 Initial revision
glantau
parents:
diff changeset
362 p = block;
986e461dc072 Initial revision
glantau
parents:
diff changeset
363 pix = pixels;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
364 MOVQ_ZERO(mm7);
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
365 i = 4;
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
366 do {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
367 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
368 "movq (%2), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
369 "movq 8(%2), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
370 "movq 16(%2), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
371 "movq 24(%2), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
372 "movq %0, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
373 "movq %1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
374 "movq %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
375 "punpcklbw %%mm7, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
376 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
377 "paddsw %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
378 "paddsw %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
379 "movq %%mm6, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
380 "punpcklbw %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
381 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
382 "paddsw %%mm6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
383 "paddsw %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
384 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
385 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
386 "movq %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
387 "movq %%mm2, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
388 :"+m"(*pix), "+m"(*(pix+line_size))
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
389 :"r"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
390 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
391 pix += line_size*2;
986e461dc072 Initial revision
glantau
parents:
diff changeset
392 p += 16;
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
393 } while (--i);
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
394 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
395
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
396 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
397 {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
398 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
399 "lea (%3, %3), %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
400 ASMALIGN(3)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
401 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
402 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
403 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
404 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
405 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
406 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
407 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
408 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
409 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
410 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
411 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
412 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
413 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
414 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
415 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
416 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
417 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
418 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
419 );
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
420 }
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
421
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
422 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
423 {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
424 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
425 "lea (%3, %3), %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
426 ASMALIGN(3)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
427 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
428 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
429 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
430 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
431 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
432 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
433 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
434 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
435 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
436 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
437 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
438 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
439 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
440 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
441 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
442 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
443 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
444 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
445 );
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
446 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
447
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
448 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
449 {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
450 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
451 "lea (%3, %3), %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
452 ASMALIGN(3)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
453 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
454 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
455 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
456 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
457 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
458 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
459 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
460 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
461 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
462 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
463 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
464 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
465 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
466 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
467 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
468 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
469 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
470 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
471 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
472 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
473 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
474 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
475 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
476 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
477 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
478 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
479 );
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
480 }
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
481
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
482 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
483 {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
484 asm volatile(
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
485 "1: \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
486 "movdqu (%1), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
487 "movdqu (%1,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
488 "movdqu (%1,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
489 "movdqu (%1,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
490 "movdqa %%xmm0, (%2) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
491 "movdqa %%xmm1, (%2,%3) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
492 "movdqa %%xmm2, (%2,%3,2) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
493 "movdqa %%xmm3, (%2,%4) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
494 "subl $4, %0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
495 "lea (%1,%3,4), %1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
496 "lea (%2,%3,4), %2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
497 "jnz 1b \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
498 : "+g"(h), "+r" (pixels), "+r" (block)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
499 : "r"((long)line_size), "r"(3L*line_size)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
500 : "memory"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
501 );
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
502 }
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
503
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
504 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
505 {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
506 asm volatile(
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
507 "1: \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
508 "movdqu (%1), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
509 "movdqu (%1,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
510 "movdqu (%1,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
511 "movdqu (%1,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
512 "pavgb (%2), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
513 "pavgb (%2,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
514 "pavgb (%2,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
515 "pavgb (%2,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
516 "movdqa %%xmm0, (%2) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
517 "movdqa %%xmm1, (%2,%3) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
518 "movdqa %%xmm2, (%2,%3,2) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
519 "movdqa %%xmm3, (%2,%4) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
520 "subl $4, %0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
521 "lea (%1,%3,4), %1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
522 "lea (%2,%3,4), %2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
523 "jnz 1b \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
524 : "+g"(h), "+r" (pixels), "+r" (block)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
525 : "r"((long)line_size), "r"(3L*line_size)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
526 : "memory"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
527 );
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
528 }
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
529
296
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
530 static void clear_blocks_mmx(DCTELEM *blocks)
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
531 {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
532 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
533 "pxor %%mm7, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
534 "mov $-128*6, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
535 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
536 "movq %%mm7, (%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
537 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
538 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
539 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
540 "add $32, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
541 " js 1b \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
542 : : "r" (((uint8_t *)blocks)+128*6)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
543 : "%"REG_a
296
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
544 );
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
545 }
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
546
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
547 #ifdef CONFIG_ENCODERS
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
548 static int pix_sum16_mmx(uint8_t * pix, int line_size){
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
549 const int h=16;
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
550 int sum;
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
551 long index= -line_size*h;
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
552
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
553 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
554 "pxor %%mm7, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
555 "pxor %%mm6, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
556 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
557 "movq (%2, %1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
558 "movq (%2, %1), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
559 "movq 8(%2, %1), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
560 "movq 8(%2, %1), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
561 "punpcklbw %%mm7, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
562 "punpckhbw %%mm7, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
563 "punpcklbw %%mm7, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
564 "punpckhbw %%mm7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
565 "paddw %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
566 "paddw %%mm2, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
567 "paddw %%mm1, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
568 "paddw %%mm3, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
569 "add %3, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
570 " js 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
571 "movq %%mm6, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
572 "psrlq $32, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
573 "paddw %%mm5, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
574 "movq %%mm6, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
575 "psrlq $16, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
576 "paddw %%mm5, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
577 "movd %%mm6, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
578 "andl $0xFFFF, %0 \n\t"
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
579 : "=&r" (sum), "+r" (index)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
580 : "r" (pix - index), "r" ((long)line_size)
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
581 );
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
582
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
583 return sum;
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
584 }
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
585 #endif //CONFIG_ENCODERS
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
586
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
587 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
588 long i=0;
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
589 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
590 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
591 "movq (%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
592 "movq (%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
593 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
594 "movq %%mm1, (%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
595 "movq 8(%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
596 "movq 8(%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
597 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
598 "movq %%mm1, 8(%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
599 "add $16, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
600 "cmp %3, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
601 " jb 1b \n\t"
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
602 : "+r" (i)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
603 : "r"(src), "r"(dst), "r"((long)w-15)
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
604 );
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
605 for(; i<w; i++)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
606 dst[i+0] += src[i+0];
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
607 }
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
608
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
609 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
610 long i=0;
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
611 asm volatile(
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
612 "1: \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
613 "movq (%2, %0), %%mm0 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
614 "movq 8(%2, %0), %%mm1 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
615 "paddb (%3, %0), %%mm0 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
616 "paddb 8(%3, %0), %%mm1 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
617 "movq %%mm0, (%1, %0) \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
618 "movq %%mm1, 8(%1, %0) \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
619 "add $16, %0 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
620 "cmp %4, %0 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
621 " jb 1b \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
622 : "+r" (i)
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
623 : "r"(dst), "r"(src1), "r"(src2), "r"((long)w-15)
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
624 );
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
625 for(; i<w; i++)
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
626 dst[i] = src1[i] + src2[i];
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
627 }
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
628
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
629 #define H263_LOOP_FILTER \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
630 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
631 "movq %0, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
632 "movq %0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
633 "movq %3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
634 "movq %3, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
635 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
636 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
637 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
638 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
639 "psubw %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
640 "psubw %%mm3, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
641 "movq %1, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
642 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
643 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
644 "movq %2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
645 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
646 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
647 "punpcklbw %%mm7, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
648 "punpckhbw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
649 "psubw %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
650 "psubw %%mm3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
651 "psllw $2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
652 "psllw $2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
653 "paddw %%mm0, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
654 "paddw %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
655 "pxor %%mm6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
656 "pcmpgtw %%mm4, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
657 "pcmpgtw %%mm5, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
658 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
659 "pxor %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
660 "psubw %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
661 "psubw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
662 "psrlw $3, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
663 "psrlw $3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
664 "packuswb %%mm5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
665 "packsswb %%mm7, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
666 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
667 "movd %4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
668 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
669 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
670 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
671 "psubusb %%mm4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
672 "movq %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
673 "psubusb %%mm4, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
674 "psubb %%mm3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
675 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
676 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
677 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
678 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
679 "paddusb %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
680 "psubusb %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
681 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
682 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
683 "paddusb %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
684 "packsswb %%mm1, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
685 "pcmpgtb %%mm0, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
686 "pxor %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
687 "psubb %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
688 "movq %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
689 "psubusb %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
690 "psubb %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
691 "pand %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
692 "psrlw $2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
693 "pxor %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
694 "psubb %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
695 "movq %0, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
696 "movq %3, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
697 "psubb %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
698 "paddb %%mm1, %%mm6 \n\t"
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
699
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
700 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
701 if(ENABLE_ANY_H263) {
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
702 const int strength= ff_h263_loop_filter_strength[qscale];
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
703
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
704 asm volatile(
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
705
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
706 H263_LOOP_FILTER
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
707
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
708 "movq %%mm3, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
709 "movq %%mm4, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
710 "movq %%mm5, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
711 "movq %%mm6, %3 \n\t"
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
712 : "+m" (*(uint64_t*)(src - 2*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
713 "+m" (*(uint64_t*)(src - 1*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
714 "+m" (*(uint64_t*)(src + 0*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
715 "+m" (*(uint64_t*)(src + 1*stride))
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
716 : "g" (2*strength), "m"(ff_pb_FC)
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
717 );
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
718 }
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
719 }
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
720
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
721 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
722 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
723 "movd %4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
724 "movd %5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
725 "movd %6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
726 "movd %7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
727 "punpcklbw %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
728 "punpcklbw %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
729 "movq %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
730 "punpcklwd %%mm2, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
731 "punpckhwd %%mm2, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
732 "movd %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
733 "punpckhdq %%mm0, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
734 "movd %%mm0, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
735 "movd %%mm1, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
736 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
737 "movd %%mm1, %3 \n\t"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
738
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
739 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
740 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
741 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
742 "=m" (*(uint32_t*)(dst + 3*dst_stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
743 : "m" (*(uint32_t*)(src + 0*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
744 "m" (*(uint32_t*)(src + 1*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
745 "m" (*(uint32_t*)(src + 2*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
746 "m" (*(uint32_t*)(src + 3*src_stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
747 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
748 }
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
749
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
750 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
751 if(ENABLE_ANY_H263) {
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
752 const int strength= ff_h263_loop_filter_strength[qscale];
6181
f3da7b2592aa Use DECLARE_ALIGNED
reimar
parents: 6135
diff changeset
753 DECLARE_ALIGNED(8, uint64_t, temp[4]);
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
754 uint8_t *btemp= (uint8_t*)temp;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
755
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
756 src -= 2;
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
757
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
758 transpose4x4(btemp , src , 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
759 transpose4x4(btemp+4, src + 4*stride, 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
760 asm volatile(
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
761 H263_LOOP_FILTER // 5 3 4 6
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
762
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
763 : "+m" (temp[0]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
764 "+m" (temp[1]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
765 "+m" (temp[2]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
766 "+m" (temp[3])
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
767 : "g" (2*strength), "m"(ff_pb_FC)
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
768 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
769
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
770 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
771 "movq %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
772 "movq %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
773 "punpcklbw %%mm3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
774 "punpcklbw %%mm6, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
775 "punpckhbw %%mm3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
776 "punpckhbw %%mm6, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
777 "movq %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
778 "movq %%mm1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
779 "punpcklwd %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
780 "punpcklwd %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
781 "punpckhwd %%mm4, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
782 "punpckhwd %%mm0, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
783 "movd %%mm5, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
784 "punpckhdq %%mm5, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
785 "movd %%mm5, (%0,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
786 "movd %%mm3, (%0,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
787 "punpckhdq %%mm3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
788 "movd %%mm3, (%0,%3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
789 "movd %%mm1, (%1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
790 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
791 "movd %%mm1, (%1,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
792 "movd %%mm6, (%1,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
793 "punpckhdq %%mm6, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
794 "movd %%mm6, (%1,%3) \n\t"
2505
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
795 :: "r" (src),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
796 "r" (src + 4*stride),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
797 "r" ((long) stride ),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
798 "r" ((long)(3*stride))
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
799 );
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
800 }
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
801 }
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
802
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
803 #ifdef CONFIG_ENCODERS
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
804 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
805 int tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
806 asm volatile (
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
807 "movl $16,%%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
808 "pxor %%mm0,%%mm0\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
809 "pxor %%mm7,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
810 "1:\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
811 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
812 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
813
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
814 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
815
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
816 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
817 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
818
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
819 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
820 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
821 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
822
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
823 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
824 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
825
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
826 "pmaddwd %%mm3,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
827 "pmaddwd %%mm4,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
828
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
829 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
830 pix2^2+pix3^2+pix6^2+pix7^2) */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
831 "paddd %%mm3,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
832 "paddd %%mm2,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
833
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
834 "add %2, %0\n"
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
835 "paddd %%mm4,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
836 "dec %%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
837 "jnz 1b\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
838
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
839 "movq %%mm7,%%mm1\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
840 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
841 "paddd %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
842 "movd %%mm1,%1\n"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
843 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
844 return tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
845 }
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
846
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
847 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
848 int tmp;
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
849 asm volatile (
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
850 "movl %4,%%ecx\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
851 "shr $1,%%ecx\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
852 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
853 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
854 "1:\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
855 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
856 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
857 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
858 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
859
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
860 /* todo: mm1-mm2, mm3-mm4 */
5963
80103098c797 spelling
vitor
parents: 5952
diff changeset
861 /* algo: subtract mm1 from mm2 with saturation and vice versa */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
862 /* OR the results to get absolute difference */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
863 "movq %%mm1,%%mm5\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
864 "movq %%mm3,%%mm6\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
865 "psubusb %%mm2,%%mm1\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
866 "psubusb %%mm4,%%mm3\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
867 "psubusb %%mm5,%%mm2\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
868 "psubusb %%mm6,%%mm4\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
869
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
870 "por %%mm1,%%mm2\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
871 "por %%mm3,%%mm4\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
872
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
873 /* now convert to 16-bit vectors so we can square them */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
874 "movq %%mm2,%%mm1\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
875 "movq %%mm4,%%mm3\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
876
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
877 "punpckhbw %%mm0,%%mm2\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
878 "punpckhbw %%mm0,%%mm4\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
879 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
880 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
881
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
882 "pmaddwd %%mm2,%%mm2\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
883 "pmaddwd %%mm4,%%mm4\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
884 "pmaddwd %%mm1,%%mm1\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
885 "pmaddwd %%mm3,%%mm3\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
886
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
887 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
888 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
889
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
890 "paddd %%mm2,%%mm1\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
891 "paddd %%mm4,%%mm3\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
892 "paddd %%mm1,%%mm7\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
893 "paddd %%mm3,%%mm7\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
894
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
895 "decl %%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
896 "jnz 1b\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
897
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
898 "movq %%mm7,%%mm1\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
899 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
900 "paddd %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
901 "movd %%mm1,%2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
902 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
903 : "r" ((long)line_size) , "m" (h)
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
904 : "%ecx");
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
905 return tmp;
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
906 }
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
907
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
908 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
909 int tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
910 asm volatile (
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
911 "movl %4,%%ecx\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
912 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
913 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
914 "1:\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
915 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
916 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
917 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
918 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
919
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
920 /* todo: mm1-mm2, mm3-mm4 */
5963
80103098c797 spelling
vitor
parents: 5952
diff changeset
921 /* algo: subtract mm1 from mm2 with saturation and vice versa */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
922 /* OR the results to get absolute difference */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
923 "movq %%mm1,%%mm5\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
924 "movq %%mm3,%%mm6\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
925 "psubusb %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
926 "psubusb %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
927 "psubusb %%mm5,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
928 "psubusb %%mm6,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
929
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
930 "por %%mm1,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
931 "por %%mm3,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
932
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
933 /* now convert to 16-bit vectors so we can square them */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
934 "movq %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
935 "movq %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
936
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
937 "punpckhbw %%mm0,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
938 "punpckhbw %%mm0,%%mm4\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
939 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
940 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
941
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
942 "pmaddwd %%mm2,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
943 "pmaddwd %%mm4,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
944 "pmaddwd %%mm1,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
945 "pmaddwd %%mm3,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
946
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
947 "add %3,%0\n"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
948 "add %3,%1\n"
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
949
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
950 "paddd %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
951 "paddd %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
952 "paddd %%mm1,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
953 "paddd %%mm3,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
954
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
955 "decl %%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
956 "jnz 1b\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
957
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
958 "movq %%mm7,%%mm1\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
959 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
960 "paddd %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
961 "movd %%mm1,%2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
962 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
963 : "r" ((long)line_size) , "m" (h)
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
964 : "%ecx");
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
965 return tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
966 }
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
967
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
968 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
969 int tmp;
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
970 asm volatile (
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
971 "shr $1,%2\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
972 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
973 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
974 "1:\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
975 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
976 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
977 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
978 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
979
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
980 /* todo: mm1-mm2, mm3-mm4 */
5963
80103098c797 spelling
vitor
parents: 5952
diff changeset
981 /* algo: subtract mm1 from mm2 with saturation and vice versa */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
982 /* OR the results to get absolute difference */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
983 "movdqa %%xmm1,%%xmm5\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
984 "movdqa %%xmm3,%%xmm6\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
985 "psubusb %%xmm2,%%xmm1\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
986 "psubusb %%xmm4,%%xmm3\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
987 "psubusb %%xmm5,%%xmm2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
988 "psubusb %%xmm6,%%xmm4\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
989
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
990 "por %%xmm1,%%xmm2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
991 "por %%xmm3,%%xmm4\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
992
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
993 /* now convert to 16-bit vectors so we can square them */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
994 "movdqa %%xmm2,%%xmm1\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
995 "movdqa %%xmm4,%%xmm3\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
996
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
997 "punpckhbw %%xmm0,%%xmm2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
998 "punpckhbw %%xmm0,%%xmm4\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
999 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1000 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1001
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1002 "pmaddwd %%xmm2,%%xmm2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1003 "pmaddwd %%xmm4,%%xmm4\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1004 "pmaddwd %%xmm1,%%xmm1\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1005 "pmaddwd %%xmm3,%%xmm3\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1006
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1007 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1008 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1009
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1010 "paddd %%xmm2,%%xmm1\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1011 "paddd %%xmm4,%%xmm3\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1012 "paddd %%xmm1,%%xmm7\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1013 "paddd %%xmm3,%%xmm7\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1014
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1015 "decl %2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1016 "jnz 1b\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1017
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1018 "movdqa %%xmm7,%%xmm1\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1019 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1020 "paddd %%xmm1,%%xmm7\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1021 "movdqa %%xmm7,%%xmm1\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1022 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1023 "paddd %%xmm1,%%xmm7\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1024 "movd %%xmm7,%3\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1025 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1026 : "r" ((long)line_size));
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1027 return tmp;
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1028 }
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
1029
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1030 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1031 int tmp;
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1032 asm volatile (
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1033 "movl %3,%%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1034 "pxor %%mm7,%%mm7\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1035 "pxor %%mm6,%%mm6\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1036
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1037 "movq (%0),%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1038 "movq %%mm0, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1039 "psllq $8, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1040 "psrlq $8, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1041 "psrlq $8, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1042 "movq %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1043 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1044 "punpcklbw %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1045 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1046 "punpckhbw %%mm7,%%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1047 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1048 "psubw %%mm1, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1049 "psubw %%mm3, %%mm2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1050
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1051 "add %2,%0\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1052
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1053 "movq (%0),%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1054 "movq %%mm4, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1055 "psllq $8, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1056 "psrlq $8, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1057 "psrlq $8, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1058 "movq %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1059 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1060 "punpcklbw %%mm7,%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1061 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1062 "punpckhbw %%mm7,%%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1063 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1064 "psubw %%mm1, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1065 "psubw %%mm3, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1066 "psubw %%mm4, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1067 "psubw %%mm5, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1068 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1069 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1070 "pcmpgtw %%mm0, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1071 "pcmpgtw %%mm2, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1072 "pxor %%mm3, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1073 "pxor %%mm1, %%mm2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1074 "psubw %%mm3, %%mm0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1075 "psubw %%mm1, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1076 "paddw %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1077 "paddw %%mm2, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1078
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1079 "add %2,%0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1080 "1:\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1081
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1082 "movq (%0),%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1083 "movq %%mm0, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1084 "psllq $8, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1085 "psrlq $8, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1086 "psrlq $8, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1087 "movq %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1088 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1089 "punpcklbw %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1090 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1091 "punpckhbw %%mm7,%%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1092 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1093 "psubw %%mm1, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1094 "psubw %%mm3, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1095 "psubw %%mm0, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1096 "psubw %%mm2, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1097 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1098 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1099 "pcmpgtw %%mm4, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1100 "pcmpgtw %%mm5, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1101 "pxor %%mm3, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1102 "pxor %%mm1, %%mm5\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1103 "psubw %%mm3, %%mm4\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1104 "psubw %%mm1, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1105 "paddw %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1106 "paddw %%mm5, %%mm6\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1107
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1108 "add %2,%0\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1109
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1110 "movq (%0),%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1111 "movq %%mm4, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1112 "psllq $8, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1113 "psrlq $8, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1114 "psrlq $8, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1115 "movq %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1116 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1117 "punpcklbw %%mm7,%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1118 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1119 "punpckhbw %%mm7,%%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1120 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1121 "psubw %%mm1, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1122 "psubw %%mm3, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1123 "psubw %%mm4, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1124 "psubw %%mm5, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1125 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1126 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1127 "pcmpgtw %%mm0, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1128 "pcmpgtw %%mm2, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1129 "pxor %%mm3, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1130 "pxor %%mm1, %%mm2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1131 "psubw %%mm3, %%mm0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1132 "psubw %%mm1, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1133 "paddw %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1134 "paddw %%mm2, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1135
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1136 "add %2,%0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1137 "subl $2, %%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1138 " jnz 1b\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1139
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1140 "movq %%mm6, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1141 "punpcklwd %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1142 "punpckhwd %%mm7,%%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1143 "paddd %%mm0, %%mm6\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1144
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1145 "movq %%mm6,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1146 "psrlq $32, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1147 "paddd %%mm6,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1148 "movd %%mm0,%1\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1149 : "+r" (pix1), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1150 : "r" ((long)line_size) , "g" (h-2)
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1151 : "%ecx");
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1152 return tmp;
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1153 }
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1154
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1155 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1156 int tmp;
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1157 uint8_t * pix= pix1;
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1158 asm volatile (
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1159 "movl %3,%%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1160 "pxor %%mm7,%%mm7\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1161 "pxor %%mm6,%%mm6\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1162
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1163 "movq (%0),%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1164 "movq 1(%0),%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1165 "movq %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1166 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1167 "punpcklbw %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1168 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1169 "punpckhbw %%mm7,%%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1170 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1171 "psubw %%mm1, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1172 "psubw %%mm3, %%mm2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1173
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1174 "add %2,%0\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1175
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1176 "movq (%0),%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1177 "movq 1(%0),%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1178 "movq %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1179 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1180 "punpcklbw %%mm7,%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1181 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1182 "punpckhbw %%mm7,%%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1183 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1184 "psubw %%mm1, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1185 "psubw %%mm3, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1186 "psubw %%mm4, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1187 "psubw %%mm5, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1188 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1189 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1190 "pcmpgtw %%mm0, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1191 "pcmpgtw %%mm2, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1192 "pxor %%mm3, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1193 "pxor %%mm1, %%mm2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1194 "psubw %%mm3, %%mm0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1195 "psubw %%mm1, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1196 "paddw %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1197 "paddw %%mm2, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1198
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1199 "add %2,%0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1200 "1:\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1201
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1202 "movq (%0),%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1203 "movq 1(%0),%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1204 "movq %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1205 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1206 "punpcklbw %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1207 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1208 "punpckhbw %%mm7,%%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1209 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1210 "psubw %%mm1, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1211 "psubw %%mm3, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1212 "psubw %%mm0, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1213 "psubw %%mm2, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1214 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1215 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1216 "pcmpgtw %%mm4, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1217 "pcmpgtw %%mm5, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1218 "pxor %%mm3, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1219 "pxor %%mm1, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1220 "psubw %%mm3, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1221 "psubw %%mm1, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1222 "paddw %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1223 "paddw %%mm5, %%mm6\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1224
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1225 "add %2,%0\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1226
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1227 "movq (%0),%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1228 "movq 1(%0),%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1229 "movq %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1230 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1231 "punpcklbw %%mm7,%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1232 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1233 "punpckhbw %%mm7,%%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1234 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1235 "psubw %%mm1, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1236 "psubw %%mm3, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1237 "psubw %%mm4, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1238 "psubw %%mm5, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1239 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1240 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1241 "pcmpgtw %%mm0, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1242 "pcmpgtw %%mm2, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1243 "pxor %%mm3, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1244 "pxor %%mm1, %%mm2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1245 "psubw %%mm3, %%mm0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1246 "psubw %%mm1, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1247 "paddw %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1248 "paddw %%mm2, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1249
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1250 "add %2,%0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1251 "subl $2, %%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1252 " jnz 1b\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1253
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1254 "movq %%mm6, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1255 "punpcklwd %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1256 "punpckhwd %%mm7,%%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1257 "paddd %%mm0, %%mm6\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1258
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1259 "movq %%mm6,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1260 "psrlq $32, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1261 "paddd %%mm6,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1262 "movd %%mm0,%1\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1263 : "+r" (pix1), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1264 : "r" ((long)line_size) , "g" (h-2)
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1265 : "%ecx");
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1266 return tmp + hf_noise8_mmx(pix+8, line_size, h);
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1267 }
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1268
2864
95bac7109ff0 Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents: 2754
diff changeset
1269 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
95bac7109ff0 Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents: 2754
diff changeset
1270 MpegEncContext *c = p;
2940
8aa244d7c274 use sse16_sse2() in nsse
lorenm
parents: 2922
diff changeset
1271 int score1, score2;
8aa244d7c274 use sse16_sse2() in nsse
lorenm
parents: 2922
diff changeset
1272
8aa244d7c274 use sse16_sse2() in nsse
lorenm
parents: 2922
diff changeset
1273 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
8aa244d7c274 use sse16_sse2() in nsse
lorenm
parents: 2922
diff changeset
1274 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
8aa244d7c274 use sse16_sse2() in nsse
lorenm
parents: 2922
diff changeset
1275 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1276
4001
34fdffe98bd0 Rename ABS macro to FFABS.
diego
parents: 3947
diff changeset
1277 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
34fdffe98bd0 Rename ABS macro to FFABS.
diego
parents: 3947
diff changeset
1278 else return score1 + FFABS(score2)*8;
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1279 }
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1280
2864
95bac7109ff0 Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents: 2754
diff changeset
1281 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
95bac7109ff0 Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents: 2754
diff changeset
1282 MpegEncContext *c = p;
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1283 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1284 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1285
4001
34fdffe98bd0 Rename ABS macro to FFABS.
diego
parents: 3947
diff changeset
1286 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
34fdffe98bd0 Rename ABS macro to FFABS.
diego
parents: 3947
diff changeset
1287 else return score1 + FFABS(score2)*8;
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1288 }
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1289
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1290 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1291 int tmp;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1292
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1293 assert( (((int)pix) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1294 assert((line_size &7) ==0);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1295
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1296 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1297 "movq (%0), %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1298 "movq 8(%0), %%mm3\n"\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1299 "add %2,%0\n"\
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1300 "movq %%mm2, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1301 "movq %%mm3, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1302 "psubusb " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1303 "psubusb " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1304 "psubusb " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1305 "psubusb " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1306 "por %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1307 "por %%mm3, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1308 "movq " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1309 "movq " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1310 "punpcklbw %%mm7, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1311 "punpcklbw %%mm7, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1312 "punpckhbw %%mm7, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1313 "punpckhbw %%mm7, %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1314 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1315 "paddw %%mm3, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1316 "paddw %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1317 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1318
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1319
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1320 asm volatile (
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1321 "movl %3,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1322 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1323 "pxor %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1324 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1325 "movq 8(%0),%%mm1\n"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1326 "add %2,%0\n"
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1327 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1328 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1329 "1:\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1330
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1331 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1332
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1333 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1334
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1335 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1336 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1337
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1338 "movq %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1339 "psrlq $32, %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1340 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1341 "movq %%mm0,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1342 "psrlq $16, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1343 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1344 "movd %%mm0,%1\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1345 : "+r" (pix), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1346 : "r" ((long)line_size) , "m" (h)
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1347 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1348 return tmp & 0xFFFF;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1349 }
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1350 #undef SUM
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1351
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1352 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1353 int tmp;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1354
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1355 assert( (((int)pix) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1356 assert((line_size &7) ==0);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1357
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1358 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1359 "movq (%0), " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1360 "movq 8(%0), " #out1 "\n"\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1361 "add %2,%0\n"\
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1362 "psadbw " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1363 "psadbw " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1364 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1365 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1366
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1367 asm volatile (
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1368 "movl %3,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1369 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1370 "pxor %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1371 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1372 "movq 8(%0),%%mm1\n"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1373 "add %2,%0\n"
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1374 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1375 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1376 "1:\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1377
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1378 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1379
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1380 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1381
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1382 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1383 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1384
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1385 "movd %%mm6,%1\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1386 : "+r" (pix), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1387 : "r" ((long)line_size) , "m" (h)
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1388 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1389 return tmp;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1390 }
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1391 #undef SUM
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1392
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1393 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1394 int tmp;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1395
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1396 assert( (((int)pix1) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1397 assert( (((int)pix2) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1398 assert((line_size &7) ==0);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1399
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1400 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1401 "movq (%0),%%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1402 "movq (%1)," #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1403 "movq 8(%0),%%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1404 "movq 8(%1)," #out1 "\n"\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1405 "add %3,%0\n"\
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1406 "add %3,%1\n"\
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1407 "psubb " #out0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1408 "psubb " #out1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1409 "pxor %%mm7, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1410 "pxor %%mm7, %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1411 "movq %%mm2, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1412 "movq %%mm3, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1413 "psubusb " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1414 "psubusb " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1415 "psubusb " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1416 "psubusb " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1417 "por %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1418 "por %%mm3, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1419 "movq " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1420 "movq " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1421 "punpcklbw %%mm7, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1422 "punpcklbw %%mm7, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1423 "punpckhbw %%mm7, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1424 "punpckhbw %%mm7, %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1425 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1426 "paddw %%mm3, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1427 "paddw %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1428 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1429
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1430
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1431 asm volatile (
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1432 "movl %4,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1433 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1434 "pcmpeqw %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1435 "psllw $15, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1436 "packsswb %%mm7, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1437 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1438 "movq (%1),%%mm2\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1439 "movq 8(%0),%%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1440 "movq 8(%1),%%mm3\n"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1441 "add %3,%0\n"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1442 "add %3,%1\n"
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1443 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1444 "psubb %%mm2, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1445 "psubb %%mm3, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1446 "pxor %%mm7, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1447 "pxor %%mm7, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1448 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1449 "1:\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1450
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1451 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1452
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1453 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1454
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1455 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1456 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1457
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1458 "movq %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1459 "psrlq $32, %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1460 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1461 "movq %%mm0,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1462 "psrlq $16, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1463 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1464 "movd %%mm0,%2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1465 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1466 : "r" ((long)line_size) , "m" (h)
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1467 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1468 return tmp & 0x7FFF;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1469 }
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1470 #undef SUM
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1471
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1472 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1473 int tmp;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1474
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1475 assert( (((int)pix1) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1476 assert( (((int)pix2) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1477 assert((line_size &7) ==0);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1478
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1479 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1480 "movq (%0)," #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1481 "movq (%1),%%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1482 "movq 8(%0)," #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1483 "movq 8(%1),%%mm3\n"\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1484 "add %3,%0\n"\
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1485 "add %3,%1\n"\
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1486 "psubb %%mm2, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1487 "psubb %%mm3, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1488 "pxor %%mm7, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1489 "pxor %%mm7, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1490 "psadbw " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1491 "psadbw " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1492 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1493 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1494
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1495 asm volatile (
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1496 "movl %4,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1497 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1498 "pcmpeqw %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1499 "psllw $15, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1500 "packsswb %%mm7, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1501 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1502 "movq (%1),%%mm2\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1503 "movq 8(%0),%%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1504 "movq 8(%1),%%mm3\n"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1505 "add %3,%0\n"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1506 "add %3,%1\n"
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1507 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1508 "psubb %%mm2, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1509 "psubb %%mm3, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1510 "pxor %%mm7, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1511 "pxor %%mm7, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1512 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1513 "1:\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1514
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1515 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1516
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1517 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1518
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1519 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1520 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1521
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1522 "movd %%mm6,%2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1523 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1524 : "r" ((long)line_size) , "m" (h)
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1525 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1526 return tmp;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1527 }
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1528 #undef SUM
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1529
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1530 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1531 long i=0;
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1532 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1533 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1534 "movq (%2, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1535 "movq (%1, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1536 "psubb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1537 "movq %%mm1, (%3, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1538 "movq 8(%2, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1539 "movq 8(%1, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1540 "psubb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1541 "movq %%mm1, 8(%3, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1542 "add $16, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1543 "cmp %4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1544 " jb 1b \n\t"
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1545 : "+r" (i)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1546 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1547 );
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1548 for(; i<w; i++)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1549 dst[i+0] = src1[i+0]-src2[i+0];
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1550 }
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1551
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1552 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1553 long i=0;
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1554 uint8_t l, lt;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1555
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1556 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1557 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1558 "movq -1(%1, %0), %%mm0 \n\t" // LT
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1559 "movq (%1, %0), %%mm1 \n\t" // T
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1560 "movq -1(%2, %0), %%mm2 \n\t" // L
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1561 "movq (%2, %0), %%mm3 \n\t" // X
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1562 "movq %%mm2, %%mm4 \n\t" // L
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1563 "psubb %%mm0, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1564 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1565 "movq %%mm4, %%mm5 \n\t" // L
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1566 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1567 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1568 "pminub %%mm2, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1569 "pmaxub %%mm1, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1570 "psubb %%mm4, %%mm3 \n\t" // dst - pred
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1571 "movq %%mm3, (%3, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1572 "add $8, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1573 "cmp %4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1574 " jb 1b \n\t"
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1575 : "+r" (i)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1576 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1577 );
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1578
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1579 l= *left;
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1580 lt= *left_top;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1581
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1582 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1583
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1584 *left_top= src1[w-1];
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1585 *left = src2[w-1];
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1586 }
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1587
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1588 #define PAETH(cpu, abs3)\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1589 void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1590 {\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1591 long i = -bpp;\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1592 long end = w-3;\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1593 asm volatile(\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1594 "pxor %%mm7, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1595 "movd (%1,%0), %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1596 "movd (%2,%0), %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1597 "punpcklbw %%mm7, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1598 "punpcklbw %%mm7, %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1599 "add %4, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1600 "1: \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1601 "movq %%mm1, %%mm2 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1602 "movd (%2,%0), %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1603 "movq %%mm2, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1604 "punpcklbw %%mm7, %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1605 "movq %%mm2, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1606 "psubw %%mm1, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1607 "psubw %%mm0, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1608 "movq %%mm3, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1609 "paddw %%mm4, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1610 abs3\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1611 "movq %%mm4, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1612 "pminsw %%mm5, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1613 "pcmpgtw %%mm6, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1614 "pcmpgtw %%mm5, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1615 "movq %%mm4, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1616 "pand %%mm3, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1617 "pandn %%mm3, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1618 "pandn %%mm0, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1619 "movd (%3,%0), %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1620 "pand %%mm1, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1621 "pand %%mm4, %%mm2 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1622 "punpcklbw %%mm7, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1623 "movq %6, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1624 "paddw %%mm6, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1625 "paddw %%mm2, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1626 "paddw %%mm3, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1627 "pand %%mm5, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1628 "movq %%mm0, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1629 "packuswb %%mm3, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1630 "movd %%mm3, (%1,%0) \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1631 "add %4, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1632 "cmp %5, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1633 "jle 1b \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1634 :"+r"(i)\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1635 :"r"(dst), "r"(top), "r"(src), "r"((long)bpp), "g"(end),\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1636 "m"(ff_pw_255)\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1637 :"memory"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1638 );\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1639 }
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1640
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1641 #define ABS3_MMX2\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1642 "psubw %%mm5, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1643 "pmaxsw %%mm7, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1644 "pxor %%mm6, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1645 "pxor %%mm7, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1646 "psubw %%mm3, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1647 "psubw %%mm4, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1648 "pmaxsw %%mm6, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1649 "pmaxsw %%mm7, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1650 "pxor %%mm7, %%mm7 \n"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1651
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1652 #define ABS3_SSSE3\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1653 "pabsw %%mm3, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1654 "pabsw %%mm4, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1655 "pabsw %%mm5, %%mm5 \n"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1656
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1657 PAETH(mmx2, ABS3_MMX2)
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1658 #ifdef HAVE_SSSE3
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1659 PAETH(ssse3, ABS3_SSSE3)
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1660 #endif
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
1661
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1662 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1663 "mov"#m" "#p1", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1664 "mov"#m" "#p2", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1665 "punpcklbw "#a", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1666 "punpcklbw "#a", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1667 "psubw "#t", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1668
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1669 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1670 uint8_t *p1b=p1, *p2b=p2;\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1671 asm volatile(\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1672 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1673 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1674 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1675 "add %4, %1 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1676 "add %4, %2 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1677 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1678 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1679 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1680 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1681 "mov"#m1" "#mm"0, %0 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1682 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1683 "mov"#m1" %0, "#mm"0 \n\t"\
5912
f75ee7ea171b tring to workaround gcc 2.95 bug which causes random failures
michael
parents: 5737
diff changeset
1684 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1685 : "r"((long)stride), "r"((long)stride*3)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1686 );\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1687 }
5912
f75ee7ea171b tring to workaround gcc 2.95 bug which causes random failures
michael
parents: 5737
diff changeset
1688 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1689
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1690 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1691 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1692
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1693 #define LBUTTERFLY2(a1,b1,a2,b2)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1694 "paddw " #b1 ", " #a1 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1695 "paddw " #b2 ", " #a2 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1696 "paddw " #b1 ", " #b1 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1697 "paddw " #b2 ", " #b2 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1698 "psubw " #a1 ", " #b1 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1699 "psubw " #a2 ", " #b2 " \n\t"
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1700
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1701 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1702 LBUTTERFLY2(m0, m1, m2, m3)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1703 LBUTTERFLY2(m4, m5, m6, m7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1704 LBUTTERFLY2(m0, m2, m1, m3)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1705 LBUTTERFLY2(m4, m6, m5, m7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1706 LBUTTERFLY2(m0, m4, m1, m5)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1707 LBUTTERFLY2(m2, m6, m3, m7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1708
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1709 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1710
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1711 #define MMABS_MMX(a,z)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1712 "pxor " #z ", " #z " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1713 "pcmpgtw " #a ", " #z " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1714 "pxor " #z ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1715 "psubw " #z ", " #a " \n\t"
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1716
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1717 #define MMABS_MMX2(a,z)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1718 "pxor " #z ", " #z " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1719 "psubw " #a ", " #z " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1720 "pmaxsw " #z ", " #a " \n\t"
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1721
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1722 #define MMABS_SSSE3(a,z)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1723 "pabsw " #a ", " #a " \n\t"
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1724
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1725 #define MMABS_SUM(a,z, sum)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1726 MMABS(a,z)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1727 "paddusw " #a ", " #sum " \n\t"
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1728
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1729 #define MMABS_SUM_8x8_NOSPILL\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1730 MMABS(%%xmm0, %%xmm8)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1731 MMABS(%%xmm1, %%xmm9)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1732 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1733 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1734 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1735 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1736 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1737 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1738 "paddusw %%xmm1, %%xmm0 \n\t"
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1739
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1740 #ifdef ARCH_X86_64
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1741 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1742 #else
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1743 #define MMABS_SUM_8x8_SSE2\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1744 "movdqa %%xmm7, (%1) \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1745 MMABS(%%xmm0, %%xmm7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1746 MMABS(%%xmm1, %%xmm7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1747 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1748 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1749 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1750 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1751 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1752 "movdqa (%1), %%xmm2 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1753 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1754 "paddusw %%xmm1, %%xmm0 \n\t"
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1755 #endif
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1756
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1757 #define LOAD4(o, a, b, c, d)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1758 "movq "#o"(%1), "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1759 "movq "#o"+8(%1), "#b" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1760 "movq "#o"+16(%1), "#c" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1761 "movq "#o"+24(%1), "#d" \n\t"\
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1762
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1763 #define STORE4(o, a, b, c, d)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1764 "movq "#a", "#o"(%1) \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1765 "movq "#b", "#o"+8(%1) \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1766 "movq "#c", "#o"+16(%1) \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1767 "movq "#d", "#o"+24(%1) \n\t"\
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1768
4988
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1769 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1770 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1771 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1772 #define HSUM_MMX(a, t, dst)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1773 "movq "#a", "#t" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1774 "psrlq $32, "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1775 "paddusw "#t", "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1776 "movq "#a", "#t" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1777 "psrlq $16, "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1778 "paddusw "#t", "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1779 "movd "#a", "#dst" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1780
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1781 #define HSUM_MMX2(a, t, dst)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1782 "pshufw $0x0E, "#a", "#t" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1783 "paddusw "#t", "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1784 "pshufw $0x01, "#a", "#t" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1785 "paddusw "#t", "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1786 "movd "#a", "#dst" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1787
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1788 #define HSUM_SSE2(a, t, dst)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1789 "movhlps "#a", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1790 "paddusw "#t", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1791 "pshuflw $0x0E, "#a", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1792 "paddusw "#t", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1793 "pshuflw $0x01, "#a", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1794 "paddusw "#t", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1795 "movd "#a", "#dst" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1796
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1797 #define HADAMARD8_DIFF_MMX(cpu) \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1798 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1799 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1800 int sum;\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1801 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1802 assert(h==8);\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1803 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1804 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1805 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1806 asm volatile(\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1807 HADAMARD48\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1808 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1809 "movq %%mm7, 96(%1) \n\t"\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1810 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1811 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1812 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1813 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1814 "movq 96(%1), %%mm7 \n\t"\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1815 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1816 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1817 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1818 : "=r" (sum)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1819 : "r"(temp)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1820 );\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1821 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1822 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1823 \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1824 asm volatile(\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1825 HADAMARD48\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1826 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1827 "movq %%mm7, 96(%1) \n\t"\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1828 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1829 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1830 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1831 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1832 "movq 96(%1), %%mm7 \n\t"\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1833 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1834 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1835 "movq %%mm6, %%mm7 \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1836 "movq %%mm0, %%mm6 \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1837 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1838 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1839 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1840 HADAMARD48\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1841 "movq %%mm7, 64(%1) \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1842 MMABS(%%mm0, %%mm7)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1843 MMABS(%%mm1, %%mm7)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1844 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1845 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1846 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1847 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1848 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1849 "movq 64(%1), %%mm2 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1850 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1851 "paddusw %%mm1, %%mm0 \n\t"\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1852 "movq %%mm0, 64(%1) \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1853 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1854 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1855 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1856 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1857 HADAMARD48\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1858 "movq %%mm7, (%1) \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1859 MMABS(%%mm0, %%mm7)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1860 MMABS(%%mm1, %%mm7)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1861 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1862 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1863 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1864 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1865 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1866 "movq (%1), %%mm2 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1867 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1868 "paddusw 64(%1), %%mm0 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1869 "paddusw %%mm1, %%mm0 \n\t"\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1870 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1871 HSUM(%%mm0, %%mm1, %0)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1872 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1873 : "=r" (sum)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1874 : "r"(temp)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1875 );\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1876 return sum&0xFFFF;\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1877 }\
6056
558c1fd0ee72 Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents: 6030
diff changeset
1878 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1879
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1880 #define HADAMARD8_DIFF_SSE2(cpu) \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1881 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1882 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1883 int sum;\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1884 \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1885 assert(h==8);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1886 \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1887 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1888 \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1889 asm volatile(\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1890 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1891 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1892 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1893 MMABS_SUM_8x8\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1894 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1895 : "=r" (sum)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1896 : "r"(temp)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1897 );\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1898 return sum&0xFFFF;\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1899 }\
6056
558c1fd0ee72 Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents: 6030
diff changeset
1900 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1901
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1902 #define MMABS(a,z) MMABS_MMX(a,z)
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1903 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1904 HADAMARD8_DIFF_MMX(mmx)
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1905 #undef MMABS
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1906 #undef HSUM
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1907
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1908 #define MMABS(a,z) MMABS_MMX2(a,z)
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1909 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1910 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1911 HADAMARD8_DIFF_MMX(mmx2)
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1912 HADAMARD8_DIFF_SSE2(sse2)
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1913 #undef MMABS
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1914 #undef MMABS_SUM_8x8
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1915 #undef HSUM
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1916
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1917 #ifdef HAVE_SSSE3
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1918 #define MMABS(a,z) MMABS_SSSE3(a,z)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1919 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1920 HADAMARD8_DIFF_SSE2(ssse3)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1921 #undef MMABS
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1922 #undef MMABS_SUM_8x8
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1923 #endif
4749
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1924
4988
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1925 #define DCT_SAD4(m,mm,o)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1926 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1927 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1928 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1929 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1930 MMABS_SUM(mm##2, mm##6, mm##0)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1931 MMABS_SUM(mm##3, mm##7, mm##1)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1932 MMABS_SUM(mm##4, mm##6, mm##0)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1933 MMABS_SUM(mm##5, mm##7, mm##1)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1934
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1935 #define DCT_SAD_MMX\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1936 "pxor %%mm0, %%mm0 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1937 "pxor %%mm1, %%mm1 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1938 DCT_SAD4(q, %%mm, 0)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1939 DCT_SAD4(q, %%mm, 8)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1940 DCT_SAD4(q, %%mm, 64)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1941 DCT_SAD4(q, %%mm, 72)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1942 "paddusw %%mm1, %%mm0 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1943 HSUM(%%mm0, %%mm1, %0)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1944
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1945 #define DCT_SAD_SSE2\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1946 "pxor %%xmm0, %%xmm0 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1947 "pxor %%xmm1, %%xmm1 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1948 DCT_SAD4(dqa, %%xmm, 0)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1949 DCT_SAD4(dqa, %%xmm, 64)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1950 "paddusw %%xmm1, %%xmm0 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1951 HSUM(%%xmm0, %%xmm1, %0)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1952
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1953 #define DCT_SAD_FUNC(cpu) \
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1954 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1955 int sum;\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1956 asm volatile(\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1957 DCT_SAD\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1958 :"=r"(sum)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1959 :"r"(block)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1960 );\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1961 return sum&0xFFFF;\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1962 }
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1963
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1964 #define DCT_SAD DCT_SAD_MMX
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1965 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1966 #define MMABS(a,z) MMABS_MMX(a,z)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1967 DCT_SAD_FUNC(mmx)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1968 #undef MMABS
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1969 #undef HSUM
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1970
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1971 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1972 #define MMABS(a,z) MMABS_MMX2(a,z)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1973 DCT_SAD_FUNC(mmx2)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1974 #undef HSUM
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1975 #undef DCT_SAD
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1976
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1977 #define DCT_SAD DCT_SAD_SSE2
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1978 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1979 DCT_SAD_FUNC(sse2)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1980 #undef MMABS
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1981
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1982 #ifdef HAVE_SSSE3
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1983 #define MMABS(a,z) MMABS_SSSE3(a,z)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1984 DCT_SAD_FUNC(ssse3)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1985 #undef MMABS
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1986 #endif
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1987 #undef HSUM
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1988 #undef DCT_SAD
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1989
5255
669a97223dc7 make arguments to ssd_int8_vs_int16() const
mru
parents: 5049
diff changeset
1990 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
4749
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1991 int sum;
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1992 long i=size;
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1993 asm volatile(
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1994 "pxor %%mm4, %%mm4 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1995 "1: \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1996 "sub $8, %0 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1997 "movq (%2,%0), %%mm2 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1998 "movq (%3,%0,2), %%mm0 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1999 "movq 8(%3,%0,2), %%mm1 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2000 "punpckhbw %%mm2, %%mm3 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2001 "punpcklbw %%mm2, %%mm2 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2002 "psraw $8, %%mm3 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2003 "psraw $8, %%mm2 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2004 "psubw %%mm3, %%mm1 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2005 "psubw %%mm2, %%mm0 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2006 "pmaddwd %%mm1, %%mm1 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2007 "pmaddwd %%mm0, %%mm0 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2008 "paddd %%mm1, %%mm4 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2009 "paddd %%mm0, %%mm4 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2010 "jg 1b \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2011 "movq %%mm4, %%mm3 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2012 "psrlq $32, %%mm3 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2013 "paddd %%mm3, %%mm4 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2014 "movd %%mm4, %1 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2015 :"+r"(i), "=r"(sum)
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2016 :"r"(pix1), "r"(pix2)
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2017 );
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2018 return sum;
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2019 }
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
2020
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
2021 #endif //CONFIG_ENCODERS
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
2022
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2023 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2024 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2025 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2026 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2027 "movq "#in7", " #m3 " \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2028 "movq "#in0", %%mm5 \n\t" /* D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2029 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2030 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2031 "movq "#in1", %%mm5 \n\t" /* C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2032 "movq "#in2", %%mm6 \n\t" /* B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2033 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2034 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2035 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2036 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2037 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2038 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2039 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2040 "psraw $5, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2041 "packuswb %%mm5, %%mm5 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2042 OP(%%mm5, out, %%mm7, d)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2043
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2044 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
2045 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2046 uint64_t temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2047 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2048 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2049 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2050 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2051 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2052 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2053 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2054 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2055 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2056 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2057 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2058 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2059 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2060 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2061 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2062 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2063 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2064 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2065 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2066 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2067 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2068 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2069 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2070 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2071 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2072 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2073 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2074 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2075 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2076 "paddw %6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2077 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2078 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2079 "movq %%mm0, %5 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2080 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2081 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2082 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2083 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2084 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2085 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2086 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2087 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2088 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2089 "paddw %%mm0, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2090 "paddw %%mm5, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2091 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2092 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2093 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2094 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2095 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2096 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2097 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2098 "paddw %%mm2, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2099 "paddw %%mm6, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2100 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2101 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2102 "paddw %6, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2103 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2104 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2105 "movq %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2106 "packuswb %%mm3, %%mm1 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2107 OP_MMX2(%%mm1, (%1),%%mm4, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2108 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2109 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2110 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2111 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2112 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2113 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2114 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2115 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2116 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2117 "paddw %%mm1, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2118 "paddw %%mm4, %%mm0 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2119 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2120 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2121 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2122 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2123 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2124 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2125 "paddw %%mm3, %%mm2 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2126 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2127 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2128 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2129 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2130 "paddw %%mm2, %%mm6 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2131 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2132 "paddw %6, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2133 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2134 "psraw $5, %%mm0 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2135 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2136 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2137 "paddw %%mm5, %%mm3 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2138 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2139 "paddw %%mm4, %%mm6 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2140 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2141 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2142 "paddw %%mm1, %%mm4 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2143 "paddw %%mm2, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2144 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2145 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2146 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2147 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2148 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2149 "paddw %6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2150 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2151 "psraw $5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2152 "packuswb %%mm4, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2153 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2154 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2155 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2156 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2157 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2158 " jnz 1b \n\t"\
6335
950811a14eb3 put loop counter in a register if possible. makes some of the qpel functions 3% faster.
lorenm
parents: 6333
diff changeset
2159 : "+a"(src), "+c"(dst), "+g"(h)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
2160 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2161 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2162 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2163 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2164 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2165 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2166 int i;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2167 int16_t temp[16];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2168 /* quick HACK, XXX FIXME MUST be optimized */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2169 for(i=0; i<h; i++)\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2170 {\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2171 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2172 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2173 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2174 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2175 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2176 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2177 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2178 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2179 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2180 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2181 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2182 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2183 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2184 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2185 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2186 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2187 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2188 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2189 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2190 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2191 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2192 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2193 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2194 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2195 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2196 "movq 16(%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2197 "movq 24(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2198 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2199 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2200 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2201 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2202 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2203 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2204 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2205 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2206 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2207 dst+=dstStride;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2208 src+=srcStride;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2209 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2210 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2211 \
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
2212 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2213 uint64_t temp;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2214 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2215 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2216 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2217 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2218 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2219 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2220 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2221 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2222 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2223 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2224 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2225 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2226 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2227 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2228 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2229 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2230 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2231 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2232 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2233 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2234 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2235 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2236 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2237 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2238 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2239 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2240 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2241 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2242 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2243 "paddw %6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2244 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2245 "psraw $5, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2246 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2247 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2248 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2249 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2250 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2251 "paddw %%mm5, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2252 "paddw %%mm6, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2253 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2254 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2255 "paddw %%mm6, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2256 "paddw %%mm5, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2257 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2258 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2259 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2260 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2261 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2262 "paddw %6, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2263 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2264 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2265 "packuswb %%mm3, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2266 OP_MMX2(%%mm0, (%1), %%mm4, q)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2267 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2268 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2269 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2270 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2271 " jnz 1b \n\t"\
6335
950811a14eb3 put loop counter in a register if possible. makes some of the qpel functions 3% faster.
lorenm
parents: 6333
diff changeset
2272 : "+a"(src), "+c"(dst), "+g"(h)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
2273 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2274 : "memory"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2275 );\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2276 }\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2277 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2278 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2279 int i;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2280 int16_t temp[8];\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2281 /* quick HACK, XXX FIXME MUST be optimized */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2282 for(i=0; i<h; i++)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2283 {\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2284 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2285 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2286 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2287 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2288 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2289 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2290 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2291 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2292 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2293 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2294 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2295 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2296 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2297 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2298 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2299 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2300 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2301 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2302 :"memory"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2303 );\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2304 dst+=dstStride;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2305 src+=srcStride;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2306 }\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2307 }
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2308
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2309 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2310 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2311 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2312 uint64_t temp[17*4];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2313 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2314 int count= 17;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2315 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2316 /*FIXME unroll */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2317 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2318 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2319 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2320 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2321 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2322 "movq 8(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2323 "movq 8(%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2324 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2325 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2326 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2327 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2328 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2329 "movq %%mm1, 17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2330 "movq %%mm2, 2*17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2331 "movq %%mm3, 3*17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2332 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2333 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2334 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2335 " jnz 1b \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2336 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
2337 : "r" ((long)srcStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2338 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2339 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2340 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2341 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2342 count=4;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2343 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2344 /*FIXME reorder for speed */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2345 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2346 /*"pxor %%mm7, %%mm7 \n\t"*/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2347 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2348 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2349 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2350 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2351 "movq 24(%0), %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2352 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2353 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2354 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2355 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2356 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2357 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2358 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2359 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2360 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2361 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2362 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2363 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2364 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2365 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2366 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2367 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2368 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2369 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2370 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2371 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2372 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2373 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2374 "add %4, %1 \n\t" \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2375 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2376 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2377 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2378 "add $136, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2379 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2380 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2381 " jnz 1b \n\t"\
958
9bb668034ecf slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped)
michaelni
parents: 954
diff changeset
2382 \
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
2383 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
2384 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2385 :"memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2386 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2387 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2388 \
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
2389 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
2390 uint64_t temp[9*2];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2391 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2392 int count= 9;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2393 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2394 /*FIXME unroll */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2395 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2396 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2397 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2398 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2399 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2400 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2401 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2402 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2403 "movq %%mm1, 9*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2404 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2405 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2406 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2407 " jnz 1b \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2408 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
2409 : "r" ((long)srcStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2410 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2411 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2412 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2413 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2414 count=2;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2415 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2416 /*FIXME reorder for speed */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2417 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2418 /*"pxor %%mm7, %%mm7 \n\t"*/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2419 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2420 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2421 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2422 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2423 "movq 24(%0), %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2424 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2425 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2426 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2427 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2428 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2429 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2430 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2431 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2432 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2433 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2434 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2435 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2436 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2437 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2438 "add $72, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2439 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2440 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2441 " jnz 1b \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2442 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2443 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
2444 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2445 : "memory"\
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2446 );\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2447 }\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2448 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2449 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
6321
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
2450 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2451 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2452 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2453 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2454 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2455 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2456 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2457 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2458 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2459 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2460 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2461 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2462 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2463 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2464 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2465 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2466 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2467 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2468 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2469 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2470 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2471 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2472 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2473 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2474 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2475 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2476 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2477 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2478 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2479 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2480 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2481 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2482 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2483 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2484 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2485 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2486 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2487 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2488 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2489 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2490 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2491 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2492 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2493 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2494 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2495 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2496 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2497 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2498 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2499 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2500 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2501 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2502 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2503 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2504 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2505 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2506 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2507 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2508 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2509 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2510 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2511 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2512 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2513 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2514 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2515 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2516 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2517 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2518 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2519 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2520 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2521 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2522 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2523 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2524 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2525 uint64_t half[8 + 9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2526 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2527 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2528 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2529 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2530 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2531 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2532 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2533 uint64_t half[8 + 9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2534 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2535 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2536 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2537 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2538 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2539 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2540 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2541 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2542 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2543 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2544 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2545 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2546 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2547 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2548 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2549 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2550 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2551 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2552 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2553 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2554 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2555 uint64_t half[9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2556 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2557 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2558 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2559 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2560 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
6321
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
2561 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2562 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2563 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2564 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2565 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2566 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2567 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2568 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2569 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2570 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2571 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2572 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2573 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2574 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2575 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2576 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2577 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2578 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2579 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2580 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2581 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2582 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2583 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2584 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2585 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2586 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2587 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2588 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2589 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2590 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2591 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2592 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2593 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2594 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2595 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2596 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2597 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2598 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2599 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2600 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2601 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2602 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2603 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2604 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2605 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2606 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2607 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2608 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2609 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2610 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2611 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2612 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2613 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2614 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2615 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2616 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2617 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2618 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2619 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2620 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2621 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2622 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2623 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2624 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2625 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2626 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2627 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2628 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2629 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2630 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2631 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2632 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2633 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2634 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2635 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2636 uint64_t half[16*2 + 17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2637 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2638 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2639 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2640 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2641 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2642 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2643 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2644 uint64_t half[16*2 + 17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2645 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2646 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2647 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2648 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2649 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2650 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2651 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2652 uint64_t half[17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2653 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2654 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2655 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2656 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2657 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2658 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2659 uint64_t half[17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2660 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2661 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2662 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2663 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2664 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2665 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2666 uint64_t half[17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2667 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2668 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2669 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2670 }
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2671
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2672 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2673 #define AVG_3DNOW_OP(a,b,temp, size) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2674 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2675 "pavgusb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2676 "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2677 #define AVG_MMX2_OP(a,b,temp, size) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2678 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2679 "pavgb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2680 "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2681
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2682 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2683 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2684 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2685 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2686 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2687 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2688 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2689 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2690 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2691
3807
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2692 /***********************************/
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2693 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2694
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2695 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2696 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2697 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2698 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2699 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2700 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2701 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2702 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2703
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2704 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2705 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2706 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2707 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2708 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2709 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2710 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2711 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2712 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2713 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2714 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2715 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2716 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2717 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2718 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2719 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2720 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2721 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2722 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2723 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2724 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2725 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2726 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2727 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2728
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2729 QPEL_2TAP(put_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2730 QPEL_2TAP(avg_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2731 QPEL_2TAP(put_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2732 QPEL_2TAP(avg_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2733 QPEL_2TAP(put_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2734 QPEL_2TAP(avg_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2735 QPEL_2TAP(put_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2736 QPEL_2TAP(avg_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2737
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2738
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
2739 #if 0
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2740 static void just_return() { return; }
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
2741 #endif
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2742
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2743 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2744 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2745 const int w = 8;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2746 const int ix = ox>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2747 const int iy = oy>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2748 const int oxs = ox>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2749 const int oys = oy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2750 const int dxxs = dxx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2751 const int dxys = dxy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2752 const int dyxs = dyx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2753 const int dyys = dyy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2754 const uint16_t r4[4] = {r,r,r,r};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2755 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2756 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2757 const uint64_t shift2 = 2*shift;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2758 uint8_t edge_buf[(h+1)*stride];
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2759 int x, y;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2760
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2761 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2762 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2763 const int dxh = dxy*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2764 const int dyw = dyx*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2765 if( // non-constant fullpel offset (3% of blocks)
6196
166bef5cad01 add parenthesis, fix warning: i386/dsputil_mmx.c:2618: warning: suggest parentheses around arithmetic in operand of |
bcoudurier
parents: 6195
diff changeset
2766 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
166bef5cad01 add parenthesis, fix warning: i386/dsputil_mmx.c:2618: warning: suggest parentheses around arithmetic in operand of |
bcoudurier
parents: 6195
diff changeset
2767 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2768 // uses more than 16 bits of subpel mv (only at huge resolution)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2769 || (dxx|dxy|dyx|dyy)&15 )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2770 {
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2771 //FIXME could still use mmx for some of the rows
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2772 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2773 return;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2774 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2775
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2776 src += ix + iy*stride;
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2777 if( (unsigned)ix >= width-w ||
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2778 (unsigned)iy >= height-h )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2779 {
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2780 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2781 src = edge_buf;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2782 }
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2783
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2784 asm volatile(
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2785 "movd %0, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2786 "pxor %%mm7, %%mm7 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2787 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2788 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2789 :: "r"(1<<shift)
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2790 );
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2791
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2792 for(x=0; x<w; x+=4){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2793 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2794 oxs - dxys + dxxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2795 oxs - dxys + dxxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2796 oxs - dxys + dxxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2797 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2798 oys - dyys + dyxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2799 oys - dyys + dyxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2800 oys - dyys + dyxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2801
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2802 for(y=0; y<h; y++){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2803 asm volatile(
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2804 "movq %0, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2805 "movq %1, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2806 "paddw %2, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2807 "paddw %3, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2808 "movq %%mm4, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2809 "movq %%mm5, %1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2810 "psrlw $12, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2811 "psrlw $12, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2812 : "+m"(*dx4), "+m"(*dy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2813 : "m"(*dxy4), "m"(*dyy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2814 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2815
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2816 asm volatile(
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2817 "movq %%mm6, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2818 "movq %%mm6, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2819 "psubw %%mm4, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2820 "psubw %%mm5, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2821 "movq %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2822 "movq %%mm4, %%mm3 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2823 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2824 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2825 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2826 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2827
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2828 "movd %4, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2829 "movd %3, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2830 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2831 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2832 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2833 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2834
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2835 "movd %2, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2836 "movd %1, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2837 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2838 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2839 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2840 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2841 "paddw %5, %%mm1 \n\t"
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2842 "paddw %%mm3, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2843 "paddw %%mm1, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2844 "paddw %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2845
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2846 "psrlw %6, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2847 "packuswb %%mm0, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2848 "movd %%mm0, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2849
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2850 : "=m"(dst[x+y*stride])
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2851 : "m"(src[0]), "m"(src[1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2852 "m"(src[stride]), "m"(src[stride+1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2853 "m"(*r4), "m"(shift2)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2854 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2855 src += stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2856 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2857 src += 4-h*stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2858 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2859 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2860
3777
20545fbb6f7c add some #ifdef CONFIG_ENCODERS/DECODERS
mru
parents: 3721
diff changeset
2861 #ifdef CONFIG_ENCODERS
5024
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2862
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2863 #define PHADDD(a, t)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2864 "movq "#a", "#t" \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2865 "psrlq $32, "#a" \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2866 "paddd "#t", "#a" \n\t"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2867 /*
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2868 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2869 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2870 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2871 */
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2872 #define PMULHRW(x, y, s, o)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2873 "pmulhw " #s ", "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2874 "pmulhw " #s ", "#y " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2875 "paddw " #o ", "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2876 "paddw " #o ", "#y " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2877 "psraw $1, "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2878 "psraw $1, "#y " \n\t"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2879 #define DEF(x) x ## _mmx
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2880 #define SET_RND MOVQ_WONE
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2881 #define SCALE_OFFSET 1
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2882
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2883 #include "dsputil_mmx_qns.h"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2884
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2885 #undef DEF
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2886 #undef SET_RND
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2887 #undef SCALE_OFFSET
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2888 #undef PMULHRW
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2889
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2890 #define DEF(x) x ## _3dnow
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2891 #define SET_RND(x)
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2892 #define SCALE_OFFSET 0
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2893 #define PMULHRW(x, y, s, o)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2894 "pmulhrw " #s ", "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2895 "pmulhrw " #s ", "#y " \n\t"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2896
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2897 #include "dsputil_mmx_qns.h"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2898
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2899 #undef DEF
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2900 #undef SET_RND
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2901 #undef SCALE_OFFSET
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2902 #undef PMULHRW
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2903
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2904 #ifdef HAVE_SSSE3
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2905 #undef PHADDD
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2906 #define DEF(x) x ## _ssse3
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2907 #define SET_RND(x)
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2908 #define SCALE_OFFSET -1
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2909 #define PHADDD(a, t)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2910 "pshufw $0x0E, "#a", "#t" \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2911 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2912 #define PMULHRW(x, y, s, o)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2913 "pmulhrsw " #s ", "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2914 "pmulhrsw " #s ", "#y " \n\t"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2915
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2916 #include "dsputil_mmx_qns.h"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2917
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2918 #undef DEF
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2919 #undef SET_RND
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2920 #undef SCALE_OFFSET
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2921 #undef PMULHRW
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2922 #undef PHADDD
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2923 #endif //HAVE_SSSE3
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2924
3777
20545fbb6f7c add some #ifdef CONFIG_ENCODERS/DECODERS
mru
parents: 3721
diff changeset
2925 #endif /* CONFIG_ENCODERS */
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents: 2753
diff changeset
2926
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2927 #define PREFETCH(name, op) \
4172
608e2dfcb86e adding more static keywords
mru
parents: 4127
diff changeset
2928 static void name(void *mem, int stride, int h){\
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2929 const uint8_t *p= mem;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2930 do{\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2931 asm volatile(#op" %0" :: "m"(*p));\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2932 p+= stride;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2933 }while(--h);\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2934 }
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2935 PREFETCH(prefetch_mmx2, prefetcht0)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2936 PREFETCH(prefetch_3dnow, prefetch)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2937 #undef PREFETCH
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2938
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents: 2753
diff changeset
2939 #include "h264dsp_mmx.c"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
2940
6009
ecfdc0bfb233 typo/clarification
diego
parents: 5963
diff changeset
2941 /* CAVS specific */
3524
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2942 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2943
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2944 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2945 put_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2946 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2947 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2948 avg_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2949 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2950 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2951 put_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2952 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2953 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2954 avg_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2955 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2956
6030
fb99890ee609 move FLAC mmx dsp to its own file
aurel
parents: 6009
diff changeset
2957 /* FLAC specific */
fb99890ee609 move FLAC mmx dsp to its own file
aurel
parents: 6009
diff changeset
2958 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
fb99890ee609 move FLAC mmx dsp to its own file
aurel
parents: 6009
diff changeset
2959 double *autoc);
fb99890ee609 move FLAC mmx dsp to its own file
aurel
parents: 6009
diff changeset
2960
5948
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
2961 /* VC1 specific */
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
2962 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
2963
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
2964 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
2965 put_pixels8_mmx(dst, src, stride, 8);
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
2966 }
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
2967
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2968 /* external functions, from idct_mmx.c */
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2969 void ff_mmx_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2970 void ff_mmxext_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2971
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2972 /* XXX: those functions should be suppressed ASAP when all IDCTs are
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2973 converted */
4020
723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure
diego
parents: 4001
diff changeset
2974 #ifdef CONFIG_GPL
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2975 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2976 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2977 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2978 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2979 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2980 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2981 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2982 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2983 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2984 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2985 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2986 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2987 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2988 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2989 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2990 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2991 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2992 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2993 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2994 }
4020
723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure
diego
parents: 4001
diff changeset
2995 #endif
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2996 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2997 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2998 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2999 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3000 }
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3001 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3002 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3003 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3004 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3005 }
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3006 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3007 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3008 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3009 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3010 }
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3011 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3012 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3013 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3014 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3015 }
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3016
3541
3fbddeb13686 10l, vorbis_inverse_coupling_sse() was really 3dnow
lorenm
parents: 3536
diff changeset
3017 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3018 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3019 int i;
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3020 asm volatile("pxor %%mm7, %%mm7":);
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3021 for(i=0; i<blocksize; i+=2) {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3022 asm volatile(
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3023 "movq %0, %%mm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3024 "movq %1, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3025 "movq %%mm0, %%mm2 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3026 "movq %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3027 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3028 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3029 "pslld $31, %%mm2 \n\t" // keep only the sign bit
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3030 "pxor %%mm2, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3031 "movq %%mm3, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3032 "pand %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3033 "pandn %%mm1, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3034 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3035 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3036 "movq %%mm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3037 "movq %%mm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3038 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3039 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3040 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3041 }
3561
97325fecd35a emms -> femms
lorenm
parents: 3557
diff changeset
3042 asm volatile("femms");
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3043 }
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3044 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3045 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3046 int i;
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3047
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3048 asm volatile(
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3049 "movaps %0, %%xmm5 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3050 ::"m"(ff_pdw_80000000[0])
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3051 );
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3052 for(i=0; i<blocksize; i+=4) {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3053 asm volatile(
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3054 "movaps %0, %%xmm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3055 "movaps %1, %%xmm1 \n\t"
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3056 "xorps %%xmm2, %%xmm2 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3057 "xorps %%xmm3, %%xmm3 \n\t"
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3058 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3059 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3060 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3061 "xorps %%xmm2, %%xmm1 \n\t"
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3062 "movaps %%xmm3, %%xmm4 \n\t"
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3063 "andps %%xmm1, %%xmm3 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3064 "andnps %%xmm1, %%xmm4 \n\t"
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3065 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3066 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3067 "movaps %%xmm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3068 "movaps %%xmm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3069 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3070 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3071 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3072 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3073 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3074
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3075 static void vector_fmul_3dnow(float *dst, const float *src, int len){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3076 long i = (len-4)*4;
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3077 asm volatile(
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3078 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3079 "movq (%1,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3080 "movq 8(%1,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3081 "pfmul (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3082 "pfmul 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3083 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3084 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3085 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3086 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3087 "femms \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3088 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3089 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3090 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3091 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3092 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3093 static void vector_fmul_sse(float *dst, const float *src, int len){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3094 long i = (len-8)*4;
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3095 asm volatile(
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3096 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3097 "movaps (%1,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3098 "movaps 16(%1,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3099 "mulps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3100 "mulps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3101 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3102 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3103 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3104 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3105 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3106 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3107 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3108 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3109 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3110
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3111 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3112 long i = len*4-16;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3113 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3114 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3115 "pswapd 8(%1), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3116 "pswapd (%1), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3117 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3118 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3119 "movq %%mm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3120 "movq %%mm1, 8(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3121 "add $16, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3122 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3123 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3124 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3125 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3126 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3127 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3128 }
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3129 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3130 long i = len*4-32;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3131 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3132 "1: \n\t"
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3133 "movaps 16(%1), %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3134 "movaps (%1), %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3135 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3136 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3137 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3138 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3139 "movaps %%xmm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3140 "movaps %%xmm1, 16(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3141 "add $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3142 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3143 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3144 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3145 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3146 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3147 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3148
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3149 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3150 const float *src2, int src3, int len, int step){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3151 long i = (len-4)*4;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3152 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3153 dst += (len-4)*2;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3154 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3155 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3156 "movq (%2,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3157 "movq 8(%2,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3158 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3159 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3160 "pfadd (%4,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3161 "pfadd 8(%4,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3162 "movd %%mm0, (%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3163 "movd %%mm1, 16(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3164 "psrlq $32, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3165 "psrlq $32, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3166 "movd %%mm0, 8(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3167 "movd %%mm1, 24(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3168 "sub $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3169 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3170 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3171 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3172 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3173 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3174 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3175 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3176 else if(step == 1 && src3 == 0){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3177 asm volatile(
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3178 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3179 "movq (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3180 "movq 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3181 "pfmul (%3,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3182 "pfmul 8(%3,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3183 "pfadd (%4,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3184 "pfadd 8(%4,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3185 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3186 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3187 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3188 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3189 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3190 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3191 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3192 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3193 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3194 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3195 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3196 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3197 }
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3198 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3199 const float *src2, int src3, int len, int step){
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3200 long i = (len-8)*4;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3201 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3202 dst += (len-8)*2;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3203 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3204 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3205 "movaps (%2,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3206 "movaps 16(%2,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3207 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3208 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3209 "addps (%4,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3210 "addps 16(%4,%0), %%xmm1 \n\t"
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3211 "movss %%xmm0, (%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3212 "movss %%xmm1, 32(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3213 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3214 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3215 "movss %%xmm2, 16(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3216 "movss %%xmm3, 48(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3217 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3218 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3219 "movss %%xmm0, 8(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3220 "movss %%xmm1, 40(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3221 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3222 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3223 "movss %%xmm2, 24(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3224 "movss %%xmm3, 56(%1) \n\t"
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3225 "sub $64, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3226 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3227 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3228 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3229 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3230 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3231 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3232 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3233 else if(step == 1 && src3 == 0){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3234 asm volatile(
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3235 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3236 "movaps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3237 "movaps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3238 "mulps (%3,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3239 "mulps 16(%3,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3240 "addps (%4,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3241 "addps 16(%4,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3242 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3243 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3244 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3245 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3246 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3247 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3248 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3249 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3250 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3251 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3252 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3253 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3254
4172
608e2dfcb86e adding more static keywords
mru
parents: 4127
diff changeset
3255 static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3256 // not bit-exact: pf2id uses different rounding than C and SSE
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3257 int i;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3258 for(i=0; i<len; i+=4) {
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3259 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3260 "pf2id %1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3261 "pf2id %2, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3262 "packssdw %%mm1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3263 "movq %%mm0, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3264 :"=m"(dst[i])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3265 :"m"(src[i]), "m"(src[i+2])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3266 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3267 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3268 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3269 }
4172
608e2dfcb86e adding more static keywords
mru
parents: 4127
diff changeset
3270 static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3271 int i;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3272 for(i=0; i<len; i+=4) {
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3273 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3274 "cvtps2pi %1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3275 "cvtps2pi %2, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3276 "packssdw %%mm1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3277 "movq %%mm0, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3278 :"=m"(dst[i])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3279 :"m"(src[i]), "m"(src[i+2])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3280 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3281 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3282 asm volatile("emms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3283 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3284
6195
5f704e9cb518 fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type
bcoudurier
parents: 6181
diff changeset
3285 extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
5f704e9cb518 fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type
bcoudurier
parents: 6181
diff changeset
3286 extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
5f704e9cb518 fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type
bcoudurier
parents: 6181
diff changeset
3287 extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
5f704e9cb518 fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type
bcoudurier
parents: 6181
diff changeset
3288 extern void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
4436
d3e389536b0a Add the const specifier as needed to reduce the number of warnings.
takis
parents: 4197
diff changeset
3289 extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
3290 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
4436
d3e389536b0a Add the const specifier as needed to reduce the number of warnings.
takis
parents: 4197
diff changeset
3291 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
3292 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3293
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3294 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
3295 {
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3296 mm_flags = mm_support();
1115
74a46d77e061 * support FF_MM_FORCE
kabi
parents: 1092
diff changeset
3297
1122
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
3298 if (avctx->dsp_mask) {
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3299 if (avctx->dsp_mask & FF_MM_FORCE)
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3300 mm_flags |= (avctx->dsp_mask & 0xffff);
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3301 else
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3302 mm_flags &= ~(avctx->dsp_mask & 0xffff);
1122
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
3303 }
1115
74a46d77e061 * support FF_MM_FORCE
kabi
parents: 1092
diff changeset
3304
631
47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents: 629
diff changeset
3305 #if 0
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3306 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3307 if (mm_flags & MM_MMX)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3308 av_log(avctx, AV_LOG_INFO, " mmx");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3309 if (mm_flags & MM_MMXEXT)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3310 av_log(avctx, AV_LOG_INFO, " mmxext");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3311 if (mm_flags & MM_3DNOW)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3312 av_log(avctx, AV_LOG_INFO, " 3dnow");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3313 if (mm_flags & MM_SSE)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3314 av_log(avctx, AV_LOG_INFO, " sse");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3315 if (mm_flags & MM_SSE2)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3316 av_log(avctx, AV_LOG_INFO, " sse2");
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3317 av_log(avctx, AV_LOG_INFO, "\n");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
3318 #endif
986e461dc072 Initial revision
glantau
parents:
diff changeset
3319
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3320 if (mm_flags & MM_MMX) {
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3321 const int idct_algo= avctx->idct_algo;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3322
1232
e88d3b1fb2a1 more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents: 1186
diff changeset
3323 #ifdef CONFIG_ENCODERS
2024
f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents: 1985
diff changeset
3324 const int dct_algo = avctx->dct_algo;
1565
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
3325 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3326 if(mm_flags & MM_SSE2){
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
3327 c->fdct = ff_fdct_sse2;
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3328 }else if(mm_flags & MM_MMXEXT){
1565
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
3329 c->fdct = ff_fdct_mmx2;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
3330 }else{
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
3331 c->fdct = ff_fdct_mmx;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
3332 }
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
3333 }
1232
e88d3b1fb2a1 more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents: 1186
diff changeset
3334 #endif //CONFIG_ENCODERS
2256
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3335 if(avctx->lowres==0){
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3336 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3337 c->idct_put= ff_simple_idct_put_mmx;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3338 c->idct_add= ff_simple_idct_add_mmx;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3339 c->idct = ff_simple_idct_mmx;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3340 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3717
ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel
diego
parents: 3712
diff changeset
3341 #ifdef CONFIG_GPL
2256
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3342 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3343 if(mm_flags & MM_MMXEXT){
2256
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3344 c->idct_put= ff_libmpeg2mmx2_idct_put;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3345 c->idct_add= ff_libmpeg2mmx2_idct_add;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3346 c->idct = ff_mmxext_idct;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3347 }else{
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3348 c->idct_put= ff_libmpeg2mmx_idct_put;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3349 c->idct_add= ff_libmpeg2mmx_idct_add;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3350 c->idct = ff_mmx_idct;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3351 }
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3352 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3717
ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel
diego
parents: 3712
diff changeset
3353 #endif
5007
f7edc4fe94db Make vp3dsp*.c compilation optional.
takis
parents: 4988
diff changeset
3354 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
f7edc4fe94db Make vp3dsp*.c compilation optional.
takis
parents: 4988
diff changeset
3355 idct_algo==FF_IDCT_VP3 &&
3721
2000e401593d disable vp3 mmx idct for theora files to avoid artifacts
aurel
parents: 3717
diff changeset
3356 avctx->codec->id!=CODEC_ID_THEORA &&
3712
f7f75f718efb Enables back the mmx/sse optimized version of the vp3 idct.
aurel
parents: 3666
diff changeset
3357 !(avctx->flags & CODEC_FLAG_BITEXACT)){
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3358 if(mm_flags & MM_SSE2){
2696
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3359 c->idct_put= ff_vp3_idct_put_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3360 c->idct_add= ff_vp3_idct_add_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3361 c->idct = ff_vp3_idct_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3362 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3363 }else{
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3364 ff_vp3_dsp_init_mmx();
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3365 c->idct_put= ff_vp3_idct_put_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3366 c->idct_add= ff_vp3_idct_add_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3367 c->idct = ff_vp3_idct_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3368 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3369 }
3524
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
3370 }else if(idct_algo==FF_IDCT_CAVS){
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
3371 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3372 }else if(idct_algo==FF_IDCT_XVIDMMX){
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3373 if(mm_flags & MM_MMXEXT){
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3374 c->idct_put= ff_idct_xvid_mmx2_put;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3375 c->idct_add= ff_idct_xvid_mmx2_add;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3376 c->idct = ff_idct_xvid_mmx2;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3377 }else{
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3378 c->idct_put= ff_idct_xvid_mmx_put;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3379 c->idct_add= ff_idct_xvid_mmx_add;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3380 c->idct = ff_idct_xvid_mmx;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3381 }
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3382 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3383 }
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3384
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3385 #ifdef CONFIG_ENCODERS
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3386 c->get_pixels = get_pixels_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3387 c->diff_pixels = diff_pixels_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3388 #endif //CONFIG_ENCODERS
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3389 c->put_pixels_clamped = put_pixels_clamped_mmx;
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
3390 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3391 c->add_pixels_clamped = add_pixels_clamped_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3392 c->clear_blocks = clear_blocks_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3393 #ifdef CONFIG_ENCODERS
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3394 c->pix_sum = pix_sum16_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3395 #endif //CONFIG_ENCODERS
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
3396
6327
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3397 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3398 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3399 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3400 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3401 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3402
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3403 SET_HPEL_FUNCS(put, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3404 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3405 SET_HPEL_FUNCS(avg, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3406 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3407 SET_HPEL_FUNCS(put, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3408 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3409 SET_HPEL_FUNCS(avg, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3410 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3411
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
3412 c->gmc= gmc_mmx;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
3413
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
3414 c->add_bytes= add_bytes_mmx;
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
3415 c->add_bytes_l2= add_bytes_l2_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3416 #ifdef CONFIG_ENCODERS
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
3417 c->diff_bytes= diff_bytes_mmx;
4988
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
3418 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3419
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
3420 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
3421 c->hadamard8_diff[1]= hadamard8_diff_mmx;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3422
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3423 c->pix_norm1 = pix_norm1_mmx;
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3424 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3425 c->sse[1] = sse8_mmx;
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3426 c->vsad[4]= vsad_intra16_mmx;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3427
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3428 c->nsse[0] = nsse16_mmx;
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3429 c->nsse[1] = nsse8_mmx;
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3430 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3431 c->vsad[0] = vsad16_mmx;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3432 }
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3433
1784
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
3434 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
3435 c->try_8x8basis= try_8x8basis_mmx;
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
3436 }
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
3437 c->add_8x8basis= add_8x8basis_mmx;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3438
4749
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
3439 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
3440
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3441 #endif //CONFIG_ENCODERS
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
3442
5277
7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs
aurel
parents: 5255
diff changeset
3443 if (ENABLE_ANY_H263) {
5278
ef85411bb7e8 cosmetics: indentation
aurel
parents: 5277
diff changeset
3444 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
ef85411bb7e8 cosmetics: indentation
aurel
parents: 5277
diff changeset
3445 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
5277
7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs
aurel
parents: 5255
diff changeset
3446 }
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 6056
diff changeset
3447 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
3448 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 6056
diff changeset
3449 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3450
3173
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3105
diff changeset
3451 c->h264_idct_dc_add=
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3105
diff changeset
3452 c->h264_idct_add= ff_h264_idct_add_mmx;
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
3453 c->h264_idct8_dc_add=
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
3454 c->h264_idct8_add= ff_h264_idct8_add_mmx;
6320
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6196
diff changeset
3455 if (mm_flags & MM_SSE2)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6196
diff changeset
3456 c->h264_idct8_add= ff_h264_idct8_add_sse2;
3173
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3105
diff changeset
3457
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3458 if (mm_flags & MM_MMXEXT) {
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
3459 c->prefetch = prefetch_mmx2;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
3460
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3461 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3462 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
3463
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3464 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3465 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3466 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
3467
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3468 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3469 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
3470
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3471 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3472 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3473 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3474
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3475 #ifdef CONFIG_ENCODERS
4988
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
3476 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
3477 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
3478 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3479 c->vsad[4]= vsad_intra16_mmx2;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3480 #endif //CONFIG_ENCODERS
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
3481
3105
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3089
diff changeset
3482 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3089
diff changeset
3483 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
2745
42d3e9068e32 MMX for H.264 iDCT (adapted from x264)
lorenm
parents: 2732
diff changeset
3484
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3485 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3486 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3487 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3488 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3489 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3490 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3491 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
1772
8cd5257195c9 vsad16_mmx2 only applies if encoders are turned on
melanson
parents: 1765
diff changeset
3492 #ifdef CONFIG_ENCODERS
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3493 c->vsad[0] = vsad16_mmx2;
1772
8cd5257195c9 vsad16_mmx2 only applies if encoders are turned on
melanson
parents: 1765
diff changeset
3494 #endif //CONFIG_ENCODERS
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3495 }
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
3496
6327
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3497 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3498 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3499 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3500 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3501 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3502 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3503 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3504 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3505 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3506 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3507 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3508 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3509 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3510 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3511 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3512 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3513 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3514
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3515 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3516 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3517 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3518 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3519 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3520 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3521
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3522 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3523 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3524 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3525 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3526 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3527 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3528
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3529 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3530 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3531 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3532 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3533
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 6056
diff changeset
3534 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
3535 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
3213
57d31bdbebe8 added mmx implementation of h264_chroma_mc2
lorenm
parents: 3211
diff changeset
3536 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
57d31bdbebe8 added mmx implementation of h264_chroma_mc2
lorenm
parents: 3211
diff changeset
3537 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
2633
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
3538 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
3539 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
3540 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
3541 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
2707
360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents: 2696
diff changeset
3542 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents: 2696
diff changeset
3543 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3576
diff changeset
3544 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
2633
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
3545
2902
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3546 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3547 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3548 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3549 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3550 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3551 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3552 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3553 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3554
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3555 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3556 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3557 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3558 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3559 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3560 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3561 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3562 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3563
5949
d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_
aurel
parents: 5948
diff changeset
3564 if (ENABLE_CAVS_DECODER)
5950
e419e6d4e7eb cosmetics: indentation
aurel
parents: 5949
diff changeset
3565 ff_cavsdsp_init_mmx2(c, avctx);
5949
d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_
aurel
parents: 5948
diff changeset
3566
d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_
aurel
parents: 5948
diff changeset
3567 if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
5950
e419e6d4e7eb cosmetics: indentation
aurel
parents: 5949
diff changeset
3568 ff_vc1dsp_init_mmx(c, avctx);
5933
6ce8f15fc02b add VC-1 MMX DSP functions, under MIT license.
gpoirier
parents: 5912
diff changeset
3569
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
3570 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
1686
68abbec33289 Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents: 1648
diff changeset
3571 #ifdef CONFIG_ENCODERS
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
3572 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1686
68abbec33289 Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents: 1648
diff changeset
3573 #endif //CONFIG_ENCODERS
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3574 } else if (mm_flags & MM_3DNOW) {
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
3575 c->prefetch = prefetch_3dnow;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
3576
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3577 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3578 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
3579
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3580 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3581 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3582 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
3583
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3584 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3585 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3586
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3587 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3588 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3589 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3590
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3591 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3592 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3593 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3594 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3595 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3596 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3597 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3598 }
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
3599
6327
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3600 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3601 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3602 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3603 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3604 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3605 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3606
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3607 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3608 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3609 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3610 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3611 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3612 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3613
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3614 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3615 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3616 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
3617 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
3807
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
3618
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 6056
diff changeset
3619 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
3620 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
3621 }
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3622
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3623
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3624 #define H264_QPEL_FUNCS(x, y, CPU)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3625 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3626 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3627 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3628 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
3629 if((mm_flags & MM_SSE2) && !(mm_flags & MM_3DNOW)){
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
3630 // these functions are slower than mmx on AMD, but faster on Intel
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3631 /* FIXME works in most codecs, but crashes svq1 due to unaligned chroma
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
3632 c->put_pixels_tab[0][0] = put_pixels16_sse2;
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
3633 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3634 */
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3635 H264_QPEL_FUNCS(0, 0, sse2);
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
3636 }
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3637 if(mm_flags & MM_SSE2){
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3638 H264_QPEL_FUNCS(0, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3639 H264_QPEL_FUNCS(0, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3640 H264_QPEL_FUNCS(0, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3641 H264_QPEL_FUNCS(1, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3642 H264_QPEL_FUNCS(1, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3643 H264_QPEL_FUNCS(1, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3644 H264_QPEL_FUNCS(2, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3645 H264_QPEL_FUNCS(2, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3646 H264_QPEL_FUNCS(2, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3647 H264_QPEL_FUNCS(3, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3648 H264_QPEL_FUNCS(3, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3649 H264_QPEL_FUNCS(3, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3650 }
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
3651 #ifdef HAVE_SSSE3
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
3652 if(mm_flags & MM_SSSE3){
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3653 H264_QPEL_FUNCS(1, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3654 H264_QPEL_FUNCS(1, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3655 H264_QPEL_FUNCS(1, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3656 H264_QPEL_FUNCS(1, 3, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3657 H264_QPEL_FUNCS(2, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3658 H264_QPEL_FUNCS(2, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3659 H264_QPEL_FUNCS(2, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3660 H264_QPEL_FUNCS(2, 3, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3661 H264_QPEL_FUNCS(3, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3662 H264_QPEL_FUNCS(3, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3663 H264_QPEL_FUNCS(3, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
3664 H264_QPEL_FUNCS(3, 3, ssse3);
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
3665 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
3666 }
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
3667 #endif
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
3668
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3669 #ifdef CONFIG_ENCODERS
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3670 if(mm_flags & MM_SSE2){
4988
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
3671 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3672 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3673 c->hadamard8_diff[1]= hadamard8_diff_sse2;
6030
fb99890ee609 move FLAC mmx dsp to its own file
aurel
parents: 6009
diff changeset
3674 if (ENABLE_FLAC_ENCODER)
fb99890ee609 move FLAC mmx dsp to its own file
aurel
parents: 6009
diff changeset
3675 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3676 }
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3677
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3678 #ifdef HAVE_SSSE3
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3679 if(mm_flags & MM_SSSE3){
5024
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3680 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3681 c->try_8x8basis= try_8x8basis_ssse3;
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3682 }
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3683 c->add_8x8basis= add_8x8basis_ssse3;
4988
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
3684 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3685 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3686 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3687 }
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3688 #endif
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3689 #endif
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3690
4589
30261f4ed12d Fix wrong conditional, Snow decoding, not encoding, was SIMD-accelerated.
diego
parents: 4436
diff changeset
3691 #ifdef CONFIG_SNOW_DECODER
5591
642588a60570 update mmx code to latest snow changes
michael
parents: 5587
diff changeset
3692 if(mm_flags & MM_SSE2 & 0){
3210
81cafbc23b8d snow mmx+sse2 optimizations, part 4
corey
parents: 3207
diff changeset
3693 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
5602
3b21f3268707 CONFIG_7REGS has been renamed to HAVE_7REGS
ramiro
parents: 5601
diff changeset
3694 #ifdef HAVE_7REGS
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3695 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
5601
b26025b9586d workaround gcc bug, untested as my gcc is not complaining
michael
parents: 5594
diff changeset
3696 #endif
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
3697 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3698 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3699 else{
5594
384629ebcb93 avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum
michael
parents: 5591
diff changeset
3700 if(mm_flags & MM_MMXEXT){
3210
81cafbc23b8d snow mmx+sse2 optimizations, part 4
corey
parents: 3207
diff changeset
3701 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
5602
3b21f3268707 CONFIG_7REGS has been renamed to HAVE_7REGS
ramiro
parents: 5601
diff changeset
3702 #ifdef HAVE_7REGS
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3703 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
5601
b26025b9586d workaround gcc bug, untested as my gcc is not complaining
michael
parents: 5594
diff changeset
3704 #endif
5594
384629ebcb93 avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum
michael
parents: 5591
diff changeset
3705 }
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
3706 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3707 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3708 #endif
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3709
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3710 if(mm_flags & MM_3DNOW){
5024
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3711 #ifdef CONFIG_ENCODERS
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3712 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3713 c->try_8x8basis= try_8x8basis_3dnow;
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3714 }
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3715 c->add_8x8basis= add_8x8basis_3dnow;
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3716 #endif //CONFIG_ENCODERS
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3717 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3718 c->vector_fmul = vector_fmul_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3719 if(!(avctx->flags & CODEC_FLAG_BITEXACT))
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3720 c->float_to_int16 = float_to_int16_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3721 }
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3722 if(mm_flags & MM_3DNOWEXT)
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3723 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3724 if(mm_flags & MM_SSE){
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3725 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3726 c->vector_fmul = vector_fmul_sse;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3727 c->float_to_int16 = float_to_int16_sse;
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3728 c->vector_fmul_reverse = vector_fmul_reverse_sse;
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3729 c->vector_fmul_add_add = vector_fmul_add_add_sse;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3730 }
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3731 if(mm_flags & MM_3DNOW)
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3732 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
3733 }
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3734
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3735 #ifdef CONFIG_ENCODERS
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3736 dsputil_init_pix_mmx(c, avctx);
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3737 #endif //CONFIG_ENCODERS
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3738 #if 0
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3739 // for speed testing
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3740 get_pixels = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3741 put_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3742 add_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3743
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3744 pix_abs16x16 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3745 pix_abs16x16_x2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3746 pix_abs16x16_y2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3747 pix_abs16x16_xy2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3748
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3749 put_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3750 put_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3751 put_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3752 put_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3753
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3754 put_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3755 put_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3756 put_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3757 put_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3758
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3759 avg_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3760 avg_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3761 avg_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3762 avg_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3763
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3764 avg_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3765 avg_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3766 avg_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3767 avg_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3768
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3769 //av_fdct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3770 //ff_idct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3771 #endif
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
3772 }