annotate i386/dsputil_mmx.c @ 6403:9a736918fd90 libavcodec

split encoding part of dsputil_mmx into its own file
author aurel
date Mon, 25 Feb 2008 23:14:22 +0000
parents 3dc36ec2dcad
children 5154ab444372
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1 /*
986e461dc072 Initial revision
glantau
parents:
diff changeset
2 * MMX optimized DSP utils
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
1739
07a484280a82 copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents: 1729
diff changeset
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
5 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
6 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
7 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
8 * FFmpeg is free software; you can redistribute it and/or
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
9 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
10 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
11 * version 2.1 of the License, or (at your option) any later version.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
12 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
13 * FFmpeg is distributed in the hope that it will be useful,
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
16 * Lesser General Public License for more details.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
17 *
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
18 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
19 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2979
diff changeset
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
21 *
986e461dc072 Initial revision
glantau
parents:
diff changeset
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
986e461dc072 Initial revision
glantau
parents:
diff changeset
23 */
986e461dc072 Initial revision
glantau
parents:
diff changeset
24
5010
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header
diego
parents: 5007
diff changeset
25 #include "dsputil.h"
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
26 #include "dsputil_mmx.h"
5010
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header
diego
parents: 5007
diff changeset
27 #include "simple_idct.h"
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header
diego
parents: 5007
diff changeset
28 #include "mpegvideo.h"
3398
e0927bc44a10 Move REG_* macros from libavcodec/i386/mmx.h to libavutil/x86_cpu.h
lucabe
parents: 3250
diff changeset
29 #include "x86_cpu.h"
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
30 #include "mmx.h"
5014
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
31 #include "vp3dsp_mmx.h"
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
32 #include "vp3dsp_sse2.h"
5277
7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs
aurel
parents: 5255
diff changeset
33 #include "h263.h"
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
34
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
35 //#undef NDEBUG
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
36 //#include <assert.h>
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
37
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
38 extern void ff_idct_xvid_mmx(short *block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
39 extern void ff_idct_xvid_mmx2(short *block);
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
40
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
41 int mm_flags; /* multimedia extension flags */
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
42
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
43 /* pixel operations */
5947
37a03989871b use ff_ prefix for extern vars
aurel
parents: 5946
diff changeset
44 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
37a03989871b use ff_ prefix for extern vars
aurel
parents: 5946
diff changeset
45 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
46
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
47 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
48 {0x8000000080000000ULL, 0x8000000080000000ULL};
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
49
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
50 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
51 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
52 DECLARE_ALIGNED_16(const xmm_t, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
53 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
54 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
55 DECLARE_ALIGNED_16(const xmm_t, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
56 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
6329
5969caa9190d clean up an ugliness introduced in r11826. this syntax will require fewer changes when adding future sse2 code.
lorenm
parents: 6327
diff changeset
57 DECLARE_ALIGNED_16(const xmm_t, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
58 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
59 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
60 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
6333
beb52d4a5efe constant was excessively aligned
lorenm
parents: 6331
diff changeset
61 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
62 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
63
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
64 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
65 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
66 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
67 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
68 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
69 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
70
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
71 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
72 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
5737
efa3c1f9259a sse2 version of compute_autocorr().
lorenm
parents: 5602
diff changeset
73
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
74 #define JUMPALIGN() asm volatile (ASMALIGN(3)::)
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
75 #define MOVQ_ZERO(regd) asm volatile ("pxor %%" #regd ", %%" #regd ::)
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
76
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
77 #define MOVQ_BFE(regd) \
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
78 asm volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
79 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
80 "paddb %%" #regd ", %%" #regd " \n\t" ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
81
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
82 #ifndef PIC
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
83 #define MOVQ_BONE(regd) asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
84 #define MOVQ_WTWO(regd) asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
85 #else
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
86 // for shared library it's better to use this way for accessing constants
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
87 // pcmpeqd -> -1
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
88 #define MOVQ_BONE(regd) \
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
89 asm volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
90 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
91 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
92 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
93
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
94 #define MOVQ_WTWO(regd) \
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
95 asm volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
96 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
97 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
98 "psllw $1, %%" #regd " \n\t"::)
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
99
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
100 #endif
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
101
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
102 // using regr as temporary and for the output result
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
103 // first argument is unmodifed and second is trashed
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
104 // regfe is supposed to contain 0xfefefefefefefefe
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
105 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
106 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
107 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
108 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
109 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
110 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
111 "paddb " #regb ", " #regr " \n\t"
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
112
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
113 #define PAVGB_MMX(rega, regb, regr, regfe) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
114 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
115 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
116 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
117 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
118 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
119 "psubb " #regb ", " #regr " \n\t"
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
120
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
121 // mm6 is supposed to contain 0xfefefefefefefefe
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
122 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
123 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
124 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
125 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
126 "pand " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
127 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
128 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
129 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
130 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
131 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
132 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
133 "paddb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
134 "paddb " #regd ", " #regp " \n\t"
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
135
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
136 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
137 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
138 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
139 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
140 "por " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
141 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
142 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
143 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
144 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
145 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
146 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
147 "psubb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
148 "psubb " #regd ", " #regp " \n\t"
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
149
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
150 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
151 /* MMX no rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
152 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
153 #define SET_RND MOVQ_WONE
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
154 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
155 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
156
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
157 #include "dsputil_mmx_rnd.h"
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
158
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
159 #undef DEF
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
160 #undef SET_RND
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
161 #undef PAVGBP
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
162 #undef PAVGB
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
163 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
164 /* MMX rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
165
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
166 #define DEF(x, y) x ## _ ## y ##_mmx
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
167 #define SET_RND MOVQ_WTWO
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
168 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
169 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
170
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
171 #include "dsputil_mmx_rnd.h"
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
172
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
173 #undef DEF
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
174 #undef SET_RND
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
175 #undef PAVGBP
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
176 #undef PAVGB
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
177
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
178 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
179 /* 3Dnow specific */
986e461dc072 Initial revision
glantau
parents:
diff changeset
180
986e461dc072 Initial revision
glantau
parents:
diff changeset
181 #define DEF(x) x ## _3dnow
986e461dc072 Initial revision
glantau
parents:
diff changeset
182 #define PAVGB "pavgusb"
986e461dc072 Initial revision
glantau
parents:
diff changeset
183
986e461dc072 Initial revision
glantau
parents:
diff changeset
184 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision
glantau
parents:
diff changeset
185
986e461dc072 Initial revision
glantau
parents:
diff changeset
186 #undef DEF
986e461dc072 Initial revision
glantau
parents:
diff changeset
187 #undef PAVGB
986e461dc072 Initial revision
glantau
parents:
diff changeset
188
986e461dc072 Initial revision
glantau
parents:
diff changeset
189 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
190 /* MMX2 specific */
986e461dc072 Initial revision
glantau
parents:
diff changeset
191
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
192 #define DEF(x) x ## _mmx2
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
193
986e461dc072 Initial revision
glantau
parents:
diff changeset
194 /* Introduced only in MMX2 set */
986e461dc072 Initial revision
glantau
parents:
diff changeset
195 #define PAVGB "pavgb"
986e461dc072 Initial revision
glantau
parents:
diff changeset
196
986e461dc072 Initial revision
glantau
parents:
diff changeset
197 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision
glantau
parents:
diff changeset
198
986e461dc072 Initial revision
glantau
parents:
diff changeset
199 #undef DEF
986e461dc072 Initial revision
glantau
parents:
diff changeset
200 #undef PAVGB
986e461dc072 Initial revision
glantau
parents:
diff changeset
201
6327
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
202 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
203 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
6321
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
204 #define put_pixels16_mmx2 put_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
205 #define put_pixels8_mmx2 put_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
206 #define put_pixels4_mmx2 put_pixels4_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
207 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
208 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
209 #define put_pixels16_3dnow put_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
210 #define put_pixels8_3dnow put_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
211 #define put_pixels4_3dnow put_pixels4_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
212 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
213 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
214
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
215 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
216 /* standard MMX */
986e461dc072 Initial revision
glantau
parents:
diff changeset
217
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
218 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
219 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
220 const DCTELEM *p;
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
221 uint8_t *pix;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
222
986e461dc072 Initial revision
glantau
parents:
diff changeset
223 /* read the pixels */
986e461dc072 Initial revision
glantau
parents:
diff changeset
224 p = block;
986e461dc072 Initial revision
glantau
parents:
diff changeset
225 pix = pixels;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
226 /* unrolled loop */
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
227 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
228 "movq %3, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
229 "movq 8%3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
230 "movq 16%3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
231 "movq 24%3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
232 "movq 32%3, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
233 "movq 40%3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
234 "movq 48%3, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
235 "movq 56%3, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
236 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
237 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
238 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
239 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
240 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
241 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
242 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
243 "movq %%mm6, (%0, %2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
244 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
245 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
246 pix += line_size*4;
986e461dc072 Initial revision
glantau
parents:
diff changeset
247 p += 32;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
248
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
249 // if here would be an exact copy of the code above
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
250 // compiler would generate some very strange code
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
251 // thus using "r"
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
252 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
253 "movq (%3), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
254 "movq 8(%3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
255 "movq 16(%3), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
256 "movq 24(%3), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
257 "movq 32(%3), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
258 "movq 40(%3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
259 "movq 48(%3), %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
260 "movq 56(%3), %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
261 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
262 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
263 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
264 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
265 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
266 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
267 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
268 "movq %%mm6, (%0, %2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
269 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
270 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
271 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
272
3089
072dbc669253 MSVC-compatible __align8/__align16 declaration
diego
parents: 3036
diff changeset
273 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
1985
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents: 1984
diff changeset
274 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents: 1984
diff changeset
275
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
276 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
277 {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
278 int i;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
279
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
280 movq_m2r(*vector128, mm1);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
281 for (i = 0; i < 8; i++) {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
282 movq_m2r(*(block), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
283 packsswb_m2r(*(block + 4), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
284 block += 8;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
285 paddb_r2r(mm1, mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
286 movq_r2m(mm0, *pixels);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
287 pixels += line_size;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
288 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
289 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
290
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
291 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
292 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
293 const DCTELEM *p;
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
294 uint8_t *pix;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
295 int i;
986e461dc072 Initial revision
glantau
parents:
diff changeset
296
986e461dc072 Initial revision
glantau
parents:
diff changeset
297 /* read the pixels */
986e461dc072 Initial revision
glantau
parents:
diff changeset
298 p = block;
986e461dc072 Initial revision
glantau
parents:
diff changeset
299 pix = pixels;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
300 MOVQ_ZERO(mm7);
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
301 i = 4;
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
302 do {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
303 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
304 "movq (%2), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
305 "movq 8(%2), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
306 "movq 16(%2), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
307 "movq 24(%2), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
308 "movq %0, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
309 "movq %1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
310 "movq %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
311 "punpcklbw %%mm7, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
312 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
313 "paddsw %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
314 "paddsw %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
315 "movq %%mm6, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
316 "punpcklbw %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
317 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
318 "paddsw %%mm6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
319 "paddsw %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
320 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
321 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
322 "movq %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
323 "movq %%mm2, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
324 :"+m"(*pix), "+m"(*(pix+line_size))
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
325 :"r"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
326 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
327 pix += line_size*2;
986e461dc072 Initial revision
glantau
parents:
diff changeset
328 p += 16;
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
329 } while (--i);
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
330 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
331
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
332 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
333 {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
334 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
335 "lea (%3, %3), %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
336 ASMALIGN(3)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
337 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
338 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
339 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
340 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
341 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
342 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
343 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
344 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
345 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
346 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
347 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
348 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
349 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
350 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
351 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
352 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
353 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
354 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
355 );
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
356 }
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
357
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
358 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
359 {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
360 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
361 "lea (%3, %3), %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
362 ASMALIGN(3)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
363 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
364 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
365 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
366 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
367 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
368 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
369 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
370 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
371 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
372 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
373 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
374 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
375 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
376 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
377 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
378 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
379 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
380 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
381 );
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
382 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
383
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
384 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
385 {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
386 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
387 "lea (%3, %3), %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
388 ASMALIGN(3)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
389 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
390 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
391 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
392 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
393 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
394 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
395 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
396 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
397 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
398 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
399 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
400 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
401 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
402 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
403 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
404 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
405 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
406 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
407 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
408 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
409 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
410 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
411 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
412 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
413 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
414 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
415 );
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
416 }
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
417
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
418 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
419 {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
420 asm volatile(
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
421 "1: \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
422 "movdqu (%1), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
423 "movdqu (%1,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
424 "movdqu (%1,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
425 "movdqu (%1,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
426 "movdqa %%xmm0, (%2) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
427 "movdqa %%xmm1, (%2,%3) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
428 "movdqa %%xmm2, (%2,%3,2) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
429 "movdqa %%xmm3, (%2,%4) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
430 "subl $4, %0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
431 "lea (%1,%3,4), %1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
432 "lea (%2,%3,4), %2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
433 "jnz 1b \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
434 : "+g"(h), "+r" (pixels), "+r" (block)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
435 : "r"((long)line_size), "r"(3L*line_size)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
436 : "memory"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
437 );
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
438 }
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
439
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
440 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
441 {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
442 asm volatile(
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
443 "1: \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
444 "movdqu (%1), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
445 "movdqu (%1,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
446 "movdqu (%1,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
447 "movdqu (%1,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
448 "pavgb (%2), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
449 "pavgb (%2,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
450 "pavgb (%2,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
451 "pavgb (%2,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
452 "movdqa %%xmm0, (%2) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
453 "movdqa %%xmm1, (%2,%3) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
454 "movdqa %%xmm2, (%2,%3,2) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
455 "movdqa %%xmm3, (%2,%4) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
456 "subl $4, %0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
457 "lea (%1,%3,4), %1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
458 "lea (%2,%3,4), %2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
459 "jnz 1b \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
460 : "+g"(h), "+r" (pixels), "+r" (block)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
461 : "r"((long)line_size), "r"(3L*line_size)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
462 : "memory"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
463 );
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
464 }
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
465
296
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
466 static void clear_blocks_mmx(DCTELEM *blocks)
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
467 {
6391
3dc36ec2dcad __asm __volatile -> asm volatile, improves code consistency and works
reimar
parents: 6384
diff changeset
468 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
469 "pxor %%mm7, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
470 "mov $-128*6, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
471 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
472 "movq %%mm7, (%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
473 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
474 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
475 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
476 "add $32, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
477 " js 1b \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
478 : : "r" (((uint8_t *)blocks)+128*6)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
479 : "%"REG_a
296
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
480 );
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
481 }
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
482
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
483 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
484 long i=0;
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
485 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
486 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
487 "movq (%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
488 "movq (%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
489 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
490 "movq %%mm1, (%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
491 "movq 8(%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
492 "movq 8(%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
493 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
494 "movq %%mm1, 8(%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
495 "add $16, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
496 "cmp %3, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
497 " jb 1b \n\t"
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
498 : "+r" (i)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
499 : "r"(src), "r"(dst), "r"((long)w-15)
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
500 );
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
501 for(; i<w; i++)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
502 dst[i+0] += src[i+0];
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
503 }
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
504
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
505 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
506 long i=0;
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
507 asm volatile(
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
508 "1: \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
509 "movq (%2, %0), %%mm0 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
510 "movq 8(%2, %0), %%mm1 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
511 "paddb (%3, %0), %%mm0 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
512 "paddb 8(%3, %0), %%mm1 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
513 "movq %%mm0, (%1, %0) \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
514 "movq %%mm1, 8(%1, %0) \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
515 "add $16, %0 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
516 "cmp %4, %0 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
517 " jb 1b \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
518 : "+r" (i)
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
519 : "r"(dst), "r"(src1), "r"(src2), "r"((long)w-15)
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
520 );
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
521 for(; i<w; i++)
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
522 dst[i] = src1[i] + src2[i];
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
523 }
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
524
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
525 #define H263_LOOP_FILTER \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
526 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
527 "movq %0, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
528 "movq %0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
529 "movq %3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
530 "movq %3, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
531 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
532 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
533 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
534 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
535 "psubw %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
536 "psubw %%mm3, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
537 "movq %1, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
538 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
539 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
540 "movq %2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
541 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
542 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
543 "punpcklbw %%mm7, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
544 "punpckhbw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
545 "psubw %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
546 "psubw %%mm3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
547 "psllw $2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
548 "psllw $2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
549 "paddw %%mm0, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
550 "paddw %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
551 "pxor %%mm6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
552 "pcmpgtw %%mm4, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
553 "pcmpgtw %%mm5, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
554 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
555 "pxor %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
556 "psubw %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
557 "psubw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
558 "psrlw $3, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
559 "psrlw $3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
560 "packuswb %%mm5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
561 "packsswb %%mm7, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
562 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
563 "movd %4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
564 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
565 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
566 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
567 "psubusb %%mm4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
568 "movq %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
569 "psubusb %%mm4, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
570 "psubb %%mm3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
571 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
572 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
573 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
574 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
575 "paddusb %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
576 "psubusb %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
577 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
578 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
579 "paddusb %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
580 "packsswb %%mm1, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
581 "pcmpgtb %%mm0, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
582 "pxor %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
583 "psubb %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
584 "movq %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
585 "psubusb %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
586 "psubb %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
587 "pand %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
588 "psrlw $2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
589 "pxor %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
590 "psubb %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
591 "movq %0, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
592 "movq %3, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
593 "psubb %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
594 "paddb %%mm1, %%mm6 \n\t"
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
595
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
596 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
597 if(ENABLE_ANY_H263) {
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
598 const int strength= ff_h263_loop_filter_strength[qscale];
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
599
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
600 asm volatile(
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
601
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
602 H263_LOOP_FILTER
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
603
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
604 "movq %%mm3, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
605 "movq %%mm4, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
606 "movq %%mm5, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
607 "movq %%mm6, %3 \n\t"
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
608 : "+m" (*(uint64_t*)(src - 2*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
609 "+m" (*(uint64_t*)(src - 1*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
610 "+m" (*(uint64_t*)(src + 0*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
611 "+m" (*(uint64_t*)(src + 1*stride))
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
612 : "g" (2*strength), "m"(ff_pb_FC)
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
613 );
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
614 }
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
615 }
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
616
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
617 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
618 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
619 "movd %4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
620 "movd %5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
621 "movd %6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
622 "movd %7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
623 "punpcklbw %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
624 "punpcklbw %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
625 "movq %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
626 "punpcklwd %%mm2, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
627 "punpckhwd %%mm2, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
628 "movd %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
629 "punpckhdq %%mm0, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
630 "movd %%mm0, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
631 "movd %%mm1, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
632 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
633 "movd %%mm1, %3 \n\t"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
634
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
635 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
636 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
637 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
638 "=m" (*(uint32_t*)(dst + 3*dst_stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
639 : "m" (*(uint32_t*)(src + 0*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
640 "m" (*(uint32_t*)(src + 1*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
641 "m" (*(uint32_t*)(src + 2*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
642 "m" (*(uint32_t*)(src + 3*src_stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
643 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
644 }
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
645
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
646 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
647 if(ENABLE_ANY_H263) {
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
648 const int strength= ff_h263_loop_filter_strength[qscale];
6181
f3da7b2592aa Use DECLARE_ALIGNED
reimar
parents: 6135
diff changeset
649 DECLARE_ALIGNED(8, uint64_t, temp[4]);
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
650 uint8_t *btemp= (uint8_t*)temp;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
651
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
652 src -= 2;
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
653
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
654 transpose4x4(btemp , src , 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
655 transpose4x4(btemp+4, src + 4*stride, 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
656 asm volatile(
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
657 H263_LOOP_FILTER // 5 3 4 6
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
658
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
659 : "+m" (temp[0]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
660 "+m" (temp[1]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
661 "+m" (temp[2]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
662 "+m" (temp[3])
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
663 : "g" (2*strength), "m"(ff_pb_FC)
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
664 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
665
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
666 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
667 "movq %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
668 "movq %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
669 "punpcklbw %%mm3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
670 "punpcklbw %%mm6, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
671 "punpckhbw %%mm3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
672 "punpckhbw %%mm6, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
673 "movq %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
674 "movq %%mm1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
675 "punpcklwd %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
676 "punpcklwd %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
677 "punpckhwd %%mm4, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
678 "punpckhwd %%mm0, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
679 "movd %%mm5, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
680 "punpckhdq %%mm5, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
681 "movd %%mm5, (%0,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
682 "movd %%mm3, (%0,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
683 "punpckhdq %%mm3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
684 "movd %%mm3, (%0,%3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
685 "movd %%mm1, (%1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
686 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
687 "movd %%mm1, (%1,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
688 "movd %%mm6, (%1,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
689 "punpckhdq %%mm6, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
690 "movd %%mm6, (%1,%3) \n\t"
2505
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
691 :: "r" (src),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
692 "r" (src + 4*stride),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
693 "r" ((long) stride ),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
694 "r" ((long)(3*stride))
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
695 );
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
696 }
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
697 }
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
698
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
699 #define PAETH(cpu, abs3)\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
700 void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
701 {\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
702 long i = -bpp;\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
703 long end = w-3;\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
704 asm volatile(\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
705 "pxor %%mm7, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
706 "movd (%1,%0), %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
707 "movd (%2,%0), %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
708 "punpcklbw %%mm7, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
709 "punpcklbw %%mm7, %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
710 "add %4, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
711 "1: \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
712 "movq %%mm1, %%mm2 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
713 "movd (%2,%0), %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
714 "movq %%mm2, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
715 "punpcklbw %%mm7, %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
716 "movq %%mm2, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
717 "psubw %%mm1, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
718 "psubw %%mm0, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
719 "movq %%mm3, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
720 "paddw %%mm4, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
721 abs3\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
722 "movq %%mm4, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
723 "pminsw %%mm5, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
724 "pcmpgtw %%mm6, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
725 "pcmpgtw %%mm5, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
726 "movq %%mm4, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
727 "pand %%mm3, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
728 "pandn %%mm3, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
729 "pandn %%mm0, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
730 "movd (%3,%0), %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
731 "pand %%mm1, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
732 "pand %%mm4, %%mm2 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
733 "punpcklbw %%mm7, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
734 "movq %6, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
735 "paddw %%mm6, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
736 "paddw %%mm2, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
737 "paddw %%mm3, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
738 "pand %%mm5, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
739 "movq %%mm0, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
740 "packuswb %%mm3, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
741 "movd %%mm3, (%1,%0) \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
742 "add %4, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
743 "cmp %5, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
744 "jle 1b \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
745 :"+r"(i)\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
746 :"r"(dst), "r"(top), "r"(src), "r"((long)bpp), "g"(end),\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
747 "m"(ff_pw_255)\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
748 :"memory"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
749 );\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
750 }
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
751
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
752 #define ABS3_MMX2\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
753 "psubw %%mm5, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
754 "pmaxsw %%mm7, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
755 "pxor %%mm6, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
756 "pxor %%mm7, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
757 "psubw %%mm3, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
758 "psubw %%mm4, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
759 "pmaxsw %%mm6, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
760 "pmaxsw %%mm7, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
761 "pxor %%mm7, %%mm7 \n"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
762
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
763 #define ABS3_SSSE3\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
764 "pabsw %%mm3, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
765 "pabsw %%mm4, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
766 "pabsw %%mm5, %%mm5 \n"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
767
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
768 PAETH(mmx2, ABS3_MMX2)
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
769 #ifdef HAVE_SSSE3
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
770 PAETH(ssse3, ABS3_SSSE3)
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
771 #endif
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
772
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
773 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
774 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
775 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
776 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
777 "movq "#in7", " #m3 " \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
778 "movq "#in0", %%mm5 \n\t" /* D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
779 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
780 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
781 "movq "#in1", %%mm5 \n\t" /* C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
782 "movq "#in2", %%mm6 \n\t" /* B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
783 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
784 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
785 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
786 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
787 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
788 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
789 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
790 "psraw $5, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
791 "packuswb %%mm5, %%mm5 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
792 OP(%%mm5, out, %%mm7, d)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
793
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
794 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
795 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
796 uint64_t temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
797 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
798 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
799 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
800 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
801 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
802 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
803 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
804 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
805 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
806 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
807 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
808 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
809 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
810 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
811 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
812 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
813 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
814 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
815 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
816 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
817 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
818 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
819 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
820 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
821 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
822 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
823 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
824 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
825 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
826 "paddw %6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
827 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
828 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
829 "movq %%mm0, %5 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
830 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
831 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
832 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
833 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
834 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
835 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
836 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
837 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
838 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
839 "paddw %%mm0, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
840 "paddw %%mm5, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
841 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
842 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
843 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
844 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
845 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
846 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
847 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
848 "paddw %%mm2, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
849 "paddw %%mm6, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
850 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
851 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
852 "paddw %6, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
853 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
854 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
855 "movq %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
856 "packuswb %%mm3, %%mm1 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
857 OP_MMX2(%%mm1, (%1),%%mm4, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
858 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
859 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
860 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
861 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
862 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
863 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
864 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
865 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
866 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
867 "paddw %%mm1, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
868 "paddw %%mm4, %%mm0 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
869 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
870 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
871 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
872 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
873 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
874 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
875 "paddw %%mm3, %%mm2 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
876 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
877 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
878 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
879 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
880 "paddw %%mm2, %%mm6 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
881 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
882 "paddw %6, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
883 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
884 "psraw $5, %%mm0 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
885 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
886 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
887 "paddw %%mm5, %%mm3 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
888 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
889 "paddw %%mm4, %%mm6 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
890 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
891 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
892 "paddw %%mm1, %%mm4 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
893 "paddw %%mm2, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
894 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
895 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
896 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
897 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
898 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
899 "paddw %6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
900 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
901 "psraw $5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
902 "packuswb %%mm4, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
903 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
904 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
905 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
906 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
907 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
908 " jnz 1b \n\t"\
6335
950811a14eb3 put loop counter in a register if possible. makes some of the qpel functions 3% faster.
lorenm
parents: 6333
diff changeset
909 : "+a"(src), "+c"(dst), "+g"(h)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
910 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
911 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
912 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
913 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
914 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
915 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
916 int i;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
917 int16_t temp[16];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
918 /* quick HACK, XXX FIXME MUST be optimized */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
919 for(i=0; i<h; i++)\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
920 {\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
921 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
922 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
923 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
924 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
925 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
926 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
927 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
928 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
929 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
930 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
931 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
932 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
933 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
934 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
935 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
936 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
937 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
938 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
939 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
940 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
941 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
942 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
943 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
944 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
945 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
946 "movq 16(%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
947 "movq 24(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
948 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
949 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
950 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
951 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
952 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
953 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
954 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
955 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
956 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
957 dst+=dstStride;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
958 src+=srcStride;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
959 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
960 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
961 \
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
962 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
963 uint64_t temp;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
964 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
965 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
966 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
967 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
968 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
969 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
970 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
971 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
972 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
973 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
974 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
975 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
976 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
977 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
978 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
979 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
980 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
981 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
982 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
983 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
984 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
985 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
986 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
987 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
988 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
989 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
990 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
991 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
992 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
993 "paddw %6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
994 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
995 "psraw $5, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
996 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
997 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
998 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
999 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1000 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1001 "paddw %%mm5, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1002 "paddw %%mm6, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1003 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1004 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1005 "paddw %%mm6, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1006 "paddw %%mm5, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1007 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1008 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1009 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1010 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1011 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1012 "paddw %6, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1013 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1014 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1015 "packuswb %%mm3, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1016 OP_MMX2(%%mm0, (%1), %%mm4, q)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1017 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1018 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1019 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1020 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1021 " jnz 1b \n\t"\
6335
950811a14eb3 put loop counter in a register if possible. makes some of the qpel functions 3% faster.
lorenm
parents: 6333
diff changeset
1022 : "+a"(src), "+c"(dst), "+g"(h)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1023 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1024 : "memory"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1025 );\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1026 }\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1027 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1028 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1029 int i;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1030 int16_t temp[8];\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1031 /* quick HACK, XXX FIXME MUST be optimized */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1032 for(i=0; i<h; i++)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1033 {\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1034 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1035 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1036 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1037 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1038 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1039 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1040 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1041 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1042 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1043 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1044 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1045 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1046 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1047 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1048 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1049 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1050 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1051 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1052 :"memory"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1053 );\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1054 dst+=dstStride;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1055 src+=srcStride;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1056 }\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1057 }
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1058
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1059 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1060 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1061 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1062 uint64_t temp[17*4];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1063 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1064 int count= 17;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1065 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1066 /*FIXME unroll */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1067 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1068 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1069 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1070 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1071 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1072 "movq 8(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1073 "movq 8(%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1074 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1075 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1076 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1077 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1078 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1079 "movq %%mm1, 17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1080 "movq %%mm2, 2*17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1081 "movq %%mm3, 3*17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1082 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1083 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1084 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1085 " jnz 1b \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1086 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1087 : "r" ((long)srcStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1088 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1089 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1090 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1091 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1092 count=4;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1093 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1094 /*FIXME reorder for speed */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1095 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1096 /*"pxor %%mm7, %%mm7 \n\t"*/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1097 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1098 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1099 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1100 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1101 "movq 24(%0), %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1102 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1103 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1104 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1105 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1106 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1107 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1108 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1109 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1110 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1111 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1112 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1113 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1114 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1115 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1116 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1117 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1118 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1119 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1120 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1121 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1122 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1123 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1124 "add %4, %1 \n\t" \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1125 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1126 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1127 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1128 "add $136, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1129 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1130 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1131 " jnz 1b \n\t"\
958
9bb668034ecf slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped)
michaelni
parents: 954
diff changeset
1132 \
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1133 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1134 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1135 :"memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1136 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1137 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1138 \
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
1139 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
1140 uint64_t temp[9*2];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1141 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1142 int count= 9;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1143 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1144 /*FIXME unroll */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1145 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1146 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1147 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1148 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1149 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1150 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1151 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1152 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1153 "movq %%mm1, 9*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1154 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1155 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1156 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1157 " jnz 1b \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1158 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1159 : "r" ((long)srcStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1160 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1161 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1162 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1163 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1164 count=2;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1165 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1166 /*FIXME reorder for speed */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1167 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1168 /*"pxor %%mm7, %%mm7 \n\t"*/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1169 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1170 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1171 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1172 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1173 "movq 24(%0), %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1174 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1175 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1176 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1177 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1178 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1179 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1180 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1181 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1182 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1183 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1184 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1185 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1186 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1187 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1188 "add $72, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1189 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1190 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1191 " jnz 1b \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1192 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1193 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1194 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1195 : "memory"\
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1196 );\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1197 }\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1198 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1199 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
6321
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
1200 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1201 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1202 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1203 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1204 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1205 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1206 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1207 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1208 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1209 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1210 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1211 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1212 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1213 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1214 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1215 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1216 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1217 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1218 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1219 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1220 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1221 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1222 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1223 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1224 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1225 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1226 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1227 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1228 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1229 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1230 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1231 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1232 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1233 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1234 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1235 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1236 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1237 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1238 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1239 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1240 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1241 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1242 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1243 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1244 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1245 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1246 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1247 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1248 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1249 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1250 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1251 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1252 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1253 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1254 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1255 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1256 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1257 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1258 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1259 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1260 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1261 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1262 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1263 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1264 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1265 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1266 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1267 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1268 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1269 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1270 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1271 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1272 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1273 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1274 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1275 uint64_t half[8 + 9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1276 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1277 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1278 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1279 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1280 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1281 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1282 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1283 uint64_t half[8 + 9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1284 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1285 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1286 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1287 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1288 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1289 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1290 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1291 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1292 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1293 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1294 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1295 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1296 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1297 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1298 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1299 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1300 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1301 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1302 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1303 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1304 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1305 uint64_t half[9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1306 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1307 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1308 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1309 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1310 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
6321
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
1311 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1312 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1313 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1314 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1315 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1316 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1317 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1318 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1319 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1320 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1321 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1322 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1323 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1324 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1325 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1326 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1327 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1328 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1329 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1330 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1331 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1332 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1333 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1334 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1335 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1336 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1337 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1338 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1339 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1340 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1341 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1342 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1343 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1344 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1345 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1346 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1347 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1348 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1349 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1350 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1351 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1352 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1353 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1354 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1355 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1356 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1357 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1358 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1359 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1360 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1361 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1362 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1363 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1364 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1365 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1366 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1367 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1368 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1369 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1370 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1371 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1372 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1373 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1374 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1375 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1376 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1377 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1378 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1379 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1380 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1381 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1382 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1383 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1384 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1385 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1386 uint64_t half[16*2 + 17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1387 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1388 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1389 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1390 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1391 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1392 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1393 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1394 uint64_t half[16*2 + 17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1395 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1396 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1397 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1398 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1399 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1400 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1401 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1402 uint64_t half[17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1403 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1404 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1405 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1406 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1407 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1408 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1409 uint64_t half[17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1410 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1411 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1412 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1413 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1414 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1415 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1416 uint64_t half[17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1417 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1418 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1419 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1420 }
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1421
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1422 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1423 #define AVG_3DNOW_OP(a,b,temp, size) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1424 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1425 "pavgusb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1426 "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1427 #define AVG_MMX2_OP(a,b,temp, size) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1428 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1429 "pavgb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1430 "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1431
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1432 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1433 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1434 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1435 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1436 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1437 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1438 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1439 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1440 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1441
3807
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1442 /***********************************/
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1443 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1444
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1445 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1446 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1447 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1448 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1449 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1450 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1451 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1452 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1453
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1454 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1455 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1456 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1457 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1458 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1459 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1460 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1461 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1462 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1463 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1464 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1465 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1466 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1467 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1468 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1469 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1470 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1471 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1472 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1473 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1474 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1475 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1476 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1477 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1478
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1479 QPEL_2TAP(put_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1480 QPEL_2TAP(avg_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1481 QPEL_2TAP(put_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1482 QPEL_2TAP(avg_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1483 QPEL_2TAP(put_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1484 QPEL_2TAP(avg_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1485 QPEL_2TAP(put_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1486 QPEL_2TAP(avg_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1487
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1488
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
1489 #if 0
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
1490 static void just_return() { return; }
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
1491 #endif
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
1492
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1493 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1494 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1495 const int w = 8;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1496 const int ix = ox>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1497 const int iy = oy>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1498 const int oxs = ox>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1499 const int oys = oy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1500 const int dxxs = dxx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1501 const int dxys = dxy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1502 const int dyxs = dyx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1503 const int dyys = dyy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1504 const uint16_t r4[4] = {r,r,r,r};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1505 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1506 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1507 const uint64_t shift2 = 2*shift;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1508 uint8_t edge_buf[(h+1)*stride];
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1509 int x, y;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1510
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1511 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1512 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1513 const int dxh = dxy*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1514 const int dyw = dyx*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1515 if( // non-constant fullpel offset (3% of blocks)
6196
166bef5cad01 add parenthesis, fix warning: i386/dsputil_mmx.c:2618: warning: suggest parentheses around arithmetic in operand of |
bcoudurier
parents: 6195
diff changeset
1516 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
166bef5cad01 add parenthesis, fix warning: i386/dsputil_mmx.c:2618: warning: suggest parentheses around arithmetic in operand of |
bcoudurier
parents: 6195
diff changeset
1517 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1518 // uses more than 16 bits of subpel mv (only at huge resolution)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1519 || (dxx|dxy|dyx|dyy)&15 )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1520 {
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1521 //FIXME could still use mmx for some of the rows
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1522 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1523 return;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1524 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1525
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1526 src += ix + iy*stride;
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1527 if( (unsigned)ix >= width-w ||
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1528 (unsigned)iy >= height-h )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1529 {
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1530 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1531 src = edge_buf;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1532 }
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1533
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1534 asm volatile(
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1535 "movd %0, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1536 "pxor %%mm7, %%mm7 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1537 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1538 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1539 :: "r"(1<<shift)
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1540 );
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1541
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1542 for(x=0; x<w; x+=4){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1543 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1544 oxs - dxys + dxxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1545 oxs - dxys + dxxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1546 oxs - dxys + dxxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1547 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1548 oys - dyys + dyxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1549 oys - dyys + dyxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1550 oys - dyys + dyxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1551
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1552 for(y=0; y<h; y++){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1553 asm volatile(
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1554 "movq %0, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1555 "movq %1, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1556 "paddw %2, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1557 "paddw %3, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1558 "movq %%mm4, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1559 "movq %%mm5, %1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1560 "psrlw $12, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1561 "psrlw $12, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1562 : "+m"(*dx4), "+m"(*dy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1563 : "m"(*dxy4), "m"(*dyy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1564 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1565
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1566 asm volatile(
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1567 "movq %%mm6, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1568 "movq %%mm6, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1569 "psubw %%mm4, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1570 "psubw %%mm5, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1571 "movq %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1572 "movq %%mm4, %%mm3 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1573 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1574 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1575 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1576 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1577
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1578 "movd %4, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1579 "movd %3, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1580 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1581 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1582 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1583 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1584
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1585 "movd %2, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1586 "movd %1, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1587 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1588 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1589 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1590 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1591 "paddw %5, %%mm1 \n\t"
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1592 "paddw %%mm3, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1593 "paddw %%mm1, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1594 "paddw %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1595
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1596 "psrlw %6, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1597 "packuswb %%mm0, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1598 "movd %%mm0, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1599
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1600 : "=m"(dst[x+y*stride])
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1601 : "m"(src[0]), "m"(src[1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1602 "m"(src[stride]), "m"(src[stride+1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1603 "m"(*r4), "m"(shift2)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1604 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1605 src += stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1606 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1607 src += 4-h*stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1608 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1609 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1610
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1611 #define PREFETCH(name, op) \
4172
608e2dfcb86e adding more static keywords
mru
parents: 4127
diff changeset
1612 static void name(void *mem, int stride, int h){\
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1613 const uint8_t *p= mem;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1614 do{\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1615 asm volatile(#op" %0" :: "m"(*p));\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1616 p+= stride;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1617 }while(--h);\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1618 }
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1619 PREFETCH(prefetch_mmx2, prefetcht0)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1620 PREFETCH(prefetch_3dnow, prefetch)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1621 #undef PREFETCH
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1622
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents: 2753
diff changeset
1623 #include "h264dsp_mmx.c"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1624
6009
ecfdc0bfb233 typo/clarification
diego
parents: 5963
diff changeset
1625 /* CAVS specific */
3524
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1626 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1627
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1628 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1629 put_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1630 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1631 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1632 avg_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1633 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1634 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1635 put_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1636 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1637 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1638 avg_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1639 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1640
5948
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1641 /* VC1 specific */
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1642 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1643
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1644 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1645 put_pixels8_mmx(dst, src, stride, 8);
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1646 }
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1647
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1648 /* external functions, from idct_mmx.c */
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1649 void ff_mmx_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1650 void ff_mmxext_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1651
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1652 /* XXX: those functions should be suppressed ASAP when all IDCTs are
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1653 converted */
4020
723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure
diego
parents: 4001
diff changeset
1654 #ifdef CONFIG_GPL
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1655 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1656 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1657 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1658 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1659 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1660 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1661 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1662 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1663 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1664 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1665 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1666 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1667 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1668 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1669 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1670 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1671 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1672 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1673 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1674 }
4020
723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure
diego
parents: 4001
diff changeset
1675 #endif
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1676 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1677 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1678 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1679 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1680 }
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1681 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1682 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1683 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1684 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1685 }
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1686 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1687 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1688 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1689 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1690 }
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1691 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1692 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1693 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1694 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1695 }
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1696
3541
3fbddeb13686 10l, vorbis_inverse_coupling_sse() was really 3dnow
lorenm
parents: 3536
diff changeset
1697 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1698 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1699 int i;
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1700 asm volatile("pxor %%mm7, %%mm7":);
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1701 for(i=0; i<blocksize; i+=2) {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1702 asm volatile(
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1703 "movq %0, %%mm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1704 "movq %1, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1705 "movq %%mm0, %%mm2 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1706 "movq %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1707 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1708 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1709 "pslld $31, %%mm2 \n\t" // keep only the sign bit
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1710 "pxor %%mm2, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1711 "movq %%mm3, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1712 "pand %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1713 "pandn %%mm1, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1714 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1715 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1716 "movq %%mm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1717 "movq %%mm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1718 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1719 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1720 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1721 }
3561
97325fecd35a emms -> femms
lorenm
parents: 3557
diff changeset
1722 asm volatile("femms");
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1723 }
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1724 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1725 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1726 int i;
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1727
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1728 asm volatile(
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1729 "movaps %0, %%xmm5 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1730 ::"m"(ff_pdw_80000000[0])
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1731 );
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1732 for(i=0; i<blocksize; i+=4) {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1733 asm volatile(
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1734 "movaps %0, %%xmm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1735 "movaps %1, %%xmm1 \n\t"
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1736 "xorps %%xmm2, %%xmm2 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1737 "xorps %%xmm3, %%xmm3 \n\t"
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1738 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1739 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1740 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1741 "xorps %%xmm2, %%xmm1 \n\t"
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1742 "movaps %%xmm3, %%xmm4 \n\t"
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1743 "andps %%xmm1, %%xmm3 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1744 "andnps %%xmm1, %%xmm4 \n\t"
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1745 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1746 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1747 "movaps %%xmm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1748 "movaps %%xmm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1749 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1750 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1751 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1752 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1753 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1754
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1755 static void vector_fmul_3dnow(float *dst, const float *src, int len){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1756 long i = (len-4)*4;
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1757 asm volatile(
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1758 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1759 "movq (%1,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1760 "movq 8(%1,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1761 "pfmul (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1762 "pfmul 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1763 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1764 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1765 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1766 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1767 "femms \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1768 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1769 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1770 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1771 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1772 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1773 static void vector_fmul_sse(float *dst, const float *src, int len){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1774 long i = (len-8)*4;
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1775 asm volatile(
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1776 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1777 "movaps (%1,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1778 "movaps 16(%1,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1779 "mulps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1780 "mulps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1781 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1782 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1783 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1784 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1785 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1786 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1787 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1788 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1789 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1790
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1791 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1792 long i = len*4-16;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1793 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1794 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1795 "pswapd 8(%1), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1796 "pswapd (%1), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1797 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1798 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1799 "movq %%mm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1800 "movq %%mm1, 8(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1801 "add $16, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1802 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1803 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1804 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1805 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1806 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1807 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1808 }
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1809 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1810 long i = len*4-32;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1811 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1812 "1: \n\t"
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1813 "movaps 16(%1), %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1814 "movaps (%1), %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1815 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1816 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1817 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1818 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1819 "movaps %%xmm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1820 "movaps %%xmm1, 16(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1821 "add $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1822 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1823 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1824 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1825 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1826 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1827 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1828
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1829 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1830 const float *src2, int src3, int len, int step){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1831 long i = (len-4)*4;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1832 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1833 dst += (len-4)*2;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1834 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1835 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1836 "movq (%2,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1837 "movq 8(%2,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1838 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1839 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1840 "pfadd (%4,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1841 "pfadd 8(%4,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1842 "movd %%mm0, (%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1843 "movd %%mm1, 16(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1844 "psrlq $32, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1845 "psrlq $32, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1846 "movd %%mm0, 8(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1847 "movd %%mm1, 24(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1848 "sub $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1849 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1850 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1851 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1852 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1853 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1854 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1855 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1856 else if(step == 1 && src3 == 0){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1857 asm volatile(
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1858 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1859 "movq (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1860 "movq 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1861 "pfmul (%3,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1862 "pfmul 8(%3,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1863 "pfadd (%4,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1864 "pfadd 8(%4,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1865 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1866 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1867 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1868 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1869 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1870 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1871 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1872 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1873 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1874 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1875 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1876 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1877 }
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1878 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1879 const float *src2, int src3, int len, int step){
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1880 long i = (len-8)*4;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1881 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1882 dst += (len-8)*2;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1883 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1884 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1885 "movaps (%2,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1886 "movaps 16(%2,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1887 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1888 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1889 "addps (%4,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1890 "addps 16(%4,%0), %%xmm1 \n\t"
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1891 "movss %%xmm0, (%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1892 "movss %%xmm1, 32(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1893 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1894 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1895 "movss %%xmm2, 16(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1896 "movss %%xmm3, 48(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1897 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1898 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1899 "movss %%xmm0, 8(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1900 "movss %%xmm1, 40(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1901 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1902 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1903 "movss %%xmm2, 24(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
1904 "movss %%xmm3, 56(%1) \n\t"
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1905 "sub $64, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1906 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1907 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1908 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1909 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1910 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1911 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1912 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1913 else if(step == 1 && src3 == 0){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1914 asm volatile(
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1915 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1916 "movaps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1917 "movaps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1918 "mulps (%3,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1919 "mulps 16(%3,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1920 "addps (%4,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1921 "addps 16(%4,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1922 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1923 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1924 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1925 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1926 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1927 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1928 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1929 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1930 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1931 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1932 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1933 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1934
4172
608e2dfcb86e adding more static keywords
mru
parents: 4127
diff changeset
1935 static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1936 // not bit-exact: pf2id uses different rounding than C and SSE
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1937 int i;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1938 for(i=0; i<len; i+=4) {
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1939 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1940 "pf2id %1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1941 "pf2id %2, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1942 "packssdw %%mm1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1943 "movq %%mm0, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1944 :"=m"(dst[i])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1945 :"m"(src[i]), "m"(src[i+2])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1946 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1947 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1948 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1949 }
4172
608e2dfcb86e adding more static keywords
mru
parents: 4127
diff changeset
1950 static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1951 int i;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1952 for(i=0; i<len; i+=4) {
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1953 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1954 "cvtps2pi %1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1955 "cvtps2pi %2, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1956 "packssdw %%mm1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1957 "movq %%mm0, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1958 :"=m"(dst[i])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1959 :"m"(src[i]), "m"(src[i+2])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1960 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1961 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1962 asm volatile("emms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1963 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1964
6195
5f704e9cb518 fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type
bcoudurier
parents: 6181
diff changeset
1965 extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
5f704e9cb518 fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type
bcoudurier
parents: 6181
diff changeset
1966 extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
5f704e9cb518 fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type
bcoudurier
parents: 6181
diff changeset
1967 extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
5f704e9cb518 fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type
bcoudurier
parents: 6181
diff changeset
1968 extern void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
4436
d3e389536b0a Add the const specifier as needed to reduce the number of warnings.
takis
parents: 4197
diff changeset
1969 extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
1970 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
4436
d3e389536b0a Add the const specifier as needed to reduce the number of warnings.
takis
parents: 4197
diff changeset
1971 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
1972 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
1973
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1974 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1975 {
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
1976 mm_flags = mm_support();
1115
74a46d77e061 * support FF_MM_FORCE
kabi
parents: 1092
diff changeset
1977
1122
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
1978 if (avctx->dsp_mask) {
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1979 if (avctx->dsp_mask & FF_MM_FORCE)
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
1980 mm_flags |= (avctx->dsp_mask & 0xffff);
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1981 else
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
1982 mm_flags &= ~(avctx->dsp_mask & 0xffff);
1122
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
1983 }
1115
74a46d77e061 * support FF_MM_FORCE
kabi
parents: 1092
diff changeset
1984
631
47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents: 629
diff changeset
1985 #if 0
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
1986 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
1987 if (mm_flags & MM_MMX)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
1988 av_log(avctx, AV_LOG_INFO, " mmx");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
1989 if (mm_flags & MM_MMXEXT)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
1990 av_log(avctx, AV_LOG_INFO, " mmxext");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
1991 if (mm_flags & MM_3DNOW)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
1992 av_log(avctx, AV_LOG_INFO, " 3dnow");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
1993 if (mm_flags & MM_SSE)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
1994 av_log(avctx, AV_LOG_INFO, " sse");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
1995 if (mm_flags & MM_SSE2)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
1996 av_log(avctx, AV_LOG_INFO, " sse2");
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
1997 av_log(avctx, AV_LOG_INFO, "\n");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1998 #endif
986e461dc072 Initial revision
glantau
parents:
diff changeset
1999
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
2000 if (mm_flags & MM_MMX) {
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2001 const int idct_algo= avctx->idct_algo;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2002
2256
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2003 if(avctx->lowres==0){
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2004 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2005 c->idct_put= ff_simple_idct_put_mmx;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2006 c->idct_add= ff_simple_idct_add_mmx;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2007 c->idct = ff_simple_idct_mmx;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2008 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3717
ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel
diego
parents: 3712
diff changeset
2009 #ifdef CONFIG_GPL
2256
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2010 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
2011 if(mm_flags & MM_MMXEXT){
2256
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2012 c->idct_put= ff_libmpeg2mmx2_idct_put;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2013 c->idct_add= ff_libmpeg2mmx2_idct_add;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2014 c->idct = ff_mmxext_idct;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2015 }else{
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2016 c->idct_put= ff_libmpeg2mmx_idct_put;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2017 c->idct_add= ff_libmpeg2mmx_idct_add;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2018 c->idct = ff_mmx_idct;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2019 }
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2020 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3717
ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel
diego
parents: 3712
diff changeset
2021 #endif
5007
f7edc4fe94db Make vp3dsp*.c compilation optional.
takis
parents: 4988
diff changeset
2022 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
f7edc4fe94db Make vp3dsp*.c compilation optional.
takis
parents: 4988
diff changeset
2023 idct_algo==FF_IDCT_VP3 &&
3721
2000e401593d disable vp3 mmx idct for theora files to avoid artifacts
aurel
parents: 3717
diff changeset
2024 avctx->codec->id!=CODEC_ID_THEORA &&
3712
f7f75f718efb Enables back the mmx/sse optimized version of the vp3 idct.
aurel
parents: 3666
diff changeset
2025 !(avctx->flags & CODEC_FLAG_BITEXACT)){
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
2026 if(mm_flags & MM_SSE2){
2696
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2027 c->idct_put= ff_vp3_idct_put_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2028 c->idct_add= ff_vp3_idct_add_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2029 c->idct = ff_vp3_idct_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2030 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2031 }else{
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2032 ff_vp3_dsp_init_mmx();
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2033 c->idct_put= ff_vp3_idct_put_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2034 c->idct_add= ff_vp3_idct_add_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2035 c->idct = ff_vp3_idct_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2036 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2037 }
3524
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2038 }else if(idct_algo==FF_IDCT_CAVS){
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2039 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2040 }else if(idct_algo==FF_IDCT_XVIDMMX){
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
2041 if(mm_flags & MM_MMXEXT){
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2042 c->idct_put= ff_idct_xvid_mmx2_put;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2043 c->idct_add= ff_idct_xvid_mmx2_add;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2044 c->idct = ff_idct_xvid_mmx2;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2045 }else{
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2046 c->idct_put= ff_idct_xvid_mmx_put;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2047 c->idct_add= ff_idct_xvid_mmx_add;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2048 c->idct = ff_idct_xvid_mmx;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2049 }
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2050 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2051 }
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2052
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2053 c->put_pixels_clamped = put_pixels_clamped_mmx;
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
2054 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2055 c->add_pixels_clamped = add_pixels_clamped_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2056 c->clear_blocks = clear_blocks_mmx;
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
2057
6327
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2058 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2059 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2060 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2061 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2062 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2063
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2064 SET_HPEL_FUNCS(put, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2065 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2066 SET_HPEL_FUNCS(avg, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2067 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2068 SET_HPEL_FUNCS(put, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2069 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2070 SET_HPEL_FUNCS(avg, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2071 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
2072
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2073 c->gmc= gmc_mmx;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2074
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
2075 c->add_bytes= add_bytes_mmx;
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
2076 c->add_bytes_l2= add_bytes_l2_mmx;
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
2077
5277
7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs
aurel
parents: 5255
diff changeset
2078 if (ENABLE_ANY_H263) {
5278
ef85411bb7e8 cosmetics: indentation
aurel
parents: 5277
diff changeset
2079 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
ef85411bb7e8 cosmetics: indentation
aurel
parents: 5277
diff changeset
2080 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
5277
7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs
aurel
parents: 5255
diff changeset
2081 }
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 6056
diff changeset
2082 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2083 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 6056
diff changeset
2084 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
2085
3173
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3105
diff changeset
2086 c->h264_idct_dc_add=
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3105
diff changeset
2087 c->h264_idct_add= ff_h264_idct_add_mmx;
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
2088 c->h264_idct8_dc_add=
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
2089 c->h264_idct8_add= ff_h264_idct8_add_mmx;
6320
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6196
diff changeset
2090 if (mm_flags & MM_SSE2)
ffb2a7b80d6d ff_h264_idct8_add_sse2.
lorenm
parents: 6196
diff changeset
2091 c->h264_idct8_add= ff_h264_idct8_add_sse2;
3173
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3105
diff changeset
2092
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
2093 if (mm_flags & MM_MMXEXT) {
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2094 c->prefetch = prefetch_mmx2;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2095
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2096 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2097 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
2098
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2099 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2100 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2101 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
2102
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2103 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2104 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
2105
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2106 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2107 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2108 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2109
3105
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3089
diff changeset
2110 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3089
diff changeset
2111 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
2745
42d3e9068e32 MMX for H.264 iDCT (adapted from x264)
lorenm
parents: 2732
diff changeset
2112
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2113 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2114 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2115 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2116 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2117 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2118 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2119 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2120 }
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2121
6327
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2122 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2123 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2124 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2125 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2126 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2127 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2128 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2129 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2130 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2131 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2132 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2133 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2134 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2135 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2136 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2137 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2138 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2139
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2140 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2141 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2142 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2143 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2144 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2145 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2146
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2147 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2148 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2149 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2150 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2151 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2152 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2153
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2154 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2155 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2156 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2157 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
2158
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 6056
diff changeset
2159 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2160 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
3213
57d31bdbebe8 added mmx implementation of h264_chroma_mc2
lorenm
parents: 3211
diff changeset
2161 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
57d31bdbebe8 added mmx implementation of h264_chroma_mc2
lorenm
parents: 3211
diff changeset
2162 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
2633
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
2163 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
2164 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
2165 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
2166 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
2707
360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents: 2696
diff changeset
2167 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents: 2696
diff changeset
2168 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3576
diff changeset
2169 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
2633
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
2170
2902
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2171 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2172 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2173 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2174 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2175 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2176 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2177 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2178 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2179
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2180 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2181 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2182 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2183 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2184 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2185 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2186 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2187 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2188
5949
d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_
aurel
parents: 5948
diff changeset
2189 if (ENABLE_CAVS_DECODER)
5950
e419e6d4e7eb cosmetics: indentation
aurel
parents: 5949
diff changeset
2190 ff_cavsdsp_init_mmx2(c, avctx);
5949
d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_
aurel
parents: 5948
diff changeset
2191
d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_
aurel
parents: 5948
diff changeset
2192 if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
5950
e419e6d4e7eb cosmetics: indentation
aurel
parents: 5949
diff changeset
2193 ff_vc1dsp_init_mmx(c, avctx);
5933
6ce8f15fc02b add VC-1 MMX DSP functions, under MIT license.
gpoirier
parents: 5912
diff changeset
2194
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
2195 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
2196 } else if (mm_flags & MM_3DNOW) {
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2197 c->prefetch = prefetch_3dnow;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2198
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2199 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2200 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
2201
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2202 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2203 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2204 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
2205
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2206 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2207 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2208
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2209 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2210 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2211 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2212
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2213 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2214 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2215 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2216 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2217 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2218 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2219 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2220 }
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2221
6327
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2222 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2223 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2224 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2225 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2226 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2227 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2228
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2229 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2230 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2231 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2232 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2233 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2234 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2235
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2236 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2237 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2238 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2239 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
3807
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2240
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 6056
diff changeset
2241 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2242 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2243 }
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2244
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2245
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2246 #define H264_QPEL_FUNCS(x, y, CPU)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2247 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2248 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2249 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2250 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2251 if((mm_flags & MM_SSE2) && !(mm_flags & MM_3DNOW)){
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2252 // these functions are slower than mmx on AMD, but faster on Intel
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2253 /* FIXME works in most codecs, but crashes svq1 due to unaligned chroma
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2254 c->put_pixels_tab[0][0] = put_pixels16_sse2;
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2255 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2256 */
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2257 H264_QPEL_FUNCS(0, 0, sse2);
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2258 }
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2259 if(mm_flags & MM_SSE2){
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2260 H264_QPEL_FUNCS(0, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2261 H264_QPEL_FUNCS(0, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2262 H264_QPEL_FUNCS(0, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2263 H264_QPEL_FUNCS(1, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2264 H264_QPEL_FUNCS(1, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2265 H264_QPEL_FUNCS(1, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2266 H264_QPEL_FUNCS(2, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2267 H264_QPEL_FUNCS(2, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2268 H264_QPEL_FUNCS(2, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2269 H264_QPEL_FUNCS(3, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2270 H264_QPEL_FUNCS(3, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2271 H264_QPEL_FUNCS(3, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2272 }
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2273 #ifdef HAVE_SSSE3
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2274 if(mm_flags & MM_SSSE3){
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2275 H264_QPEL_FUNCS(1, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2276 H264_QPEL_FUNCS(1, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2277 H264_QPEL_FUNCS(1, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2278 H264_QPEL_FUNCS(1, 3, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2279 H264_QPEL_FUNCS(2, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2280 H264_QPEL_FUNCS(2, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2281 H264_QPEL_FUNCS(2, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2282 H264_QPEL_FUNCS(2, 3, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2283 H264_QPEL_FUNCS(3, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2284 H264_QPEL_FUNCS(3, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2285 H264_QPEL_FUNCS(3, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2286 H264_QPEL_FUNCS(3, 3, ssse3);
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
2287 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2288 }
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2289 #endif
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2290
4589
30261f4ed12d Fix wrong conditional, Snow decoding, not encoding, was SIMD-accelerated.
diego
parents: 4436
diff changeset
2291 #ifdef CONFIG_SNOW_DECODER
5591
642588a60570 update mmx code to latest snow changes
michael
parents: 5587
diff changeset
2292 if(mm_flags & MM_SSE2 & 0){
3210
81cafbc23b8d snow mmx+sse2 optimizations, part 4
corey
parents: 3207
diff changeset
2293 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
5602
3b21f3268707 CONFIG_7REGS has been renamed to HAVE_7REGS
ramiro
parents: 5601
diff changeset
2294 #ifdef HAVE_7REGS
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2295 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
5601
b26025b9586d workaround gcc bug, untested as my gcc is not complaining
michael
parents: 5594
diff changeset
2296 #endif
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
2297 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2298 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2299 else{
5594
384629ebcb93 avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum
michael
parents: 5591
diff changeset
2300 if(mm_flags & MM_MMXEXT){
3210
81cafbc23b8d snow mmx+sse2 optimizations, part 4
corey
parents: 3207
diff changeset
2301 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
5602
3b21f3268707 CONFIG_7REGS has been renamed to HAVE_7REGS
ramiro
parents: 5601
diff changeset
2302 #ifdef HAVE_7REGS
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2303 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
5601
b26025b9586d workaround gcc bug, untested as my gcc is not complaining
michael
parents: 5594
diff changeset
2304 #endif
5594
384629ebcb93 avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum
michael
parents: 5591
diff changeset
2305 }
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
2306 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2307 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2308 #endif
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2309
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
2310 if(mm_flags & MM_3DNOW){
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2311 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2312 c->vector_fmul = vector_fmul_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2313 if(!(avctx->flags & CODEC_FLAG_BITEXACT))
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2314 c->float_to_int16 = float_to_int16_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2315 }
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
2316 if(mm_flags & MM_3DNOWEXT)
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2317 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
2318 if(mm_flags & MM_SSE){
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2319 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2320 c->vector_fmul = vector_fmul_sse;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2321 c->float_to_int16 = float_to_int16_sse;
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2322 c->vector_fmul_reverse = vector_fmul_reverse_sse;
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2323 c->vector_fmul_add_add = vector_fmul_add_add_sse;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2324 }
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
2325 if(mm_flags & MM_3DNOW)
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2326 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2327 }
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
2328
6403
9a736918fd90 split encoding part of dsputil_mmx into its own file
aurel
parents: 6391
diff changeset
2329 if (ENABLE_ENCODERS)
9a736918fd90 split encoding part of dsputil_mmx into its own file
aurel
parents: 6391
diff changeset
2330 dsputilenc_init_mmx(c, avctx);
9a736918fd90 split encoding part of dsputil_mmx into its own file
aurel
parents: 6391
diff changeset
2331
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2332 #if 0
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2333 // for speed testing
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2334 get_pixels = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2335 put_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2336 add_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2337
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2338 pix_abs16x16 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2339 pix_abs16x16_x2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2340 pix_abs16x16_y2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2341 pix_abs16x16_xy2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2342
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2343 put_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2344 put_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2345 put_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2346 put_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2347
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2348 put_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2349 put_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2350 put_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2351 put_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2352
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2353 avg_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2354 avg_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2355 avg_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2356 avg_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2357
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2358 avg_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2359 avg_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2360 avg_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2361 avg_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2362
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2363 //av_fdct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2364 //ff_idct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2365 #endif
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2366 }