annotate i386/dsputil_mmx.c @ 8375:de2509cf3c44 libavcodec

H.264 idct functions that include the chroma, inter luma and intra16 luma loops thus avoiding the calling overhead. New functions are not yet used.
author michael
date Thu, 18 Dec 2008 02:36:48 +0000
parents 08b0f63a91c5
children 60b6a780100b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1 /*
986e461dc072 Initial revision
glantau
parents:
diff changeset
2 * MMX optimized DSP utils
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
1739
07a484280a82 copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents: 1729
diff changeset
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
5 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
6 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
7 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
8 * FFmpeg is free software; you can redistribute it and/or
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
9 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
10 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
11 * version 2.1 of the License, or (at your option) any later version.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
12 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
13 * FFmpeg is distributed in the hope that it will be useful,
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
16 * Lesser General Public License for more details.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
17 *
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
18 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
19 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2979
diff changeset
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
21 *
986e461dc072 Initial revision
glantau
parents:
diff changeset
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
986e461dc072 Initial revision
glantau
parents:
diff changeset
23 */
986e461dc072 Initial revision
glantau
parents:
diff changeset
24
6763
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6755
diff changeset
25 #include "libavutil/x86_cpu.h"
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6755
diff changeset
26 #include "libavcodec/dsputil.h"
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6755
diff changeset
27 #include "libavcodec/h263.h"
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6755
diff changeset
28 #include "libavcodec/mpegvideo.h"
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6755
diff changeset
29 #include "libavcodec/simple_idct.h"
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
30 #include "dsputil_mmx.h"
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
31 #include "mmx.h"
5014
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
32 #include "vp3dsp_mmx.h"
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
33 #include "vp3dsp_sse2.h"
6585
0ec61ed36c29 Add a header file to declare Xvid IDCT functions.
diego
parents: 6557
diff changeset
34 #include "idct_xvid.h"
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
35
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
36 //#undef NDEBUG
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
37 //#include <assert.h>
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
38
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
39 int mm_flags; /* multimedia extension flags */
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
40
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
41 /* pixel operations */
5947
37a03989871b use ff_ prefix for extern vars
aurel
parents: 5946
diff changeset
42 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
37a03989871b use ff_ prefix for extern vars
aurel
parents: 5946
diff changeset
43 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
44
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
45 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
46 {0x8000000080000000ULL, 0x8000000080000000ULL};
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
47
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
48 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
49 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
8317
08b0f63a91c5 avoid POSIX reserved _t suffix
aurel
parents: 8288
diff changeset
50 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
08b0f63a91c5 avoid POSIX reserved _t suffix
aurel
parents: 8288
diff changeset
51 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
52 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
8317
08b0f63a91c5 avoid POSIX reserved _t suffix
aurel
parents: 8288
diff changeset
53 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
54 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
8317
08b0f63a91c5 avoid POSIX reserved _t suffix
aurel
parents: 8288
diff changeset
55 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
08b0f63a91c5 avoid POSIX reserved _t suffix
aurel
parents: 8288
diff changeset
56 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
57 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
58 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
59 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
6333
beb52d4a5efe constant was excessively aligned
lorenm
parents: 6331
diff changeset
60 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
61 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
62
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
63 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
64 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
65 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
8032
0839f325edb5 MMX VP3 Loop Filter
conrad
parents: 8031
diff changeset
66 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
67 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
8032
0839f325edb5 MMX VP3 Loop Filter
conrad
parents: 8031
diff changeset
68 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
5946
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
69 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
70 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
71
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
72 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
55251379b5b1 make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents: 5933
diff changeset
73 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
5737
efa3c1f9259a sse2 version of compute_autocorr().
lorenm
parents: 5602
diff changeset
74
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
75 #define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
76 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
77
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
78 #define MOVQ_BFE(regd) \
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
79 __asm__ volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
80 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
81 "paddb %%" #regd ", %%" #regd " \n\t" ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
82
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
83 #ifndef PIC
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
84 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
85 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
86 #else
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
87 // for shared library it's better to use this way for accessing constants
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
88 // pcmpeqd -> -1
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
89 #define MOVQ_BONE(regd) \
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
90 __asm__ volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
91 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
92 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
93 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
94
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
95 #define MOVQ_WTWO(regd) \
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
96 __asm__ volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
97 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
98 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
99 "psllw $1, %%" #regd " \n\t"::)
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
100
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
101 #endif
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
102
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
103 // using regr as temporary and for the output result
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
104 // first argument is unmodifed and second is trashed
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
105 // regfe is supposed to contain 0xfefefefefefefefe
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
106 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
107 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
108 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
109 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
110 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
111 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
112 "paddb " #regb ", " #regr " \n\t"
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
113
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
114 #define PAVGB_MMX(rega, regb, regr, regfe) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
115 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
116 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
117 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
118 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
119 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
120 "psubb " #regb ", " #regr " \n\t"
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
121
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
122 // mm6 is supposed to contain 0xfefefefefefefefe
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
123 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
124 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
125 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
126 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
127 "pand " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
128 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
129 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
130 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
131 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
132 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
133 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
134 "paddb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
135 "paddb " #regd ", " #regp " \n\t"
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
136
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
137 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
138 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
139 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
140 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
141 "por " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
142 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
143 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
144 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
145 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
146 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
147 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
148 "psubb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
149 "psubb " #regd ", " #regp " \n\t"
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
150
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
151 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
152 /* MMX no rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
153 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
154 #define SET_RND MOVQ_WONE
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
155 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
156 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
157
8073
915bc657348f Rename template included sources from .h to _template.c.
flameeyes
parents: 8041
diff changeset
158 #include "dsputil_mmx_rnd_template.c"
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
159
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
160 #undef DEF
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
161 #undef SET_RND
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
162 #undef PAVGBP
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
163 #undef PAVGB
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
164 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
165 /* MMX rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
166
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
167 #define DEF(x, y) x ## _ ## y ##_mmx
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
168 #define SET_RND MOVQ_WTWO
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
169 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
170 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
171
8073
915bc657348f Rename template included sources from .h to _template.c.
flameeyes
parents: 8041
diff changeset
172 #include "dsputil_mmx_rnd_template.c"
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
173
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
174 #undef DEF
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
175 #undef SET_RND
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
176 #undef PAVGBP
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
177 #undef PAVGB
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
178
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
179 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
180 /* 3Dnow specific */
986e461dc072 Initial revision
glantau
parents:
diff changeset
181
986e461dc072 Initial revision
glantau
parents:
diff changeset
182 #define DEF(x) x ## _3dnow
986e461dc072 Initial revision
glantau
parents:
diff changeset
183 #define PAVGB "pavgusb"
986e461dc072 Initial revision
glantau
parents:
diff changeset
184
8073
915bc657348f Rename template included sources from .h to _template.c.
flameeyes
parents: 8041
diff changeset
185 #include "dsputil_mmx_avg_template.c"
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
186
986e461dc072 Initial revision
glantau
parents:
diff changeset
187 #undef DEF
986e461dc072 Initial revision
glantau
parents:
diff changeset
188 #undef PAVGB
986e461dc072 Initial revision
glantau
parents:
diff changeset
189
986e461dc072 Initial revision
glantau
parents:
diff changeset
190 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
191 /* MMX2 specific */
986e461dc072 Initial revision
glantau
parents:
diff changeset
192
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
193 #define DEF(x) x ## _mmx2
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
194
986e461dc072 Initial revision
glantau
parents:
diff changeset
195 /* Introduced only in MMX2 set */
986e461dc072 Initial revision
glantau
parents:
diff changeset
196 #define PAVGB "pavgb"
986e461dc072 Initial revision
glantau
parents:
diff changeset
197
8073
915bc657348f Rename template included sources from .h to _template.c.
flameeyes
parents: 8041
diff changeset
198 #include "dsputil_mmx_avg_template.c"
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
199
986e461dc072 Initial revision
glantau
parents:
diff changeset
200 #undef DEF
986e461dc072 Initial revision
glantau
parents:
diff changeset
201 #undef PAVGB
986e461dc072 Initial revision
glantau
parents:
diff changeset
202
6327
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
203 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
204 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
6321
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
205 #define put_pixels16_mmx2 put_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
206 #define put_pixels8_mmx2 put_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
207 #define put_pixels4_mmx2 put_pixels4_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
208 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
209 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
210 #define put_pixels16_3dnow put_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
211 #define put_pixels8_3dnow put_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
212 #define put_pixels4_3dnow put_pixels4_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
213 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
214 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
215
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
216 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
217 /* standard MMX */
986e461dc072 Initial revision
glantau
parents:
diff changeset
218
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
219 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
220 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
221 const DCTELEM *p;
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
222 uint8_t *pix;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
223
986e461dc072 Initial revision
glantau
parents:
diff changeset
224 /* read the pixels */
986e461dc072 Initial revision
glantau
parents:
diff changeset
225 p = block;
986e461dc072 Initial revision
glantau
parents:
diff changeset
226 pix = pixels;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
227 /* unrolled loop */
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
228 __asm__ volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
229 "movq %3, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
230 "movq 8%3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
231 "movq 16%3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
232 "movq 24%3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
233 "movq 32%3, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
234 "movq 40%3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
235 "movq 48%3, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
236 "movq 56%3, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
237 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
238 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
239 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
240 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
241 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
242 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
243 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
244 "movq %%mm6, (%0, %2) \n\t"
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
245 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
246 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
247 pix += line_size*4;
986e461dc072 Initial revision
glantau
parents:
diff changeset
248 p += 32;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
249
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
250 // if here would be an exact copy of the code above
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
251 // compiler would generate some very strange code
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
252 // thus using "r"
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
253 __asm__ volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
254 "movq (%3), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
255 "movq 8(%3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
256 "movq 16(%3), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
257 "movq 24(%3), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
258 "movq 32(%3), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
259 "movq 40(%3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
260 "movq 48(%3), %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
261 "movq 56(%3), %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
262 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
263 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
264 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
265 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
266 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
267 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
268 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
269 "movq %%mm6, (%0, %2) \n\t"
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
270 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
271 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
272 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
273
3089
072dbc669253 MSVC-compatible __align8/__align16 declaration
diego
parents: 3036
diff changeset
274 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
1985
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents: 1984
diff changeset
275 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents: 1984
diff changeset
276
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
277 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
278 {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
279 int i;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
280
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
281 movq_m2r(*vector128, mm1);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
282 for (i = 0; i < 8; i++) {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
283 movq_m2r(*(block), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
284 packsswb_m2r(*(block + 4), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
285 block += 8;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
286 paddb_r2r(mm1, mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
287 movq_r2m(mm0, *pixels);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
288 pixels += line_size;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
289 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
290 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
291
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
292 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
293 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
294 const DCTELEM *p;
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
295 uint8_t *pix;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
296 int i;
986e461dc072 Initial revision
glantau
parents:
diff changeset
297
986e461dc072 Initial revision
glantau
parents:
diff changeset
298 /* read the pixels */
986e461dc072 Initial revision
glantau
parents:
diff changeset
299 p = block;
986e461dc072 Initial revision
glantau
parents:
diff changeset
300 pix = pixels;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
301 MOVQ_ZERO(mm7);
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
302 i = 4;
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
303 do {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
304 __asm__ volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
305 "movq (%2), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
306 "movq 8(%2), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
307 "movq 16(%2), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
308 "movq 24(%2), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
309 "movq %0, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
310 "movq %1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
311 "movq %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
312 "punpcklbw %%mm7, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
313 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
314 "paddsw %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
315 "paddsw %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
316 "movq %%mm6, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
317 "punpcklbw %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
318 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
319 "paddsw %%mm6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
320 "paddsw %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
321 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
322 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
323 "movq %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
324 "movq %%mm2, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
325 :"+m"(*pix), "+m"(*(pix+line_size))
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
326 :"r"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
327 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
328 pix += line_size*2;
986e461dc072 Initial revision
glantau
parents:
diff changeset
329 p += 16;
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
330 } while (--i);
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
331 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
332
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
333 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
334 {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
335 __asm__ volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
336 "lea (%3, %3), %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
337 ASMALIGN(3)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
338 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
339 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
340 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
341 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
342 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
343 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
344 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
345 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
346 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
347 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
348 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
349 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
350 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
351 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
352 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
353 : "+g"(h), "+r" (pixels), "+r" (block)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
354 : "r"((x86_reg)line_size)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
355 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
356 );
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
357 }
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
358
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
359 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
360 {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
361 __asm__ volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
362 "lea (%3, %3), %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
363 ASMALIGN(3)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
364 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
365 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
366 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
367 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
368 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
369 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
370 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
371 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
372 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
373 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
374 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
375 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
376 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
377 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
378 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
379 : "+g"(h), "+r" (pixels), "+r" (block)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
380 : "r"((x86_reg)line_size)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
381 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
382 );
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
383 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
384
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
385 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
386 {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
387 __asm__ volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
388 "lea (%3, %3), %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
389 ASMALIGN(3)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
390 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
391 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
392 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
393 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
394 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
395 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
396 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
397 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
398 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
399 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
400 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
401 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
402 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
403 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
404 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
405 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
406 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
407 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
408 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
409 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
410 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
411 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
412 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
413 : "+g"(h), "+r" (pixels), "+r" (block)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
414 : "r"((x86_reg)line_size)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
415 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
416 );
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
417 }
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
418
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
419 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
420 {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
421 __asm__ volatile(
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
422 "1: \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
423 "movdqu (%1), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
424 "movdqu (%1,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
425 "movdqu (%1,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
426 "movdqu (%1,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
427 "movdqa %%xmm0, (%2) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
428 "movdqa %%xmm1, (%2,%3) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
429 "movdqa %%xmm2, (%2,%3,2) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
430 "movdqa %%xmm3, (%2,%4) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
431 "subl $4, %0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
432 "lea (%1,%3,4), %1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
433 "lea (%2,%3,4), %2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
434 "jnz 1b \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
435 : "+g"(h), "+r" (pixels), "+r" (block)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
436 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
437 : "memory"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
438 );
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
439 }
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
440
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
441 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
442 {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
443 __asm__ volatile(
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
444 "1: \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
445 "movdqu (%1), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
446 "movdqu (%1,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
447 "movdqu (%1,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
448 "movdqu (%1,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
449 "pavgb (%2), %%xmm0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
450 "pavgb (%2,%3), %%xmm1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
451 "pavgb (%2,%3,2), %%xmm2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
452 "pavgb (%2,%4), %%xmm3 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
453 "movdqa %%xmm0, (%2) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
454 "movdqa %%xmm1, (%2,%3) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
455 "movdqa %%xmm2, (%2,%3,2) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
456 "movdqa %%xmm3, (%2,%4) \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
457 "subl $4, %0 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
458 "lea (%1,%3,4), %1 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
459 "lea (%2,%3,4), %2 \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
460 "jnz 1b \n\t"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
461 : "+g"(h), "+r" (pixels), "+r" (block)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
462 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
463 : "memory"
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
464 );
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
465 }
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
466
8288
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
467 #define CLEAR_BLOCKS(name,n) \
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
468 static void name(DCTELEM *blocks)\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
469 {\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
470 __asm__ volatile(\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
471 "pxor %%mm7, %%mm7 \n\t"\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
472 "mov %1, %%"REG_a" \n\t"\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
473 "1: \n\t"\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
474 "movq %%mm7, (%0, %%"REG_a") \n\t"\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
475 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
476 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
477 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
478 "add $32, %%"REG_a" \n\t"\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
479 " js 1b \n\t"\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
480 : : "r" (((uint8_t *)blocks)+128*n),\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
481 "i" (-128*n)\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
482 : "%"REG_a\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
483 );\
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
484 }
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
485 CLEAR_BLOCKS(clear_blocks_mmx, 6)
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
486 CLEAR_BLOCKS(clear_block_mmx, 1)
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
487
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
488 static void clear_block_sse(DCTELEM *block)
296
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
489 {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
490 __asm__ volatile(
8288
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
491 "xorps %%xmm0, %%xmm0 \n"
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
492 "movaps %%xmm0, (%0) \n"
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
493 "movaps %%xmm0, 16(%0) \n"
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
494 "movaps %%xmm0, 32(%0) \n"
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
495 "movaps %%xmm0, 48(%0) \n"
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
496 "movaps %%xmm0, 64(%0) \n"
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
497 "movaps %%xmm0, 80(%0) \n"
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
498 "movaps %%xmm0, 96(%0) \n"
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
499 "movaps %%xmm0, 112(%0) \n"
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
500 :: "r"(block)
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
501 : "memory"
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
502 );
296
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
503 }
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
504
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
505 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
506 x86_reg i=0;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
507 __asm__ volatile(
7087
9c0f579ebb6b Fix add_bytes_mmx and add_bytes_l2_mmx for w < 16
reimar
parents: 6763
diff changeset
508 "jmp 2f \n\t"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
509 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
510 "movq (%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
511 "movq (%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
512 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
513 "movq %%mm1, (%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
514 "movq 8(%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
515 "movq 8(%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
516 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
517 "movq %%mm1, 8(%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
518 "add $16, %0 \n\t"
7087
9c0f579ebb6b Fix add_bytes_mmx and add_bytes_l2_mmx for w < 16
reimar
parents: 6763
diff changeset
519 "2: \n\t"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
520 "cmp %3, %0 \n\t"
7087
9c0f579ebb6b Fix add_bytes_mmx and add_bytes_l2_mmx for w < 16
reimar
parents: 6763
diff changeset
521 " js 1b \n\t"
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
522 : "+r" (i)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
523 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
524 );
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
525 for(; i<w; i++)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
526 dst[i+0] += src[i+0];
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
527 }
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
528
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
529 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
530 x86_reg i=0;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
531 __asm__ volatile(
7087
9c0f579ebb6b Fix add_bytes_mmx and add_bytes_l2_mmx for w < 16
reimar
parents: 6763
diff changeset
532 "jmp 2f \n\t"
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
533 "1: \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
534 "movq (%2, %0), %%mm0 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
535 "movq 8(%2, %0), %%mm1 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
536 "paddb (%3, %0), %%mm0 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
537 "paddb 8(%3, %0), %%mm1 \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
538 "movq %%mm0, (%1, %0) \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
539 "movq %%mm1, 8(%1, %0) \n\t"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
540 "add $16, %0 \n\t"
7087
9c0f579ebb6b Fix add_bytes_mmx and add_bytes_l2_mmx for w < 16
reimar
parents: 6763
diff changeset
541 "2: \n\t"
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
542 "cmp %4, %0 \n\t"
7087
9c0f579ebb6b Fix add_bytes_mmx and add_bytes_l2_mmx for w < 16
reimar
parents: 6763
diff changeset
543 " js 1b \n\t"
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
544 : "+r" (i)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
545 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
546 );
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
547 for(; i<w; i++)
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
548 dst[i] = src1[i] + src2[i];
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
549 }
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
550
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
551 #define H263_LOOP_FILTER \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
552 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
553 "movq %0, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
554 "movq %0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
555 "movq %3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
556 "movq %3, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
557 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
558 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
559 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
560 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
561 "psubw %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
562 "psubw %%mm3, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
563 "movq %1, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
564 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
565 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
566 "movq %2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
567 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
568 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
569 "punpcklbw %%mm7, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
570 "punpckhbw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
571 "psubw %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
572 "psubw %%mm3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
573 "psllw $2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
574 "psllw $2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
575 "paddw %%mm0, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
576 "paddw %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
577 "pxor %%mm6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
578 "pcmpgtw %%mm4, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
579 "pcmpgtw %%mm5, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
580 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
581 "pxor %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
582 "psubw %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
583 "psubw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
584 "psrlw $3, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
585 "psrlw $3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
586 "packuswb %%mm5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
587 "packsswb %%mm7, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
588 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
589 "movd %4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
590 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
591 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
592 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
593 "psubusb %%mm4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
594 "movq %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
595 "psubusb %%mm4, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
596 "psubb %%mm3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
597 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
598 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
599 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
600 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
601 "paddusb %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
602 "psubusb %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
603 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
604 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
605 "paddusb %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
606 "packsswb %%mm1, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
607 "pcmpgtb %%mm0, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
608 "pxor %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
609 "psubb %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
610 "movq %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
611 "psubusb %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
612 "psubb %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
613 "pand %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
614 "psrlw $2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
615 "pxor %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
616 "psubb %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
617 "movq %0, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
618 "movq %3, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
619 "psubb %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
620 "paddb %%mm1, %%mm6 \n\t"
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
621
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
622 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
623 if(ENABLE_ANY_H263) {
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
624 const int strength= ff_h263_loop_filter_strength[qscale];
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
625
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
626 __asm__ volatile(
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
627
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
628 H263_LOOP_FILTER
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
629
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
630 "movq %%mm3, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
631 "movq %%mm4, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
632 "movq %%mm5, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
633 "movq %%mm6, %3 \n\t"
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
634 : "+m" (*(uint64_t*)(src - 2*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
635 "+m" (*(uint64_t*)(src - 1*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
636 "+m" (*(uint64_t*)(src + 0*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
637 "+m" (*(uint64_t*)(src + 1*stride))
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
638 : "g" (2*strength), "m"(ff_pb_FC)
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
639 );
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
640 }
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
641 }
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
642
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
643 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
644 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
645 "movd %4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
646 "movd %5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
647 "movd %6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
648 "movd %7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
649 "punpcklbw %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
650 "punpcklbw %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
651 "movq %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
652 "punpcklwd %%mm2, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
653 "punpckhwd %%mm2, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
654 "movd %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
655 "punpckhdq %%mm0, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
656 "movd %%mm0, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
657 "movd %%mm1, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
658 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
659 "movd %%mm1, %3 \n\t"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
660
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
661 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
662 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
663 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
664 "=m" (*(uint32_t*)(dst + 3*dst_stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
665 : "m" (*(uint32_t*)(src + 0*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
666 "m" (*(uint32_t*)(src + 1*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
667 "m" (*(uint32_t*)(src + 2*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
668 "m" (*(uint32_t*)(src + 3*src_stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
669 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
670 }
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
671
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
672 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
673 if(ENABLE_ANY_H263) {
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
674 const int strength= ff_h263_loop_filter_strength[qscale];
6181
f3da7b2592aa Use DECLARE_ALIGNED
reimar
parents: 6135
diff changeset
675 DECLARE_ALIGNED(8, uint64_t, temp[4]);
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
676 uint8_t *btemp= (uint8_t*)temp;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
677
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
678 src -= 2;
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
679
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
680 transpose4x4(btemp , src , 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
681 transpose4x4(btemp+4, src + 4*stride, 8, stride);
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
682 __asm__ volatile(
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
683 H263_LOOP_FILTER // 5 3 4 6
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
684
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
685 : "+m" (temp[0]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
686 "+m" (temp[1]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
687 "+m" (temp[2]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
688 "+m" (temp[3])
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
689 : "g" (2*strength), "m"(ff_pb_FC)
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
690 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
691
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
692 __asm__ volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
693 "movq %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
694 "movq %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
695 "punpcklbw %%mm3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
696 "punpcklbw %%mm6, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
697 "punpckhbw %%mm3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
698 "punpckhbw %%mm6, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
699 "movq %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
700 "movq %%mm1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
701 "punpcklwd %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
702 "punpcklwd %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
703 "punpckhwd %%mm4, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
704 "punpckhwd %%mm0, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
705 "movd %%mm5, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
706 "punpckhdq %%mm5, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
707 "movd %%mm5, (%0,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
708 "movd %%mm3, (%0,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
709 "punpckhdq %%mm3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
710 "movd %%mm3, (%0,%3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
711 "movd %%mm1, (%1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
712 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
713 "movd %%mm1, (%1,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
714 "movd %%mm6, (%1,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
715 "punpckhdq %%mm6, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
716 "movd %%mm6, (%1,%3) \n\t"
2505
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
717 :: "r" (src),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
718 "r" (src + 4*stride),
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
719 "r" ((x86_reg) stride ),
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
720 "r" ((x86_reg)(3*stride))
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
721 );
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
722 }
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
723 }
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
724
6437
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
725 /* draw the edges of width 'w' of an image of size width, height
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
726 this mmx version can only handle w==8 || w==16 */
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
727 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
728 {
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
729 uint8_t *ptr, *last_line;
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
730 int i;
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
731
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
732 last_line = buf + (height - 1) * wrap;
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
733 /* left and right */
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
734 ptr = buf;
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
735 if(w==8)
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
736 {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
737 __asm__ volatile(
6437
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
738 "1: \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
739 "movd (%0), %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
740 "punpcklbw %%mm0, %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
741 "punpcklwd %%mm0, %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
742 "punpckldq %%mm0, %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
743 "movq %%mm0, -8(%0) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
744 "movq -8(%0, %2), %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
745 "punpckhbw %%mm1, %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
746 "punpckhwd %%mm1, %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
747 "punpckhdq %%mm1, %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
748 "movq %%mm1, (%0, %2) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
749 "add %1, %0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
750 "cmp %3, %0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
751 " jb 1b \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
752 : "+r" (ptr)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
753 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
6437
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
754 );
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
755 }
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
756 else
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
757 {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
758 __asm__ volatile(
6437
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
759 "1: \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
760 "movd (%0), %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
761 "punpcklbw %%mm0, %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
762 "punpcklwd %%mm0, %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
763 "punpckldq %%mm0, %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
764 "movq %%mm0, -8(%0) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
765 "movq %%mm0, -16(%0) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
766 "movq -8(%0, %2), %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
767 "punpckhbw %%mm1, %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
768 "punpckhwd %%mm1, %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
769 "punpckhdq %%mm1, %%mm1 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
770 "movq %%mm1, (%0, %2) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
771 "movq %%mm1, 8(%0, %2) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
772 "add %1, %0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
773 "cmp %3, %0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
774 " jb 1b \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
775 : "+r" (ptr)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
776 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
6437
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
777 );
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
778 }
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
779
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
780 for(i=0;i<w;i+=4) {
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
781 /* top and bottom (and hopefully also the corners) */
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
782 ptr= buf - (i + 1) * wrap - w;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
783 __asm__ volatile(
6437
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
784 "1: \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
785 "movq (%1, %0), %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
786 "movq %%mm0, (%0) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
787 "movq %%mm0, (%0, %2) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
788 "movq %%mm0, (%0, %2, 2) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
789 "movq %%mm0, (%0, %3) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
790 "add $8, %0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
791 "cmp %4, %0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
792 " jb 1b \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
793 : "+r" (ptr)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
794 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
6437
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
795 );
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
796 ptr= last_line + (i + 1) * wrap - w;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
797 __asm__ volatile(
6437
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
798 "1: \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
799 "movq (%1, %0), %%mm0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
800 "movq %%mm0, (%0) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
801 "movq %%mm0, (%0, %2) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
802 "movq %%mm0, (%0, %2, 2) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
803 "movq %%mm0, (%0, %3) \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
804 "add $8, %0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
805 "cmp %4, %0 \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
806 " jb 1b \n\t"
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
807 : "+r" (ptr)
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
808 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
6437
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
809 );
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
810 }
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
811 }
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
812
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
813 #define PAETH(cpu, abs3)\
7460
2ced44037814 Mark add_png_paeth_prediction_* functions which are only used within this file
diego
parents: 7286
diff changeset
814 static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
815 {\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
816 x86_reg i = -bpp;\
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
817 x86_reg end = w-3;\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
818 __asm__ volatile(\
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
819 "pxor %%mm7, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
820 "movd (%1,%0), %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
821 "movd (%2,%0), %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
822 "punpcklbw %%mm7, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
823 "punpcklbw %%mm7, %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
824 "add %4, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
825 "1: \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
826 "movq %%mm1, %%mm2 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
827 "movd (%2,%0), %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
828 "movq %%mm2, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
829 "punpcklbw %%mm7, %%mm1 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
830 "movq %%mm2, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
831 "psubw %%mm1, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
832 "psubw %%mm0, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
833 "movq %%mm3, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
834 "paddw %%mm4, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
835 abs3\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
836 "movq %%mm4, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
837 "pminsw %%mm5, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
838 "pcmpgtw %%mm6, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
839 "pcmpgtw %%mm5, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
840 "movq %%mm4, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
841 "pand %%mm3, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
842 "pandn %%mm3, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
843 "pandn %%mm0, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
844 "movd (%3,%0), %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
845 "pand %%mm1, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
846 "pand %%mm4, %%mm2 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
847 "punpcklbw %%mm7, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
848 "movq %6, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
849 "paddw %%mm6, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
850 "paddw %%mm2, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
851 "paddw %%mm3, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
852 "pand %%mm5, %%mm0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
853 "movq %%mm0, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
854 "packuswb %%mm3, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
855 "movd %%mm3, (%1,%0) \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
856 "add %4, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
857 "cmp %5, %0 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
858 "jle 1b \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
859 :"+r"(i)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
860 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
861 "m"(ff_pw_255)\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
862 :"memory"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
863 );\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
864 }
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
865
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
866 #define ABS3_MMX2\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
867 "psubw %%mm5, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
868 "pmaxsw %%mm7, %%mm5 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
869 "pxor %%mm6, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
870 "pxor %%mm7, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
871 "psubw %%mm3, %%mm6 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
872 "psubw %%mm4, %%mm7 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
873 "pmaxsw %%mm6, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
874 "pmaxsw %%mm7, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
875 "pxor %%mm7, %%mm7 \n"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
876
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
877 #define ABS3_SSSE3\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
878 "pabsw %%mm3, %%mm3 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
879 "pabsw %%mm4, %%mm4 \n"\
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
880 "pabsw %%mm5, %%mm5 \n"
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
881
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
882 PAETH(mmx2, ABS3_MMX2)
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
883 #ifdef HAVE_SSSE3
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
884 PAETH(ssse3, ABS3_SSSE3)
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
885 #endif
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
886
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
887 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
888 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
889 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
890 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
891 "movq "#in7", " #m3 " \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
892 "movq "#in0", %%mm5 \n\t" /* D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
893 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
894 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
895 "movq "#in1", %%mm5 \n\t" /* C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
896 "movq "#in2", %%mm6 \n\t" /* B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
897 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
898 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
899 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
900 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
901 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
902 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
903 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
904 "psraw $5, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
905 "packuswb %%mm5, %%mm5 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
906 OP(%%mm5, out, %%mm7, d)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
907
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
908 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
909 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
910 uint64_t temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
911 \
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
912 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
913 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
914 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
915 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
916 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
917 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
918 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
919 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
920 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
921 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
922 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
923 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
924 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
925 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
926 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
927 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
928 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
929 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
930 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
931 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
932 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
933 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
934 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
935 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
936 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
937 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
938 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
939 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
940 "paddw %6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
941 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
942 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
943 "movq %%mm0, %5 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
944 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
945 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
946 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
947 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
948 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
949 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
950 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
951 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
952 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
953 "paddw %%mm0, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
954 "paddw %%mm5, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
955 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
956 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
957 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
958 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
959 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
960 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
961 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
962 "paddw %%mm2, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
963 "paddw %%mm6, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
964 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
965 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
966 "paddw %6, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
967 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
968 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
969 "movq %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
970 "packuswb %%mm3, %%mm1 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
971 OP_MMX2(%%mm1, (%1),%%mm4, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
972 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
973 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
974 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
975 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
976 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
977 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
978 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
979 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
980 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
981 "paddw %%mm1, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
982 "paddw %%mm4, %%mm0 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
983 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
984 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
985 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
986 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
987 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
988 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
989 "paddw %%mm3, %%mm2 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
990 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
991 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
992 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
993 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
994 "paddw %%mm2, %%mm6 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
995 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
996 "paddw %6, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
997 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
998 "psraw $5, %%mm0 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
999 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1000 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1001 "paddw %%mm5, %%mm3 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1002 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1003 "paddw %%mm4, %%mm6 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1004 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1005 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1006 "paddw %%mm1, %%mm4 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1007 "paddw %%mm2, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1008 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1009 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1010 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1011 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1012 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1013 "paddw %6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1014 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1015 "psraw $5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1016 "packuswb %%mm4, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1017 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1018 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1019 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1020 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1021 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1022 " jnz 1b \n\t"\
6513
713c4fd84e0b Hardcode register to prevent aparent miscompilation.
michael
parents: 6512
diff changeset
1023 : "+a"(src), "+c"(dst), "+D"(h)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
1024 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1025 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1026 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1027 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1028 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1029 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1030 int i;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1031 int16_t temp[16];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1032 /* quick HACK, XXX FIXME MUST be optimized */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1033 for(i=0; i<h; i++)\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1034 {\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1035 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1036 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1037 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1038 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1039 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1040 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1041 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1042 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1043 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1044 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1045 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1046 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1047 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1048 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1049 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1050 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1051 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1052 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1053 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1054 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1055 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1056 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1057 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1058 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1059 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1060 "movq 16(%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1061 "movq 24(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1062 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1063 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1064 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1065 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1066 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1067 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1068 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1069 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1070 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1071 dst+=dstStride;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1072 src+=srcStride;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1073 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1074 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1075 \
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
1076 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1077 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1078 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1079 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1080 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1081 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1082 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1083 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1084 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1085 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1086 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1087 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1088 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1089 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1090 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1091 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1092 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1093 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1094 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1095 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1096 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1097 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1098 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1099 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1100 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1101 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1102 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1103 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1104 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
6512
33ac9c5524cc remove unused temp
michael
parents: 6437
diff changeset
1105 "paddw %5, %%mm6 \n\t"\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1106 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1107 "psraw $5, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1108 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1109 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1110 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1111 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1112 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1113 "paddw %%mm5, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1114 "paddw %%mm6, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1115 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1116 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1117 "paddw %%mm6, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1118 "paddw %%mm5, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1119 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1120 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1121 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1122 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1123 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
6512
33ac9c5524cc remove unused temp
michael
parents: 6437
diff changeset
1124 "paddw %5, %%mm1 \n\t"\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1125 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1126 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1127 "packuswb %%mm3, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1128 OP_MMX2(%%mm0, (%1), %%mm4, q)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1129 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1130 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1131 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1132 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1133 " jnz 1b \n\t"\
6513
713c4fd84e0b Hardcode register to prevent aparent miscompilation.
michael
parents: 6512
diff changeset
1134 : "+a"(src), "+c"(dst), "+d"(h)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
1135 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1136 : "memory"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1137 );\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1138 }\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1139 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1140 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1141 int i;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1142 int16_t temp[8];\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1143 /* quick HACK, XXX FIXME MUST be optimized */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1144 for(i=0; i<h; i++)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1145 {\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1146 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1147 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1148 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1149 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1150 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1151 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1152 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1153 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1154 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1155 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1156 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1157 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1158 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1159 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1160 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1161 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1162 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1163 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1164 :"memory"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1165 );\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1166 dst+=dstStride;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1167 src+=srcStride;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1168 }\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1169 }
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1170
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1171 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1172 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1173 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1174 uint64_t temp[17*4];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1175 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1176 int count= 17;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1177 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1178 /*FIXME unroll */\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1179 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1180 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1181 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1182 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1183 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1184 "movq 8(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1185 "movq 8(%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1186 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1187 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1188 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1189 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1190 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1191 "movq %%mm1, 17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1192 "movq %%mm2, 2*17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1193 "movq %%mm3, 3*17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1194 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1195 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1196 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1197 " jnz 1b \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1198 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
1199 : "r" ((x86_reg)srcStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1200 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1201 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1202 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1203 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1204 count=4;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1205 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1206 /*FIXME reorder for speed */\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1207 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1208 /*"pxor %%mm7, %%mm7 \n\t"*/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1209 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1210 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1211 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1212 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1213 "movq 24(%0), %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1214 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1215 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1216 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1217 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1218 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1219 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1220 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1221 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1222 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1223 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1224 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1225 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1226 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1227 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1228 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1229 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1230 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1231 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1232 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1233 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1234 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1235 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1236 "add %4, %1 \n\t" \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1237 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1238 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1239 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1240 "add $136, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1241 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1242 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1243 " jnz 1b \n\t"\
958
9bb668034ecf slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped)
michaelni
parents: 954
diff changeset
1244 \
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
1245 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
1246 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1247 :"memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1248 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1249 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1250 \
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
1251 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
1252 uint64_t temp[9*2];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1253 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1254 int count= 9;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1255 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1256 /*FIXME unroll */\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1257 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1258 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1259 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1260 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1261 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1262 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1263 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1264 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1265 "movq %%mm1, 9*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1266 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1267 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1268 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1269 " jnz 1b \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1270 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
1271 : "r" ((x86_reg)srcStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1272 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1273 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1274 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1275 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1276 count=2;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1277 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1278 /*FIXME reorder for speed */\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1279 __asm__ volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1280 /*"pxor %%mm7, %%mm7 \n\t"*/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1281 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1282 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1283 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1284 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1285 "movq 24(%0), %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1286 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1287 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1288 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1289 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1290 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1291 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1292 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1293 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1294 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1295 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1296 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1297 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1298 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1299 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1300 "add $72, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1301 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1302 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1303 " jnz 1b \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1304 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
1305 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
1306 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1307 : "memory"\
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
1308 );\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1309 }\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1310 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1311 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
6321
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
1312 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1313 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1314 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1315 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1316 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1317 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1318 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1319 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1320 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1321 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1322 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1323 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1324 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1325 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1326 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1327 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1328 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1329 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1330 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1331 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1332 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1333 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1334 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1335 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1336 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1337 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1338 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1339 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1340 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1341 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1342 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1343 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1344 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1345 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1346 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1347 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1348 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1349 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1350 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1351 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1352 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1353 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1354 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1355 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1356 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1357 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1358 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1359 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1360 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1361 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1362 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1363 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1364 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1365 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1366 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1367 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1368 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1369 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1370 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1371 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1372 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1373 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1374 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1375 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1376 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1377 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1378 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1379 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1380 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1381 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1382 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1383 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1384 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1385 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1386 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1387 uint64_t half[8 + 9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1388 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1389 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1390 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1391 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1392 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1393 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1394 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1395 uint64_t half[8 + 9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1396 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1397 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1398 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1399 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1400 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1401 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1402 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1403 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1404 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1405 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1406 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1407 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1408 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1409 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1410 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1411 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1412 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1413 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1414 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1415 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1416 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1417 uint64_t half[9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1418 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1419 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1420 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1421 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1422 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
6321
57bd93f81a14 use mmx2/3dnow avg functions in avg_qpel*_mc00
lorenm
parents: 6320
diff changeset
1423 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1424 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1425 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1426 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1427 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1428 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1429 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1430 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1431 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1432 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1433 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1434 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1435 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1436 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1437 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1438 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1439 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1440 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1441 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1442 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1443 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1444 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1445 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1446 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1447 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1448 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1449 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1450 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1451 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1452 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1453 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1454 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1455 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1456 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1457 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1458 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1459 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1460 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1461 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1462 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1463 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1464 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1465 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1466 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1467 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1468 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1469 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1470 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1471 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1472 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1473 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1474 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1475 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1476 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1477 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1478 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1479 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1480 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1481 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1482 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1483 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1484 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1485 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1486 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1487 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1488 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1489 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1490 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1491 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1492 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1493 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1494 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1495 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1496 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1497 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1498 uint64_t half[16*2 + 17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1499 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1500 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1501 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1502 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1503 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1504 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1505 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1506 uint64_t half[16*2 + 17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1507 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1508 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1509 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1510 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1511 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1512 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1513 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1514 uint64_t half[17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1515 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1516 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1517 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1518 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1519 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1520 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1521 uint64_t half[17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1522 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1523 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
1524 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
1525 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1526 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
1527 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1528 uint64_t half[17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1529 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1530 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1531 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1532 }
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1533
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1534 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1535 #define AVG_3DNOW_OP(a,b,temp, size) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1536 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1537 "pavgusb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1538 "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1539 #define AVG_MMX2_OP(a,b,temp, size) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1540 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1541 "pavgb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1542 "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1543
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1544 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1545 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1546 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1547 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1548 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1549 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1550 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1551 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1552 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1553
3807
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1554 /***********************************/
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1555 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1556
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1557 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1558 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1559 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1560 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1561 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1562 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1563 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1564 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1565
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1566 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1567 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1568 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1569 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1570 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1571 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1572 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1573 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1574 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1575 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1576 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1577 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1578 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1579 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1580 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1581 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1582 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1583 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1584 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1585 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1586 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1587 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1588 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1589 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1590
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1591 QPEL_2TAP(put_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1592 QPEL_2TAP(avg_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1593 QPEL_2TAP(put_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1594 QPEL_2TAP(avg_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1595 QPEL_2TAP(put_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1596 QPEL_2TAP(avg_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1597 QPEL_2TAP(put_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1598 QPEL_2TAP(avg_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1599
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
1600
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
1601 #if 0
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
1602 static void just_return() { return; }
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
1603 #endif
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
1604
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1605 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1606 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1607 const int w = 8;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1608 const int ix = ox>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1609 const int iy = oy>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1610 const int oxs = ox>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1611 const int oys = oy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1612 const int dxxs = dxx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1613 const int dxys = dxy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1614 const int dyxs = dyx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1615 const int dyys = dyy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1616 const uint16_t r4[4] = {r,r,r,r};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1617 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1618 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1619 const uint64_t shift2 = 2*shift;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1620 uint8_t edge_buf[(h+1)*stride];
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1621 int x, y;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1622
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1623 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1624 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1625 const int dxh = dxy*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1626 const int dyw = dyx*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1627 if( // non-constant fullpel offset (3% of blocks)
6196
166bef5cad01 add parenthesis, fix warning: i386/dsputil_mmx.c:2618: warning: suggest parentheses around arithmetic in operand of |
bcoudurier
parents: 6195
diff changeset
1628 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
166bef5cad01 add parenthesis, fix warning: i386/dsputil_mmx.c:2618: warning: suggest parentheses around arithmetic in operand of |
bcoudurier
parents: 6195
diff changeset
1629 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1630 // uses more than 16 bits of subpel mv (only at huge resolution)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1631 || (dxx|dxy|dyx|dyy)&15 )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1632 {
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1633 //FIXME could still use mmx for some of the rows
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1634 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1635 return;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1636 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1637
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1638 src += ix + iy*stride;
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1639 if( (unsigned)ix >= width-w ||
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1640 (unsigned)iy >= height-h )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1641 {
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1642 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1643 src = edge_buf;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1644 }
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1645
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1646 __asm__ volatile(
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1647 "movd %0, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1648 "pxor %%mm7, %%mm7 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1649 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1650 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1651 :: "r"(1<<shift)
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1652 );
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1653
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1654 for(x=0; x<w; x+=4){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1655 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1656 oxs - dxys + dxxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1657 oxs - dxys + dxxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1658 oxs - dxys + dxxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1659 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1660 oys - dyys + dyxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1661 oys - dyys + dyxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1662 oys - dyys + dyxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1663
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1664 for(y=0; y<h; y++){
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1665 __asm__ volatile(
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1666 "movq %0, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1667 "movq %1, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1668 "paddw %2, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1669 "paddw %3, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1670 "movq %%mm4, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1671 "movq %%mm5, %1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1672 "psrlw $12, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1673 "psrlw $12, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1674 : "+m"(*dx4), "+m"(*dy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1675 : "m"(*dxy4), "m"(*dyy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1676 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1677
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1678 __asm__ volatile(
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1679 "movq %%mm6, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1680 "movq %%mm6, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1681 "psubw %%mm4, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1682 "psubw %%mm5, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1683 "movq %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1684 "movq %%mm4, %%mm3 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1685 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1686 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1687 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1688 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1689
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1690 "movd %4, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1691 "movd %3, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1692 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1693 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1694 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1695 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1696
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1697 "movd %2, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1698 "movd %1, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1699 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1700 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1701 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1702 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
1703 "paddw %5, %%mm1 \n\t"
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1704 "paddw %%mm3, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1705 "paddw %%mm1, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1706 "paddw %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1707
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1708 "psrlw %6, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1709 "packuswb %%mm0, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1710 "movd %%mm0, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1711
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1712 : "=m"(dst[x+y*stride])
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1713 : "m"(src[0]), "m"(src[1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1714 "m"(src[stride]), "m"(src[stride+1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1715 "m"(*r4), "m"(shift2)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1716 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1717 src += stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1718 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1719 src += 4-h*stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1720 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1721 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
1722
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1723 #define PREFETCH(name, op) \
4172
608e2dfcb86e adding more static keywords
mru
parents: 4127
diff changeset
1724 static void name(void *mem, int stride, int h){\
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1725 const uint8_t *p= mem;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1726 do{\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1727 __asm__ volatile(#op" %0" :: "m"(*p));\
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1728 p+= stride;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1729 }while(--h);\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1730 }
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1731 PREFETCH(prefetch_mmx2, prefetcht0)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1732 PREFETCH(prefetch_3dnow, prefetch)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1733 #undef PREFETCH
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
1734
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents: 2753
diff changeset
1735 #include "h264dsp_mmx.c"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1736
6009
ecfdc0bfb233 typo/clarification
diego
parents: 5963
diff changeset
1737 /* CAVS specific */
3524
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1738 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
6522
dfa76e0734e5 Add missed call to ff_cavsdsp_init_3dnow() in dsputil_init_mmx()
zuxy
parents: 6513
diff changeset
1739 void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx);
3524
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1740
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1741 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1742 put_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1743 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1744 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1745 avg_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1746 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1747 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1748 put_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1749 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1750 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1751 avg_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1752 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
1753
5948
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1754 /* VC1 specific */
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1755 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1756
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1757 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1758 put_pixels8_mmx(dst, src, stride, 8);
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1759 }
db875a610973 build vc1dsp_mmx.c in its own compilation unit
aurel
parents: 5947
diff changeset
1760
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1761 /* external functions, from idct_mmx.c */
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1762 void ff_mmx_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1763 void ff_mmxext_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1764
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1765 /* XXX: those functions should be suppressed ASAP when all IDCTs are
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1766 converted */
4020
723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure
diego
parents: 4001
diff changeset
1767 #ifdef CONFIG_GPL
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1768 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1769 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1770 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1771 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1772 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1773 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1774 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1775 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1776 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1777 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1778 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1779 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1780 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1781 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1782 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1783 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1784 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1785 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1786 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
1787 }
4020
723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure
diego
parents: 4001
diff changeset
1788 #endif
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1789 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1790 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1791 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1792 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1793 }
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1794 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1795 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1796 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1797 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1798 }
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1799 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1800 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1801 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1802 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1803 }
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1804 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1805 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1806 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1807 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
1808 }
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1809
3541
3fbddeb13686 10l, vorbis_inverse_coupling_sse() was really 3dnow
lorenm
parents: 3536
diff changeset
1810 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1811 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1812 int i;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1813 __asm__ volatile("pxor %%mm7, %%mm7":);
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1814 for(i=0; i<blocksize; i+=2) {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1815 __asm__ volatile(
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1816 "movq %0, %%mm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1817 "movq %1, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1818 "movq %%mm0, %%mm2 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1819 "movq %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1820 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1821 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1822 "pslld $31, %%mm2 \n\t" // keep only the sign bit
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1823 "pxor %%mm2, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1824 "movq %%mm3, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1825 "pand %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1826 "pandn %%mm1, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1827 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1828 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1829 "movq %%mm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1830 "movq %%mm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1831 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1832 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1833 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1834 }
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1835 __asm__ volatile("femms");
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1836 }
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1837 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1838 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1839 int i;
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1840
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1841 __asm__ volatile(
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1842 "movaps %0, %%xmm5 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1843 ::"m"(ff_pdw_80000000[0])
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1844 );
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1845 for(i=0; i<blocksize; i+=4) {
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1846 __asm__ volatile(
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1847 "movaps %0, %%xmm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1848 "movaps %1, %%xmm1 \n\t"
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1849 "xorps %%xmm2, %%xmm2 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1850 "xorps %%xmm3, %%xmm3 \n\t"
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1851 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1852 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1853 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1854 "xorps %%xmm2, %%xmm1 \n\t"
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1855 "movaps %%xmm3, %%xmm4 \n\t"
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1856 "andps %%xmm1, %%xmm3 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
1857 "andnps %%xmm1, %%xmm4 \n\t"
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1858 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1859 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1860 "movaps %%xmm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1861 "movaps %%xmm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1862 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1863 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1864 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1865 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1866 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
1867
7563
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1868 #define IF1(x) x
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1869 #define IF0(x)
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1870
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1871 #define MIX5(mono,stereo)\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1872 __asm__ volatile(\
7563
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1873 "movss 0(%2), %%xmm5 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1874 "movss 8(%2), %%xmm6 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1875 "movss 24(%2), %%xmm7 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1876 "shufps $0, %%xmm5, %%xmm5 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1877 "shufps $0, %%xmm6, %%xmm6 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1878 "shufps $0, %%xmm7, %%xmm7 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1879 "1: \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1880 "movaps (%0,%1), %%xmm0 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1881 "movaps 0x400(%0,%1), %%xmm1 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1882 "movaps 0x800(%0,%1), %%xmm2 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1883 "movaps 0xc00(%0,%1), %%xmm3 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1884 "movaps 0x1000(%0,%1), %%xmm4 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1885 "mulps %%xmm5, %%xmm0 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1886 "mulps %%xmm6, %%xmm1 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1887 "mulps %%xmm5, %%xmm2 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1888 "mulps %%xmm7, %%xmm3 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1889 "mulps %%xmm7, %%xmm4 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1890 stereo("addps %%xmm1, %%xmm0 \n")\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1891 "addps %%xmm1, %%xmm2 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1892 "addps %%xmm3, %%xmm0 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1893 "addps %%xmm4, %%xmm2 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1894 mono("addps %%xmm2, %%xmm0 \n")\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1895 "movaps %%xmm0, (%0,%1) \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1896 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1897 "add $16, %0 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1898 "jl 1b \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1899 :"+&r"(i)\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1900 :"r"(samples[0]+len), "r"(matrix)\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1901 :"memory"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1902 );
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1903
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1904 #define MIX_MISC(stereo)\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1905 __asm__ volatile(\
7563
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1906 "1: \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1907 "movaps (%3,%0), %%xmm0 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1908 stereo("movaps %%xmm0, %%xmm1 \n")\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1909 "mulps %%xmm6, %%xmm0 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1910 stereo("mulps %%xmm7, %%xmm1 \n")\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1911 "lea 1024(%3,%0), %1 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1912 "mov %5, %2 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1913 "2: \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1914 "movaps (%1), %%xmm2 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1915 stereo("movaps %%xmm2, %%xmm3 \n")\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1916 "mulps (%4,%2), %%xmm2 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1917 stereo("mulps 16(%4,%2), %%xmm3 \n")\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1918 "addps %%xmm2, %%xmm0 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1919 stereo("addps %%xmm3, %%xmm1 \n")\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1920 "add $1024, %1 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1921 "add $32, %2 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1922 "jl 2b \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1923 "movaps %%xmm0, (%3,%0) \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1924 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1925 "add $16, %0 \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1926 "jl 1b \n"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1927 :"+&r"(i), "=&r"(j), "=&r"(k)\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1928 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1929 :"memory"\
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1930 );
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1931
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1932 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1933 {
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1934 int (*matrix_cmp)[2] = (int(*)[2])matrix;
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1935 intptr_t i,j,k;
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1936
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1937 i = -len*sizeof(float);
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1938 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1939 MIX5(IF0,IF1);
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1940 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1941 MIX5(IF1,IF0);
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1942 } else {
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1943 DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]);
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1944 j = 2*in_ch*sizeof(float);
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1945 __asm__ volatile(
7563
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1946 "1: \n"
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1947 "sub $8, %0 \n"
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1948 "movss (%2,%0), %%xmm6 \n"
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1949 "movss 4(%2,%0), %%xmm7 \n"
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1950 "shufps $0, %%xmm6, %%xmm6 \n"
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1951 "shufps $0, %%xmm7, %%xmm7 \n"
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1952 "movaps %%xmm6, (%1,%0,4) \n"
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1953 "movaps %%xmm7, 16(%1,%0,4) \n"
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1954 "jg 1b \n"
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1955 :"+&r"(j)
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1956 :"r"(matrix_simd), "r"(matrix)
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1957 :"memory"
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1958 );
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1959 if(out_ch == 2) {
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1960 MIX_MISC(IF1);
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1961 } else {
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1962 MIX_MISC(IF0);
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1963 }
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1964 }
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1965 }
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
1966
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1967 static void vector_fmul_3dnow(float *dst, const float *src, int len){
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
1968 x86_reg i = (len-4)*4;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1969 __asm__ volatile(
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1970 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1971 "movq (%1,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1972 "movq 8(%1,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1973 "pfmul (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1974 "pfmul 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1975 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1976 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1977 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1978 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1979 "femms \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1980 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1981 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1982 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1983 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1984 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
1985 static void vector_fmul_sse(float *dst, const float *src, int len){
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
1986 x86_reg i = (len-8)*4;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
1987 __asm__ volatile(
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1988 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1989 "movaps (%1,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1990 "movaps 16(%1,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1991 "mulps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1992 "mulps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1993 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1994 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1995 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1996 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1997 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1998 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
1999 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2000 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2001 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2002
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2003 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
2004 x86_reg i = len*4-16;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2005 __asm__ volatile(
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2006 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2007 "pswapd 8(%1), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2008 "pswapd (%1), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2009 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2010 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2011 "movq %%mm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2012 "movq %%mm1, 8(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2013 "add $16, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2014 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2015 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2016 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2017 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2018 );
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2019 __asm__ volatile("femms");
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2020 }
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2021 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
2022 x86_reg i = len*4-32;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2023 __asm__ volatile(
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2024 "1: \n\t"
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2025 "movaps 16(%1), %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2026 "movaps (%1), %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2027 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2028 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2029 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2030 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2031 "movaps %%xmm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2032 "movaps %%xmm1, 16(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2033 "add $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2034 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2035 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2036 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2037 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2038 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2039 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2040
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2041 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2042 const float *src2, int src3, int len, int step){
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
2043 x86_reg i = (len-4)*4;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2044 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2045 dst += (len-4)*2;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2046 __asm__ volatile(
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2047 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2048 "movq (%2,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2049 "movq 8(%2,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2050 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2051 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2052 "pfadd (%4,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2053 "pfadd 8(%4,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2054 "movd %%mm0, (%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2055 "movd %%mm1, 16(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2056 "psrlq $32, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2057 "psrlq $32, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2058 "movd %%mm0, 8(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2059 "movd %%mm1, 24(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2060 "sub $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2061 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2062 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2063 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2064 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2065 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2066 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2067 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2068 else if(step == 1 && src3 == 0){
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2069 __asm__ volatile(
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2070 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2071 "movq (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2072 "movq 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2073 "pfmul (%3,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2074 "pfmul 8(%3,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2075 "pfadd (%4,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2076 "pfadd 8(%4,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2077 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2078 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2079 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2080 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2081 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2082 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2083 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2084 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2085 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2086 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2087 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2088 __asm__ volatile("femms");
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2089 }
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2090 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2091 const float *src2, int src3, int len, int step){
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 6601
diff changeset
2092 x86_reg i = (len-8)*4;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2093 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2094 dst += (len-8)*2;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2095 __asm__ volatile(
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2096 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2097 "movaps (%2,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2098 "movaps 16(%2,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2099 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2100 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2101 "addps (%4,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2102 "addps 16(%4,%0), %%xmm1 \n\t"
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2103 "movss %%xmm0, (%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2104 "movss %%xmm1, 32(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2105 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2106 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2107 "movss %%xmm2, 16(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2108 "movss %%xmm3, 48(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2109 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2110 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2111 "movss %%xmm0, 8(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2112 "movss %%xmm1, 40(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2113 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2114 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2115 "movss %%xmm2, 24(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2116 "movss %%xmm3, 56(%1) \n\t"
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2117 "sub $64, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2118 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2119 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2120 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2121 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2122 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2123 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2124 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2125 else if(step == 1 && src3 == 0){
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2126 __asm__ volatile(
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2127 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2128 "movaps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2129 "movaps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2130 "mulps (%3,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2131 "mulps 16(%3,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2132 "addps (%4,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2133 "addps 16(%4,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2134 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2135 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2136 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2137 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2138 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2139 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2140 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2141 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2142 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2143 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2144 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2145 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2146
7263
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2147 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2148 const float *win, float add_bias, int len){
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2149 #ifdef HAVE_6REGS
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2150 if(add_bias == 0){
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2151 x86_reg i = -len*4;
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2152 x86_reg j = len*4-8;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2153 __asm__ volatile(
7263
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2154 "1: \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2155 "pswapd (%5,%1), %%mm1 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2156 "movq (%5,%0), %%mm0 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2157 "pswapd (%4,%1), %%mm5 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2158 "movq (%3,%0), %%mm4 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2159 "movq %%mm0, %%mm2 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2160 "movq %%mm1, %%mm3 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2161 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2162 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2163 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2164 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2165 "pfadd %%mm3, %%mm2 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2166 "pfsub %%mm0, %%mm1 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2167 "pswapd %%mm2, %%mm2 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2168 "movq %%mm1, (%2,%0) \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2169 "movq %%mm2, (%2,%1) \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2170 "sub $8, %1 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2171 "add $8, %0 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2172 "jl 1b \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2173 "femms \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2174 :"+r"(i), "+r"(j)
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2175 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2176 );
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2177 }else
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2178 #endif
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2179 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2180 }
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2181
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2182 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2183 const float *win, float add_bias, int len){
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2184 #ifdef HAVE_6REGS
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2185 if(add_bias == 0){
7263
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2186 x86_reg i = -len*4;
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2187 x86_reg j = len*4-16;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2188 __asm__ volatile(
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2189 "1: \n"
7263
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2190 "movaps (%5,%1), %%xmm1 \n"
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2191 "movaps (%5,%0), %%xmm0 \n"
7263
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2192 "movaps (%4,%1), %%xmm5 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2193 "movaps (%3,%0), %%xmm4 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2194 "shufps $0x1b, %%xmm1, %%xmm1 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2195 "shufps $0x1b, %%xmm5, %%xmm5 \n"
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2196 "movaps %%xmm0, %%xmm2 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2197 "movaps %%xmm1, %%xmm3 \n"
7263
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2198 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2199 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2200 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2201 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2202 "addps %%xmm3, %%xmm2 \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2203 "subps %%xmm0, %%xmm1 \n"
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2204 "shufps $0x1b, %%xmm2, %%xmm2 \n"
7263
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2205 "movaps %%xmm1, (%2,%0) \n"
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2206 "movaps %%xmm2, (%2,%1) \n"
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2207 "sub $16, %1 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2208 "add $16, %0 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2209 "jl 1b \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2210 :"+r"(i), "+r"(j)
7263
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2211 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2212 );
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2213 }else
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2214 #endif
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2215 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2216 }
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2217
7564
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2218 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2219 {
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2220 x86_reg i = -4*len;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2221 __asm__ volatile(
7564
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2222 "movss %3, %%xmm4 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2223 "shufps $0, %%xmm4, %%xmm4 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2224 "1: \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2225 "cvtpi2ps (%2,%0), %%xmm0 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2226 "cvtpi2ps 8(%2,%0), %%xmm1 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2227 "cvtpi2ps 16(%2,%0), %%xmm2 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2228 "cvtpi2ps 24(%2,%0), %%xmm3 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2229 "movlhps %%xmm1, %%xmm0 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2230 "movlhps %%xmm3, %%xmm2 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2231 "mulps %%xmm4, %%xmm0 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2232 "mulps %%xmm4, %%xmm2 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2233 "movaps %%xmm0, (%1,%0) \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2234 "movaps %%xmm2, 16(%1,%0) \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2235 "add $32, %0 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2236 "jl 1b \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2237 :"+r"(i)
7567
75841957d08b gcc chokes on xmm constraints, so pessimize int32_to_float_fmul_scalar_sse a little
lorenm
parents: 7565
diff changeset
2238 :"r"(dst+len), "r"(src+len), "m"(mul)
7564
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2239 );
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2240 }
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2241
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2242 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2243 {
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2244 x86_reg i = -4*len;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2245 __asm__ volatile(
7564
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2246 "movss %3, %%xmm4 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2247 "shufps $0, %%xmm4, %%xmm4 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2248 "1: \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2249 "cvtdq2ps (%2,%0), %%xmm0 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2250 "cvtdq2ps 16(%2,%0), %%xmm1 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2251 "mulps %%xmm4, %%xmm0 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2252 "mulps %%xmm4, %%xmm1 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2253 "movaps %%xmm0, (%1,%0) \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2254 "movaps %%xmm1, 16(%1,%0) \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2255 "add $32, %0 \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2256 "jl 1b \n"
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2257 :"+r"(i)
7567
75841957d08b gcc chokes on xmm constraints, so pessimize int32_to_float_fmul_scalar_sse a little
lorenm
parents: 7565
diff changeset
2258 :"r"(dst+len), "r"(src+len), "m"(mul)
7564
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2259 );
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2260 }
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2261
7218
7f3d6509628b Fix x86-64
michael
parents: 7217
diff changeset
2262 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
8041
24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions.
reimar
parents: 8035
diff changeset
2263 x86_reg reglen = len;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2264 // not bit-exact: pf2id uses different rounding than C and SSE
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2265 __asm__ volatile(
7217
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2266 "add %0 , %0 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2267 "lea (%2,%0,2) , %2 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2268 "add %0 , %1 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2269 "neg %0 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2270 "1: \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2271 "pf2id (%2,%0,2) , %%mm0 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2272 "pf2id 8(%2,%0,2) , %%mm1 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2273 "pf2id 16(%2,%0,2) , %%mm2 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2274 "pf2id 24(%2,%0,2) , %%mm3 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2275 "packssdw %%mm1 , %%mm0 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2276 "packssdw %%mm3 , %%mm2 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2277 "movq %%mm0 , (%1,%0) \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2278 "movq %%mm2 , 8(%1,%0) \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2279 "add $16 , %0 \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2280 " js 1b \n\t"
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2281 "femms \n\t"
8041
24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions.
reimar
parents: 8035
diff changeset
2282 :"+r"(reglen), "+r"(dst), "+r"(src)
7217
726770da3234 dont use C-asm loops and unroll once float_to_int16_3dnow()
michael
parents: 7087
diff changeset
2283 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2284 }
7218
7f3d6509628b Fix x86-64
michael
parents: 7217
diff changeset
2285 static void float_to_int16_sse(int16_t *dst, const float *src, long len){
8041
24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions.
reimar
parents: 8035
diff changeset
2286 x86_reg reglen = len;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2287 __asm__ volatile(
7219
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2288 "add %0 , %0 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2289 "lea (%2,%0,2) , %2 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2290 "add %0 , %1 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2291 "neg %0 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2292 "1: \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2293 "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2294 "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2295 "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2296 "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2297 "packssdw %%mm1 , %%mm0 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2298 "packssdw %%mm3 , %%mm2 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2299 "movq %%mm0 , (%1,%0) \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2300 "movq %%mm2 , 8(%1,%0) \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2301 "add $16 , %0 \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2302 " js 1b \n\t"
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2303 "emms \n\t"
8041
24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions.
reimar
parents: 8035
diff changeset
2304 :"+r"(reglen), "+r"(dst), "+r"(src)
7219
f72ef5b28253 1 c-asm loop less and 1x unroll of float_to_int16_sse()
michael
parents: 7218
diff changeset
2305 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2306 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2307
7226
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2308 static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
8041
24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions.
reimar
parents: 8035
diff changeset
2309 x86_reg reglen = len;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2310 __asm__ volatile(
7226
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2311 "add %0 , %0 \n\t"
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2312 "lea (%2,%0,2) , %2 \n\t"
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2313 "add %0 , %1 \n\t"
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2314 "neg %0 \n\t"
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2315 "1: \n\t"
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2316 "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2317 "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2318 "packssdw %%xmm1 , %%xmm0 \n\t"
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2319 "movdqa %%xmm0 , (%1,%0) \n\t"
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2320 "add $16 , %0 \n\t"
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2321 " js 1b \n\t"
8041
24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions.
reimar
parents: 8035
diff changeset
2322 :"+r"(reglen), "+r"(dst), "+r"(src)
7226
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2323 );
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2324 }
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2325
7568
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2326 #ifdef HAVE_YASM
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2327 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2328 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2329 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
7565
474c7ae4b431 special case 6 channel version of float_to_int16_interleave
lorenm
parents: 7564
diff changeset
2330 #else
7568
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2331 #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2332 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2333 #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
7565
474c7ae4b431 special case 6 channel version of float_to_int16_interleave
lorenm
parents: 7564
diff changeset
2334 #endif
7568
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2335 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
7565
474c7ae4b431 special case 6 channel version of float_to_int16_interleave
lorenm
parents: 7564
diff changeset
2336
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2337 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2338 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
7565
474c7ae4b431 special case 6 channel version of float_to_int16_interleave
lorenm
parents: 7564
diff changeset
2339 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
7286
e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents: 7278
diff changeset
2340 DECLARE_ALIGNED_16(int16_t, tmp[len]);\
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2341 int i,j,c;\
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2342 for(c=0; c<channels; c++){\
7286
e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents: 7278
diff changeset
2343 float_to_int16_##cpu(tmp, src[c], len);\
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2344 for(i=0, j=c; i<len; i++, j+=channels)\
7286
e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents: 7278
diff changeset
2345 dst[j] = tmp[i];\
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2346 }\
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2347 }\
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2348 \
7286
e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents: 7278
diff changeset
2349 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2350 if(channels==1)\
7286
e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents: 7278
diff changeset
2351 float_to_int16_##cpu(dst, src[0], len);\
7565
474c7ae4b431 special case 6 channel version of float_to_int16_interleave
lorenm
parents: 7564
diff changeset
2352 else if(channels==2){\
8041
24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions.
reimar
parents: 8035
diff changeset
2353 x86_reg reglen = len; \
7286
e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents: 7278
diff changeset
2354 const float *src0 = src[0];\
e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents: 7278
diff changeset
2355 const float *src1 = src[1];\
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2356 __asm__ volatile(\
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2357 "shl $2, %0 \n"\
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2358 "add %0, %1 \n"\
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2359 "add %0, %2 \n"\
7286
e267f2519248 float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents: 7278
diff changeset
2360 "add %0, %3 \n"\
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2361 "neg %0 \n"\
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2362 body\
8041
24761747ac3d Use x86_reg type instead of long in float_to_int16 MMX/SSE functions.
reimar
parents: 8035
diff changeset
2363 :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2364 );\
7565
474c7ae4b431 special case 6 channel version of float_to_int16_interleave
lorenm
parents: 7564
diff changeset
2365 }else if(channels==6){\
7568
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2366 ff_float_to_int16_interleave6_##cpu(dst, src, len);\
7565
474c7ae4b431 special case 6 channel version of float_to_int16_interleave
lorenm
parents: 7564
diff changeset
2367 }else\
474c7ae4b431 special case 6 channel version of float_to_int16_interleave
lorenm
parents: 7564
diff changeset
2368 float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2369 }
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2370
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2371 FLOAT_TO_INT16_INTERLEAVE(3dnow,
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2372 "1: \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2373 "pf2id (%2,%0), %%mm0 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2374 "pf2id 8(%2,%0), %%mm1 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2375 "pf2id (%3,%0), %%mm2 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2376 "pf2id 8(%3,%0), %%mm3 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2377 "packssdw %%mm1, %%mm0 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2378 "packssdw %%mm3, %%mm2 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2379 "movq %%mm0, %%mm1 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2380 "punpcklwd %%mm2, %%mm0 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2381 "punpckhwd %%mm2, %%mm1 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2382 "movq %%mm0, (%1,%0)\n"
7278
6c140c15ee8c 10l, float_to_int16_interleave_sse/3dnow wrote the wrong samples
lorenm
parents: 7263
diff changeset
2383 "movq %%mm1, 8(%1,%0)\n"
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2384 "add $16, %0 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2385 "js 1b \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2386 "femms \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2387 )
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2388
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2389 FLOAT_TO_INT16_INTERLEAVE(sse,
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2390 "1: \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2391 "cvtps2pi (%2,%0), %%mm0 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2392 "cvtps2pi 8(%2,%0), %%mm1 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2393 "cvtps2pi (%3,%0), %%mm2 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2394 "cvtps2pi 8(%3,%0), %%mm3 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2395 "packssdw %%mm1, %%mm0 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2396 "packssdw %%mm3, %%mm2 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2397 "movq %%mm0, %%mm1 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2398 "punpcklwd %%mm2, %%mm0 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2399 "punpckhwd %%mm2, %%mm1 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2400 "movq %%mm0, (%1,%0)\n"
7278
6c140c15ee8c 10l, float_to_int16_interleave_sse/3dnow wrote the wrong samples
lorenm
parents: 7263
diff changeset
2401 "movq %%mm1, 8(%1,%0)\n"
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2402 "add $16, %0 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2403 "js 1b \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2404 "emms \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2405 )
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2406
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2407 FLOAT_TO_INT16_INTERLEAVE(sse2,
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2408 "1: \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2409 "cvtps2dq (%2,%0), %%xmm0 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2410 "cvtps2dq (%3,%0), %%xmm1 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2411 "packssdw %%xmm1, %%xmm0 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2412 "movhlps %%xmm0, %%xmm1 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2413 "punpcklwd %%xmm1, %%xmm0 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2414 "movdqa %%xmm0, (%1,%0) \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2415 "add $16, %0 \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2416 "js 1b \n"
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2417 )
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2418
7568
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2419 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2420 if(channels==6)
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2421 ff_float_to_int16_interleave6_3dn2(dst, src, len);
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2422 else
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2423 float_to_int16_interleave_3dnow(dst, src, len, channels);
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2424 }
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2425
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2426
8250
cf4d575b1982 Delete unnecessary 'extern' keywords.
diego
parents: 8104
diff changeset
2427 void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
cf4d575b1982 Delete unnecessary 'extern' keywords.
diego
parents: 8104
diff changeset
2428 void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
cf4d575b1982 Delete unnecessary 'extern' keywords.
diego
parents: 8104
diff changeset
2429 void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
cf4d575b1982 Delete unnecessary 'extern' keywords.
diego
parents: 8104
diff changeset
2430 void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
cf4d575b1982 Delete unnecessary 'extern' keywords.
diego
parents: 8104
diff changeset
2431 void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
cf4d575b1982 Delete unnecessary 'extern' keywords.
diego
parents: 8104
diff changeset
2432 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
cf4d575b1982 Delete unnecessary 'extern' keywords.
diego
parents: 8104
diff changeset
2433 void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
cf4d575b1982 Delete unnecessary 'extern' keywords.
diego
parents: 8104
diff changeset
2434 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2435
7238
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2436
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2437 static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2438 {
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2439 x86_reg o = -(order << 1);
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2440 v1 += order;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2441 v2 += order;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2442 __asm__ volatile(
7238
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2443 "1: \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2444 "movdqu (%1,%2), %%xmm0 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2445 "movdqu 16(%1,%2), %%xmm1 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2446 "paddw (%0,%2), %%xmm0 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2447 "paddw 16(%0,%2), %%xmm1 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2448 "movdqa %%xmm0, (%0,%2) \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2449 "movdqa %%xmm1, 16(%0,%2) \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2450 "add $32, %2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2451 "js 1b \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2452 : "+r"(v1), "+r"(v2), "+r"(o)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2453 );
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2454 }
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2455
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2456 static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2457 {
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2458 x86_reg o = -(order << 1);
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2459 v1 += order;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2460 v2 += order;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2461 __asm__ volatile(
7238
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2462 "1: \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2463 "movdqa (%0,%2), %%xmm0 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2464 "movdqa 16(%0,%2), %%xmm2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2465 "movdqu (%1,%2), %%xmm1 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2466 "movdqu 16(%1,%2), %%xmm3 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2467 "psubw %%xmm1, %%xmm0 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2468 "psubw %%xmm3, %%xmm2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2469 "movdqa %%xmm0, (%0,%2) \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2470 "movdqa %%xmm2, 16(%0,%2) \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2471 "add $32, %2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2472 "js 1b \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2473 : "+r"(v1), "+r"(v2), "+r"(o)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2474 );
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2475 }
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2476
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2477 static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2478 {
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2479 int res = 0;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2480 DECLARE_ALIGNED_16(int64_t, sh);
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2481 x86_reg o = -(order << 1);
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2482
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2483 v1 += order;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2484 v2 += order;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2485 sh = shift;
8031
eebc7209c47f Convert asm keyword into __asm__.
flameeyes
parents: 7880
diff changeset
2486 __asm__ volatile(
7238
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2487 "pxor %%xmm7, %%xmm7 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2488 "1: \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2489 "movdqu (%0,%3), %%xmm0 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2490 "movdqu 16(%0,%3), %%xmm1 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2491 "pmaddwd (%1,%3), %%xmm0 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2492 "pmaddwd 16(%1,%3), %%xmm1 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2493 "paddd %%xmm0, %%xmm7 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2494 "paddd %%xmm1, %%xmm7 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2495 "add $32, %3 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2496 "js 1b \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2497 "movhlps %%xmm7, %%xmm2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2498 "paddd %%xmm2, %%xmm7 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2499 "psrad %4, %%xmm7 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2500 "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2501 "paddd %%xmm2, %%xmm7 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2502 "movd %%xmm7, %2 \n\t"
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2503 : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2504 : "m"(sh)
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2505 );
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2506 return res;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2507 }
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2508
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2509 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2510 {
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
2511 mm_flags = mm_support();
1115
74a46d77e061 * support FF_MM_FORCE
kabi
parents: 1092
diff changeset
2512
1122
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
2513 if (avctx->dsp_mask) {
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2514 if (avctx->dsp_mask & FF_MM_FORCE)
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
2515 mm_flags |= (avctx->dsp_mask & 0xffff);
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2516 else
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
2517 mm_flags &= ~(avctx->dsp_mask & 0xffff);
1122
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
2518 }
1115
74a46d77e061 * support FF_MM_FORCE
kabi
parents: 1092
diff changeset
2519
631
47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents: 629
diff changeset
2520 #if 0
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2521 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2522 if (mm_flags & FF_MM_MMX)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2523 av_log(avctx, AV_LOG_INFO, " mmx");
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2524 if (mm_flags & FF_MM_MMXEXT)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2525 av_log(avctx, AV_LOG_INFO, " mmxext");
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2526 if (mm_flags & FF_MM_3DNOW)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2527 av_log(avctx, AV_LOG_INFO, " 3dnow");
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2528 if (mm_flags & FF_MM_SSE)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2529 av_log(avctx, AV_LOG_INFO, " sse");
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2530 if (mm_flags & FF_MM_SSE2)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2531 av_log(avctx, AV_LOG_INFO, " sse2");
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2532 av_log(avctx, AV_LOG_INFO, "\n");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2533 #endif
986e461dc072 Initial revision
glantau
parents:
diff changeset
2534
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2535 if (mm_flags & FF_MM_MMX) {
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2536 const int idct_algo= avctx->idct_algo;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2537
2256
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2538 if(avctx->lowres==0){
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2539 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2540 c->idct_put= ff_simple_idct_put_mmx;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2541 c->idct_add= ff_simple_idct_add_mmx;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2542 c->idct = ff_simple_idct_mmx;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2543 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3717
ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel
diego
parents: 3712
diff changeset
2544 #ifdef CONFIG_GPL
2256
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2545 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2546 if(mm_flags & FF_MM_MMXEXT){
2256
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2547 c->idct_put= ff_libmpeg2mmx2_idct_put;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2548 c->idct_add= ff_libmpeg2mmx2_idct_add;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2549 c->idct = ff_mmxext_idct;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2550 }else{
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2551 c->idct_put= ff_libmpeg2mmx_idct_put;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2552 c->idct_add= ff_libmpeg2mmx_idct_add;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2553 c->idct = ff_mmx_idct;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2554 }
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
2555 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3717
ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel
diego
parents: 3712
diff changeset
2556 #endif
7880
2d3d9b4181d7 Ensure MMX/SSE2 VP3 IDCT selection isn't disabled when only Theora is enabled
conrad
parents: 7876
diff changeset
2557 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER) &&
7876
3fd591f125b5 MMX/SSE2 VP3 IDCT are bitexact now that the dequantization matrices are permutated correctly
conrad
parents: 7759
diff changeset
2558 idct_algo==FF_IDCT_VP3){
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2559 if(mm_flags & FF_MM_SSE2){
2696
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2560 c->idct_put= ff_vp3_idct_put_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2561 c->idct_add= ff_vp3_idct_add_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2562 c->idct = ff_vp3_idct_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2563 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2564 }else{
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2565 c->idct_put= ff_vp3_idct_put_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2566 c->idct_add= ff_vp3_idct_add_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2567 c->idct = ff_vp3_idct_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2568 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
2569 }
3524
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2570 }else if(idct_algo==FF_IDCT_CAVS){
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2571 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2572 }else if(idct_algo==FF_IDCT_XVIDMMX){
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2573 if(mm_flags & FF_MM_SSE2){
6601
f76581c16848 Add a new xvid-style IDCT using SSE2.
astrange
parents: 6585
diff changeset
2574 c->idct_put= ff_idct_xvid_sse2_put;
f76581c16848 Add a new xvid-style IDCT using SSE2.
astrange
parents: 6585
diff changeset
2575 c->idct_add= ff_idct_xvid_sse2_add;
f76581c16848 Add a new xvid-style IDCT using SSE2.
astrange
parents: 6585
diff changeset
2576 c->idct = ff_idct_xvid_sse2;
f76581c16848 Add a new xvid-style IDCT using SSE2.
astrange
parents: 6585
diff changeset
2577 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2578 }else if(mm_flags & FF_MM_MMXEXT){
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2579 c->idct_put= ff_idct_xvid_mmx2_put;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2580 c->idct_add= ff_idct_xvid_mmx2_add;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2581 c->idct = ff_idct_xvid_mmx2;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2582 }else{
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2583 c->idct_put= ff_idct_xvid_mmx_put;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2584 c->idct_add= ff_idct_xvid_mmx_add;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2585 c->idct = ff_idct_xvid_mmx;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2586 }
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2587 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2588 }
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
2589
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2590 c->put_pixels_clamped = put_pixels_clamped_mmx;
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
2591 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2592 c->add_pixels_clamped = add_pixels_clamped_mmx;
8288
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
2593 c->clear_block = clear_block_mmx;
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2594 c->clear_blocks = clear_blocks_mmx;
8288
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
2595 if (mm_flags & FF_MM_SSE)
800444234375 clear_block mmx
lorenm
parents: 8250
diff changeset
2596 c->clear_block = clear_block_sse;
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
2597
6327
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2598 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2599 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2600 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2601 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2602 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2603
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2604 SET_HPEL_FUNCS(put, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2605 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2606 SET_HPEL_FUNCS(avg, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2607 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2608 SET_HPEL_FUNCS(put, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2609 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2610 SET_HPEL_FUNCS(avg, 1, 8, mmx);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2611 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
2612
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2613 c->gmc= gmc_mmx;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2614
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
2615 c->add_bytes= add_bytes_mmx;
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
2616 c->add_bytes_l2= add_bytes_l2_mmx;
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
2617
6437
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
2618 c->draw_edges = draw_edges_mmx;
5154ab444372 move draw_edges() into dsputil
aurel
parents: 6403
diff changeset
2619
5277
7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs
aurel
parents: 5255
diff changeset
2620 if (ENABLE_ANY_H263) {
5278
ef85411bb7e8 cosmetics: indentation
aurel
parents: 5277
diff changeset
2621 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
ef85411bb7e8 cosmetics: indentation
aurel
parents: 5277
diff changeset
2622 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
5277
7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs
aurel
parents: 5255
diff changeset
2623 }
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 6056
diff changeset
2624 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2625 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 6056
diff changeset
2626 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
2627
3173
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3105
diff changeset
2628 c->h264_idct_dc_add=
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3105
diff changeset
2629 c->h264_idct_add= ff_h264_idct_add_mmx;
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
2630 c->h264_idct8_dc_add=
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
2631 c->h264_idct8_add= ff_h264_idct8_add_mmx;
8375
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8317
diff changeset
2632
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8317
diff changeset
2633 c->h264_idct_add16 = ff_h264_idct_add16_mmx;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8317
diff changeset
2634 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8317
diff changeset
2635 c->h264_idct_add8 = ff_h264_idct_add8_mmx;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8317
diff changeset
2636 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
3173
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3105
diff changeset
2637
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2638 if (mm_flags & FF_MM_MMXEXT) {
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2639 c->prefetch = prefetch_mmx2;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2640
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2641 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2642 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
2643
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2644 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2645 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2646 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
2647
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2648 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2649 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
2650
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2651 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2652 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2653 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2654
3105
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3089
diff changeset
2655 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3089
diff changeset
2656 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
8375
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8317
diff changeset
2657 c->h264_idct_add16 = ff_h264_idct_add16_mmx2;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8317
diff changeset
2658 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8317
diff changeset
2659 c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8317
diff changeset
2660 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
2745
42d3e9068e32 MMX for H.264 iDCT (adapted from x264)
lorenm
parents: 2732
diff changeset
2661
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2662 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2663 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2664 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2665 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2666 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2667 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2668 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2669
8035
56f973432109 Cosmetics: reindent
conrad
parents: 8034
diff changeset
2670 if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) {
56f973432109 Cosmetics: reindent
conrad
parents: 8034
diff changeset
2671 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
56f973432109 Cosmetics: reindent
conrad
parents: 8034
diff changeset
2672 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
8034
9b690041e298 Combine non-bitexact sections
conrad
parents: 8033
diff changeset
2673 }
8033
b661cf8690a1 VP3 loop filter is mmx2 not mmx
conrad
parents: 8032
diff changeset
2674 }
b661cf8690a1 VP3 loop filter is mmx2 not mmx
conrad
parents: 8032
diff changeset
2675
6327
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2676 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2677 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2678 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2679 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2680 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2681 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2682 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2683 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2684 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2685 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2686 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2687 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2688 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2689 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2690 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2691 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2692 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2693
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2694 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2695 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2696 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2697 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2698 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2699 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2700
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2701 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2702 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2703 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2704 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2705 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2706 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2707
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2708 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2709 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2710 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2711 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
2712
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 6056
diff changeset
2713 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2714 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
3213
57d31bdbebe8 added mmx implementation of h264_chroma_mc2
lorenm
parents: 3211
diff changeset
2715 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
57d31bdbebe8 added mmx implementation of h264_chroma_mc2
lorenm
parents: 3211
diff changeset
2716 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
2633
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
2717 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
2718 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
2719 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
2720 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
2707
360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents: 2696
diff changeset
2721 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents: 2696
diff changeset
2722 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3576
diff changeset
2723 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
2633
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
2724
2902
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2725 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2726 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2727 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2728 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2729 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2730 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2731 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2732 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2733
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2734 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2735 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2736 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2737 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2738 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2739 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2740 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2741 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
2742
5949
d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_
aurel
parents: 5948
diff changeset
2743 if (ENABLE_CAVS_DECODER)
5950
e419e6d4e7eb cosmetics: indentation
aurel
parents: 5949
diff changeset
2744 ff_cavsdsp_init_mmx2(c, avctx);
5949
d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_
aurel
parents: 5948
diff changeset
2745
d7ad613197d2 convert some #ifdef CONFIG_ to if(ENABLE_
aurel
parents: 5948
diff changeset
2746 if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
5950
e419e6d4e7eb cosmetics: indentation
aurel
parents: 5949
diff changeset
2747 ff_vc1dsp_init_mmx(c, avctx);
5933
6ce8f15fc02b add VC-1 MMX DSP functions, under MIT license.
gpoirier
parents: 5912
diff changeset
2748
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
2749 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2750 } else if (mm_flags & FF_MM_3DNOW) {
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2751 c->prefetch = prefetch_3dnow;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2752
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2753 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2754 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
2755
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2756 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2757 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2758 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
2759
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2760 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2761 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2762
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2763 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2764 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
2765 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2766
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2767 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2768 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2769 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2770 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2771 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2772 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2773 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2774 }
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2775
6327
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2776 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2777 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2778 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2779 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2780 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2781 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2782
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2783 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2784 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2785 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2786 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2787 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2788 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2789
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2790 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2791 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2792 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
5b3e72f4bd4e reduce code duplication
lorenm
parents: 6322
diff changeset
2793 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
3807
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2794
6057
03febc8f506f add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents: 6056
diff changeset
2795 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
2796 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
6522
dfa76e0734e5 Add missed call to ff_cavsdsp_init_3dnow() in dsputil_init_mmx()
zuxy
parents: 6513
diff changeset
2797
dfa76e0734e5 Add missed call to ff_cavsdsp_init_3dnow() in dsputil_init_mmx()
zuxy
parents: 6513
diff changeset
2798 if (ENABLE_CAVS_DECODER)
dfa76e0734e5 Add missed call to ff_cavsdsp_init_3dnow() in dsputil_init_mmx()
zuxy
parents: 6513
diff changeset
2799 ff_cavsdsp_init_3dnow(c, avctx);
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2800 }
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2801
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2802
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2803 #define H264_QPEL_FUNCS(x, y, CPU)\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2804 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2805 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2806 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2807 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2808 if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2809 // these functions are slower than mmx on AMD, but faster on Intel
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2810 /* FIXME works in most codecs, but crashes svq1 due to unaligned chroma
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2811 c->put_pixels_tab[0][0] = put_pixels16_sse2;
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2812 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2813 */
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2814 H264_QPEL_FUNCS(0, 0, sse2);
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2815 }
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2816 if(mm_flags & FF_MM_SSE2){
8375
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8317
diff changeset
2817 c->h264_idct8_add = ff_h264_idct8_add_sse2;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8317
diff changeset
2818 c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
de2509cf3c44 H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents: 8317
diff changeset
2819
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2820 H264_QPEL_FUNCS(0, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2821 H264_QPEL_FUNCS(0, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2822 H264_QPEL_FUNCS(0, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2823 H264_QPEL_FUNCS(1, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2824 H264_QPEL_FUNCS(1, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2825 H264_QPEL_FUNCS(1, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2826 H264_QPEL_FUNCS(2, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2827 H264_QPEL_FUNCS(2, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2828 H264_QPEL_FUNCS(2, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2829 H264_QPEL_FUNCS(3, 1, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2830 H264_QPEL_FUNCS(3, 2, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2831 H264_QPEL_FUNCS(3, 3, sse2);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2832 }
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2833 #ifdef HAVE_SSSE3
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2834 if(mm_flags & FF_MM_SSSE3){
6336
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2835 H264_QPEL_FUNCS(1, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2836 H264_QPEL_FUNCS(1, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2837 H264_QPEL_FUNCS(1, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2838 H264_QPEL_FUNCS(1, 3, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2839 H264_QPEL_FUNCS(2, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2840 H264_QPEL_FUNCS(2, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2841 H264_QPEL_FUNCS(2, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2842 H264_QPEL_FUNCS(2, 3, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2843 H264_QPEL_FUNCS(3, 0, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2844 H264_QPEL_FUNCS(3, 1, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2845 H264_QPEL_FUNCS(3, 2, ssse3);
ef3fb5a7e275 sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents: 6335
diff changeset
2846 H264_QPEL_FUNCS(3, 3, ssse3);
6557
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6522
diff changeset
2847 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_nornd;
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6522
diff changeset
2848 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6522
diff changeset
2849 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6522
diff changeset
2850 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
e1208c4f8898 h264 chroma mc ssse3
lorenm
parents: 6522
diff changeset
2851 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
6384
0a403ade8c81 simd and unroll png_filter_row
lorenm
parents: 6336
diff changeset
2852 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
6331
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2853 }
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2854 #endif
c57670e07668 ssse3 h264 motion compensation.
lorenm
parents: 6329
diff changeset
2855
4589
30261f4ed12d Fix wrong conditional, Snow decoding, not encoding, was SIMD-accelerated.
diego
parents: 4436
diff changeset
2856 #ifdef CONFIG_SNOW_DECODER
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2857 if(mm_flags & FF_MM_SSE2 & 0){
3210
81cafbc23b8d snow mmx+sse2 optimizations, part 4
corey
parents: 3207
diff changeset
2858 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
5602
3b21f3268707 CONFIG_7REGS has been renamed to HAVE_7REGS
ramiro
parents: 5601
diff changeset
2859 #ifdef HAVE_7REGS
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2860 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
5601
b26025b9586d workaround gcc bug, untested as my gcc is not complaining
michael
parents: 5594
diff changeset
2861 #endif
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
2862 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2863 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2864 else{
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2865 if(mm_flags & FF_MM_MMXEXT){
3210
81cafbc23b8d snow mmx+sse2 optimizations, part 4
corey
parents: 3207
diff changeset
2866 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
5602
3b21f3268707 CONFIG_7REGS has been renamed to HAVE_7REGS
ramiro
parents: 5601
diff changeset
2867 #ifdef HAVE_7REGS
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2868 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
5601
b26025b9586d workaround gcc bug, untested as my gcc is not complaining
michael
parents: 5594
diff changeset
2869 #endif
5594
384629ebcb93 avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum
michael
parents: 5591
diff changeset
2870 }
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
2871 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2872 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
2873 #endif
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2874
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2875 if(mm_flags & FF_MM_3DNOW){
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2876 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2877 c->vector_fmul = vector_fmul_3dnow;
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2878 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2879 c->float_to_int16 = float_to_int16_3dnow;
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2880 c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2881 }
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2882 }
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2883 if(mm_flags & FF_MM_3DNOWEXT){
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2884 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
7263
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2885 c->vector_fmul_window = vector_fmul_window_3dnow2;
7568
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2886 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2887 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
bb76ecde9318 gcc chokes on the 7 registers needed for float_to_int16_interleave6 (even inside HAVE_7REGS), so write it in yasm
lorenm
parents: 7567
diff changeset
2888 }
7263
fc843d00867c exploit mdct symmetry
lorenm
parents: 7261
diff changeset
2889 }
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2890 if(mm_flags & FF_MM_SSE){
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2891 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
7563
8390efaa0c03 simd downmix
lorenm
parents: 7548
diff changeset
2892 c->ac3_downmix = ac3_downmix_sse;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2893 c->vector_fmul = vector_fmul_sse;
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2894 c->vector_fmul_reverse = vector_fmul_reverse_sse;
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
2895 c->vector_fmul_add_add = vector_fmul_add_add_sse;
7261
032a49f033e8 simplify vorbis windowing
lorenm
parents: 7238
diff changeset
2896 c->vector_fmul_window = vector_fmul_window_sse;
7564
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2897 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
7548
3d1b177a1b8c cosmetics in dsp init
lorenm
parents: 7460
diff changeset
2898 c->float_to_int16 = float_to_int16_sse;
3d1b177a1b8c cosmetics in dsp init
lorenm
parents: 7460
diff changeset
2899 c->float_to_int16_interleave = float_to_int16_interleave_sse;
7226
e707d79a5ffd float_to_int16_sse2()
michael
parents: 7219
diff changeset
2900 }
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2901 if(mm_flags & FF_MM_3DNOW)
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2902 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
8104
0d108ec85620 Remove duplicated MM_* macros for CPU capabilities from dsputil.h.
rathann
parents: 8073
diff changeset
2903 if(mm_flags & FF_MM_SSE2){
7564
7cf793954871 simd int->float
lorenm
parents: 7563
diff changeset
2904 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
7548
3d1b177a1b8c cosmetics in dsp init
lorenm
parents: 7460
diff changeset
2905 c->float_to_int16 = float_to_int16_sse2;
3d1b177a1b8c cosmetics in dsp init
lorenm
parents: 7460
diff changeset
2906 c->float_to_int16_interleave = float_to_int16_interleave_sse2;
7238
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2907 c->add_int16 = add_int16_sse2;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2908 c->sub_int16 = sub_int16_sse2;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2909 c->scalarproduct_int16 = scalarproduct_int16_sse2;
08cc6e202aa6 SSE2 optimizations for Monkey's Audio decoder vector functions
kostya
parents: 7226
diff changeset
2910 }
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2911 }
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
2912
6403
9a736918fd90 split encoding part of dsputil_mmx into its own file
aurel
parents: 6391
diff changeset
2913 if (ENABLE_ENCODERS)
9a736918fd90 split encoding part of dsputil_mmx into its own file
aurel
parents: 6391
diff changeset
2914 dsputilenc_init_mmx(c, avctx);
9a736918fd90 split encoding part of dsputil_mmx into its own file
aurel
parents: 6391
diff changeset
2915
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2916 #if 0
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2917 // for speed testing
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2918 get_pixels = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2919 put_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2920 add_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2921
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2922 pix_abs16x16 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2923 pix_abs16x16_x2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2924 pix_abs16x16_y2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2925 pix_abs16x16_xy2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2926
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2927 put_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2928 put_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2929 put_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2930 put_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2931
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2932 put_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2933 put_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2934 put_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2935 put_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2936
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2937 avg_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2938 avg_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2939 avg_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2940 avg_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2941
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2942 avg_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2943 avg_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2944 avg_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2945 avg_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2946
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2947 //av_fdct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2948 //ff_idct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2949 #endif
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
2950 }