annotate i386/dsputil_mmx.c @ 5591:642588a60570 libavcodec

update mmx code to latest snow changes note, the code likely can overflow and thus needs some more changes sse2 updated too but disabled as it is untested
author michael
date Sat, 25 Aug 2007 15:20:56 +0000
parents 3ae03eacbe9f
children 384629ebcb93
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
1 /*
986e461dc072 Initial revision
glantau
parents:
diff changeset
2 * MMX optimized DSP utils
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
1739
07a484280a82 copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents: 1729
diff changeset
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
5 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
6 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
7 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
8 * FFmpeg is free software; you can redistribute it and/or
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
9 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
10 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
11 * version 2.1 of the License, or (at your option) any later version.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
12 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
13 * FFmpeg is distributed in the hope that it will be useful,
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
16 * Lesser General Public License for more details.
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
17 *
429
718a22dc121f license/copyright change
glantau
parents: 422
diff changeset
18 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3932
diff changeset
19 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2979
diff changeset
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
21 *
986e461dc072 Initial revision
glantau
parents:
diff changeset
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
986e461dc072 Initial revision
glantau
parents:
diff changeset
23 */
986e461dc072 Initial revision
glantau
parents:
diff changeset
24
5010
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header
diego
parents: 5007
diff changeset
25 #include "dsputil.h"
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header
diego
parents: 5007
diff changeset
26 #include "simple_idct.h"
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header
diego
parents: 5007
diff changeset
27 #include "mpegvideo.h"
3398
e0927bc44a10 Move REG_* macros from libavcodec/i386/mmx.h to libavutil/x86_cpu.h
lucabe
parents: 3250
diff changeset
28 #include "x86_cpu.h"
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
29 #include "mmx.h"
5014
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
30 #include "vp3dsp_mmx.h"
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
31 #include "vp3dsp_sse2.h"
5277
7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs
aurel
parents: 5255
diff changeset
32 #include "h263.h"
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
33
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
34 //#undef NDEBUG
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
35 //#include <assert.h>
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
36
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
37 extern void ff_idct_xvid_mmx(short *block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
38 extern void ff_idct_xvid_mmx2(short *block);
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
39
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
40 int mm_flags; /* multimedia extension flags */
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
41
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
42 /* pixel operations */
1845
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
43 static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
44 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
45 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
46
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
47 static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) =
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
48 {0x8000000080000000ULL, 0x8000000080000000ULL};
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
49
1845
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
50 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
51 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
2633
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
52 static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
53 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
54 static const uint64_t ff_pw_8 attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL;
1845
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
55 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
56 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents: 2753
diff changeset
57 static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
1845
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
58 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
59
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3576
diff changeset
60 static const uint64_t ff_pb_1 attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3576
diff changeset
61 static const uint64_t ff_pb_3 attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3576
diff changeset
62 static const uint64_t ff_pb_7 attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
2707
360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents: 2696
diff changeset
63 static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
4127
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 4020
diff changeset
64 static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL;
d011a097bb85 optimize H264_DEBLOCK_P0_Q0
michael
parents: 4020
diff changeset
65 static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL;
1845
3054613980a8 attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents: 1784
diff changeset
66 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
67
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
68 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
69 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
70
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
71 #define MOVQ_WONE(regd) \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
72 __asm __volatile ( \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
73 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
74 "psrlw $15, %%" #regd ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
75
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
76 #define MOVQ_BFE(regd) \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
77 __asm __volatile ( \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
78 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
79 "paddb %%" #regd ", %%" #regd " \n\t" ::)
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
80
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
81 #ifndef PIC
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
82 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
83 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
84 #else
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
85 // for shared library it's better to use this way for accessing constants
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
86 // pcmpeqd -> -1
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
87 #define MOVQ_BONE(regd) \
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
88 __asm __volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
89 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
90 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
91 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
92
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
93 #define MOVQ_WTWO(regd) \
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
94 __asm __volatile ( \
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
95 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
96 "psrlw $15, %%" #regd " \n\t" \
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
97 "psllw $1, %%" #regd " \n\t"::)
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
98
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
99 #endif
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
100
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
101 // using regr as temporary and for the output result
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
102 // first argument is unmodifed and second is trashed
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
103 // regfe is supposed to contain 0xfefefefefefefefe
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
104 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
105 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
106 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
107 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
108 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
109 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
110 "paddb " #regb ", " #regr " \n\t"
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
111
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
112 #define PAVGB_MMX(rega, regb, regr, regfe) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
113 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
114 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
115 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
116 "pand " #regfe "," #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
117 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
118 "psubb " #regb ", " #regr " \n\t"
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
119
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
120 // mm6 is supposed to contain 0xfefefefefefefefe
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
121 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
122 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
123 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
124 "pand " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
125 "pand " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
126 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
127 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
128 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
129 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
130 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
131 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
132 "paddb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
133 "paddb " #regd ", " #regp " \n\t"
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
134
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
135 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
136 "movq " #rega ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
137 "movq " #regc ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
138 "por " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
139 "por " #regd ", " #regp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
140 "pxor " #rega ", " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
141 "pxor " #regc ", " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
142 "pand %%mm6, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
143 "pand %%mm6, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
144 "psrlq $1, " #regd " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
145 "psrlq $1, " #regb " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
146 "psubb " #regb ", " #regr " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
147 "psubb " #regd ", " #regp " \n\t"
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
148
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
149 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
150 /* MMX no rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
151 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
152 #define SET_RND MOVQ_WONE
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
153 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
154 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
155
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
156 #include "dsputil_mmx_rnd.h"
444
a5edef76dac6 * new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents: 438
diff changeset
157
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
158 #undef DEF
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
159 #undef SET_RND
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
160 #undef PAVGBP
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
161 #undef PAVGB
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
162 /***********************************/
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
163 /* MMX rounding */
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
164
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
165 #define DEF(x, y) x ## _ ## y ##_mmx
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
166 #define SET_RND MOVQ_WTWO
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
167 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
168 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
445
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
169
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
170 #include "dsputil_mmx_rnd.h"
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
171
62c01dbdc1e0 * code with new PAVGB for MMX only CPU splited into separate file
kabi
parents: 444
diff changeset
172 #undef DEF
448
e8c8ca9106aa * removed MANGLE from macros for setting constants
kabi
parents: 446
diff changeset
173 #undef SET_RND
446
efe0c0d40577 * reenabled original xy2 put routine - rounding error is really bad with
kabi
parents: 445
diff changeset
174 #undef PAVGBP
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
175 #undef PAVGB
387
b8f3affeb8e1 shared lib support (req by kabi) ...
michaelni
parents: 386
diff changeset
176
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
177 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
178 /* 3Dnow specific */
986e461dc072 Initial revision
glantau
parents:
diff changeset
179
986e461dc072 Initial revision
glantau
parents:
diff changeset
180 #define DEF(x) x ## _3dnow
986e461dc072 Initial revision
glantau
parents:
diff changeset
181 #define PAVGB "pavgusb"
986e461dc072 Initial revision
glantau
parents:
diff changeset
182
986e461dc072 Initial revision
glantau
parents:
diff changeset
183 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision
glantau
parents:
diff changeset
184
986e461dc072 Initial revision
glantau
parents:
diff changeset
185 #undef DEF
986e461dc072 Initial revision
glantau
parents:
diff changeset
186 #undef PAVGB
986e461dc072 Initial revision
glantau
parents:
diff changeset
187
986e461dc072 Initial revision
glantau
parents:
diff changeset
188 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
189 /* MMX2 specific */
986e461dc072 Initial revision
glantau
parents:
diff changeset
190
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
191 #define DEF(x) x ## _mmx2
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
192
986e461dc072 Initial revision
glantau
parents:
diff changeset
193 /* Introduced only in MMX2 set */
986e461dc072 Initial revision
glantau
parents:
diff changeset
194 #define PAVGB "pavgb"
986e461dc072 Initial revision
glantau
parents:
diff changeset
195
986e461dc072 Initial revision
glantau
parents:
diff changeset
196 #include "dsputil_mmx_avg.h"
986e461dc072 Initial revision
glantau
parents:
diff changeset
197
986e461dc072 Initial revision
glantau
parents:
diff changeset
198 #undef DEF
986e461dc072 Initial revision
glantau
parents:
diff changeset
199 #undef PAVGB
986e461dc072 Initial revision
glantau
parents:
diff changeset
200
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
201 #define SBUTTERFLY(a,b,t,n,m)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
202 "mov" #m " " #a ", " #t " \n\t" /* abcd */\
3416
fb9d94637fb2 #define SBUTTERFLY outside CONFIG_ENCODERS
mru
parents: 3398
diff changeset
203 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
fb9d94637fb2 #define SBUTTERFLY outside CONFIG_ENCODERS
mru
parents: 3398
diff changeset
204 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
fb9d94637fb2 #define SBUTTERFLY outside CONFIG_ENCODERS
mru
parents: 3398
diff changeset
205
4939
3409dc0e7797 cosmetics: remove duplicate transpose macro
lorenm
parents: 4796
diff changeset
206 #define TRANSPOSE4(a,b,c,d,t)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
207 SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
208 SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
209 SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
210 SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
4939
3409dc0e7797 cosmetics: remove duplicate transpose macro
lorenm
parents: 4796
diff changeset
211
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
212 /***********************************/
986e461dc072 Initial revision
glantau
parents:
diff changeset
213 /* standard MMX */
986e461dc072 Initial revision
glantau
parents:
diff changeset
214
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
215 #ifdef CONFIG_ENCODERS
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
216 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
217 {
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
218 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
219 "mov $-128, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
220 "pxor %%mm7, %%mm7 \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
221 ASMALIGN(4)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
222 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
223 "movq (%0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
224 "movq (%0, %2), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
225 "movq %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
226 "movq %%mm2, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
227 "punpcklbw %%mm7, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
228 "punpckhbw %%mm7, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
229 "punpcklbw %%mm7, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
230 "punpckhbw %%mm7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
231 "movq %%mm0, (%1, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
232 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
233 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
234 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
235 "add %3, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
236 "add $32, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
237 "js 1b \n\t"
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
238 : "+r" (pixels)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
239 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
240 : "%"REG_a
386
f49629bab18d hopefully faster mmx2&3dnow MC
michaelni
parents: 342
diff changeset
241 );
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
242 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
243
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
244 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
245 {
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
246 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
247 "pxor %%mm7, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
248 "mov $-128, %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
249 ASMALIGN(4)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
250 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
251 "movq (%0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
252 "movq (%1), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
253 "movq %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
254 "movq %%mm2, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
255 "punpcklbw %%mm7, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
256 "punpckhbw %%mm7, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
257 "punpcklbw %%mm7, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
258 "punpckhbw %%mm7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
259 "psubw %%mm2, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
260 "psubw %%mm3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
261 "movq %%mm0, (%2, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
262 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
263 "add %3, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
264 "add %3, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
265 "add $16, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
266 "jnz 1b \n\t"
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
267 : "+r" (s1), "+r" (s2)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
268 : "r" (block+64), "r" ((long)stride)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
269 : "%"REG_a
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
270 );
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
271 }
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
272 #endif //CONFIG_ENCODERS
324
9c6f056f0e41 fixed mpeg4 time stuff on encoding
michaelni
parents: 296
diff changeset
273
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
274 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
275 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
276 const DCTELEM *p;
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
277 uint8_t *pix;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
278
986e461dc072 Initial revision
glantau
parents:
diff changeset
279 /* read the pixels */
986e461dc072 Initial revision
glantau
parents:
diff changeset
280 p = block;
986e461dc072 Initial revision
glantau
parents:
diff changeset
281 pix = pixels;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
282 /* unrolled loop */
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
283 __asm __volatile(
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
284 "movq %3, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
285 "movq 8%3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
286 "movq 16%3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
287 "movq 24%3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
288 "movq 32%3, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
289 "movq 40%3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
290 "movq 48%3, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
291 "movq 56%3, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
292 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
293 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
294 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
295 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
296 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
297 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
298 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
299 "movq %%mm6, (%0, %2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
300 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
301 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
302 pix += line_size*4;
986e461dc072 Initial revision
glantau
parents:
diff changeset
303 p += 32;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
304
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
305 // if here would be an exact copy of the code above
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
306 // compiler would generate some very strange code
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
307 // thus using "r"
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
308 __asm __volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
309 "movq (%3), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
310 "movq 8(%3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
311 "movq 16(%3), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
312 "movq 24(%3), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
313 "movq 32(%3), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
314 "movq 40(%3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
315 "movq 48(%3), %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
316 "movq 56(%3), %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
317 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
318 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
319 "packuswb %%mm5, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
320 "packuswb %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
321 "movq %%mm0, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
322 "movq %%mm2, (%0, %1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
323 "movq %%mm4, (%0, %1, 2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
324 "movq %%mm6, (%0, %2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
325 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
326 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
327 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
328
3089
072dbc669253 MSVC-compatible __align8/__align16 declaration
diego
parents: 3036
diff changeset
329 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
1985
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents: 1984
diff changeset
330 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
b2bc62fdecc0 move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents: 1984
diff changeset
331
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
332 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
333 {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
334 int i;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
335
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
336 movq_m2r(*vector128, mm1);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
337 for (i = 0; i < 8; i++) {
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
338 movq_m2r(*(block), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
339 packsswb_m2r(*(block + 4), mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
340 block += 8;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
341 paddb_r2r(mm1, mm0);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
342 movq_r2m(mm0, *pixels);
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
343 pixels += line_size;
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
344 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
345 }
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
346
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
347 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
348 {
986e461dc072 Initial revision
glantau
parents:
diff changeset
349 const DCTELEM *p;
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
350 uint8_t *pix;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
351 int i;
986e461dc072 Initial revision
glantau
parents:
diff changeset
352
986e461dc072 Initial revision
glantau
parents:
diff changeset
353 /* read the pixels */
986e461dc072 Initial revision
glantau
parents:
diff changeset
354 p = block;
986e461dc072 Initial revision
glantau
parents:
diff changeset
355 pix = pixels;
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
356 MOVQ_ZERO(mm7);
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
357 i = 4;
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
358 do {
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
359 __asm __volatile(
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
360 "movq (%2), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
361 "movq 8(%2), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
362 "movq 16(%2), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
363 "movq 24(%2), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
364 "movq %0, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
365 "movq %1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
366 "movq %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
367 "punpcklbw %%mm7, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
368 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
369 "paddsw %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
370 "paddsw %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
371 "movq %%mm6, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
372 "punpcklbw %%mm7, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
373 "punpckhbw %%mm7, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
374 "paddsw %%mm6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
375 "paddsw %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
376 "packuswb %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
377 "packuswb %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
378 "movq %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
379 "movq %%mm2, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
380 :"+m"(*pix), "+m"(*(pix+line_size))
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
381 :"r"(p)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
382 :"memory");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
383 pix += line_size*2;
986e461dc072 Initial revision
glantau
parents:
diff changeset
384 p += 16;
342
8635a7036395 * fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents: 324
diff changeset
385 } while (--i);
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
386 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
387
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
388 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
389 {
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
390 __asm __volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
391 "lea (%3, %3), %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
392 ASMALIGN(3)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
393 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
394 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
395 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
396 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
397 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
398 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
399 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
400 "movd (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
401 "movd (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
402 "movd %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
403 "movd %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
404 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
405 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
406 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
407 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
408 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
409 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
410 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
411 );
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
412 }
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
413
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
414 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
415 {
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
416 __asm __volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
417 "lea (%3, %3), %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
418 ASMALIGN(3)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
419 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
420 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
421 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
422 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
423 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
424 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
425 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
426 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
427 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
428 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
429 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
430 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
431 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
432 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
433 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
434 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
435 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
436 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
437 );
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
438 }
986e461dc072 Initial revision
glantau
parents:
diff changeset
439
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
440 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
441 {
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
442 __asm __volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
443 "lea (%3, %3), %%"REG_a" \n\t"
3576
f7125bf10892 Support for MacIntel, last part: balign directives
gpoirier
parents: 3574
diff changeset
444 ASMALIGN(3)
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
445 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
446 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
447 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
448 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
449 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
450 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
451 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
452 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
453 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
454 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
455 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
456 "movq (%1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
457 "movq 8(%1), %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
458 "movq (%1, %3), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
459 "movq 8(%1, %3), %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
460 "movq %%mm0, (%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
461 "movq %%mm4, 8(%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
462 "movq %%mm1, (%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
463 "movq %%mm5, 8(%2, %3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
464 "add %%"REG_a", %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
465 "add %%"REG_a", %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
466 "subl $4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
467 "jnz 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
468 : "+g"(h), "+r" (pixels), "+r" (block)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
469 : "r"((long)line_size)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
470 : "%"REG_a, "memory"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
471 );
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
472 }
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
473
296
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
474 static void clear_blocks_mmx(DCTELEM *blocks)
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
475 {
471
d7f65ea52aaa * reimplemented remaing avg_ pixel functions
kabi
parents: 448
diff changeset
476 __asm __volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
477 "pxor %%mm7, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
478 "mov $-128*6, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
479 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
480 "movq %%mm7, (%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
481 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
482 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
483 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
484 "add $32, %%"REG_a" \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
485 " js 1b \n\t"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
486 : : "r" (((uint8_t *)blocks)+128*6)
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
487 : "%"REG_a
296
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
488 );
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
489 }
c1a8a1b4a24b sizeof(s->block) isnt 64*6*2 anymore bugfix
michaelni
parents: 294
diff changeset
490
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
491 #ifdef CONFIG_ENCODERS
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
492 static int pix_sum16_mmx(uint8_t * pix, int line_size){
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
493 const int h=16;
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
494 int sum;
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
495 long index= -line_size*h;
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
496
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
497 __asm __volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
498 "pxor %%mm7, %%mm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
499 "pxor %%mm6, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
500 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
501 "movq (%2, %1), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
502 "movq (%2, %1), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
503 "movq 8(%2, %1), %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
504 "movq 8(%2, %1), %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
505 "punpcklbw %%mm7, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
506 "punpckhbw %%mm7, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
507 "punpcklbw %%mm7, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
508 "punpckhbw %%mm7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
509 "paddw %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
510 "paddw %%mm2, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
511 "paddw %%mm1, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
512 "paddw %%mm3, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
513 "add %3, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
514 " js 1b \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
515 "movq %%mm6, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
516 "psrlq $32, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
517 "paddw %%mm5, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
518 "movq %%mm6, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
519 "psrlq $16, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
520 "paddw %%mm5, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
521 "movd %%mm6, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
522 "andl $0xFFFF, %0 \n\t"
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
523 : "=&r" (sum), "+r" (index)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
524 : "r" (pix - index), "r" ((long)line_size)
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
525 );
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
526
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
527 return sum;
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
528 }
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
529 #endif //CONFIG_ENCODERS
688
894b61908734 pix_sum16_mmx()
michaelni
parents: 651
diff changeset
530
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
531 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
532 long i=0;
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
533 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
534 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
535 "movq (%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
536 "movq (%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
537 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
538 "movq %%mm1, (%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
539 "movq 8(%1, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
540 "movq 8(%2, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
541 "paddb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
542 "movq %%mm1, 8(%2, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
543 "add $16, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
544 "cmp %3, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
545 " jb 1b \n\t"
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
546 : "+r" (i)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
547 : "r"(src), "r"(dst), "r"((long)w-15)
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
548 );
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
549 for(; i<w; i++)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
550 dst[i+0] += src[i+0];
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
551 }
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
552
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
553 #define H263_LOOP_FILTER \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
554 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
555 "movq %0, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
556 "movq %0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
557 "movq %3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
558 "movq %3, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
559 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
560 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
561 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
562 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
563 "psubw %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
564 "psubw %%mm3, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
565 "movq %1, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
566 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
567 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
568 "movq %2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
569 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
570 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
571 "punpcklbw %%mm7, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
572 "punpckhbw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
573 "psubw %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
574 "psubw %%mm3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
575 "psllw $2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
576 "psllw $2, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
577 "paddw %%mm0, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
578 "paddw %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
579 "pxor %%mm6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
580 "pcmpgtw %%mm4, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
581 "pcmpgtw %%mm5, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
582 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
583 "pxor %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
584 "psubw %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
585 "psubw %%mm7, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
586 "psrlw $3, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
587 "psrlw $3, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
588 "packuswb %%mm5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
589 "packsswb %%mm7, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
590 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
591 "movd %4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
592 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
593 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
594 "punpcklbw %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
595 "psubusb %%mm4, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
596 "movq %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
597 "psubusb %%mm4, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
598 "psubb %%mm3, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
599 "movq %1, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
600 "movq %2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
601 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
602 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
603 "paddusb %%mm2, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
604 "psubusb %%mm2, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
605 "pxor %%mm6, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
606 "pxor %%mm6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
607 "paddusb %%mm2, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
608 "packsswb %%mm1, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
609 "pcmpgtb %%mm0, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
610 "pxor %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
611 "psubb %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
612 "movq %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
613 "psubusb %%mm2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
614 "psubb %%mm0, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
615 "pand %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
616 "psrlw $2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
617 "pxor %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
618 "psubb %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
619 "movq %0, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
620 "movq %3, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
621 "psubb %%mm1, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
622 "paddb %%mm1, %%mm6 \n\t"
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
623
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
624 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
625 if(ENABLE_ANY_H263) {
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
626 const int strength= ff_h263_loop_filter_strength[qscale];
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
627
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
628 asm volatile(
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
629
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
630 H263_LOOP_FILTER
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
631
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
632 "movq %%mm3, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
633 "movq %%mm4, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
634 "movq %%mm5, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
635 "movq %%mm6, %3 \n\t"
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
636 : "+m" (*(uint64_t*)(src - 2*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
637 "+m" (*(uint64_t*)(src - 1*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
638 "+m" (*(uint64_t*)(src + 0*stride)),
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
639 "+m" (*(uint64_t*)(src + 1*stride))
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
640 : "g" (2*strength), "m"(ff_pb_FC)
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
641 );
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
642 }
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
643 }
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
644
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
645 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
646 asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
647 "movd %4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
648 "movd %5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
649 "movd %6, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
650 "movd %7, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
651 "punpcklbw %%mm1, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
652 "punpcklbw %%mm3, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
653 "movq %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
654 "punpcklwd %%mm2, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
655 "punpckhwd %%mm2, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
656 "movd %%mm0, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
657 "punpckhdq %%mm0, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
658 "movd %%mm0, %1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
659 "movd %%mm1, %2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
660 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
661 "movd %%mm1, %3 \n\t"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
662
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
663 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
664 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
665 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
666 "=m" (*(uint32_t*)(dst + 3*dst_stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
667 : "m" (*(uint32_t*)(src + 0*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
668 "m" (*(uint32_t*)(src + 1*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
669 "m" (*(uint32_t*)(src + 2*src_stride)),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
670 "m" (*(uint32_t*)(src + 3*src_stride))
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
671 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
672 }
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
673
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
674 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
675 if(ENABLE_ANY_H263) {
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
676 const int strength= ff_h263_loop_filter_strength[qscale];
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
677 uint64_t temp[4] __attribute__ ((aligned(8)));
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
678 uint8_t *btemp= (uint8_t*)temp;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
679
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
680 src -= 2;
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
681
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
682 transpose4x4(btemp , src , 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
683 transpose4x4(btemp+4, src + 4*stride, 8, stride);
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
684 asm volatile(
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
685 H263_LOOP_FILTER // 5 3 4 6
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
686
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
687 : "+m" (temp[0]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
688 "+m" (temp[1]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
689 "+m" (temp[2]),
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
690 "+m" (temp[3])
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
691 : "g" (2*strength), "m"(ff_pb_FC)
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
692 );
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
693
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
694 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
695 "movq %%mm5, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
696 "movq %%mm4, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
697 "punpcklbw %%mm3, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
698 "punpcklbw %%mm6, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
699 "punpckhbw %%mm3, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
700 "punpckhbw %%mm6, %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
701 "movq %%mm5, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
702 "movq %%mm1, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
703 "punpcklwd %%mm4, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
704 "punpcklwd %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
705 "punpckhwd %%mm4, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
706 "punpckhwd %%mm0, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
707 "movd %%mm5, (%0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
708 "punpckhdq %%mm5, %%mm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
709 "movd %%mm5, (%0,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
710 "movd %%mm3, (%0,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
711 "punpckhdq %%mm3, %%mm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
712 "movd %%mm3, (%0,%3) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
713 "movd %%mm1, (%1) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
714 "punpckhdq %%mm1, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
715 "movd %%mm1, (%1,%2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
716 "movd %%mm6, (%1,%2,2) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
717 "punpckhdq %%mm6, %%mm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
718 "movd %%mm6, (%1,%3) \n\t"
2505
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
719 :: "r" (src),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
720 "r" (src + 4*stride),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
721 "r" ((long) stride ),
86e2b1424801 optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents: 2293
diff changeset
722 "r" ((long)(3*stride))
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
723 );
5394
e9a6215f4e3a help some gcc version to optimize out those functions
aurel
parents: 5278
diff changeset
724 }
1648
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
725 }
de28264c3dc3 h263_h_loop_filter_mmx
michael
parents: 1647
diff changeset
726
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
727 #ifdef CONFIG_ENCODERS
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
728 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
729 int tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
730 asm volatile (
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
731 "movl $16,%%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
732 "pxor %%mm0,%%mm0\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
733 "pxor %%mm7,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
734 "1:\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
735 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
736 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
737
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
738 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
739
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
740 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
741 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
742
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
743 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
744 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
745 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
746
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
747 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
748 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
749
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
750 "pmaddwd %%mm3,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
751 "pmaddwd %%mm4,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
752
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
753 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
754 pix2^2+pix3^2+pix6^2+pix7^2) */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
755 "paddd %%mm3,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
756 "paddd %%mm2,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
757
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
758 "add %2, %0\n"
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
759 "paddd %%mm4,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
760 "dec %%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
761 "jnz 1b\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
762
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
763 "movq %%mm7,%%mm1\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
764 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
765 "paddd %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
766 "movd %%mm1,%1\n"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
767 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
768 return tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
769 }
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
770
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
771 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
772 int tmp;
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
773 asm volatile (
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
774 "movl %4,%%ecx\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
775 "shr $1,%%ecx\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
776 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
777 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
778 "1:\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
779 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
780 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
781 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
782 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
783
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
784 /* todo: mm1-mm2, mm3-mm4 */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
785 /* algo: substract mm1 from mm2 with saturation and vice versa */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
786 /* OR the results to get absolute difference */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
787 "movq %%mm1,%%mm5\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
788 "movq %%mm3,%%mm6\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
789 "psubusb %%mm2,%%mm1\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
790 "psubusb %%mm4,%%mm3\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
791 "psubusb %%mm5,%%mm2\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
792 "psubusb %%mm6,%%mm4\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
793
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
794 "por %%mm1,%%mm2\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
795 "por %%mm3,%%mm4\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
796
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
797 /* now convert to 16-bit vectors so we can square them */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
798 "movq %%mm2,%%mm1\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
799 "movq %%mm4,%%mm3\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
800
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
801 "punpckhbw %%mm0,%%mm2\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
802 "punpckhbw %%mm0,%%mm4\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
803 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
804 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
805
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
806 "pmaddwd %%mm2,%%mm2\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
807 "pmaddwd %%mm4,%%mm4\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
808 "pmaddwd %%mm1,%%mm1\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
809 "pmaddwd %%mm3,%%mm3\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
810
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
811 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
812 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
813
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
814 "paddd %%mm2,%%mm1\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
815 "paddd %%mm4,%%mm3\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
816 "paddd %%mm1,%%mm7\n"
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
817 "paddd %%mm3,%%mm7\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
818
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
819 "decl %%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
820 "jnz 1b\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
821
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
822 "movq %%mm7,%%mm1\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
823 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
824 "paddd %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
825 "movd %%mm1,%2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
826 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
827 : "r" ((long)line_size) , "m" (h)
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
828 : "%ecx");
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
829 return tmp;
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
830 }
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
831
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
832 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
833 int tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
834 asm volatile (
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
835 "movl %4,%%ecx\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
836 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
837 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
838 "1:\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
839 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
840 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
841 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
842 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
843
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
844 /* todo: mm1-mm2, mm3-mm4 */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
845 /* algo: substract mm1 from mm2 with saturation and vice versa */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
846 /* OR the results to get absolute difference */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
847 "movq %%mm1,%%mm5\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
848 "movq %%mm3,%%mm6\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
849 "psubusb %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
850 "psubusb %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
851 "psubusb %%mm5,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
852 "psubusb %%mm6,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
853
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
854 "por %%mm1,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
855 "por %%mm3,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
856
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
857 /* now convert to 16-bit vectors so we can square them */
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
858 "movq %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
859 "movq %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
860
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
861 "punpckhbw %%mm0,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
862 "punpckhbw %%mm0,%%mm4\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
863 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
864 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
865
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
866 "pmaddwd %%mm2,%%mm2\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
867 "pmaddwd %%mm4,%%mm4\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
868 "pmaddwd %%mm1,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
869 "pmaddwd %%mm3,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
870
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
871 "add %3,%0\n"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
872 "add %3,%1\n"
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
873
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
874 "paddd %%mm2,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
875 "paddd %%mm4,%%mm3\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
876 "paddd %%mm1,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
877 "paddd %%mm3,%%mm7\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
878
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
879 "decl %%ecx\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
880 "jnz 1b\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
881
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
882 "movq %%mm7,%%mm1\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
883 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
884 "paddd %%mm7,%%mm1\n"
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
885 "movd %%mm1,%2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
886 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
887 : "r" ((long)line_size) , "m" (h)
1708
dea5b2946999 interlaced motion estimation
michael
parents: 1686
diff changeset
888 : "%ecx");
997
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
889 return tmp;
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
890 }
4dfe15ae0078 sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents: 984
diff changeset
891
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
892 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
893 int tmp;
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
894 asm volatile (
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
895 "shr $1,%2\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
896 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
897 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
898 "1:\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
899 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
900 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
901 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
902 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
903
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
904 /* todo: mm1-mm2, mm3-mm4 */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
905 /* algo: substract mm1 from mm2 with saturation and vice versa */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
906 /* OR the results to get absolute difference */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
907 "movdqa %%xmm1,%%xmm5\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
908 "movdqa %%xmm3,%%xmm6\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
909 "psubusb %%xmm2,%%xmm1\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
910 "psubusb %%xmm4,%%xmm3\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
911 "psubusb %%xmm5,%%xmm2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
912 "psubusb %%xmm6,%%xmm4\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
913
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
914 "por %%xmm1,%%xmm2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
915 "por %%xmm3,%%xmm4\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
916
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
917 /* now convert to 16-bit vectors so we can square them */
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
918 "movdqa %%xmm2,%%xmm1\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
919 "movdqa %%xmm4,%%xmm3\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
920
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
921 "punpckhbw %%xmm0,%%xmm2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
922 "punpckhbw %%xmm0,%%xmm4\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
923 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
924 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
925
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
926 "pmaddwd %%xmm2,%%xmm2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
927 "pmaddwd %%xmm4,%%xmm4\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
928 "pmaddwd %%xmm1,%%xmm1\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
929 "pmaddwd %%xmm3,%%xmm3\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
930
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
931 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
932 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
933
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
934 "paddd %%xmm2,%%xmm1\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
935 "paddd %%xmm4,%%xmm3\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
936 "paddd %%xmm1,%%xmm7\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
937 "paddd %%xmm3,%%xmm7\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
938
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
939 "decl %2\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
940 "jnz 1b\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
941
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
942 "movdqa %%xmm7,%%xmm1\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
943 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
944 "paddd %%xmm1,%%xmm7\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
945 "movdqa %%xmm7,%%xmm1\n"
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
946 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
947 "paddd %%xmm1,%%xmm7\n"
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
948 "movd %%xmm7,%3\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
949 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
2899
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
950 : "r" ((long)line_size));
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
951 return tmp;
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
952 }
d3a726717baf sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents: 2892
diff changeset
953
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
954 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
955 int tmp;
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
956 asm volatile (
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
957 "movl %3,%%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
958 "pxor %%mm7,%%mm7\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
959 "pxor %%mm6,%%mm6\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
960
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
961 "movq (%0),%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
962 "movq %%mm0, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
963 "psllq $8, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
964 "psrlq $8, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
965 "psrlq $8, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
966 "movq %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
967 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
968 "punpcklbw %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
969 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
970 "punpckhbw %%mm7,%%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
971 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
972 "psubw %%mm1, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
973 "psubw %%mm3, %%mm2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
974
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
975 "add %2,%0\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
976
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
977 "movq (%0),%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
978 "movq %%mm4, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
979 "psllq $8, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
980 "psrlq $8, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
981 "psrlq $8, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
982 "movq %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
983 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
984 "punpcklbw %%mm7,%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
985 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
986 "punpckhbw %%mm7,%%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
987 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
988 "psubw %%mm1, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
989 "psubw %%mm3, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
990 "psubw %%mm4, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
991 "psubw %%mm5, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
992 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
993 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
994 "pcmpgtw %%mm0, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
995 "pcmpgtw %%mm2, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
996 "pxor %%mm3, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
997 "pxor %%mm1, %%mm2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
998 "psubw %%mm3, %%mm0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
999 "psubw %%mm1, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1000 "paddw %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1001 "paddw %%mm2, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1002
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1003 "add %2,%0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1004 "1:\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1005
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1006 "movq (%0),%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1007 "movq %%mm0, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1008 "psllq $8, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1009 "psrlq $8, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1010 "psrlq $8, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1011 "movq %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1012 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1013 "punpcklbw %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1014 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1015 "punpckhbw %%mm7,%%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1016 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1017 "psubw %%mm1, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1018 "psubw %%mm3, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1019 "psubw %%mm0, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1020 "psubw %%mm2, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1021 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1022 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1023 "pcmpgtw %%mm4, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1024 "pcmpgtw %%mm5, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1025 "pxor %%mm3, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1026 "pxor %%mm1, %%mm5\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1027 "psubw %%mm3, %%mm4\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1028 "psubw %%mm1, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1029 "paddw %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1030 "paddw %%mm5, %%mm6\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1031
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1032 "add %2,%0\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1033
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1034 "movq (%0),%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1035 "movq %%mm4, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1036 "psllq $8, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1037 "psrlq $8, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1038 "psrlq $8, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1039 "movq %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1040 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1041 "punpcklbw %%mm7,%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1042 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1043 "punpckhbw %%mm7,%%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1044 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1045 "psubw %%mm1, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1046 "psubw %%mm3, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1047 "psubw %%mm4, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1048 "psubw %%mm5, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1049 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1050 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1051 "pcmpgtw %%mm0, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1052 "pcmpgtw %%mm2, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1053 "pxor %%mm3, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1054 "pxor %%mm1, %%mm2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1055 "psubw %%mm3, %%mm0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1056 "psubw %%mm1, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1057 "paddw %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1058 "paddw %%mm2, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1059
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1060 "add %2,%0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1061 "subl $2, %%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1062 " jnz 1b\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1063
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1064 "movq %%mm6, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1065 "punpcklwd %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1066 "punpckhwd %%mm7,%%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1067 "paddd %%mm0, %%mm6\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1068
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1069 "movq %%mm6,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1070 "psrlq $32, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1071 "paddd %%mm6,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1072 "movd %%mm0,%1\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1073 : "+r" (pix1), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1074 : "r" ((long)line_size) , "g" (h-2)
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1075 : "%ecx");
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1076 return tmp;
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1077 }
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1078
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1079 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1080 int tmp;
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1081 uint8_t * pix= pix1;
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1082 asm volatile (
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1083 "movl %3,%%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1084 "pxor %%mm7,%%mm7\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1085 "pxor %%mm6,%%mm6\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1086
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1087 "movq (%0),%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1088 "movq 1(%0),%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1089 "movq %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1090 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1091 "punpcklbw %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1092 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1093 "punpckhbw %%mm7,%%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1094 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1095 "psubw %%mm1, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1096 "psubw %%mm3, %%mm2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1097
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1098 "add %2,%0\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1099
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1100 "movq (%0),%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1101 "movq 1(%0),%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1102 "movq %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1103 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1104 "punpcklbw %%mm7,%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1105 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1106 "punpckhbw %%mm7,%%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1107 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1108 "psubw %%mm1, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1109 "psubw %%mm3, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1110 "psubw %%mm4, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1111 "psubw %%mm5, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1112 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1113 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1114 "pcmpgtw %%mm0, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1115 "pcmpgtw %%mm2, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1116 "pxor %%mm3, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1117 "pxor %%mm1, %%mm2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1118 "psubw %%mm3, %%mm0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1119 "psubw %%mm1, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1120 "paddw %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1121 "paddw %%mm2, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1122
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1123 "add %2,%0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1124 "1:\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1125
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1126 "movq (%0),%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1127 "movq 1(%0),%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1128 "movq %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1129 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1130 "punpcklbw %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1131 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1132 "punpckhbw %%mm7,%%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1133 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1134 "psubw %%mm1, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1135 "psubw %%mm3, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1136 "psubw %%mm0, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1137 "psubw %%mm2, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1138 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1139 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1140 "pcmpgtw %%mm4, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1141 "pcmpgtw %%mm5, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1142 "pxor %%mm3, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1143 "pxor %%mm1, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1144 "psubw %%mm3, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1145 "psubw %%mm1, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1146 "paddw %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1147 "paddw %%mm5, %%mm6\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1148
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1149 "add %2,%0\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1150
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1151 "movq (%0),%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1152 "movq 1(%0),%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1153 "movq %%mm4, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1154 "movq %%mm1, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1155 "punpcklbw %%mm7,%%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1156 "punpcklbw %%mm7,%%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1157 "punpckhbw %%mm7,%%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1158 "punpckhbw %%mm7,%%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1159 "psubw %%mm1, %%mm4\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1160 "psubw %%mm3, %%mm5\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1161 "psubw %%mm4, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1162 "psubw %%mm5, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1163 "pxor %%mm3, %%mm3\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1164 "pxor %%mm1, %%mm1\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1165 "pcmpgtw %%mm0, %%mm3\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1166 "pcmpgtw %%mm2, %%mm1\n\t"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1167 "pxor %%mm3, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1168 "pxor %%mm1, %%mm2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1169 "psubw %%mm3, %%mm0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1170 "psubw %%mm1, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1171 "paddw %%mm0, %%mm2\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1172 "paddw %%mm2, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1173
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1174 "add %2,%0\n"
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1175 "subl $2, %%ecx\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1176 " jnz 1b\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1177
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1178 "movq %%mm6, %%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1179 "punpcklwd %%mm7,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1180 "punpckhwd %%mm7,%%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1181 "paddd %%mm0, %%mm6\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1182
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1183 "movq %%mm6,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1184 "psrlq $32, %%mm6\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1185 "paddd %%mm6,%%mm0\n"
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1186 "movd %%mm0,%1\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1187 : "+r" (pix1), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1188 : "r" ((long)line_size) , "g" (h-2)
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1189 : "%ecx");
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1190 return tmp + hf_noise8_mmx(pix+8, line_size, h);
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1191 }
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1192
2864
95bac7109ff0 Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents: 2754
diff changeset
1193 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
95bac7109ff0 Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents: 2754
diff changeset
1194 MpegEncContext *c = p;
2940
8aa244d7c274 use sse16_sse2() in nsse
lorenm
parents: 2922
diff changeset
1195 int score1, score2;
8aa244d7c274 use sse16_sse2() in nsse
lorenm
parents: 2922
diff changeset
1196
8aa244d7c274 use sse16_sse2() in nsse
lorenm
parents: 2922
diff changeset
1197 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
8aa244d7c274 use sse16_sse2() in nsse
lorenm
parents: 2922
diff changeset
1198 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
8aa244d7c274 use sse16_sse2() in nsse
lorenm
parents: 2922
diff changeset
1199 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1200
4001
34fdffe98bd0 Rename ABS macro to FFABS.
diego
parents: 3947
diff changeset
1201 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
34fdffe98bd0 Rename ABS macro to FFABS.
diego
parents: 3947
diff changeset
1202 else return score1 + FFABS(score2)*8;
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1203 }
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1204
2864
95bac7109ff0 Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents: 2754
diff changeset
1205 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
95bac7109ff0 Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents: 2754
diff changeset
1206 MpegEncContext *c = p;
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1207 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1208 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1209
4001
34fdffe98bd0 Rename ABS macro to FFABS.
diego
parents: 3947
diff changeset
1210 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
34fdffe98bd0 Rename ABS macro to FFABS.
diego
parents: 3947
diff changeset
1211 else return score1 + FFABS(score2)*8;
2067
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1212 }
f37b6ffc81ed sse8 and nsse in mmx
michael
parents: 2024
diff changeset
1213
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1214 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1215 int tmp;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1216
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1217 assert( (((int)pix) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1218 assert((line_size &7) ==0);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1219
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1220 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1221 "movq (%0), %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1222 "movq 8(%0), %%mm3\n"\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1223 "add %2,%0\n"\
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1224 "movq %%mm2, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1225 "movq %%mm3, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1226 "psubusb " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1227 "psubusb " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1228 "psubusb " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1229 "psubusb " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1230 "por %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1231 "por %%mm3, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1232 "movq " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1233 "movq " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1234 "punpcklbw %%mm7, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1235 "punpcklbw %%mm7, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1236 "punpckhbw %%mm7, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1237 "punpckhbw %%mm7, %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1238 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1239 "paddw %%mm3, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1240 "paddw %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1241 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1242
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1243
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1244 asm volatile (
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1245 "movl %3,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1246 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1247 "pxor %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1248 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1249 "movq 8(%0),%%mm1\n"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1250 "add %2,%0\n"
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1251 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1252 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1253 "1:\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1254
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1255 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1256
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1257 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1258
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1259 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1260 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1261
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1262 "movq %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1263 "psrlq $32, %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1264 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1265 "movq %%mm0,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1266 "psrlq $16, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1267 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1268 "movd %%mm0,%1\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1269 : "+r" (pix), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1270 : "r" ((long)line_size) , "m" (h)
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1271 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1272 return tmp & 0xFFFF;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1273 }
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1274 #undef SUM
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1275
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1276 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1277 int tmp;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1278
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1279 assert( (((int)pix) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1280 assert((line_size &7) ==0);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1281
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1282 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1283 "movq (%0), " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1284 "movq 8(%0), " #out1 "\n"\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1285 "add %2,%0\n"\
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1286 "psadbw " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1287 "psadbw " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1288 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1289 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1290
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1291 asm volatile (
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1292 "movl %3,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1293 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1294 "pxor %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1295 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1296 "movq 8(%0),%%mm1\n"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1297 "add %2,%0\n"
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1298 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1299 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1300 "1:\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1301
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1302 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1303
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1304 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1305
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1306 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1307 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1308
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1309 "movd %%mm6,%1\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1310 : "+r" (pix), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1311 : "r" ((long)line_size) , "m" (h)
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1312 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1313 return tmp;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1314 }
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1315 #undef SUM
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1316
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1317 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1318 int tmp;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1319
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1320 assert( (((int)pix1) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1321 assert( (((int)pix2) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1322 assert((line_size &7) ==0);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1323
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1324 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1325 "movq (%0),%%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1326 "movq (%1)," #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1327 "movq 8(%0),%%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1328 "movq 8(%1)," #out1 "\n"\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1329 "add %3,%0\n"\
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1330 "add %3,%1\n"\
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1331 "psubb " #out0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1332 "psubb " #out1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1333 "pxor %%mm7, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1334 "pxor %%mm7, %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1335 "movq %%mm2, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1336 "movq %%mm3, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1337 "psubusb " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1338 "psubusb " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1339 "psubusb " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1340 "psubusb " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1341 "por %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1342 "por %%mm3, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1343 "movq " #in0 ", %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1344 "movq " #in1 ", %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1345 "punpcklbw %%mm7, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1346 "punpcklbw %%mm7, " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1347 "punpckhbw %%mm7, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1348 "punpckhbw %%mm7, %%mm3\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1349 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1350 "paddw %%mm3, %%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1351 "paddw %%mm2, " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1352 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1353
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1354
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1355 asm volatile (
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1356 "movl %4,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1357 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1358 "pcmpeqw %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1359 "psllw $15, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1360 "packsswb %%mm7, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1361 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1362 "movq (%1),%%mm2\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1363 "movq 8(%0),%%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1364 "movq 8(%1),%%mm3\n"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1365 "add %3,%0\n"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1366 "add %3,%1\n"
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1367 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1368 "psubb %%mm2, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1369 "psubb %%mm3, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1370 "pxor %%mm7, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1371 "pxor %%mm7, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1372 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1373 "1:\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1374
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1375 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1376
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1377 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1378
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1379 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1380 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1381
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1382 "movq %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1383 "psrlq $32, %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1384 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1385 "movq %%mm0,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1386 "psrlq $16, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1387 "paddw %%mm6,%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1388 "movd %%mm0,%2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1389 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1390 : "r" ((long)line_size) , "m" (h)
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1391 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1392 return tmp & 0x7FFF;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1393 }
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1394 #undef SUM
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1395
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1396 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1397 int tmp;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1398
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1399 assert( (((int)pix1) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1400 assert( (((int)pix2) & 7) == 0);
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1401 assert((line_size &7) ==0);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1402
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1403 #define SUM(in0, in1, out0, out1) \
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1404 "movq (%0)," #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1405 "movq (%1),%%mm2\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1406 "movq 8(%0)," #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1407 "movq 8(%1),%%mm3\n"\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1408 "add %3,%0\n"\
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1409 "add %3,%1\n"\
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1410 "psubb %%mm2, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1411 "psubb %%mm3, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1412 "pxor %%mm7, " #out0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1413 "pxor %%mm7, " #out1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1414 "psadbw " #out0 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1415 "psadbw " #out1 ", " #in1 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1416 "paddw " #in1 ", " #in0 "\n"\
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1417 "paddw " #in0 ", %%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1418
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1419 asm volatile (
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1420 "movl %4,%%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1421 "pxor %%mm6,%%mm6\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1422 "pcmpeqw %%mm7,%%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1423 "psllw $15, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1424 "packsswb %%mm7, %%mm7\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1425 "movq (%0),%%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1426 "movq (%1),%%mm2\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1427 "movq 8(%0),%%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1428 "movq 8(%1),%%mm3\n"
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1429 "add %3,%0\n"
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1430 "add %3,%1\n"
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1431 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1432 "psubb %%mm2, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1433 "psubb %%mm3, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1434 "pxor %%mm7, %%mm0\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1435 "pxor %%mm7, %%mm1\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1436 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1437 "1:\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1438
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1439 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1440
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1441 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1442
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1443 "subl $2, %%ecx\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1444 "jnz 1b\n"
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1445
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1446 "movd %%mm6,%2\n"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1447 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1448 : "r" ((long)line_size) , "m" (h)
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1449 : "%ecx");
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1450 return tmp;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1451 }
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1452 #undef SUM
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
1453
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1454 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1455 long i=0;
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1456 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1457 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1458 "movq (%2, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1459 "movq (%1, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1460 "psubb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1461 "movq %%mm1, (%3, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1462 "movq 8(%2, %0), %%mm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1463 "movq 8(%1, %0), %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1464 "psubb %%mm0, %%mm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1465 "movq %%mm1, 8(%3, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1466 "add $16, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1467 "cmp %4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1468 " jb 1b \n\t"
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1469 : "+r" (i)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1470 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1471 );
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1472 for(; i<w; i++)
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1473 dst[i+0] = src1[i+0]-src2[i+0];
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1474 }
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1475
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1476 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1477 long i=0;
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1478 uint8_t l, lt;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1479
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1480 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1481 "1: \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1482 "movq -1(%1, %0), %%mm0 \n\t" // LT
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1483 "movq (%1, %0), %%mm1 \n\t" // T
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1484 "movq -1(%2, %0), %%mm2 \n\t" // L
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1485 "movq (%2, %0), %%mm3 \n\t" // X
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1486 "movq %%mm2, %%mm4 \n\t" // L
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1487 "psubb %%mm0, %%mm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1488 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1489 "movq %%mm4, %%mm5 \n\t" // L
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1490 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1491 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1492 "pminub %%mm2, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1493 "pmaxub %%mm1, %%mm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1494 "psubb %%mm4, %%mm3 \n\t" // dst - pred
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1495 "movq %%mm3, (%3, %0) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1496 "add $8, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1497 "cmp %4, %0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1498 " jb 1b \n\t"
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1499 : "+r" (i)
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
1500 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1501 );
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1502
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1503 l= *left;
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1504 lt= *left_top;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1505
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1506 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1507
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1508 *left_top= src1[w-1];
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1509 *left = src2[w-1];
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1510 }
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
1511
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1512 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1513 "mov"#m" "#p1", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1514 "mov"#m" "#p2", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1515 "punpcklbw "#a", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1516 "punpcklbw "#a", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1517 "psubw "#t", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1518
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1519 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1520 uint8_t *p1b=p1, *p2b=p2;\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1521 asm volatile(\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1522 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1523 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1524 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1525 "add %4, %1 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1526 "add %4, %2 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1527 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1528 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1529 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1530 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1531 "mov"#m1" "#mm"0, %0 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1532 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1533 "mov"#m1" %0, "#mm"0 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1534 : "=m"(temp), "+r"(p1b), "+r"(p2b)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1535 : "r"((long)stride), "r"((long)stride*3)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1536 );\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1537 }
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1538
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1539 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1540 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1541
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1542 #ifdef ARCH_X86_64
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1543 // permutes 01234567 -> 05736421
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1544 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1545 SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1546 SBUTTERFLY(c,d,b,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1547 SBUTTERFLY(e,f,d,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1548 SBUTTERFLY(g,h,f,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1549 SBUTTERFLY(a,c,h,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1550 SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1551 SBUTTERFLY(e,g,b,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1552 SBUTTERFLY(d,f,g,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1553 SBUTTERFLY(a,e,f,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1554 SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1555 SBUTTERFLY(h,b,d,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1556 SBUTTERFLY(c,g,b,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1557 "movdqa %%xmm8, "#g" \n\t"
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1558 #else
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1559 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1560 "movdqa "#h", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1561 SBUTTERFLY(a,b,h,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1562 "movdqa "#h", 16"#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1563 "movdqa "#t", "#h" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1564 SBUTTERFLY(c,d,b,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1565 SBUTTERFLY(e,f,d,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1566 SBUTTERFLY(g,h,f,wd,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1567 SBUTTERFLY(a,c,h,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1568 "movdqa "#h", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1569 "movdqa 16"#t", "#h" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1570 SBUTTERFLY(h,b,c,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1571 SBUTTERFLY(e,g,b,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1572 SBUTTERFLY(d,f,g,dq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1573 SBUTTERFLY(a,e,f,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1574 SBUTTERFLY(h,d,e,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1575 "movdqa "#h", 16"#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1576 "movdqa "#t", "#h" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1577 SBUTTERFLY(h,b,d,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1578 SBUTTERFLY(c,g,b,qdq,dqa)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1579 "movdqa 16"#t", "#g" \n\t"
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1580 #endif
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1581
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1582 #define LBUTTERFLY2(a1,b1,a2,b2)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1583 "paddw " #b1 ", " #a1 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1584 "paddw " #b2 ", " #a2 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1585 "paddw " #b1 ", " #b1 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1586 "paddw " #b2 ", " #b2 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1587 "psubw " #a1 ", " #b1 " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1588 "psubw " #a2 ", " #b2 " \n\t"
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1589
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1590 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1591 LBUTTERFLY2(m0, m1, m2, m3)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1592 LBUTTERFLY2(m4, m5, m6, m7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1593 LBUTTERFLY2(m0, m2, m1, m3)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1594 LBUTTERFLY2(m4, m6, m5, m7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1595 LBUTTERFLY2(m0, m4, m1, m5)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1596 LBUTTERFLY2(m2, m6, m3, m7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1597
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1598 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1599
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1600 #define MMABS_MMX(a,z)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1601 "pxor " #z ", " #z " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1602 "pcmpgtw " #a ", " #z " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1603 "pxor " #z ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1604 "psubw " #z ", " #a " \n\t"
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1605
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1606 #define MMABS_MMX2(a,z)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1607 "pxor " #z ", " #z " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1608 "psubw " #a ", " #z " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1609 "pmaxsw " #z ", " #a " \n\t"
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1610
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1611 #define MMABS_SSSE3(a,z)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1612 "pabsw " #a ", " #a " \n\t"
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1613
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1614 #define MMABS_SUM(a,z, sum)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1615 MMABS(a,z)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1616 "paddusw " #a ", " #sum " \n\t"
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1617
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1618 #define MMABS_SUM_8x8_NOSPILL\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1619 MMABS(%%xmm0, %%xmm8)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1620 MMABS(%%xmm1, %%xmm9)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1621 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1622 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1623 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1624 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1625 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1626 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1627 "paddusw %%xmm1, %%xmm0 \n\t"
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1628
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1629 #ifdef ARCH_X86_64
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1630 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1631 #else
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1632 #define MMABS_SUM_8x8_SSE2\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1633 "movdqa %%xmm7, (%1) \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1634 MMABS(%%xmm0, %%xmm7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1635 MMABS(%%xmm1, %%xmm7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1636 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1637 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1638 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1639 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1640 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1641 "movdqa (%1), %%xmm2 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1642 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1643 "paddusw %%xmm1, %%xmm0 \n\t"
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1644 #endif
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
1645
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1646 #define LOAD4(o, a, b, c, d)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1647 "movq "#o"(%1), "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1648 "movq "#o"+8(%1), "#b" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1649 "movq "#o"+16(%1), "#c" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1650 "movq "#o"+24(%1), "#d" \n\t"\
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1651
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1652 #define STORE4(o, a, b, c, d)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1653 "movq "#a", "#o"(%1) \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1654 "movq "#b", "#o"+8(%1) \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1655 "movq "#c", "#o"+16(%1) \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1656 "movq "#d", "#o"+24(%1) \n\t"\
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1657
4988
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1658 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1659 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1660 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1661 #define HSUM_MMX(a, t, dst)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1662 "movq "#a", "#t" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1663 "psrlq $32, "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1664 "paddusw "#t", "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1665 "movq "#a", "#t" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1666 "psrlq $16, "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1667 "paddusw "#t", "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1668 "movd "#a", "#dst" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1669
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1670 #define HSUM_MMX2(a, t, dst)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1671 "pshufw $0x0E, "#a", "#t" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1672 "paddusw "#t", "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1673 "pshufw $0x01, "#a", "#t" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1674 "paddusw "#t", "#a" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1675 "movd "#a", "#dst" \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1676
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1677 #define HSUM_SSE2(a, t, dst)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1678 "movhlps "#a", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1679 "paddusw "#t", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1680 "pshuflw $0x0E, "#a", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1681 "paddusw "#t", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1682 "pshuflw $0x01, "#a", "#t" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1683 "paddusw "#t", "#a" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1684 "movd "#a", "#dst" \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1685
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1686 #define HADAMARD8_DIFF_MMX(cpu) \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1687 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1688 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1689 int sum;\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1690 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1691 assert(h==8);\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1692 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1693 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1694 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1695 asm volatile(\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1696 HADAMARD48\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1697 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1698 "movq %%mm7, 96(%1) \n\t"\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1699 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1700 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1701 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1702 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1703 "movq 96(%1), %%mm7 \n\t"\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1704 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1705 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1706 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1707 : "=r" (sum)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1708 : "r"(temp)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1709 );\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1710 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1711 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1712 \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1713 asm volatile(\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1714 HADAMARD48\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1715 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1716 "movq %%mm7, 96(%1) \n\t"\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1717 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1718 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1719 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1720 \
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1721 "movq 96(%1), %%mm7 \n\t"\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1722 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1723 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1724 "movq %%mm6, %%mm7 \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1725 "movq %%mm0, %%mm6 \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1726 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1727 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1728 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1729 HADAMARD48\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1730 "movq %%mm7, 64(%1) \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1731 MMABS(%%mm0, %%mm7)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1732 MMABS(%%mm1, %%mm7)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1733 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1734 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1735 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1736 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1737 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1738 "movq 64(%1), %%mm2 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1739 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1740 "paddusw %%mm1, %%mm0 \n\t"\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1741 "movq %%mm0, 64(%1) \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1742 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1743 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1744 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1745 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1746 HADAMARD48\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1747 "movq %%mm7, (%1) \n\t"\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1748 MMABS(%%mm0, %%mm7)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1749 MMABS(%%mm1, %%mm7)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1750 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1751 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1752 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1753 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1754 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1755 "movq (%1), %%mm2 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1756 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1757 "paddusw 64(%1), %%mm0 \n\t"\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1758 "paddusw %%mm1, %%mm0 \n\t"\
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1759 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1760 HSUM(%%mm0, %%mm1, %0)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1761 \
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1762 : "=r" (sum)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1763 : "r"(temp)\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1764 );\
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1765 return sum&0xFFFF;\
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1766 }\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1767 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1768
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1769 #define HADAMARD8_DIFF_SSE2(cpu) \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1770 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1771 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1772 int sum;\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1773 \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1774 assert(h==8);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1775 \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1776 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1777 \
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1778 asm volatile(\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1779 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1780 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1781 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1782 MMABS_SUM_8x8\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1783 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1784 : "=r" (sum)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1785 : "r"(temp)\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1786 );\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1787 return sum&0xFFFF;\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1788 }\
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1789 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
1790
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1791 #define MMABS(a,z) MMABS_MMX(a,z)
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1792 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1793 HADAMARD8_DIFF_MMX(mmx)
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1794 #undef MMABS
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1795 #undef HSUM
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1796
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1797 #define MMABS(a,z) MMABS_MMX2(a,z)
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1798 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1799 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1800 HADAMARD8_DIFF_MMX(mmx2)
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1801 HADAMARD8_DIFF_SSE2(sse2)
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1802 #undef MMABS
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1803 #undef MMABS_SUM_8x8
4946
c1fb4544bd59 cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents: 4939
diff changeset
1804 #undef HSUM
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
1805
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1806 #ifdef HAVE_SSSE3
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1807 #define MMABS(a,z) MMABS_SSSE3(a,z)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1808 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1809 HADAMARD8_DIFF_SSE2(ssse3)
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1810 #undef MMABS
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1811 #undef MMABS_SUM_8x8
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
1812 #endif
4749
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1813
4988
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1814 #define DCT_SAD4(m,mm,o)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1815 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1816 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1817 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1818 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1819 MMABS_SUM(mm##2, mm##6, mm##0)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1820 MMABS_SUM(mm##3, mm##7, mm##1)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1821 MMABS_SUM(mm##4, mm##6, mm##0)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1822 MMABS_SUM(mm##5, mm##7, mm##1)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1823
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1824 #define DCT_SAD_MMX\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1825 "pxor %%mm0, %%mm0 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1826 "pxor %%mm1, %%mm1 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1827 DCT_SAD4(q, %%mm, 0)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1828 DCT_SAD4(q, %%mm, 8)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1829 DCT_SAD4(q, %%mm, 64)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1830 DCT_SAD4(q, %%mm, 72)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1831 "paddusw %%mm1, %%mm0 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1832 HSUM(%%mm0, %%mm1, %0)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1833
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1834 #define DCT_SAD_SSE2\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1835 "pxor %%xmm0, %%xmm0 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1836 "pxor %%xmm1, %%xmm1 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1837 DCT_SAD4(dqa, %%xmm, 0)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1838 DCT_SAD4(dqa, %%xmm, 64)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1839 "paddusw %%xmm1, %%xmm0 \n\t"\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1840 HSUM(%%xmm0, %%xmm1, %0)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1841
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1842 #define DCT_SAD_FUNC(cpu) \
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1843 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1844 int sum;\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1845 asm volatile(\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1846 DCT_SAD\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1847 :"=r"(sum)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1848 :"r"(block)\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1849 );\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1850 return sum&0xFFFF;\
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1851 }
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1852
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1853 #define DCT_SAD DCT_SAD_MMX
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1854 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1855 #define MMABS(a,z) MMABS_MMX(a,z)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1856 DCT_SAD_FUNC(mmx)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1857 #undef MMABS
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1858 #undef HSUM
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1859
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1860 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1861 #define MMABS(a,z) MMABS_MMX2(a,z)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1862 DCT_SAD_FUNC(mmx2)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1863 #undef HSUM
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1864 #undef DCT_SAD
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1865
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1866 #define DCT_SAD DCT_SAD_SSE2
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1867 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1868 DCT_SAD_FUNC(sse2)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1869 #undef MMABS
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1870
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1871 #ifdef HAVE_SSSE3
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1872 #define MMABS(a,z) MMABS_SSSE3(a,z)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1873 DCT_SAD_FUNC(ssse3)
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1874 #undef MMABS
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1875 #endif
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1876 #undef HSUM
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1877 #undef DCT_SAD
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
1878
5255
669a97223dc7 make arguments to ssd_int8_vs_int16() const
mru
parents: 5049
diff changeset
1879 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
4749
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1880 int sum;
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1881 long i=size;
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1882 asm volatile(
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1883 "pxor %%mm4, %%mm4 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1884 "1: \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1885 "sub $8, %0 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1886 "movq (%2,%0), %%mm2 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1887 "movq (%3,%0,2), %%mm0 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1888 "movq 8(%3,%0,2), %%mm1 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1889 "punpckhbw %%mm2, %%mm3 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1890 "punpcklbw %%mm2, %%mm2 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1891 "psraw $8, %%mm3 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1892 "psraw $8, %%mm2 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1893 "psubw %%mm3, %%mm1 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1894 "psubw %%mm2, %%mm0 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1895 "pmaddwd %%mm1, %%mm1 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1896 "pmaddwd %%mm0, %%mm0 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1897 "paddd %%mm1, %%mm4 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1898 "paddd %%mm0, %%mm4 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1899 "jg 1b \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1900 "movq %%mm4, %%mm3 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1901 "psrlq $32, %%mm3 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1902 "paddd %%mm3, %%mm4 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1903 "movd %%mm4, %1 \n"
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1904 :"+r"(i), "=r"(sum)
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1905 :"r"(pix1), "r"(pix2)
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1906 );
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1907 return sum;
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1908 }
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
1909
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
1910 #endif //CONFIG_ENCODERS
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
1911
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1912 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1913 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1914
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1915 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1916 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1917 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1918 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1919 "movq "#in7", " #m3 " \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1920 "movq "#in0", %%mm5 \n\t" /* D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1921 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1922 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1923 "movq "#in1", %%mm5 \n\t" /* C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1924 "movq "#in2", %%mm6 \n\t" /* B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1925 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1926 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1927 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1928 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1929 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1930 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1931 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1932 "psraw $5, %%mm5 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1933 "packuswb %%mm5, %%mm5 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1934 OP(%%mm5, out, %%mm7, d)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1935
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1936 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
1937 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1938 uint64_t temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1939 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1940 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1941 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1942 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1943 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1944 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1945 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1946 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1947 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1948 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1949 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1950 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1951 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1952 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1953 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1954 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1955 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1956 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1957 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1958 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1959 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1960 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1961 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1962 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1963 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1964 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1965 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1966 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1967 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1968 "paddw %6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1969 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1970 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1971 "movq %%mm0, %5 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1972 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
1973 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1974 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1975 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1976 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1977 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1978 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1979 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1980 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1981 "paddw %%mm0, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1982 "paddw %%mm5, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1983 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1984 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1985 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1986 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1987 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1988 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1989 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1990 "paddw %%mm2, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1991 "paddw %%mm6, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1992 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1993 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1994 "paddw %6, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1995 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1996 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1997 "movq %5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
1998 "packuswb %%mm3, %%mm1 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
1999 OP_MMX2(%%mm1, (%1),%%mm4, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2000 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2001 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2002 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2003 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2004 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2005 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2006 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2007 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2008 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2009 "paddw %%mm1, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2010 "paddw %%mm4, %%mm0 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2011 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2012 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2013 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2014 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2015 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2016 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2017 "paddw %%mm3, %%mm2 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2018 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2019 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2020 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2021 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2022 "paddw %%mm2, %%mm6 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2023 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2024 "paddw %6, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2025 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2026 "psraw $5, %%mm0 \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2027 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2028 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2029 "paddw %%mm5, %%mm3 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2030 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2031 "paddw %%mm4, %%mm6 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2032 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2033 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2034 "paddw %%mm1, %%mm4 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2035 "paddw %%mm2, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2036 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2037 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2038 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2039 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2040 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2041 "paddw %6, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2042 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2043 "psraw $5, %%mm4 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2044 "packuswb %%mm4, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2045 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2046 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2047 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2048 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2049 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2050 " jnz 1b \n\t"\
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
2051 : "+a"(src), "+c"(dst), "+m"(h)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
2052 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2053 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2054 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2055 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2056 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2057 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2058 int i;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2059 int16_t temp[16];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2060 /* quick HACK, XXX FIXME MUST be optimized */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2061 for(i=0; i<h; i++)\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2062 {\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2063 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2064 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2065 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2066 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2067 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2068 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2069 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2070 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2071 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2072 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2073 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2074 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2075 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2076 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2077 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2078 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2079 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2080 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2081 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2082 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2083 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2084 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2085 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2086 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2087 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2088 "movq 16(%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2089 "movq 24(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2090 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2091 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2092 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2093 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2094 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2095 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2096 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2097 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2098 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2099 dst+=dstStride;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2100 src+=srcStride;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2101 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2102 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2103 \
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
2104 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2105 uint64_t temp;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2106 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2107 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2108 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2109 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2110 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2111 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2112 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2113 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2114 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2115 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2116 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2117 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2118 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2119 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2120 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2121 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2122 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2123 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2124 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2125 "paddw %%mm3, %%mm5 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2126 "paddw %%mm2, %%mm6 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2127 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2128 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2129 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2130 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2131 "paddw %%mm4, %%mm0 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2132 "paddw %%mm1, %%mm5 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2133 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2134 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2135 "paddw %6, %%mm6 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2136 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2137 "psraw $5, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2138 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2139 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2140 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2141 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2142 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2143 "paddw %%mm5, %%mm1 \n\t" /* a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2144 "paddw %%mm6, %%mm2 \n\t" /* b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2145 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2146 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2147 "paddw %%mm6, %%mm3 \n\t" /* c */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2148 "paddw %%mm5, %%mm4 \n\t" /* d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2149 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2150 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2151 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2152 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2153 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2154 "paddw %6, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2155 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2156 "psraw $5, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2157 "packuswb %%mm3, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2158 OP_MMX2(%%mm0, (%1), %%mm4, q)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2159 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2160 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2161 "add %4, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2162 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2163 " jnz 1b \n\t"\
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
2164 : "+a"(src), "+c"(dst), "+m"(h)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
2165 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2166 : "memory"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2167 );\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2168 }\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2169 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2170 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2171 int i;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2172 int16_t temp[8];\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2173 /* quick HACK, XXX FIXME MUST be optimized */\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2174 for(i=0; i<h; i++)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2175 {\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2176 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2177 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2178 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2179 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2180 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2181 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2182 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2183 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2184 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2185 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2186 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2187 "paddw %2, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2188 "paddw %2, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2189 "psraw $5, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2190 "psraw $5, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2191 "packuswb %%mm1, %%mm0 \n\t"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2192 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2193 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2194 :"memory"\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2195 );\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2196 dst+=dstStride;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2197 src+=srcStride;\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2198 }\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2199 }
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2200
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2201 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2202 \
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2203 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2204 uint64_t temp[17*4];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2205 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2206 int count= 17;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2207 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2208 /*FIXME unroll */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2209 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2210 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2211 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2212 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2213 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2214 "movq 8(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2215 "movq 8(%0), %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2216 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2217 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2218 "punpcklbw %%mm7, %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2219 "punpckhbw %%mm7, %%mm3 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2220 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2221 "movq %%mm1, 17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2222 "movq %%mm2, 2*17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2223 "movq %%mm3, 3*17*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2224 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2225 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2226 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2227 " jnz 1b \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2228 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
2229 : "r" ((long)srcStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2230 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2231 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2232 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2233 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2234 count=4;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2235 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2236 /*FIXME reorder for speed */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2237 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2238 /*"pxor %%mm7, %%mm7 \n\t"*/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2239 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2240 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2241 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2242 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2243 "movq 24(%0), %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2244 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2245 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2246 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2247 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2248 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2249 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2250 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2251 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2252 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2253 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2254 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2255 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2256 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2257 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2258 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2259 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2260 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2261 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2262 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2263 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2264 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2265 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2266 "add %4, %1 \n\t" \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2267 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2268 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2269 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2270 "add $136, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2271 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2272 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2273 " jnz 1b \n\t"\
958
9bb668034ecf slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped)
michaelni
parents: 954
diff changeset
2274 \
967
274b518c4ecb PIC / ebx fix
michaelni
parents: 966
diff changeset
2275 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
2276 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2277 :"memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2278 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2279 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2280 \
1057
bb5de8a59da8 * static,const,compiler warning cleanup
kabi
parents: 997
diff changeset
2281 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
2282 uint64_t temp[9*2];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2283 uint64_t *temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2284 int count= 9;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2285 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2286 /*FIXME unroll */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2287 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2288 "pxor %%mm7, %%mm7 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2289 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2290 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2291 "movq (%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2292 "punpcklbw %%mm7, %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2293 "punpckhbw %%mm7, %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2294 "movq %%mm0, (%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2295 "movq %%mm1, 9*8(%1) \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2296 "add $8, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2297 "add %3, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2298 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2299 " jnz 1b \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2300 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
2301 : "r" ((long)srcStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2302 : "memory"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2303 );\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2304 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2305 temp_ptr= temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2306 count=2;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2307 \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2308 /*FIXME reorder for speed */\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2309 asm volatile(\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2310 /*"pxor %%mm7, %%mm7 \n\t"*/\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2311 "1: \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2312 "movq (%0), %%mm0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2313 "movq 8(%0), %%mm1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2314 "movq 16(%0), %%mm2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2315 "movq 24(%0), %%mm3 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2316 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2317 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2318 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2319 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2320 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2321 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2322 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2323 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2324 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2325 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2326 "add %4, %1 \n\t"\
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2327 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2328 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2329 \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2330 "add $72, %0 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2331 "add %6, %1 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2332 "decl %2 \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2333 " jnz 1b \n\t"\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2334 \
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
2335 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2256
diff changeset
2336 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
966
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2337 : "memory"\
7ef9226f430e more gcc bug workarounds
michaelni
parents: 961
diff changeset
2338 );\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2339 }\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2340 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2341 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2342 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2343 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2344 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2345 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2346 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2347 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2348 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2349 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2350 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2351 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2352 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2353 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2354 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2355 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2356 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2357 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2358 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2359 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2360 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2361 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2362 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2363 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2364 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2365 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2366 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2367 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2368 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2369 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2370 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2371 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2372 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2373 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2374 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2375 uint64_t temp[8];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2376 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2377 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2378 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2379 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2380 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2381 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2382 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2383 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2384 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2385 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2386 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2387 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2388 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2389 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2390 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2391 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2392 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2393 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2394 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2395 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2396 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2397 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2398 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2399 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2400 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2401 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2402 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2403 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2404 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2405 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2406 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2407 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2408 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2409 uint8_t * const halfH= ((uint8_t*)half) + 64;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2410 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2411 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2412 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2413 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2414 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2415 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2416 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2417 uint64_t half[8 + 9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2418 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2419 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2420 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2421 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2422 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2423 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2424 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2425 uint64_t half[8 + 9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2426 uint8_t * const halfH= ((uint8_t*)half) + 64;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2427 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2428 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2429 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2430 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2431 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2432 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2433 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2434 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2435 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2436 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2437 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2438 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2439 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2440 uint64_t half[8 + 9];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2441 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2442 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2443 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2444 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2445 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2446 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2447 uint64_t half[9];\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2448 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2449 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2450 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2451 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2452 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2453 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2454 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2455 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2456 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2457 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2458 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2459 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2460 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2461 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2462 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2463 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2464 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2465 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2466 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2467 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2468 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2469 uint8_t * const half= (uint8_t*)temp;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2470 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2471 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2472 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2473 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2474 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2475 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2476 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2477 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2478 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2479 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2480 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2481 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2482 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2483 }\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2484 \
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2485 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2486 uint64_t temp[32];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2487 uint8_t * const half= (uint8_t*)temp;\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2488 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2489 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2490 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2491 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2492 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2493 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2494 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2495 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2496 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2497 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2498 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2499 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2500 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2501 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2502 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2503 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2504 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2505 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2506 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2507 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2508 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2509 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2510 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2511 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2512 uint8_t * const halfHV= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2513 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2514 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2515 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2516 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2517 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2518 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2519 uint64_t half[16*2 + 17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2520 uint8_t * const halfH= ((uint8_t*)half) + 256;\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2521 uint8_t * const halfHV= ((uint8_t*)half);\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2522 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2523 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2524 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2525 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2526 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2527 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2528 uint64_t half[16*2 + 17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2529 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2530 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2531 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2532 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2533 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2534 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2535 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2536 uint64_t half[16*2 + 17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2537 uint8_t * const halfH= ((uint8_t*)half) + 256;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2538 uint8_t * const halfHV= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2539 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2540 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2541 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2542 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2543 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2544 uint64_t half[17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2545 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2546 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2547 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2548 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2549 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2550 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2551 uint64_t half[17*2];\
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2552 uint8_t * const halfH= ((uint8_t*)half);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2553 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
2207
22b768f1261a 10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents: 2067
diff changeset
2554 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
2555 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2556 }\
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1057
diff changeset
2557 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2558 uint64_t half[17*2];\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2559 uint8_t * const halfH= ((uint8_t*)half);\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2560 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2561 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2562 }
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2563
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2564 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2565 #define AVG_3DNOW_OP(a,b,temp, size) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2566 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2567 "pavgusb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2568 "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2569 #define AVG_MMX2_OP(a,b,temp, size) \
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2570 "mov" #size " " #b ", " #temp " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2571 "pavgb " #temp ", " #a " \n\t"\
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
2572 "mov" #size " " #a ", " #b " \n\t"
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2573
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2574 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2575 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2576 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2577 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2578 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2579 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2580 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
2581 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2582 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2583
3807
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2584 /***********************************/
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2585 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2586
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2587 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2588 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2589 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2590 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2591 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2592 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2593 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2594 }
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2595
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2596 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2597 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2598 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2599 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2600 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2601 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2602 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2603 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2604 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2605 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2606 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2607 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2608 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2609 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2610 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2611 }\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2612 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2613 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2614 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2615 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2616 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2617 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2618 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2619 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2620
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2621 QPEL_2TAP(put_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2622 QPEL_2TAP(avg_, 16, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2623 QPEL_2TAP(put_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2624 QPEL_2TAP(avg_, 8, mmx2)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2625 QPEL_2TAP(put_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2626 QPEL_2TAP(avg_, 16, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2627 QPEL_2TAP(put_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2628 QPEL_2TAP(avg_, 8, 3dnow)
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2629
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
2630
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
2631 #if 0
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2632 static void just_return() { return; }
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
2633 #endif
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
2634
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2635 #define SET_QPEL_FUNC(postfix1, postfix2) \
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2636 c->put_ ## postfix1 = put_ ## postfix2;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2637 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
2638 c->avg_ ## postfix1 = avg_ ## postfix2;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2639
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2640 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2641 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2642 const int w = 8;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2643 const int ix = ox>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2644 const int iy = oy>>(16+shift);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2645 const int oxs = ox>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2646 const int oys = oy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2647 const int dxxs = dxx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2648 const int dxys = dxy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2649 const int dyxs = dyx>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2650 const int dyys = dyy>>4;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2651 const uint16_t r4[4] = {r,r,r,r};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2652 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2653 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2654 const uint64_t shift2 = 2*shift;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2655 uint8_t edge_buf[(h+1)*stride];
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2656 int x, y;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2657
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2658 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2659 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2660 const int dxh = dxy*(h-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2661 const int dyw = dyx*(w-1);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2662 if( // non-constant fullpel offset (3% of blocks)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2663 (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2664 oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2665 // uses more than 16 bits of subpel mv (only at huge resolution)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2666 || (dxx|dxy|dyx|dyy)&15 )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2667 {
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2668 //FIXME could still use mmx for some of the rows
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2669 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2670 return;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2671 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2672
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2673 src += ix + iy*stride;
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2674 if( (unsigned)ix >= width-w ||
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2675 (unsigned)iy >= height-h )
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2676 {
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2677 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2678 src = edge_buf;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2679 }
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2680
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2681 asm volatile(
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2682 "movd %0, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2683 "pxor %%mm7, %%mm7 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2684 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2685 "punpcklwd %%mm6, %%mm6 \n\t"
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2686 :: "r"(1<<shift)
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2687 );
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2688
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2689 for(x=0; x<w; x+=4){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2690 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2691 oxs - dxys + dxxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2692 oxs - dxys + dxxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2693 oxs - dxys + dxxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2694 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2695 oys - dyys + dyxs*(x+1),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2696 oys - dyys + dyxs*(x+2),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2697 oys - dyys + dyxs*(x+3) };
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2698
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2699 for(y=0; y<h; y++){
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2700 asm volatile(
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2701 "movq %0, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2702 "movq %1, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2703 "paddw %2, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2704 "paddw %3, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2705 "movq %%mm4, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2706 "movq %%mm5, %1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2707 "psrlw $12, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2708 "psrlw $12, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2709 : "+m"(*dx4), "+m"(*dy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2710 : "m"(*dxy4), "m"(*dyy4)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2711 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2712
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2713 asm volatile(
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2714 "movq %%mm6, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2715 "movq %%mm6, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2716 "psubw %%mm4, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2717 "psubw %%mm5, %%mm1 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2718 "movq %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2719 "movq %%mm4, %%mm3 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2720 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2721 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2722 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2723 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2724
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2725 "movd %4, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2726 "movd %3, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2727 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2728 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2729 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2730 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2731
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2732 "movd %2, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2733 "movd %1, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2734 "punpcklbw %%mm7, %%mm5 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2735 "punpcklbw %%mm7, %%mm4 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2736 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2737 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
3250
fec9bc8d63fc gmc_mmx tweaks
lorenm
parents: 3248
diff changeset
2738 "paddw %5, %%mm1 \n\t"
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2739 "paddw %%mm3, %%mm2 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2740 "paddw %%mm1, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2741 "paddw %%mm2, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2742
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2743 "psrlw %6, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2744 "packuswb %%mm0, %%mm0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2745 "movd %%mm0, %0 \n\t"
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2746
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2747 : "=m"(dst[x+y*stride])
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2748 : "m"(src[0]), "m"(src[1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2749 "m"(src[stride]), "m"(src[stride+1]),
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2750 "m"(*r4), "m"(shift2)
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2751 );
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2752 src += stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2753 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2754 src += 4-h*stride;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2755 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2756 }
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
2757
3777
20545fbb6f7c add some #ifdef CONFIG_ENCODERS/DECODERS
mru
parents: 3721
diff changeset
2758 #ifdef CONFIG_ENCODERS
5024
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2759
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2760 #define PHADDD(a, t)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2761 "movq "#a", "#t" \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2762 "psrlq $32, "#a" \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2763 "paddd "#t", "#a" \n\t"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2764 /*
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2765 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2766 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2767 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2768 */
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2769 #define PMULHRW(x, y, s, o)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2770 "pmulhw " #s ", "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2771 "pmulhw " #s ", "#y " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2772 "paddw " #o ", "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2773 "paddw " #o ", "#y " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2774 "psraw $1, "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2775 "psraw $1, "#y " \n\t"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2776 #define DEF(x) x ## _mmx
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2777 #define SET_RND MOVQ_WONE
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2778 #define SCALE_OFFSET 1
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2779
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2780 #include "dsputil_mmx_qns.h"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2781
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2782 #undef DEF
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2783 #undef SET_RND
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2784 #undef SCALE_OFFSET
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2785 #undef PMULHRW
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2786
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2787 #define DEF(x) x ## _3dnow
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2788 #define SET_RND(x)
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2789 #define SCALE_OFFSET 0
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2790 #define PMULHRW(x, y, s, o)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2791 "pmulhrw " #s ", "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2792 "pmulhrw " #s ", "#y " \n\t"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2793
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2794 #include "dsputil_mmx_qns.h"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2795
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2796 #undef DEF
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2797 #undef SET_RND
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2798 #undef SCALE_OFFSET
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2799 #undef PMULHRW
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2800
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2801 #ifdef HAVE_SSSE3
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2802 #undef PHADDD
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2803 #define DEF(x) x ## _ssse3
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2804 #define SET_RND(x)
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2805 #define SCALE_OFFSET -1
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2806 #define PHADDD(a, t)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2807 "pshufw $0x0E, "#a", "#t" \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2808 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2809 #define PMULHRW(x, y, s, o)\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2810 "pmulhrsw " #s ", "#x " \n\t"\
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2811 "pmulhrsw " #s ", "#y " \n\t"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2812
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2813 #include "dsputil_mmx_qns.h"
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2814
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2815 #undef DEF
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2816 #undef SET_RND
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2817 #undef SCALE_OFFSET
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2818 #undef PMULHRW
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2819 #undef PHADDD
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2820 #endif //HAVE_SSSE3
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
2821
3777
20545fbb6f7c add some #ifdef CONFIG_ENCODERS/DECODERS
mru
parents: 3721
diff changeset
2822 #endif /* CONFIG_ENCODERS */
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents: 2753
diff changeset
2823
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2824 #define PREFETCH(name, op) \
4172
608e2dfcb86e adding more static keywords
mru
parents: 4127
diff changeset
2825 static void name(void *mem, int stride, int h){\
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2826 const uint8_t *p= mem;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2827 do{\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2828 asm volatile(#op" %0" :: "m"(*p));\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2829 p+= stride;\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2830 }while(--h);\
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2831 }
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2832 PREFETCH(prefetch_mmx2, prefetcht0)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2833 PREFETCH(prefetch_3dnow, prefetch)
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2834 #undef PREFETCH
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
2835
2754
a49f140179e9 sort H.264 mmx dsp functions into their own file
lorenm
parents: 2753
diff changeset
2836 #include "h264dsp_mmx.c"
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
2837
3524
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2838 /* AVS specific */
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2839 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2840
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2841 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2842 put_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2843 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2844 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2845 avg_pixels8_mmx(dst, src, stride, 8);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2846 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2847 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2848 put_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2849 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2850 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2851 avg_pixels16_mmx(dst, src, stride, 16);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2852 }
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
2853
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2854 /* external functions, from idct_mmx.c */
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2855 void ff_mmx_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2856 void ff_mmxext_idct(DCTELEM *block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2857
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2858 /* XXX: those functions should be suppressed ASAP when all IDCTs are
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2859 converted */
4020
723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure
diego
parents: 4001
diff changeset
2860 #ifdef CONFIG_GPL
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2861 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2862 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2863 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2864 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2865 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2866 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2867 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2868 ff_mmx_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2869 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2870 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2871 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2872 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2873 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2874 put_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2875 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2876 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2877 {
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2878 ff_mmxext_idct (block);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2879 add_pixels_clamped_mmx(block, dest, line_size);
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
2880 }
4020
723818b5de0f Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure
diego
parents: 4001
diff changeset
2881 #endif
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2882 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2883 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2884 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2885 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2886 }
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2887 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2888 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2889 ff_idct_xvid_mmx (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2890 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2891 }
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2892 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2893 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2894 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2895 put_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2896 }
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2897 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2898 {
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2899 ff_idct_xvid_mmx2 (block);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2900 add_pixels_clamped_mmx(block, dest, line_size);
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
2901 }
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
2902
3541
3fbddeb13686 10l, vorbis_inverse_coupling_sse() was really 3dnow
lorenm
parents: 3536
diff changeset
2903 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2904 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2905 int i;
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2906 asm volatile("pxor %%mm7, %%mm7":);
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2907 for(i=0; i<blocksize; i+=2) {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2908 asm volatile(
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2909 "movq %0, %%mm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2910 "movq %1, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2911 "movq %%mm0, %%mm2 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2912 "movq %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2913 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2914 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2915 "pslld $31, %%mm2 \n\t" // keep only the sign bit
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2916 "pxor %%mm2, %%mm1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2917 "movq %%mm3, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2918 "pand %%mm1, %%mm3 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2919 "pandn %%mm1, %%mm4 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2920 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2921 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2922 "movq %%mm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2923 "movq %%mm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2924 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2925 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2926 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2927 }
3561
97325fecd35a emms -> femms
lorenm
parents: 3557
diff changeset
2928 asm volatile("femms");
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2929 }
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2930 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2931 {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2932 int i;
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2933
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2934 asm volatile(
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2935 "movaps %0, %%xmm5 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2936 ::"m"(ff_pdw_80000000[0])
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2937 );
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2938 for(i=0; i<blocksize; i+=4) {
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2939 asm volatile(
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2940 "movaps %0, %%xmm0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2941 "movaps %1, %%xmm1 \n\t"
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2942 "xorps %%xmm2, %%xmm2 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2943 "xorps %%xmm3, %%xmm3 \n\t"
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2944 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2945 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2946 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2947 "xorps %%xmm2, %%xmm1 \n\t"
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2948 "movaps %%xmm3, %%xmm4 \n\t"
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2949 "andps %%xmm1, %%xmm3 \n\t"
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
2950 "andnps %%xmm1, %%xmm4 \n\t"
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2951 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2952 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2953 "movaps %%xmm3, %1 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2954 "movaps %%xmm0, %0 \n\t"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2955 :"+m"(mag[i]), "+m"(ang[i])
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2956 ::"memory"
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2957 );
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2958 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2959 }
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
2960
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2961 static void vector_fmul_3dnow(float *dst, const float *src, int len){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2962 long i = (len-4)*4;
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2963 asm volatile(
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2964 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2965 "movq (%1,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2966 "movq 8(%1,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2967 "pfmul (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2968 "pfmul 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2969 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2970 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2971 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2972 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2973 "femms \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2974 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2975 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2976 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2977 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2978 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2979 static void vector_fmul_sse(float *dst, const float *src, int len){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2980 long i = (len-8)*4;
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2981 asm volatile(
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2982 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2983 "movaps (%1,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2984 "movaps 16(%1,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2985 "mulps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2986 "mulps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2987 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2988 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2989 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2990 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2991 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2992 :"r"(dst), "r"(src)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2993 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
2994 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2995 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2996
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2997 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2998 long i = len*4-16;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
2999 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3000 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3001 "pswapd 8(%1), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3002 "pswapd (%1), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3003 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3004 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3005 "movq %%mm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3006 "movq %%mm1, 8(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3007 "add $16, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3008 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3009 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3010 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3011 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3012 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3013 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3014 }
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3015 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3016 long i = len*4-32;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3017 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3018 "1: \n\t"
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3019 "movaps 16(%1), %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3020 "movaps (%1), %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3021 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3022 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3023 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3024 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3025 "movaps %%xmm0, (%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3026 "movaps %%xmm1, 16(%2,%0) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3027 "add $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3028 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3029 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3030 :"+r"(i), "+r"(src1)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3031 :"r"(dst), "r"(src0)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3032 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3033 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3034
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3035 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3036 const float *src2, int src3, int len, int step){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3037 long i = (len-4)*4;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3038 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3039 dst += (len-4)*2;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3040 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3041 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3042 "movq (%2,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3043 "movq 8(%2,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3044 "pfmul (%3,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3045 "pfmul 8(%3,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3046 "pfadd (%4,%0), %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3047 "pfadd 8(%4,%0), %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3048 "movd %%mm0, (%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3049 "movd %%mm1, 16(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3050 "psrlq $32, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3051 "psrlq $32, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3052 "movd %%mm0, 8(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3053 "movd %%mm1, 24(%1) \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3054 "sub $32, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3055 "sub $16, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3056 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3057 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3058 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3059 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3060 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3061 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3062 else if(step == 1 && src3 == 0){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3063 asm volatile(
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3064 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3065 "movq (%2,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3066 "movq 8(%2,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3067 "pfmul (%3,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3068 "pfmul 8(%3,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3069 "pfadd (%4,%0), %%mm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3070 "pfadd 8(%4,%0), %%mm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3071 "movq %%mm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3072 "movq %%mm1, 8(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3073 "sub $16, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3074 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3075 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3076 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3077 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3078 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3079 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3080 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3081 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3082 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3083 }
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3084 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3085 const float *src2, int src3, int len, int step){
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3086 long i = (len-8)*4;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3087 if(step == 2 && src3 == 0){
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3088 dst += (len-8)*2;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3089 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3090 "1: \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3091 "movaps (%2,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3092 "movaps 16(%2,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3093 "mulps (%3,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3094 "mulps 16(%3,%0), %%xmm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3095 "addps (%4,%0), %%xmm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3096 "addps 16(%4,%0), %%xmm1 \n\t"
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3097 "movss %%xmm0, (%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3098 "movss %%xmm1, 32(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3099 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3100 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3101 "movss %%xmm2, 16(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3102 "movss %%xmm3, 48(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3103 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3104 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3105 "movss %%xmm0, 8(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3106 "movss %%xmm1, 40(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3107 "movhlps %%xmm0, %%xmm2 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3108 "movhlps %%xmm1, %%xmm3 \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3109 "movss %%xmm2, 24(%1) \n\t"
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3110 "movss %%xmm3, 56(%1) \n\t"
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3111 "sub $64, %1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3112 "sub $32, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3113 "jge 1b \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3114 :"+r"(i), "+r"(dst)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3115 :"r"(src0), "r"(src1), "r"(src2)
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3116 :"memory"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3117 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3118 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3119 else if(step == 1 && src3 == 0){
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3120 asm volatile(
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3121 "1: \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3122 "movaps (%2,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3123 "movaps 16(%2,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3124 "mulps (%3,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3125 "mulps 16(%3,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3126 "addps (%4,%0), %%xmm0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3127 "addps 16(%4,%0), %%xmm1 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3128 "movaps %%xmm0, (%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3129 "movaps %%xmm1, 16(%1,%0) \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3130 "sub $32, %0 \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3131 "jge 1b \n\t"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3132 :"+r"(i)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3133 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3134 :"memory"
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3135 );
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3136 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3137 else
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3138 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3139 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3140
4172
608e2dfcb86e adding more static keywords
mru
parents: 4127
diff changeset
3141 static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3142 // not bit-exact: pf2id uses different rounding than C and SSE
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3143 int i;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3144 for(i=0; i<len; i+=4) {
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3145 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3146 "pf2id %1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3147 "pf2id %2, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3148 "packssdw %%mm1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3149 "movq %%mm0, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3150 :"=m"(dst[i])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3151 :"m"(src[i]), "m"(src[i+2])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3152 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3153 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3154 asm volatile("femms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3155 }
4172
608e2dfcb86e adding more static keywords
mru
parents: 4127
diff changeset
3156 static void float_to_int16_sse(int16_t *dst, const float *src, int len){
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3157 int i;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3158 for(i=0; i<len; i+=4) {
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3159 asm volatile(
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3160 "cvtps2pi %1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3161 "cvtps2pi %2, %%mm1 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3162 "packssdw %%mm1, %%mm0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3163 "movq %%mm0, %0 \n\t"
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3164 :"=m"(dst[i])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3165 :"m"(src[i]), "m"(src[i+2])
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3166 );
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3167 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3168 asm volatile("emms");
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3169 }
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3170
4589
30261f4ed12d Fix wrong conditional, Snow decoding, not encoding, was SIMD-accelerated.
diego
parents: 4436
diff changeset
3171 #ifdef CONFIG_SNOW_DECODER
3210
81cafbc23b8d snow mmx+sse2 optimizations, part 4
corey
parents: 3207
diff changeset
3172 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
81cafbc23b8d snow mmx+sse2 optimizations, part 4
corey
parents: 3207
diff changeset
3173 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3174 extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3175 extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
4436
d3e389536b0a Add the const specifier as needed to reduce the number of warnings.
takis
parents: 4197
diff changeset
3176 extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
3177 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
4436
d3e389536b0a Add the const specifier as needed to reduce the number of warnings.
takis
parents: 4197
diff changeset
3178 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
3179 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
3210
81cafbc23b8d snow mmx+sse2 optimizations, part 4
corey
parents: 3207
diff changeset
3180 #endif
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3181
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3182 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
3183 {
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3184 mm_flags = mm_support();
1115
74a46d77e061 * support FF_MM_FORCE
kabi
parents: 1092
diff changeset
3185
1122
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
3186 if (avctx->dsp_mask) {
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3187 if (avctx->dsp_mask & FF_MM_FORCE)
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3188 mm_flags |= (avctx->dsp_mask & 0xffff);
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3189 else
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3190 mm_flags &= ~(avctx->dsp_mask & 0xffff);
1122
ddc3b0140b8f * oooooops - sorry for this one - wrong logic
kabi
parents: 1115
diff changeset
3191 }
1115
74a46d77e061 * support FF_MM_FORCE
kabi
parents: 1092
diff changeset
3192
631
47a8964ba5cd be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents: 629
diff changeset
3193 #if 0
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3194 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3195 if (mm_flags & MM_MMX)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3196 av_log(avctx, AV_LOG_INFO, " mmx");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3197 if (mm_flags & MM_MMXEXT)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3198 av_log(avctx, AV_LOG_INFO, " mmxext");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3199 if (mm_flags & MM_3DNOW)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3200 av_log(avctx, AV_LOG_INFO, " 3dnow");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3201 if (mm_flags & MM_SSE)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3202 av_log(avctx, AV_LOG_INFO, " sse");
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3203 if (mm_flags & MM_SSE2)
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3204 av_log(avctx, AV_LOG_INFO, " sse2");
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3205 av_log(avctx, AV_LOG_INFO, "\n");
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
3206 #endif
986e461dc072 Initial revision
glantau
parents:
diff changeset
3207
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3208 if (mm_flags & MM_MMX) {
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3209 const int idct_algo= avctx->idct_algo;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3210
1232
e88d3b1fb2a1 more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents: 1186
diff changeset
3211 #ifdef CONFIG_ENCODERS
2024
f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents: 1985
diff changeset
3212 const int dct_algo = avctx->dct_algo;
1565
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
3213 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3214 if(mm_flags & MM_SSE2){
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
3215 c->fdct = ff_fdct_sse2;
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3216 }else if(mm_flags & MM_MMXEXT){
1565
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
3217 c->fdct = ff_fdct_mmx2;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
3218 }else{
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
3219 c->fdct = ff_fdct_mmx;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
3220 }
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1530
diff changeset
3221 }
1232
e88d3b1fb2a1 more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents: 1186
diff changeset
3222 #endif //CONFIG_ENCODERS
2256
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3223 if(avctx->lowres==0){
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3224 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3225 c->idct_put= ff_simple_idct_put_mmx;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3226 c->idct_add= ff_simple_idct_add_mmx;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3227 c->idct = ff_simple_idct_mmx;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3228 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
3717
ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel
diego
parents: 3712
diff changeset
3229 #ifdef CONFIG_GPL
2256
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3230 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3231 if(mm_flags & MM_MMXEXT){
2256
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3232 c->idct_put= ff_libmpeg2mmx2_idct_put;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3233 c->idct_add= ff_libmpeg2mmx2_idct_add;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3234 c->idct = ff_mmxext_idct;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3235 }else{
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3236 c->idct_put= ff_libmpeg2mmx_idct_put;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3237 c->idct_add= ff_libmpeg2mmx_idct_add;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3238 c->idct = ff_mmx_idct;
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3239 }
7e0b2e86afa9 1/2 resolution decoding
michael
parents: 2217
diff changeset
3240 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3717
ea9fe1c9d126 Remove the LGPL exception clause as discussed on ffmpeg-devel
diego
parents: 3712
diff changeset
3241 #endif
5007
f7edc4fe94db Make vp3dsp*.c compilation optional.
takis
parents: 4988
diff changeset
3242 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
f7edc4fe94db Make vp3dsp*.c compilation optional.
takis
parents: 4988
diff changeset
3243 idct_algo==FF_IDCT_VP3 &&
3721
2000e401593d disable vp3 mmx idct for theora files to avoid artifacts
aurel
parents: 3717
diff changeset
3244 avctx->codec->id!=CODEC_ID_THEORA &&
3712
f7f75f718efb Enables back the mmx/sse optimized version of the vp3 idct.
aurel
parents: 3666
diff changeset
3245 !(avctx->flags & CODEC_FLAG_BITEXACT)){
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3246 if(mm_flags & MM_SSE2){
2696
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3247 c->idct_put= ff_vp3_idct_put_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3248 c->idct_add= ff_vp3_idct_add_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3249 c->idct = ff_vp3_idct_sse2;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3250 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3251 }else{
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3252 ff_vp3_dsp_init_mmx();
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3253 c->idct_put= ff_vp3_idct_put_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3254 c->idct_add= ff_vp3_idct_add_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3255 c->idct = ff_vp3_idct_mmx;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3256 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 2691
diff changeset
3257 }
3524
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
3258 }else if(idct_algo==FF_IDCT_CAVS){
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
3259 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3260 }else if(idct_algo==FF_IDCT_XVIDMMX){
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3261 if(mm_flags & MM_MMXEXT){
2868
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3262 c->idct_put= ff_idct_xvid_mmx2_put;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3263 c->idct_add= ff_idct_xvid_mmx2_add;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3264 c->idct = ff_idct_xvid_mmx2;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3265 }else{
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3266 c->idct_put= ff_idct_xvid_mmx_put;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3267 c->idct_add= ff_idct_xvid_mmx_add;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3268 c->idct = ff_idct_xvid_mmx;
666064f710d4 xvids mmx&mmx2 idcts
michael
parents: 2864
diff changeset
3269 }
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3270 }
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3271 }
1868
771dcc2d4a0c use optimized VP3 functions where appropriate
melanson
parents: 1845
diff changeset
3272
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3273 #ifdef CONFIG_ENCODERS
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3274 c->get_pixels = get_pixels_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3275 c->diff_pixels = diff_pixels_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3276 #endif //CONFIG_ENCODERS
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3277 c->put_pixels_clamped = put_pixels_clamped_mmx;
1984
ef919e9ef73e separate out put_signed_pixels_clamped() into its own function and
melanson
parents: 1977
diff changeset
3278 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3279 c->add_pixels_clamped = add_pixels_clamped_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3280 c->clear_blocks = clear_blocks_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3281 #ifdef CONFIG_ENCODERS
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3282 c->pix_sum = pix_sum16_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3283 #endif //CONFIG_ENCODERS
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
3284
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3285 c->put_pixels_tab[0][0] = put_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3286 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3287 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3288 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
3289
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3290 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3291 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3292 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3293 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
3294
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3295 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3296 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3297 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3298 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
3299
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3300 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3301 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3302 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3303 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3304
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3305 c->put_pixels_tab[1][0] = put_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3306 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3307 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3308 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
3309
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3310 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3311 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3312 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3313 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
3314
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3315 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3316 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3317 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3318 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
3319
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3320 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3321 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3322 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3323 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3324
3248
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
3325 c->gmc= gmc_mmx;
7aa9f80e7954 mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents: 3215
diff changeset
3326
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
3327 c->add_bytes= add_bytes_mmx;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3328 #ifdef CONFIG_ENCODERS
866
725ef4ea3ecc huffyuv
michaelni
parents: 853
diff changeset
3329 c->diff_bytes= diff_bytes_mmx;
4988
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
3330 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3331
936
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
3332 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
caa77cd960c0 qpel encoding
michaelni
parents: 866
diff changeset
3333 c->hadamard8_diff[1]= hadamard8_diff_mmx;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3334
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3335 c->pix_norm1 = pix_norm1_mmx;
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3336 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3337 c->sse[1] = sse8_mmx;
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3338 c->vsad[4]= vsad_intra16_mmx;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3339
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3340 c->nsse[0] = nsse16_mmx;
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3341 c->nsse[1] = nsse8_mmx;
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3342 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3343 c->vsad[0] = vsad16_mmx;
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3344 }
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3345
1784
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
3346 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
3347 c->try_8x8basis= try_8x8basis_mmx;
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
3348 }
65f7bd09f37b quantizer noise shaping optimization
michael
parents: 1772
diff changeset
3349 c->add_8x8basis= add_8x8basis_mmx;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3350
4749
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
3351 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
7011f597e473 mmx 16-bit ssd. 2.3x faster svq1 encoding.
lorenm
parents: 4589
diff changeset
3352
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3353 #endif //CONFIG_ENCODERS
1647
c943c1d2d099 h263_v_loop_filter_mmx
michael
parents: 1566
diff changeset
3354
5277
7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs
aurel
parents: 5255
diff changeset
3355 if (ENABLE_ANY_H263) {
5278
ef85411bb7e8 cosmetics: indentation
aurel
parents: 5277
diff changeset
3356 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
ef85411bb7e8 cosmetics: indentation
aurel
parents: 5277
diff changeset
3357 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
5277
7b3fcb7c61ce Avoid linking with h263.c functions when the relevant codecs
aurel
parents: 5255
diff changeset
3358 }
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3359 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
3360 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3361
3173
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3105
diff changeset
3362 c->h264_idct_dc_add=
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3105
diff changeset
3363 c->h264_idct_add= ff_h264_idct_add_mmx;
3174
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
3364 c->h264_idct8_dc_add=
b65cbae9d940 h264_idct8_add_mmx
lorenm
parents: 3173
diff changeset
3365 c->h264_idct8_add= ff_h264_idct8_add_mmx;
3173
9a2cc7b0fbdb h264_idct_add only needs mmx1
lorenm
parents: 3105
diff changeset
3366
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3367 if (mm_flags & MM_MMXEXT) {
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
3368 c->prefetch = prefetch_mmx2;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
3369
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3370 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3371 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
3372
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3373 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3374 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3375 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
415
1c3f42442fba * added simple test main - see comments about how to
kabi
parents: 402
diff changeset
3376
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3377 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3378 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
3379
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3380 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3381 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3382 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3383
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3384 #ifdef CONFIG_ENCODERS
4988
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
3385 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
3386 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
3387 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3388 c->vsad[4]= vsad_intra16_mmx2;
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3389 #endif //CONFIG_ENCODERS
1153
2725c8eb3c81 faster hadamard transform
michaelni
parents: 1122
diff changeset
3390
3105
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3089
diff changeset
3391 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
2d35fb3cb940 h264: special case dc-only idct. ~1% faster overall
lorenm
parents: 3089
diff changeset
3392 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
2745
42d3e9068e32 MMX for H.264 iDCT (adapted from x264)
lorenm
parents: 2732
diff changeset
3393
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3394 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3395 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3396 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3397 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3398 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3399 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3400 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
1772
8cd5257195c9 vsad16_mmx2 only applies if encoders are turned on
melanson
parents: 1765
diff changeset
3401 #ifdef CONFIG_ENCODERS
1729
a4a5e7521339 interlaced dct decision cleanup
michael
parents: 1708
diff changeset
3402 c->vsad[0] = vsad16_mmx2;
1772
8cd5257195c9 vsad16_mmx2 only applies if encoders are turned on
melanson
parents: 1765
diff changeset
3403 #endif //CONFIG_ENCODERS
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3404 }
959
3ec070eef24a qpel in b frames bugfixes
michaelni
parents: 958
diff changeset
3405
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
3406 #if 1
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3407 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3408 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3409 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3410 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3411 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3412 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3413 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3414 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3415 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3416 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3417 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3418 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3419 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3420 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3421 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3422 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3423 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3424 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3425 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3426 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3427 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3428 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3429 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3430 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3431 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3432 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3433 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3434 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3435 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3436 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3437 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3438 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
961
f8c5babc7b4e 1000l (push & esp) using mangle now ...
michaelni
parents: 959
diff changeset
3439 #endif
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
3440
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3441 //FIXME 3dnow too
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3442 #define dspfunc(PFX, IDX, NUM) \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3443 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3444 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3445 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3446 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3447 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3448 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3449 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3450 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3451 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3452 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3453 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3454 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3455 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3456 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3457 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3458 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3459
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3460 dspfunc(put_h264_qpel, 0, 16);
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3461 dspfunc(put_h264_qpel, 1, 8);
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3462 dspfunc(put_h264_qpel, 2, 4);
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3463 dspfunc(avg_h264_qpel, 0, 16);
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3464 dspfunc(avg_h264_qpel, 1, 8);
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3465 dspfunc(avg_h264_qpel, 2, 4);
3807
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
3466
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
3467 dspfunc(put_2tap_qpel, 0, 16);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
3468 dspfunc(put_2tap_qpel, 1, 8);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
3469 dspfunc(avg_2tap_qpel, 0, 16);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
3470 dspfunc(avg_2tap_qpel, 1, 8);
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3471 #undef dspfunc
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3472
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3473 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
3474 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
3213
57d31bdbebe8 added mmx implementation of h264_chroma_mc2
lorenm
parents: 3211
diff changeset
3475 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
57d31bdbebe8 added mmx implementation of h264_chroma_mc2
lorenm
parents: 3211
diff changeset
3476 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
2633
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
3477 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
3478 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
3479 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
3480 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
2707
360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents: 2696
diff changeset
3481 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
360024d31dab H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents: 2696
diff changeset
3482 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
3645
47821be55b6c mmx implementation of deblocking strength decision.
lorenm
parents: 3576
diff changeset
3483 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
2633
72e6ffa1f3a5 MMX for H.264 deblocking filter
lorenm
parents: 2505
diff changeset
3484
2902
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3485 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3486 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3487 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3488 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3489 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3490 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3491 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3492 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3493
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3494 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3495 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3496 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3497 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3498 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3499 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3500 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3501 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
3c79bc9f3aa9 h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents: 2899
diff changeset
3502
3524
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
3503 #ifdef CONFIG_CAVS_DECODER
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
3504 ff_cavsdsp_init_mmx2(c, avctx);
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
3505 #endif
419409926166 some MMX optimizations for the CAVS decoder
stefang
parents: 3496
diff changeset
3506
1686
68abbec33289 Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents: 1648
diff changeset
3507 #ifdef CONFIG_ENCODERS
1527
8ffd0c00e6df mmx2 optimization of huffyuv median encoding
michael
parents: 1324
diff changeset
3508 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1686
68abbec33289 Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents: 1648
diff changeset
3509 #endif //CONFIG_ENCODERS
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3510 } else if (mm_flags & MM_3DNOW) {
3215
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
3511 c->prefetch = prefetch_3dnow;
06f98047ff26 prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents: 3213
diff changeset
3512
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3513 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3514 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
393
bf164fce2c14 removed debug function
glantau
parents: 387
diff changeset
3515
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3516 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3517 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3518 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
651
45e8f39fda50 put/avg_pixels16
michaelni
parents: 631
diff changeset
3519
853
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3520 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3521 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3522
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3523 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3524 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
eacc2dd8fd9d * using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents: 706
diff changeset
3525 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3526
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3527 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3528 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3529 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3530 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3531 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3532 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3533 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3534 }
984
e162c09efbe7 qpel fix
michaelni
parents: 967
diff changeset
3535
954
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3536 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3537 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3538 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3539 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3540 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3541 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3542 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3543 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3544 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3545 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3546 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3547 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3548 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3549 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3550 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3551 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3552 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3553 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3554 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3555 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3556 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3557 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3558 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3559 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3560 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3561 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3562 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3563 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3564 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3565 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3566 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
13aec7e50c52 qpel in mmx2/3dnow
michaelni
parents: 936
diff changeset
3567 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
2209
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3568
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3569 #define dspfunc(PFX, IDX, NUM) \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3570 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3571 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3572 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3573 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3574 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3575 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3576 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3577 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3578 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3579 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3580 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3581 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3582 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3583 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3584 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3585 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3586
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3587 dspfunc(put_h264_qpel, 0, 16);
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3588 dspfunc(put_h264_qpel, 1, 8);
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3589 dspfunc(put_h264_qpel, 2, 4);
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3590 dspfunc(avg_h264_qpel, 0, 16);
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3591 dspfunc(avg_h264_qpel, 1, 8);
c4a476971abc h264 luma motion compensation in mmx2/3dnow
michael
parents: 2207
diff changeset
3592 dspfunc(avg_h264_qpel, 2, 4);
2732
473ee06ec3a1 MMX code for (put|avg)_h264_chroma_mc8
hzoli
parents: 2707
diff changeset
3593
3807
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
3594 dspfunc(put_2tap_qpel, 0, 16);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
3595 dspfunc(put_2tap_qpel, 1, 8);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
3596 dspfunc(avg_2tap_qpel, 0, 16);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
3597 dspfunc(avg_2tap_qpel, 1, 8);
6a40092eb9e6 approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents: 3777
diff changeset
3598
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
3599 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
2922
d772011258ec faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents: 2902
diff changeset
3600 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
3601 }
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3602
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3603 #ifdef CONFIG_ENCODERS
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3604 if(mm_flags & MM_SSE2){
4988
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
3605 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3606 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3607 c->hadamard8_diff[1]= hadamard8_diff_sse2;
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3608 }
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3609
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3610 #ifdef HAVE_SSSE3
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3611 if(mm_flags & MM_SSSE3){
5024
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3612 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3613 c->try_8x8basis= try_8x8basis_ssse3;
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3614 }
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3615 c->add_8x8basis= add_8x8basis_ssse3;
4988
689490842cf5 factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents: 4987
diff changeset
3616 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
4987
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3617 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3618 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3619 }
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3620 #endif
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3621 #endif
02199b094850 sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents: 4946
diff changeset
3622
4589
30261f4ed12d Fix wrong conditional, Snow decoding, not encoding, was SIMD-accelerated.
diego
parents: 4436
diff changeset
3623 #ifdef CONFIG_SNOW_DECODER
5591
642588a60570 update mmx code to latest snow changes
michael
parents: 5587
diff changeset
3624 if(mm_flags & MM_SSE2 & 0){
3210
81cafbc23b8d snow mmx+sse2 optimizations, part 4
corey
parents: 3207
diff changeset
3625 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3626 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
3627 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3628 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3629 else{
3210
81cafbc23b8d snow mmx+sse2 optimizations, part 4
corey
parents: 3207
diff changeset
3630 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3631 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
3211
b77b5e7072d6 add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents: 3210
diff changeset
3632 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3633 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents: 3174
diff changeset
3634 #endif
3536
545a15c19c91 sse & sse2 implementations of vorbis channel coupling.
lorenm
parents: 3524
diff changeset
3635
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3636 if(mm_flags & MM_3DNOW){
5024
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3637 #ifdef CONFIG_ENCODERS
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3638 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3639 c->try_8x8basis= try_8x8basis_3dnow;
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3640 }
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3641 c->add_8x8basis= add_8x8basis_3dnow;
8a3bc96c366f 3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents: 5014
diff changeset
3642 #endif //CONFIG_ENCODERS
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3643 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3644 c->vector_fmul = vector_fmul_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3645 if(!(avctx->flags & CODEC_FLAG_BITEXACT))
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3646 c->float_to_int16 = float_to_int16_3dnow;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3647 }
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3648 if(mm_flags & MM_3DNOWEXT)
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3649 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3650 if(mm_flags & MM_SSE){
3557
8e13ec0f8aa3 change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents: 3541
diff changeset
3651 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3652 c->vector_fmul = vector_fmul_sse;
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3653 c->float_to_int16 = float_to_int16_sse;
3569
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3654 c->vector_fmul_reverse = vector_fmul_reverse_sse;
c42c03f3b402 convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents: 3568
diff changeset
3655 c->vector_fmul_add_add = vector_fmul_add_add_sse;
3568
945caa35ee9a sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents: 3561
diff changeset
3656 }
4197
bbe0bc387a19 revert bad checkin
mru
parents: 4196
diff changeset
3657 if(mm_flags & MM_3DNOW)
3574
f549d1e685f7 vorbis simd tweaks
lorenm
parents: 3569
diff changeset
3658 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
3659 }
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2940
diff changeset
3660
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3661 #ifdef CONFIG_ENCODERS
1092
f59c3f66363b MpegEncContext.(i)dct_* -> DspContext.(i)dct_*
michaelni
parents: 1065
diff changeset
3662 dsputil_init_pix_mmx(c, avctx);
1530
3b31998fe22f disable encoders where appropriate (patch courtesy of BERO
melanson
parents: 1527
diff changeset
3663 #endif //CONFIG_ENCODERS
247
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3664 #if 0
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3665 // for speed testing
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3666 get_pixels = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3667 put_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3668 add_pixels_clamped = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3669
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3670 pix_abs16x16 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3671 pix_abs16x16_x2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3672 pix_abs16x16_y2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3673 pix_abs16x16_xy2 = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3674
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3675 put_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3676 put_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3677 put_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3678 put_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3679
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3680 put_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3681 put_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3682 put_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3683 put_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3684
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3685 avg_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3686 avg_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3687 avg_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3688 avg_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3689
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3690 avg_no_rnd_pixels_tab[0] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3691 avg_no_rnd_pixels_tab[1] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3692 avg_no_rnd_pixels_tab[2] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3693 avg_no_rnd_pixels_tab[3] = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3694
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3695 //av_fdct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3696 //ff_idct = just_return;
6f48cacd9ed9 * some modifications to allow gcc to compile same code for -fPIC
kabi
parents: 188
diff changeset
3697 #endif
0
986e461dc072 Initial revision
glantau
parents:
diff changeset
3698 }