Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 6362:78aa57eba353 libavcodec
FLAT objects cannot have multiple sections, so using the L1 attributes breaks
linking. The FDPIC relocs also break for any other format. Thus check the
compiler environment and select the appropriate sections/relocs.
patch by Mike Frysinger, vapier.adi a gmail d com
author | diego |
---|---|
date | Sat, 16 Feb 2008 15:17:31 +0000 |
parents | ef3fb5a7e275 |
children | 0a403ade8c81 |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1729
diff
changeset
|
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
0 | 5 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3932
diff
changeset
|
6 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3932
diff
changeset
|
7 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3932
diff
changeset
|
8 * FFmpeg is free software; you can redistribute it and/or |
429 | 9 * modify it under the terms of the GNU Lesser General Public |
10 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3932
diff
changeset
|
11 * version 2.1 of the License, or (at your option) any later version. |
0 | 12 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3932
diff
changeset
|
13 * FFmpeg is distributed in the hope that it will be useful, |
0 | 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 * Lesser General Public License for more details. | |
0 | 17 * |
429 | 18 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3932
diff
changeset
|
19 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
0 | 21 * |
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
23 */ | |
24 | |
5010
d5ba514e3f4a
Add libavcodec to compiler include flags in order to simplify header
diego
parents:
5007
diff
changeset
|
25 #include "dsputil.h" |
5946
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
26 #include "dsputil_mmx.h" |
5010
d5ba514e3f4a
Add libavcodec to compiler include flags in order to simplify header
diego
parents:
5007
diff
changeset
|
27 #include "simple_idct.h" |
d5ba514e3f4a
Add libavcodec to compiler include flags in order to simplify header
diego
parents:
5007
diff
changeset
|
28 #include "mpegvideo.h" |
3398
e0927bc44a10
Move REG_* macros from libavcodec/i386/mmx.h to libavutil/x86_cpu.h
lucabe
parents:
3250
diff
changeset
|
29 #include "x86_cpu.h" |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
30 #include "mmx.h" |
5014
42b99a3aadde
better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents:
5010
diff
changeset
|
31 #include "vp3dsp_mmx.h" |
42b99a3aadde
better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents:
5010
diff
changeset
|
32 #include "vp3dsp_sse2.h" |
5277
7b3fcb7c61ce
Avoid linking with h263.c functions when the relevant codecs
aurel
parents:
5255
diff
changeset
|
33 #include "h263.h" |
0 | 34 |
1729 | 35 //#undef NDEBUG |
36 //#include <assert.h> | |
37 | |
2868 | 38 extern void ff_idct_xvid_mmx(short *block); |
39 extern void ff_idct_xvid_mmx2(short *block); | |
1647 | 40 |
4197 | 41 int mm_flags; /* multimedia extension flags */ |
936 | 42 |
0 | 43 /* pixel operations */ |
5947 | 44 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL; |
45 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL; | |
5946
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
46 |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
47 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) = |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
48 {0x8000000080000000ULL, 0x8000000080000000ULL}; |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
49 |
5946
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
50 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
51 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL; |
6331 | 52 DECLARE_ALIGNED_16(const xmm_t, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; |
5946
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
53 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL; |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
54 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; |
6331 | 55 DECLARE_ALIGNED_16(const xmm_t, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; |
5946
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
56 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; |
6329
5969caa9190d
clean up an ugliness introduced in r11826. this syntax will require fewer changes when adding future sse2 code.
lorenm
parents:
6327
diff
changeset
|
57 DECLARE_ALIGNED_16(const xmm_t, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; |
5946
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
58 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
59 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL; |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
60 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; |
6333 | 61 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL; |
5946
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
62 |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
63 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
64 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
65 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
66 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
67 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
68 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
69 |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
70 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 }; |
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5933
diff
changeset
|
71 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 }; |
5737 | 72 |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3574
diff
changeset
|
73 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::) |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
74 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
75 |
448 | 76 #define MOVQ_WONE(regd) \ |
77 __asm __volatile ( \ | |
78 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
79 "psrlw $15, %%" #regd ::) | |
80 | |
81 #define MOVQ_BFE(regd) \ | |
82 __asm __volatile ( \ | |
83 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ | |
84 "paddb %%" #regd ", %%" #regd " \n\t" ::) | |
85 | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
86 #ifndef PIC |
5947 | 87 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) |
88 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
89 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
90 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
91 // pcmpeqd -> -1 |
448 | 92 #define MOVQ_BONE(regd) \ |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
93 __asm __volatile ( \ |
448 | 94 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
95 "psrlw $15, %%" #regd " \n\t" \ | |
96 "packuswb %%" #regd ", %%" #regd " \n\t" ::) | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
97 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
98 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
99 __asm __volatile ( \ |
448 | 100 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
101 "psrlw $15, %%" #regd " \n\t" \ | |
102 "psllw $1, %%" #regd " \n\t"::) | |
387 | 103 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
104 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
105 |
448 | 106 // using regr as temporary and for the output result |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
107 // first argument is unmodifed and second is trashed |
471 | 108 // regfe is supposed to contain 0xfefefefefefefefe |
109 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ | |
2979 | 110 "movq " #rega ", " #regr " \n\t"\ |
111 "pand " #regb ", " #regr " \n\t"\ | |
112 "pxor " #rega ", " #regb " \n\t"\ | |
113 "pand " #regfe "," #regb " \n\t"\ | |
114 "psrlq $1, " #regb " \n\t"\ | |
115 "paddb " #regb ", " #regr " \n\t" | |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
116 |
471 | 117 #define PAVGB_MMX(rega, regb, regr, regfe) \ |
2979 | 118 "movq " #rega ", " #regr " \n\t"\ |
119 "por " #regb ", " #regr " \n\t"\ | |
120 "pxor " #rega ", " #regb " \n\t"\ | |
121 "pand " #regfe "," #regb " \n\t"\ | |
122 "psrlq $1, " #regb " \n\t"\ | |
123 "psubb " #regb ", " #regr " \n\t" | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
124 |
471 | 125 // mm6 is supposed to contain 0xfefefefefefefefe |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
126 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
2979 | 127 "movq " #rega ", " #regr " \n\t"\ |
128 "movq " #regc ", " #regp " \n\t"\ | |
129 "pand " #regb ", " #regr " \n\t"\ | |
130 "pand " #regd ", " #regp " \n\t"\ | |
131 "pxor " #rega ", " #regb " \n\t"\ | |
132 "pxor " #regc ", " #regd " \n\t"\ | |
133 "pand %%mm6, " #regb " \n\t"\ | |
134 "pand %%mm6, " #regd " \n\t"\ | |
135 "psrlq $1, " #regb " \n\t"\ | |
136 "psrlq $1, " #regd " \n\t"\ | |
137 "paddb " #regb ", " #regr " \n\t"\ | |
138 "paddb " #regd ", " #regp " \n\t" | |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
139 |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
140 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ |
2979 | 141 "movq " #rega ", " #regr " \n\t"\ |
142 "movq " #regc ", " #regp " \n\t"\ | |
143 "por " #regb ", " #regr " \n\t"\ | |
144 "por " #regd ", " #regp " \n\t"\ | |
145 "pxor " #rega ", " #regb " \n\t"\ | |
146 "pxor " #regc ", " #regd " \n\t"\ | |
147 "pand %%mm6, " #regb " \n\t"\ | |
148 "pand %%mm6, " #regd " \n\t"\ | |
149 "psrlq $1, " #regd " \n\t"\ | |
150 "psrlq $1, " #regb " \n\t"\ | |
151 "psubb " #regb ", " #regr " \n\t"\ | |
152 "psubb " #regd ", " #regp " \n\t" | |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
153 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
154 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
155 /* MMX no rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
156 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx |
448 | 157 #define SET_RND MOVQ_WONE |
2979 | 158 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) |
159 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
160 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
161 #include "dsputil_mmx_rnd.h" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
162 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
163 #undef DEF |
448 | 164 #undef SET_RND |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
165 #undef PAVGBP |
471 | 166 #undef PAVGB |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
167 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
168 /* MMX rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
169 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
170 #define DEF(x, y) x ## _ ## y ##_mmx |
448 | 171 #define SET_RND MOVQ_WTWO |
2979 | 172 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) |
173 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
174 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
175 #include "dsputil_mmx_rnd.h" |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
176 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
177 #undef DEF |
448 | 178 #undef SET_RND |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
179 #undef PAVGBP |
471 | 180 #undef PAVGB |
387 | 181 |
0 | 182 /***********************************/ |
183 /* 3Dnow specific */ | |
184 | |
185 #define DEF(x) x ## _3dnow | |
186 #define PAVGB "pavgusb" | |
187 | |
188 #include "dsputil_mmx_avg.h" | |
189 | |
190 #undef DEF | |
191 #undef PAVGB | |
192 | |
193 /***********************************/ | |
194 /* MMX2 specific */ | |
195 | |
386 | 196 #define DEF(x) x ## _mmx2 |
0 | 197 |
198 /* Introduced only in MMX2 set */ | |
199 #define PAVGB "pavgb" | |
200 | |
201 #include "dsputil_mmx_avg.h" | |
202 | |
203 #undef DEF | |
204 #undef PAVGB | |
205 | |
6327 | 206 #define put_no_rnd_pixels16_mmx put_pixels16_mmx |
207 #define put_no_rnd_pixels8_mmx put_pixels8_mmx | |
6321 | 208 #define put_pixels16_mmx2 put_pixels16_mmx |
209 #define put_pixels8_mmx2 put_pixels8_mmx | |
210 #define put_pixels4_mmx2 put_pixels4_mmx | |
211 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx | |
212 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx | |
213 #define put_pixels16_3dnow put_pixels16_mmx | |
214 #define put_pixels8_3dnow put_pixels8_mmx | |
215 #define put_pixels4_3dnow put_pixels4_mmx | |
216 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx | |
217 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx | |
218 | |
0 | 219 /***********************************/ |
220 /* standard MMX */ | |
221 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
222 #ifdef CONFIG_ENCODERS |
1064 | 223 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) |
0 | 224 { |
386 | 225 asm volatile( |
2979 | 226 "mov $-128, %%"REG_a" \n\t" |
227 "pxor %%mm7, %%mm7 \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3574
diff
changeset
|
228 ASMALIGN(4) |
2979 | 229 "1: \n\t" |
230 "movq (%0), %%mm0 \n\t" | |
231 "movq (%0, %2), %%mm2 \n\t" | |
232 "movq %%mm0, %%mm1 \n\t" | |
233 "movq %%mm2, %%mm3 \n\t" | |
234 "punpcklbw %%mm7, %%mm0 \n\t" | |
235 "punpckhbw %%mm7, %%mm1 \n\t" | |
236 "punpcklbw %%mm7, %%mm2 \n\t" | |
237 "punpckhbw %%mm7, %%mm3 \n\t" | |
238 "movq %%mm0, (%1, %%"REG_a") \n\t" | |
239 "movq %%mm1, 8(%1, %%"REG_a") \n\t" | |
240 "movq %%mm2, 16(%1, %%"REG_a") \n\t" | |
241 "movq %%mm3, 24(%1, %%"REG_a") \n\t" | |
242 "add %3, %0 \n\t" | |
243 "add $32, %%"REG_a" \n\t" | |
244 "js 1b \n\t" | |
386 | 245 : "+r" (pixels) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
246 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
247 : "%"REG_a |
386 | 248 ); |
0 | 249 } |
250 | |
1064 | 251 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) |
324 | 252 { |
253 asm volatile( | |
2979 | 254 "pxor %%mm7, %%mm7 \n\t" |
255 "mov $-128, %%"REG_a" \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3574
diff
changeset
|
256 ASMALIGN(4) |
2979 | 257 "1: \n\t" |
258 "movq (%0), %%mm0 \n\t" | |
259 "movq (%1), %%mm2 \n\t" | |
260 "movq %%mm0, %%mm1 \n\t" | |
261 "movq %%mm2, %%mm3 \n\t" | |
262 "punpcklbw %%mm7, %%mm0 \n\t" | |
263 "punpckhbw %%mm7, %%mm1 \n\t" | |
264 "punpcklbw %%mm7, %%mm2 \n\t" | |
265 "punpckhbw %%mm7, %%mm3 \n\t" | |
266 "psubw %%mm2, %%mm0 \n\t" | |
267 "psubw %%mm3, %%mm1 \n\t" | |
268 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
269 "movq %%mm1, 8(%2, %%"REG_a") \n\t" | |
270 "add %3, %0 \n\t" | |
271 "add %3, %1 \n\t" | |
272 "add $16, %%"REG_a" \n\t" | |
273 "jnz 1b \n\t" | |
324 | 274 : "+r" (s1), "+r" (s2) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
275 : "r" (block+64), "r" ((long)stride) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
276 : "%"REG_a |
324 | 277 ); |
278 } | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
279 #endif //CONFIG_ENCODERS |
324 | 280 |
1064 | 281 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
0 | 282 { |
283 const DCTELEM *p; | |
1064 | 284 uint8_t *pix; |
0 | 285 |
286 /* read the pixels */ | |
287 p = block; | |
288 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
289 /* unrolled loop */ |
2979 | 290 __asm __volatile( |
291 "movq %3, %%mm0 \n\t" | |
292 "movq 8%3, %%mm1 \n\t" | |
293 "movq 16%3, %%mm2 \n\t" | |
294 "movq 24%3, %%mm3 \n\t" | |
295 "movq 32%3, %%mm4 \n\t" | |
296 "movq 40%3, %%mm5 \n\t" | |
297 "movq 48%3, %%mm6 \n\t" | |
298 "movq 56%3, %%mm7 \n\t" | |
299 "packuswb %%mm1, %%mm0 \n\t" | |
300 "packuswb %%mm3, %%mm2 \n\t" | |
301 "packuswb %%mm5, %%mm4 \n\t" | |
302 "packuswb %%mm7, %%mm6 \n\t" | |
303 "movq %%mm0, (%0) \n\t" | |
304 "movq %%mm2, (%0, %1) \n\t" | |
305 "movq %%mm4, (%0, %1, 2) \n\t" | |
306 "movq %%mm6, (%0, %2) \n\t" | |
307 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p) | |
308 :"memory"); | |
0 | 309 pix += line_size*4; |
310 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
311 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
312 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
313 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
314 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
315 __asm __volatile( |
2979 | 316 "movq (%3), %%mm0 \n\t" |
317 "movq 8(%3), %%mm1 \n\t" | |
318 "movq 16(%3), %%mm2 \n\t" | |
319 "movq 24(%3), %%mm3 \n\t" | |
320 "movq 32(%3), %%mm4 \n\t" | |
321 "movq 40(%3), %%mm5 \n\t" | |
322 "movq 48(%3), %%mm6 \n\t" | |
323 "movq 56(%3), %%mm7 \n\t" | |
324 "packuswb %%mm1, %%mm0 \n\t" | |
325 "packuswb %%mm3, %%mm2 \n\t" | |
326 "packuswb %%mm5, %%mm4 \n\t" | |
327 "packuswb %%mm7, %%mm6 \n\t" | |
328 "movq %%mm0, (%0) \n\t" | |
329 "movq %%mm2, (%0, %1) \n\t" | |
330 "movq %%mm4, (%0, %1, 2) \n\t" | |
331 "movq %%mm6, (%0, %2) \n\t" | |
332 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p) | |
333 :"memory"); | |
0 | 334 } |
335 | |
3089 | 336 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) = |
1985
b2bc62fdecc0
move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents:
1984
diff
changeset
|
337 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; |
b2bc62fdecc0
move the 0x80 vector outside of the function, thus saving the compiler
melanson
parents:
1984
diff
changeset
|
338 |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
339 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
340 { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
341 int i; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
342 |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
343 movq_m2r(*vector128, mm1); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
344 for (i = 0; i < 8; i++) { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
345 movq_m2r(*(block), mm0); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
346 packsswb_m2r(*(block + 4), mm0); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
347 block += 8; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
348 paddb_r2r(mm1, mm0); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
349 movq_r2m(mm0, *pixels); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
350 pixels += line_size; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
351 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
352 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
353 |
1064 | 354 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
0 | 355 { |
356 const DCTELEM *p; | |
1064 | 357 uint8_t *pix; |
0 | 358 int i; |
359 | |
360 /* read the pixels */ | |
361 p = block; | |
362 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
363 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
364 i = 4; |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
365 do { |
2979 | 366 __asm __volatile( |
367 "movq (%2), %%mm0 \n\t" | |
368 "movq 8(%2), %%mm1 \n\t" | |
369 "movq 16(%2), %%mm2 \n\t" | |
370 "movq 24(%2), %%mm3 \n\t" | |
371 "movq %0, %%mm4 \n\t" | |
372 "movq %1, %%mm6 \n\t" | |
373 "movq %%mm4, %%mm5 \n\t" | |
374 "punpcklbw %%mm7, %%mm4 \n\t" | |
375 "punpckhbw %%mm7, %%mm5 \n\t" | |
376 "paddsw %%mm4, %%mm0 \n\t" | |
377 "paddsw %%mm5, %%mm1 \n\t" | |
378 "movq %%mm6, %%mm5 \n\t" | |
379 "punpcklbw %%mm7, %%mm6 \n\t" | |
380 "punpckhbw %%mm7, %%mm5 \n\t" | |
381 "paddsw %%mm6, %%mm2 \n\t" | |
382 "paddsw %%mm5, %%mm3 \n\t" | |
383 "packuswb %%mm1, %%mm0 \n\t" | |
384 "packuswb %%mm3, %%mm2 \n\t" | |
385 "movq %%mm0, %0 \n\t" | |
386 "movq %%mm2, %1 \n\t" | |
387 :"+m"(*pix), "+m"(*(pix+line_size)) | |
388 :"r"(p) | |
389 :"memory"); | |
0 | 390 pix += line_size*2; |
391 p += 16; | |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
392 } while (--i); |
0 | 393 } |
394 | |
2209 | 395 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
396 { | |
397 __asm __volatile( | |
2979 | 398 "lea (%3, %3), %%"REG_a" \n\t" |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3574
diff
changeset
|
399 ASMALIGN(3) |
2979 | 400 "1: \n\t" |
401 "movd (%1), %%mm0 \n\t" | |
402 "movd (%1, %3), %%mm1 \n\t" | |
403 "movd %%mm0, (%2) \n\t" | |
404 "movd %%mm1, (%2, %3) \n\t" | |
405 "add %%"REG_a", %1 \n\t" | |
406 "add %%"REG_a", %2 \n\t" | |
407 "movd (%1), %%mm0 \n\t" | |
408 "movd (%1, %3), %%mm1 \n\t" | |
409 "movd %%mm0, (%2) \n\t" | |
410 "movd %%mm1, (%2, %3) \n\t" | |
411 "add %%"REG_a", %1 \n\t" | |
412 "add %%"REG_a", %2 \n\t" | |
413 "subl $4, %0 \n\t" | |
414 "jnz 1b \n\t" | |
415 : "+g"(h), "+r" (pixels), "+r" (block) | |
416 : "r"((long)line_size) | |
417 : "%"REG_a, "memory" | |
418 ); | |
2209 | 419 } |
420 | |
1064 | 421 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
0 | 422 { |
471 | 423 __asm __volatile( |
2979 | 424 "lea (%3, %3), %%"REG_a" \n\t" |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3574
diff
changeset
|
425 ASMALIGN(3) |
2979 | 426 "1: \n\t" |
427 "movq (%1), %%mm0 \n\t" | |
428 "movq (%1, %3), %%mm1 \n\t" | |
429 "movq %%mm0, (%2) \n\t" | |
430 "movq %%mm1, (%2, %3) \n\t" | |
431 "add %%"REG_a", %1 \n\t" | |
432 "add %%"REG_a", %2 \n\t" | |
433 "movq (%1), %%mm0 \n\t" | |
434 "movq (%1, %3), %%mm1 \n\t" | |
435 "movq %%mm0, (%2) \n\t" | |
436 "movq %%mm1, (%2, %3) \n\t" | |
437 "add %%"REG_a", %1 \n\t" | |
438 "add %%"REG_a", %2 \n\t" | |
439 "subl $4, %0 \n\t" | |
440 "jnz 1b \n\t" | |
441 : "+g"(h), "+r" (pixels), "+r" (block) | |
442 : "r"((long)line_size) | |
443 : "%"REG_a, "memory" | |
444 ); | |
0 | 445 } |
446 | |
1064 | 447 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
651 | 448 { |
449 __asm __volatile( | |
2979 | 450 "lea (%3, %3), %%"REG_a" \n\t" |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3574
diff
changeset
|
451 ASMALIGN(3) |
2979 | 452 "1: \n\t" |
453 "movq (%1), %%mm0 \n\t" | |
454 "movq 8(%1), %%mm4 \n\t" | |
455 "movq (%1, %3), %%mm1 \n\t" | |
456 "movq 8(%1, %3), %%mm5 \n\t" | |
457 "movq %%mm0, (%2) \n\t" | |
458 "movq %%mm4, 8(%2) \n\t" | |
459 "movq %%mm1, (%2, %3) \n\t" | |
460 "movq %%mm5, 8(%2, %3) \n\t" | |
461 "add %%"REG_a", %1 \n\t" | |
462 "add %%"REG_a", %2 \n\t" | |
463 "movq (%1), %%mm0 \n\t" | |
464 "movq 8(%1), %%mm4 \n\t" | |
465 "movq (%1, %3), %%mm1 \n\t" | |
466 "movq 8(%1, %3), %%mm5 \n\t" | |
467 "movq %%mm0, (%2) \n\t" | |
468 "movq %%mm4, 8(%2) \n\t" | |
469 "movq %%mm1, (%2, %3) \n\t" | |
470 "movq %%mm5, 8(%2, %3) \n\t" | |
471 "add %%"REG_a", %1 \n\t" | |
472 "add %%"REG_a", %2 \n\t" | |
473 "subl $4, %0 \n\t" | |
474 "jnz 1b \n\t" | |
475 : "+g"(h), "+r" (pixels), "+r" (block) | |
476 : "r"((long)line_size) | |
477 : "%"REG_a, "memory" | |
478 ); | |
651 | 479 } |
480 | |
6331 | 481 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
482 { | |
483 __asm __volatile( | |
484 "1: \n\t" | |
485 "movdqu (%1), %%xmm0 \n\t" | |
486 "movdqu (%1,%3), %%xmm1 \n\t" | |
487 "movdqu (%1,%3,2), %%xmm2 \n\t" | |
488 "movdqu (%1,%4), %%xmm3 \n\t" | |
489 "movdqa %%xmm0, (%2) \n\t" | |
490 "movdqa %%xmm1, (%2,%3) \n\t" | |
491 "movdqa %%xmm2, (%2,%3,2) \n\t" | |
492 "movdqa %%xmm3, (%2,%4) \n\t" | |
493 "subl $4, %0 \n\t" | |
494 "lea (%1,%3,4), %1 \n\t" | |
495 "lea (%2,%3,4), %2 \n\t" | |
496 "jnz 1b \n\t" | |
497 : "+g"(h), "+r" (pixels), "+r" (block) | |
498 : "r"((long)line_size), "r"(3L*line_size) | |
499 : "memory" | |
500 ); | |
501 } | |
502 | |
503 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
504 { | |
505 __asm __volatile( | |
506 "1: \n\t" | |
507 "movdqu (%1), %%xmm0 \n\t" | |
508 "movdqu (%1,%3), %%xmm1 \n\t" | |
509 "movdqu (%1,%3,2), %%xmm2 \n\t" | |
510 "movdqu (%1,%4), %%xmm3 \n\t" | |
511 "pavgb (%2), %%xmm0 \n\t" | |
512 "pavgb (%2,%3), %%xmm1 \n\t" | |
513 "pavgb (%2,%3,2), %%xmm2 \n\t" | |
514 "pavgb (%2,%4), %%xmm3 \n\t" | |
515 "movdqa %%xmm0, (%2) \n\t" | |
516 "movdqa %%xmm1, (%2,%3) \n\t" | |
517 "movdqa %%xmm2, (%2,%3,2) \n\t" | |
518 "movdqa %%xmm3, (%2,%4) \n\t" | |
519 "subl $4, %0 \n\t" | |
520 "lea (%1,%3,4), %1 \n\t" | |
521 "lea (%2,%3,4), %2 \n\t" | |
522 "jnz 1b \n\t" | |
523 : "+g"(h), "+r" (pixels), "+r" (block) | |
524 : "r"((long)line_size), "r"(3L*line_size) | |
525 : "memory" | |
526 ); | |
527 } | |
528 | |
296 | 529 static void clear_blocks_mmx(DCTELEM *blocks) |
530 { | |
471 | 531 __asm __volatile( |
2979 | 532 "pxor %%mm7, %%mm7 \n\t" |
533 "mov $-128*6, %%"REG_a" \n\t" | |
534 "1: \n\t" | |
535 "movq %%mm7, (%0, %%"REG_a") \n\t" | |
536 "movq %%mm7, 8(%0, %%"REG_a") \n\t" | |
537 "movq %%mm7, 16(%0, %%"REG_a") \n\t" | |
538 "movq %%mm7, 24(%0, %%"REG_a") \n\t" | |
539 "add $32, %%"REG_a" \n\t" | |
540 " js 1b \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
541 : : "r" (((uint8_t *)blocks)+128*6) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
542 : "%"REG_a |
296 | 543 ); |
544 } | |
545 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
546 #ifdef CONFIG_ENCODERS |
1064 | 547 static int pix_sum16_mmx(uint8_t * pix, int line_size){ |
688 | 548 const int h=16; |
549 int sum; | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
550 long index= -line_size*h; |
688 | 551 |
552 __asm __volatile( | |
2979 | 553 "pxor %%mm7, %%mm7 \n\t" |
554 "pxor %%mm6, %%mm6 \n\t" | |
555 "1: \n\t" | |
556 "movq (%2, %1), %%mm0 \n\t" | |
557 "movq (%2, %1), %%mm1 \n\t" | |
558 "movq 8(%2, %1), %%mm2 \n\t" | |
559 "movq 8(%2, %1), %%mm3 \n\t" | |
560 "punpcklbw %%mm7, %%mm0 \n\t" | |
561 "punpckhbw %%mm7, %%mm1 \n\t" | |
562 "punpcklbw %%mm7, %%mm2 \n\t" | |
563 "punpckhbw %%mm7, %%mm3 \n\t" | |
564 "paddw %%mm0, %%mm1 \n\t" | |
565 "paddw %%mm2, %%mm3 \n\t" | |
566 "paddw %%mm1, %%mm3 \n\t" | |
567 "paddw %%mm3, %%mm6 \n\t" | |
568 "add %3, %1 \n\t" | |
569 " js 1b \n\t" | |
570 "movq %%mm6, %%mm5 \n\t" | |
571 "psrlq $32, %%mm6 \n\t" | |
572 "paddw %%mm5, %%mm6 \n\t" | |
573 "movq %%mm6, %%mm5 \n\t" | |
574 "psrlq $16, %%mm6 \n\t" | |
575 "paddw %%mm5, %%mm6 \n\t" | |
576 "movd %%mm6, %0 \n\t" | |
577 "andl $0xFFFF, %0 \n\t" | |
688 | 578 : "=&r" (sum), "+r" (index) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
579 : "r" (pix - index), "r" ((long)line_size) |
688 | 580 ); |
581 | |
582 return sum; | |
583 } | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
584 #endif //CONFIG_ENCODERS |
688 | 585 |
866 | 586 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
587 long i=0; |
866 | 588 asm volatile( |
2979 | 589 "1: \n\t" |
590 "movq (%1, %0), %%mm0 \n\t" | |
591 "movq (%2, %0), %%mm1 \n\t" | |
592 "paddb %%mm0, %%mm1 \n\t" | |
593 "movq %%mm1, (%2, %0) \n\t" | |
594 "movq 8(%1, %0), %%mm0 \n\t" | |
595 "movq 8(%2, %0), %%mm1 \n\t" | |
596 "paddb %%mm0, %%mm1 \n\t" | |
597 "movq %%mm1, 8(%2, %0) \n\t" | |
598 "add $16, %0 \n\t" | |
599 "cmp %3, %0 \n\t" | |
600 " jb 1b \n\t" | |
866 | 601 : "+r" (i) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
602 : "r"(src), "r"(dst), "r"((long)w-15) |
866 | 603 ); |
604 for(; i<w; i++) | |
605 dst[i+0] += src[i+0]; | |
606 } | |
607 | |
1648 | 608 #define H263_LOOP_FILTER \ |
2979 | 609 "pxor %%mm7, %%mm7 \n\t"\ |
610 "movq %0, %%mm0 \n\t"\ | |
611 "movq %0, %%mm1 \n\t"\ | |
612 "movq %3, %%mm2 \n\t"\ | |
613 "movq %3, %%mm3 \n\t"\ | |
614 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
615 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
616 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
617 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
618 "psubw %%mm2, %%mm0 \n\t"\ | |
619 "psubw %%mm3, %%mm1 \n\t"\ | |
620 "movq %1, %%mm2 \n\t"\ | |
621 "movq %1, %%mm3 \n\t"\ | |
622 "movq %2, %%mm4 \n\t"\ | |
623 "movq %2, %%mm5 \n\t"\ | |
624 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
625 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
626 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
627 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
628 "psubw %%mm2, %%mm4 \n\t"\ | |
629 "psubw %%mm3, %%mm5 \n\t"\ | |
630 "psllw $2, %%mm4 \n\t"\ | |
631 "psllw $2, %%mm5 \n\t"\ | |
632 "paddw %%mm0, %%mm4 \n\t"\ | |
633 "paddw %%mm1, %%mm5 \n\t"\ | |
634 "pxor %%mm6, %%mm6 \n\t"\ | |
635 "pcmpgtw %%mm4, %%mm6 \n\t"\ | |
636 "pcmpgtw %%mm5, %%mm7 \n\t"\ | |
637 "pxor %%mm6, %%mm4 \n\t"\ | |
638 "pxor %%mm7, %%mm5 \n\t"\ | |
639 "psubw %%mm6, %%mm4 \n\t"\ | |
640 "psubw %%mm7, %%mm5 \n\t"\ | |
641 "psrlw $3, %%mm4 \n\t"\ | |
642 "psrlw $3, %%mm5 \n\t"\ | |
643 "packuswb %%mm5, %%mm4 \n\t"\ | |
644 "packsswb %%mm7, %%mm6 \n\t"\ | |
645 "pxor %%mm7, %%mm7 \n\t"\ | |
646 "movd %4, %%mm2 \n\t"\ | |
647 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
648 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
649 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
650 "psubusb %%mm4, %%mm2 \n\t"\ | |
651 "movq %%mm2, %%mm3 \n\t"\ | |
652 "psubusb %%mm4, %%mm3 \n\t"\ | |
653 "psubb %%mm3, %%mm2 \n\t"\ | |
654 "movq %1, %%mm3 \n\t"\ | |
655 "movq %2, %%mm4 \n\t"\ | |
656 "pxor %%mm6, %%mm3 \n\t"\ | |
657 "pxor %%mm6, %%mm4 \n\t"\ | |
658 "paddusb %%mm2, %%mm3 \n\t"\ | |
659 "psubusb %%mm2, %%mm4 \n\t"\ | |
660 "pxor %%mm6, %%mm3 \n\t"\ | |
661 "pxor %%mm6, %%mm4 \n\t"\ | |
662 "paddusb %%mm2, %%mm2 \n\t"\ | |
663 "packsswb %%mm1, %%mm0 \n\t"\ | |
664 "pcmpgtb %%mm0, %%mm7 \n\t"\ | |
665 "pxor %%mm7, %%mm0 \n\t"\ | |
666 "psubb %%mm7, %%mm0 \n\t"\ | |
667 "movq %%mm0, %%mm1 \n\t"\ | |
668 "psubusb %%mm2, %%mm0 \n\t"\ | |
669 "psubb %%mm0, %%mm1 \n\t"\ | |
670 "pand %5, %%mm1 \n\t"\ | |
671 "psrlw $2, %%mm1 \n\t"\ | |
672 "pxor %%mm7, %%mm1 \n\t"\ | |
673 "psubb %%mm7, %%mm1 \n\t"\ | |
674 "movq %0, %%mm5 \n\t"\ | |
675 "movq %3, %%mm6 \n\t"\ | |
676 "psubb %%mm1, %%mm5 \n\t"\ | |
677 "paddb %%mm1, %%mm6 \n\t" | |
1648 | 678 |
1647 | 679 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5278
diff
changeset
|
680 if(ENABLE_ANY_H263) { |
1647 | 681 const int strength= ff_h263_loop_filter_strength[qscale]; |
682 | |
683 asm volatile( | |
2967 | 684 |
1648 | 685 H263_LOOP_FILTER |
2967 | 686 |
2979 | 687 "movq %%mm3, %1 \n\t" |
688 "movq %%mm4, %2 \n\t" | |
689 "movq %%mm5, %0 \n\t" | |
690 "movq %%mm6, %3 \n\t" | |
1647 | 691 : "+m" (*(uint64_t*)(src - 2*stride)), |
692 "+m" (*(uint64_t*)(src - 1*stride)), | |
693 "+m" (*(uint64_t*)(src + 0*stride)), | |
694 "+m" (*(uint64_t*)(src + 1*stride)) | |
695 : "g" (2*strength), "m"(ff_pb_FC) | |
696 ); | |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5278
diff
changeset
|
697 } |
1647 | 698 } |
699 | |
1648 | 700 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
701 asm volatile( //FIXME could save 1 instruction if done as 8x4 ... | |
2979 | 702 "movd %4, %%mm0 \n\t" |
703 "movd %5, %%mm1 \n\t" | |
704 "movd %6, %%mm2 \n\t" | |
705 "movd %7, %%mm3 \n\t" | |
706 "punpcklbw %%mm1, %%mm0 \n\t" | |
707 "punpcklbw %%mm3, %%mm2 \n\t" | |
708 "movq %%mm0, %%mm1 \n\t" | |
709 "punpcklwd %%mm2, %%mm0 \n\t" | |
710 "punpckhwd %%mm2, %%mm1 \n\t" | |
711 "movd %%mm0, %0 \n\t" | |
712 "punpckhdq %%mm0, %%mm0 \n\t" | |
713 "movd %%mm0, %1 \n\t" | |
714 "movd %%mm1, %2 \n\t" | |
715 "punpckhdq %%mm1, %%mm1 \n\t" | |
716 "movd %%mm1, %3 \n\t" | |
2967 | 717 |
1648 | 718 : "=m" (*(uint32_t*)(dst + 0*dst_stride)), |
719 "=m" (*(uint32_t*)(dst + 1*dst_stride)), | |
720 "=m" (*(uint32_t*)(dst + 2*dst_stride)), | |
721 "=m" (*(uint32_t*)(dst + 3*dst_stride)) | |
722 : "m" (*(uint32_t*)(src + 0*src_stride)), | |
723 "m" (*(uint32_t*)(src + 1*src_stride)), | |
724 "m" (*(uint32_t*)(src + 2*src_stride)), | |
725 "m" (*(uint32_t*)(src + 3*src_stride)) | |
726 ); | |
727 } | |
728 | |
729 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ | |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5278
diff
changeset
|
730 if(ENABLE_ANY_H263) { |
1648 | 731 const int strength= ff_h263_loop_filter_strength[qscale]; |
6181 | 732 DECLARE_ALIGNED(8, uint64_t, temp[4]); |
1648 | 733 uint8_t *btemp= (uint8_t*)temp; |
2967 | 734 |
1648 | 735 src -= 2; |
736 | |
737 transpose4x4(btemp , src , 8, stride); | |
738 transpose4x4(btemp+4, src + 4*stride, 8, stride); | |
739 asm volatile( | |
740 H263_LOOP_FILTER // 5 3 4 6 | |
2967 | 741 |
1648 | 742 : "+m" (temp[0]), |
743 "+m" (temp[1]), | |
744 "+m" (temp[2]), | |
745 "+m" (temp[3]) | |
746 : "g" (2*strength), "m"(ff_pb_FC) | |
747 ); | |
748 | |
749 asm volatile( | |
2979 | 750 "movq %%mm5, %%mm1 \n\t" |
751 "movq %%mm4, %%mm0 \n\t" | |
752 "punpcklbw %%mm3, %%mm5 \n\t" | |
753 "punpcklbw %%mm6, %%mm4 \n\t" | |
754 "punpckhbw %%mm3, %%mm1 \n\t" | |
755 "punpckhbw %%mm6, %%mm0 \n\t" | |
756 "movq %%mm5, %%mm3 \n\t" | |
757 "movq %%mm1, %%mm6 \n\t" | |
758 "punpcklwd %%mm4, %%mm5 \n\t" | |
759 "punpcklwd %%mm0, %%mm1 \n\t" | |
760 "punpckhwd %%mm4, %%mm3 \n\t" | |
761 "punpckhwd %%mm0, %%mm6 \n\t" | |
762 "movd %%mm5, (%0) \n\t" | |
763 "punpckhdq %%mm5, %%mm5 \n\t" | |
764 "movd %%mm5, (%0,%2) \n\t" | |
765 "movd %%mm3, (%0,%2,2) \n\t" | |
766 "punpckhdq %%mm3, %%mm3 \n\t" | |
767 "movd %%mm3, (%0,%3) \n\t" | |
768 "movd %%mm1, (%1) \n\t" | |
769 "punpckhdq %%mm1, %%mm1 \n\t" | |
770 "movd %%mm1, (%1,%2) \n\t" | |
771 "movd %%mm6, (%1,%2,2) \n\t" | |
772 "punpckhdq %%mm6, %%mm6 \n\t" | |
773 "movd %%mm6, (%1,%3) \n\t" | |
2505
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
774 :: "r" (src), |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
775 "r" (src + 4*stride), |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
776 "r" ((long) stride ), |
86e2b1424801
optimization and gcc 4.0 bug workaround patch by (Martin Drab >drab kepler.fjfi.cvut cz<)
michael
parents:
2293
diff
changeset
|
777 "r" ((long)(3*stride)) |
1648 | 778 ); |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5278
diff
changeset
|
779 } |
1648 | 780 } |
781 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
782 #ifdef CONFIG_ENCODERS |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
783 static int pix_norm1_mmx(uint8_t *pix, int line_size) { |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
784 int tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
785 asm volatile ( |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
786 "movl $16,%%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
787 "pxor %%mm0,%%mm0\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
788 "pxor %%mm7,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
789 "1:\n" |
2979 | 790 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ |
791 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
792 |
2979 | 793 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
794 |
2979 | 795 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ |
796 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
797 |
2979 | 798 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ |
799 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ | |
800 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
801 |
2979 | 802 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ |
803 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
804 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
805 "pmaddwd %%mm3,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
806 "pmaddwd %%mm4,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
807 |
2979 | 808 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, |
809 pix2^2+pix3^2+pix6^2+pix7^2) */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
810 "paddd %%mm3,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
811 "paddd %%mm2,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
812 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
813 "add %2, %0\n" |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
814 "paddd %%mm4,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
815 "dec %%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
816 "jnz 1b\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
817 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
818 "movq %%mm7,%%mm1\n" |
2979 | 819 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
820 "paddd %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
821 "movd %%mm1,%1\n" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
822 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" ); |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
823 return tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
824 } |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
825 |
2067 | 826 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
827 int tmp; | |
828 asm volatile ( | |
829 "movl %4,%%ecx\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
830 "shr $1,%%ecx\n" |
2979 | 831 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ |
832 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ | |
2067 | 833 "1:\n" |
2979 | 834 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ |
835 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ | |
836 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ | |
837 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ | |
2067 | 838 |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
839 /* todo: mm1-mm2, mm3-mm4 */ |
5963 | 840 /* algo: subtract mm1 from mm2 with saturation and vice versa */ |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
841 /* OR the results to get absolute difference */ |
2067 | 842 "movq %%mm1,%%mm5\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
843 "movq %%mm3,%%mm6\n" |
2067 | 844 "psubusb %%mm2,%%mm1\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
845 "psubusb %%mm4,%%mm3\n" |
2067 | 846 "psubusb %%mm5,%%mm2\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
847 "psubusb %%mm6,%%mm4\n" |
2067 | 848 |
849 "por %%mm1,%%mm2\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
850 "por %%mm3,%%mm4\n" |
2067 | 851 |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
852 /* now convert to 16-bit vectors so we can square them */ |
2067 | 853 "movq %%mm2,%%mm1\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
854 "movq %%mm4,%%mm3\n" |
2067 | 855 |
856 "punpckhbw %%mm0,%%mm2\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
857 "punpckhbw %%mm0,%%mm4\n" |
2979 | 858 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ |
859 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ | |
2067 | 860 |
861 "pmaddwd %%mm2,%%mm2\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
862 "pmaddwd %%mm4,%%mm4\n" |
2067 | 863 "pmaddwd %%mm1,%%mm1\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
864 "pmaddwd %%mm3,%%mm3\n" |
2067 | 865 |
2979 | 866 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ |
867 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ | |
2067 | 868 |
869 "paddd %%mm2,%%mm1\n" | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
870 "paddd %%mm4,%%mm3\n" |
2067 | 871 "paddd %%mm1,%%mm7\n" |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
872 "paddd %%mm3,%%mm7\n" |
2067 | 873 |
874 "decl %%ecx\n" | |
875 "jnz 1b\n" | |
876 | |
877 "movq %%mm7,%%mm1\n" | |
2979 | 878 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
2067 | 879 "paddd %%mm7,%%mm1\n" |
880 "movd %%mm1,%2\n" | |
2967 | 881 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
882 : "r" ((long)line_size) , "m" (h) |
2067 | 883 : "%ecx"); |
884 return tmp; | |
885 } | |
886 | |
1708 | 887 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
888 int tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
889 asm volatile ( |
1708 | 890 "movl %4,%%ecx\n" |
2979 | 891 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ |
892 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
893 "1:\n" |
2979 | 894 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ |
895 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ | |
896 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ | |
897 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
898 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
899 /* todo: mm1-mm2, mm3-mm4 */ |
5963 | 900 /* algo: subtract mm1 from mm2 with saturation and vice versa */ |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
901 /* OR the results to get absolute difference */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
902 "movq %%mm1,%%mm5\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
903 "movq %%mm3,%%mm6\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
904 "psubusb %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
905 "psubusb %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
906 "psubusb %%mm5,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
907 "psubusb %%mm6,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
908 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
909 "por %%mm1,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
910 "por %%mm3,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
911 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
912 /* now convert to 16-bit vectors so we can square them */ |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
913 "movq %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
914 "movq %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
915 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
916 "punpckhbw %%mm0,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
917 "punpckhbw %%mm0,%%mm4\n" |
2979 | 918 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ |
919 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
920 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
921 "pmaddwd %%mm2,%%mm2\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
922 "pmaddwd %%mm4,%%mm4\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
923 "pmaddwd %%mm1,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
924 "pmaddwd %%mm3,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
925 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
926 "add %3,%0\n" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
927 "add %3,%1\n" |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
928 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
929 "paddd %%mm2,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
930 "paddd %%mm4,%%mm3\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
931 "paddd %%mm1,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
932 "paddd %%mm3,%%mm7\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
933 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
934 "decl %%ecx\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
935 "jnz 1b\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
936 |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
937 "movq %%mm7,%%mm1\n" |
2979 | 938 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
939 "paddd %%mm7,%%mm1\n" |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
940 "movd %%mm1,%2\n" |
2967 | 941 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
942 : "r" ((long)line_size) , "m" (h) |
1708 | 943 : "%ecx"); |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
944 return tmp; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
945 } |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
984
diff
changeset
|
946 |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
947 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
948 int tmp; |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
949 asm volatile ( |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
950 "shr $1,%2\n" |
2979 | 951 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ |
952 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
953 "1:\n" |
2979 | 954 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */ |
955 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */ | |
956 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */ | |
957 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */ | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
958 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
959 /* todo: mm1-mm2, mm3-mm4 */ |
5963 | 960 /* algo: subtract mm1 from mm2 with saturation and vice versa */ |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
961 /* OR the results to get absolute difference */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
962 "movdqa %%xmm1,%%xmm5\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
963 "movdqa %%xmm3,%%xmm6\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
964 "psubusb %%xmm2,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
965 "psubusb %%xmm4,%%xmm3\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
966 "psubusb %%xmm5,%%xmm2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
967 "psubusb %%xmm6,%%xmm4\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
968 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
969 "por %%xmm1,%%xmm2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
970 "por %%xmm3,%%xmm4\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
971 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
972 /* now convert to 16-bit vectors so we can square them */ |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
973 "movdqa %%xmm2,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
974 "movdqa %%xmm4,%%xmm3\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
975 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
976 "punpckhbw %%xmm0,%%xmm2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
977 "punpckhbw %%xmm0,%%xmm4\n" |
2979 | 978 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ |
979 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
980 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
981 "pmaddwd %%xmm2,%%xmm2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
982 "pmaddwd %%xmm4,%%xmm4\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
983 "pmaddwd %%xmm1,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
984 "pmaddwd %%xmm3,%%xmm3\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
985 |
2979 | 986 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ |
987 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ | |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
988 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
989 "paddd %%xmm2,%%xmm1\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
990 "paddd %%xmm4,%%xmm3\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
991 "paddd %%xmm1,%%xmm7\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
992 "paddd %%xmm3,%%xmm7\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
993 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
994 "decl %2\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
995 "jnz 1b\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
996 |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
997 "movdqa %%xmm7,%%xmm1\n" |
2979 | 998 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */ |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
999 "paddd %%xmm1,%%xmm7\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
1000 "movdqa %%xmm7,%%xmm1\n" |
2979 | 1001 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
1002 "paddd %%xmm1,%%xmm7\n" |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
1003 "movd %%xmm7,%3\n" |
2967 | 1004 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) |
2899
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
1005 : "r" ((long)line_size)); |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
1006 return tmp; |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
1007 } |
d3a726717baf
sse2 16x16 sum squared diff (306=>268 cycles on a K8)
lorenm
parents:
2892
diff
changeset
|
1008 |
2067 | 1009 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { |
1010 int tmp; | |
1011 asm volatile ( | |
1012 "movl %3,%%ecx\n" | |
1013 "pxor %%mm7,%%mm7\n" | |
1014 "pxor %%mm6,%%mm6\n" | |
2967 | 1015 |
2067 | 1016 "movq (%0),%%mm0\n" |
1017 "movq %%mm0, %%mm1\n" | |
1018 "psllq $8, %%mm0\n" | |
1019 "psrlq $8, %%mm1\n" | |
1020 "psrlq $8, %%mm0\n" | |
1021 "movq %%mm0, %%mm2\n" | |
1022 "movq %%mm1, %%mm3\n" | |
1023 "punpcklbw %%mm7,%%mm0\n" | |
1024 "punpcklbw %%mm7,%%mm1\n" | |
1025 "punpckhbw %%mm7,%%mm2\n" | |
1026 "punpckhbw %%mm7,%%mm3\n" | |
1027 "psubw %%mm1, %%mm0\n" | |
1028 "psubw %%mm3, %%mm2\n" | |
2967 | 1029 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1030 "add %2,%0\n" |
2967 | 1031 |
2067 | 1032 "movq (%0),%%mm4\n" |
1033 "movq %%mm4, %%mm1\n" | |
1034 "psllq $8, %%mm4\n" | |
1035 "psrlq $8, %%mm1\n" | |
1036 "psrlq $8, %%mm4\n" | |
1037 "movq %%mm4, %%mm5\n" | |
1038 "movq %%mm1, %%mm3\n" | |
1039 "punpcklbw %%mm7,%%mm4\n" | |
1040 "punpcklbw %%mm7,%%mm1\n" | |
1041 "punpckhbw %%mm7,%%mm5\n" | |
1042 "punpckhbw %%mm7,%%mm3\n" | |
1043 "psubw %%mm1, %%mm4\n" | |
1044 "psubw %%mm3, %%mm5\n" | |
1045 "psubw %%mm4, %%mm0\n" | |
1046 "psubw %%mm5, %%mm2\n" | |
1047 "pxor %%mm3, %%mm3\n" | |
1048 "pxor %%mm1, %%mm1\n" | |
1049 "pcmpgtw %%mm0, %%mm3\n\t" | |
1050 "pcmpgtw %%mm2, %%mm1\n\t" | |
1051 "pxor %%mm3, %%mm0\n" | |
1052 "pxor %%mm1, %%mm2\n" | |
2967 | 1053 "psubw %%mm3, %%mm0\n" |
2067 | 1054 "psubw %%mm1, %%mm2\n" |
1055 "paddw %%mm0, %%mm2\n" | |
1056 "paddw %%mm2, %%mm6\n" | |
1057 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1058 "add %2,%0\n" |
2067 | 1059 "1:\n" |
2967 | 1060 |
2067 | 1061 "movq (%0),%%mm0\n" |
1062 "movq %%mm0, %%mm1\n" | |
1063 "psllq $8, %%mm0\n" | |
1064 "psrlq $8, %%mm1\n" | |
1065 "psrlq $8, %%mm0\n" | |
1066 "movq %%mm0, %%mm2\n" | |
1067 "movq %%mm1, %%mm3\n" | |
1068 "punpcklbw %%mm7,%%mm0\n" | |
1069 "punpcklbw %%mm7,%%mm1\n" | |
1070 "punpckhbw %%mm7,%%mm2\n" | |
1071 "punpckhbw %%mm7,%%mm3\n" | |
1072 "psubw %%mm1, %%mm0\n" | |
1073 "psubw %%mm3, %%mm2\n" | |
1074 "psubw %%mm0, %%mm4\n" | |
1075 "psubw %%mm2, %%mm5\n" | |
1076 "pxor %%mm3, %%mm3\n" | |
1077 "pxor %%mm1, %%mm1\n" | |
1078 "pcmpgtw %%mm4, %%mm3\n\t" | |
1079 "pcmpgtw %%mm5, %%mm1\n\t" | |
1080 "pxor %%mm3, %%mm4\n" | |
1081 "pxor %%mm1, %%mm5\n" | |
2967 | 1082 "psubw %%mm3, %%mm4\n" |
2067 | 1083 "psubw %%mm1, %%mm5\n" |
1084 "paddw %%mm4, %%mm5\n" | |
1085 "paddw %%mm5, %%mm6\n" | |
2967 | 1086 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1087 "add %2,%0\n" |
2967 | 1088 |
2067 | 1089 "movq (%0),%%mm4\n" |
1090 "movq %%mm4, %%mm1\n" | |
1091 "psllq $8, %%mm4\n" | |
1092 "psrlq $8, %%mm1\n" | |
1093 "psrlq $8, %%mm4\n" | |
1094 "movq %%mm4, %%mm5\n" | |
1095 "movq %%mm1, %%mm3\n" | |
1096 "punpcklbw %%mm7,%%mm4\n" | |
1097 "punpcklbw %%mm7,%%mm1\n" | |
1098 "punpckhbw %%mm7,%%mm5\n" | |
1099 "punpckhbw %%mm7,%%mm3\n" | |
1100 "psubw %%mm1, %%mm4\n" | |
1101 "psubw %%mm3, %%mm5\n" | |
1102 "psubw %%mm4, %%mm0\n" | |
1103 "psubw %%mm5, %%mm2\n" | |
1104 "pxor %%mm3, %%mm3\n" | |
1105 "pxor %%mm1, %%mm1\n" | |
1106 "pcmpgtw %%mm0, %%mm3\n\t" | |
1107 "pcmpgtw %%mm2, %%mm1\n\t" | |
1108 "pxor %%mm3, %%mm0\n" | |
1109 "pxor %%mm1, %%mm2\n" | |
2967 | 1110 "psubw %%mm3, %%mm0\n" |
2067 | 1111 "psubw %%mm1, %%mm2\n" |
1112 "paddw %%mm0, %%mm2\n" | |
1113 "paddw %%mm2, %%mm6\n" | |
1114 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1115 "add %2,%0\n" |
2067 | 1116 "subl $2, %%ecx\n" |
1117 " jnz 1b\n" | |
1118 | |
1119 "movq %%mm6, %%mm0\n" | |
1120 "punpcklwd %%mm7,%%mm0\n" | |
1121 "punpckhwd %%mm7,%%mm6\n" | |
1122 "paddd %%mm0, %%mm6\n" | |
2967 | 1123 |
2067 | 1124 "movq %%mm6,%%mm0\n" |
1125 "psrlq $32, %%mm6\n" | |
1126 "paddd %%mm6,%%mm0\n" | |
1127 "movd %%mm0,%1\n" | |
2967 | 1128 : "+r" (pix1), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1129 : "r" ((long)line_size) , "g" (h-2) |
2067 | 1130 : "%ecx"); |
1131 return tmp; | |
1132 } | |
1133 | |
1134 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { | |
1135 int tmp; | |
1136 uint8_t * pix= pix1; | |
1137 asm volatile ( | |
1138 "movl %3,%%ecx\n" | |
1139 "pxor %%mm7,%%mm7\n" | |
1140 "pxor %%mm6,%%mm6\n" | |
2967 | 1141 |
2067 | 1142 "movq (%0),%%mm0\n" |
1143 "movq 1(%0),%%mm1\n" | |
1144 "movq %%mm0, %%mm2\n" | |
1145 "movq %%mm1, %%mm3\n" | |
1146 "punpcklbw %%mm7,%%mm0\n" | |
1147 "punpcklbw %%mm7,%%mm1\n" | |
1148 "punpckhbw %%mm7,%%mm2\n" | |
1149 "punpckhbw %%mm7,%%mm3\n" | |
1150 "psubw %%mm1, %%mm0\n" | |
1151 "psubw %%mm3, %%mm2\n" | |
2967 | 1152 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1153 "add %2,%0\n" |
2967 | 1154 |
2067 | 1155 "movq (%0),%%mm4\n" |
1156 "movq 1(%0),%%mm1\n" | |
1157 "movq %%mm4, %%mm5\n" | |
1158 "movq %%mm1, %%mm3\n" | |
1159 "punpcklbw %%mm7,%%mm4\n" | |
1160 "punpcklbw %%mm7,%%mm1\n" | |
1161 "punpckhbw %%mm7,%%mm5\n" | |
1162 "punpckhbw %%mm7,%%mm3\n" | |
1163 "psubw %%mm1, %%mm4\n" | |
1164 "psubw %%mm3, %%mm5\n" | |
1165 "psubw %%mm4, %%mm0\n" | |
1166 "psubw %%mm5, %%mm2\n" | |
1167 "pxor %%mm3, %%mm3\n" | |
1168 "pxor %%mm1, %%mm1\n" | |
1169 "pcmpgtw %%mm0, %%mm3\n\t" | |
1170 "pcmpgtw %%mm2, %%mm1\n\t" | |
1171 "pxor %%mm3, %%mm0\n" | |
1172 "pxor %%mm1, %%mm2\n" | |
2967 | 1173 "psubw %%mm3, %%mm0\n" |
2067 | 1174 "psubw %%mm1, %%mm2\n" |
1175 "paddw %%mm0, %%mm2\n" | |
1176 "paddw %%mm2, %%mm6\n" | |
1177 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1178 "add %2,%0\n" |
2067 | 1179 "1:\n" |
2967 | 1180 |
2067 | 1181 "movq (%0),%%mm0\n" |
1182 "movq 1(%0),%%mm1\n" | |
1183 "movq %%mm0, %%mm2\n" | |
1184 "movq %%mm1, %%mm3\n" | |
1185 "punpcklbw %%mm7,%%mm0\n" | |
1186 "punpcklbw %%mm7,%%mm1\n" | |
1187 "punpckhbw %%mm7,%%mm2\n" | |
1188 "punpckhbw %%mm7,%%mm3\n" | |
1189 "psubw %%mm1, %%mm0\n" | |
1190 "psubw %%mm3, %%mm2\n" | |
1191 "psubw %%mm0, %%mm4\n" | |
1192 "psubw %%mm2, %%mm5\n" | |
1193 "pxor %%mm3, %%mm3\n" | |
1194 "pxor %%mm1, %%mm1\n" | |
1195 "pcmpgtw %%mm4, %%mm3\n\t" | |
1196 "pcmpgtw %%mm5, %%mm1\n\t" | |
1197 "pxor %%mm3, %%mm4\n" | |
1198 "pxor %%mm1, %%mm5\n" | |
1199 "psubw %%mm3, %%mm4\n" | |
1200 "psubw %%mm1, %%mm5\n" | |
1201 "paddw %%mm4, %%mm5\n" | |
1202 "paddw %%mm5, %%mm6\n" | |
2967 | 1203 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1204 "add %2,%0\n" |
2967 | 1205 |
2067 | 1206 "movq (%0),%%mm4\n" |
1207 "movq 1(%0),%%mm1\n" | |
1208 "movq %%mm4, %%mm5\n" | |
1209 "movq %%mm1, %%mm3\n" | |
1210 "punpcklbw %%mm7,%%mm4\n" | |
1211 "punpcklbw %%mm7,%%mm1\n" | |
1212 "punpckhbw %%mm7,%%mm5\n" | |
1213 "punpckhbw %%mm7,%%mm3\n" | |
1214 "psubw %%mm1, %%mm4\n" | |
1215 "psubw %%mm3, %%mm5\n" | |
1216 "psubw %%mm4, %%mm0\n" | |
1217 "psubw %%mm5, %%mm2\n" | |
1218 "pxor %%mm3, %%mm3\n" | |
1219 "pxor %%mm1, %%mm1\n" | |
1220 "pcmpgtw %%mm0, %%mm3\n\t" | |
1221 "pcmpgtw %%mm2, %%mm1\n\t" | |
1222 "pxor %%mm3, %%mm0\n" | |
1223 "pxor %%mm1, %%mm2\n" | |
2967 | 1224 "psubw %%mm3, %%mm0\n" |
2067 | 1225 "psubw %%mm1, %%mm2\n" |
1226 "paddw %%mm0, %%mm2\n" | |
1227 "paddw %%mm2, %%mm6\n" | |
1228 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1229 "add %2,%0\n" |
2067 | 1230 "subl $2, %%ecx\n" |
1231 " jnz 1b\n" | |
1232 | |
1233 "movq %%mm6, %%mm0\n" | |
1234 "punpcklwd %%mm7,%%mm0\n" | |
1235 "punpckhwd %%mm7,%%mm6\n" | |
1236 "paddd %%mm0, %%mm6\n" | |
2967 | 1237 |
2067 | 1238 "movq %%mm6,%%mm0\n" |
1239 "psrlq $32, %%mm6\n" | |
1240 "paddd %%mm6,%%mm0\n" | |
1241 "movd %%mm0,%1\n" | |
2967 | 1242 : "+r" (pix1), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1243 : "r" ((long)line_size) , "g" (h-2) |
2067 | 1244 : "%ecx"); |
1245 return tmp + hf_noise8_mmx(pix+8, line_size, h); | |
1246 } | |
1247 | |
2864
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1248 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1249 MpegEncContext *c = p; |
2940 | 1250 int score1, score2; |
1251 | |
1252 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); | |
1253 else score1 = sse16_mmx(c, pix1, pix2, line_size, h); | |
1254 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); | |
2067 | 1255 |
4001 | 1256 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
1257 else return score1 + FFABS(score2)*8; | |
2067 | 1258 } |
1259 | |
2864
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1260 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
95bac7109ff0
Kill some compiler warnings. Compiled code verified identical after changes.
mru
parents:
2754
diff
changeset
|
1261 MpegEncContext *c = p; |
2067 | 1262 int score1= sse8_mmx(c, pix1, pix2, line_size, h); |
1263 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); | |
1264 | |
4001 | 1265 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
1266 else return score1 + FFABS(score2)*8; | |
2067 | 1267 } |
1268 | |
1729 | 1269 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { |
1270 int tmp; | |
2967 | 1271 |
1729 | 1272 assert( (((int)pix) & 7) == 0); |
1273 assert((line_size &7) ==0); | |
2967 | 1274 |
1729 | 1275 #define SUM(in0, in1, out0, out1) \ |
1276 "movq (%0), %%mm2\n"\ | |
1277 "movq 8(%0), %%mm3\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1278 "add %2,%0\n"\ |
1729 | 1279 "movq %%mm2, " #out0 "\n"\ |
1280 "movq %%mm3, " #out1 "\n"\ | |
1281 "psubusb " #in0 ", %%mm2\n"\ | |
1282 "psubusb " #in1 ", %%mm3\n"\ | |
1283 "psubusb " #out0 ", " #in0 "\n"\ | |
1284 "psubusb " #out1 ", " #in1 "\n"\ | |
1285 "por %%mm2, " #in0 "\n"\ | |
1286 "por %%mm3, " #in1 "\n"\ | |
1287 "movq " #in0 ", %%mm2\n"\ | |
1288 "movq " #in1 ", %%mm3\n"\ | |
1289 "punpcklbw %%mm7, " #in0 "\n"\ | |
1290 "punpcklbw %%mm7, " #in1 "\n"\ | |
1291 "punpckhbw %%mm7, %%mm2\n"\ | |
1292 "punpckhbw %%mm7, %%mm3\n"\ | |
1293 "paddw " #in1 ", " #in0 "\n"\ | |
1294 "paddw %%mm3, %%mm2\n"\ | |
1295 "paddw %%mm2, " #in0 "\n"\ | |
1296 "paddw " #in0 ", %%mm6\n" | |
1297 | |
2967 | 1298 |
1729 | 1299 asm volatile ( |
1300 "movl %3,%%ecx\n" | |
1301 "pxor %%mm6,%%mm6\n" | |
1302 "pxor %%mm7,%%mm7\n" | |
1303 "movq (%0),%%mm0\n" | |
1304 "movq 8(%0),%%mm1\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1305 "add %2,%0\n" |
1729 | 1306 "subl $2, %%ecx\n" |
1307 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1308 "1:\n" | |
2967 | 1309 |
1729 | 1310 SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
2967 | 1311 |
1729 | 1312 SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
2967 | 1313 |
1729 | 1314 "subl $2, %%ecx\n" |
1315 "jnz 1b\n" | |
1316 | |
1317 "movq %%mm6,%%mm0\n" | |
1318 "psrlq $32, %%mm6\n" | |
1319 "paddw %%mm6,%%mm0\n" | |
1320 "movq %%mm0,%%mm6\n" | |
1321 "psrlq $16, %%mm0\n" | |
1322 "paddw %%mm6,%%mm0\n" | |
1323 "movd %%mm0,%1\n" | |
2967 | 1324 : "+r" (pix), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1325 : "r" ((long)line_size) , "m" (h) |
1729 | 1326 : "%ecx"); |
1327 return tmp & 0xFFFF; | |
1328 } | |
1329 #undef SUM | |
1330 | |
1331 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { | |
1332 int tmp; | |
2967 | 1333 |
1729 | 1334 assert( (((int)pix) & 7) == 0); |
1335 assert((line_size &7) ==0); | |
2967 | 1336 |
1729 | 1337 #define SUM(in0, in1, out0, out1) \ |
1338 "movq (%0), " #out0 "\n"\ | |
1339 "movq 8(%0), " #out1 "\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1340 "add %2,%0\n"\ |
1729 | 1341 "psadbw " #out0 ", " #in0 "\n"\ |
1342 "psadbw " #out1 ", " #in1 "\n"\ | |
1343 "paddw " #in1 ", " #in0 "\n"\ | |
1344 "paddw " #in0 ", %%mm6\n" | |
1345 | |
1346 asm volatile ( | |
1347 "movl %3,%%ecx\n" | |
1348 "pxor %%mm6,%%mm6\n" | |
1349 "pxor %%mm7,%%mm7\n" | |
1350 "movq (%0),%%mm0\n" | |
1351 "movq 8(%0),%%mm1\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1352 "add %2,%0\n" |
1729 | 1353 "subl $2, %%ecx\n" |
1354 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1355 "1:\n" | |
2967 | 1356 |
1729 | 1357 SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
2967 | 1358 |
1729 | 1359 SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
2967 | 1360 |
1729 | 1361 "subl $2, %%ecx\n" |
1362 "jnz 1b\n" | |
1363 | |
1364 "movd %%mm6,%1\n" | |
2967 | 1365 : "+r" (pix), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1366 : "r" ((long)line_size) , "m" (h) |
1729 | 1367 : "%ecx"); |
1368 return tmp; | |
1369 } | |
1370 #undef SUM | |
1371 | |
1372 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
1373 int tmp; | |
2967 | 1374 |
1729 | 1375 assert( (((int)pix1) & 7) == 0); |
1376 assert( (((int)pix2) & 7) == 0); | |
1377 assert((line_size &7) ==0); | |
2967 | 1378 |
1729 | 1379 #define SUM(in0, in1, out0, out1) \ |
1380 "movq (%0),%%mm2\n"\ | |
1381 "movq (%1)," #out0 "\n"\ | |
1382 "movq 8(%0),%%mm3\n"\ | |
1383 "movq 8(%1)," #out1 "\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1384 "add %3,%0\n"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1385 "add %3,%1\n"\ |
1729 | 1386 "psubb " #out0 ", %%mm2\n"\ |
1387 "psubb " #out1 ", %%mm3\n"\ | |
1388 "pxor %%mm7, %%mm2\n"\ | |
1389 "pxor %%mm7, %%mm3\n"\ | |
1390 "movq %%mm2, " #out0 "\n"\ | |
1391 "movq %%mm3, " #out1 "\n"\ | |
1392 "psubusb " #in0 ", %%mm2\n"\ | |
1393 "psubusb " #in1 ", %%mm3\n"\ | |
1394 "psubusb " #out0 ", " #in0 "\n"\ | |
1395 "psubusb " #out1 ", " #in1 "\n"\ | |
1396 "por %%mm2, " #in0 "\n"\ | |
1397 "por %%mm3, " #in1 "\n"\ | |
1398 "movq " #in0 ", %%mm2\n"\ | |
1399 "movq " #in1 ", %%mm3\n"\ | |
1400 "punpcklbw %%mm7, " #in0 "\n"\ | |
1401 "punpcklbw %%mm7, " #in1 "\n"\ | |
1402 "punpckhbw %%mm7, %%mm2\n"\ | |
1403 "punpckhbw %%mm7, %%mm3\n"\ | |
1404 "paddw " #in1 ", " #in0 "\n"\ | |
1405 "paddw %%mm3, %%mm2\n"\ | |
1406 "paddw %%mm2, " #in0 "\n"\ | |
1407 "paddw " #in0 ", %%mm6\n" | |
1408 | |
2967 | 1409 |
1729 | 1410 asm volatile ( |
1411 "movl %4,%%ecx\n" | |
1412 "pxor %%mm6,%%mm6\n" | |
1413 "pcmpeqw %%mm7,%%mm7\n" | |
1414 "psllw $15, %%mm7\n" | |
1415 "packsswb %%mm7, %%mm7\n" | |
1416 "movq (%0),%%mm0\n" | |
1417 "movq (%1),%%mm2\n" | |
1418 "movq 8(%0),%%mm1\n" | |
1419 "movq 8(%1),%%mm3\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1420 "add %3,%0\n" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1421 "add %3,%1\n" |
1729 | 1422 "subl $2, %%ecx\n" |
1423 "psubb %%mm2, %%mm0\n" | |
1424 "psubb %%mm3, %%mm1\n" | |
1425 "pxor %%mm7, %%mm0\n" | |
1426 "pxor %%mm7, %%mm1\n" | |
1427 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1428 "1:\n" | |
2967 | 1429 |
1729 | 1430 SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
2967 | 1431 |
1729 | 1432 SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
2967 | 1433 |
1729 | 1434 "subl $2, %%ecx\n" |
1435 "jnz 1b\n" | |
1436 | |
1437 "movq %%mm6,%%mm0\n" | |
1438 "psrlq $32, %%mm6\n" | |
1439 "paddw %%mm6,%%mm0\n" | |
1440 "movq %%mm0,%%mm6\n" | |
1441 "psrlq $16, %%mm0\n" | |
1442 "paddw %%mm6,%%mm0\n" | |
1443 "movd %%mm0,%2\n" | |
2967 | 1444 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1445 : "r" ((long)line_size) , "m" (h) |
1729 | 1446 : "%ecx"); |
1447 return tmp & 0x7FFF; | |
1448 } | |
1449 #undef SUM | |
1450 | |
1451 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
1452 int tmp; | |
2967 | 1453 |
1729 | 1454 assert( (((int)pix1) & 7) == 0); |
1455 assert( (((int)pix2) & 7) == 0); | |
1456 assert((line_size &7) ==0); | |
2967 | 1457 |
1729 | 1458 #define SUM(in0, in1, out0, out1) \ |
1459 "movq (%0)," #out0 "\n"\ | |
1460 "movq (%1),%%mm2\n"\ | |
1461 "movq 8(%0)," #out1 "\n"\ | |
1462 "movq 8(%1),%%mm3\n"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1463 "add %3,%0\n"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1464 "add %3,%1\n"\ |
1729 | 1465 "psubb %%mm2, " #out0 "\n"\ |
1466 "psubb %%mm3, " #out1 "\n"\ | |
1467 "pxor %%mm7, " #out0 "\n"\ | |
1468 "pxor %%mm7, " #out1 "\n"\ | |
1469 "psadbw " #out0 ", " #in0 "\n"\ | |
1470 "psadbw " #out1 ", " #in1 "\n"\ | |
1471 "paddw " #in1 ", " #in0 "\n"\ | |
1472 "paddw " #in0 ", %%mm6\n" | |
1473 | |
1474 asm volatile ( | |
1475 "movl %4,%%ecx\n" | |
1476 "pxor %%mm6,%%mm6\n" | |
1477 "pcmpeqw %%mm7,%%mm7\n" | |
1478 "psllw $15, %%mm7\n" | |
1479 "packsswb %%mm7, %%mm7\n" | |
1480 "movq (%0),%%mm0\n" | |
1481 "movq (%1),%%mm2\n" | |
1482 "movq 8(%0),%%mm1\n" | |
1483 "movq 8(%1),%%mm3\n" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1484 "add %3,%0\n" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1485 "add %3,%1\n" |
1729 | 1486 "subl $2, %%ecx\n" |
1487 "psubb %%mm2, %%mm0\n" | |
1488 "psubb %%mm3, %%mm1\n" | |
1489 "pxor %%mm7, %%mm0\n" | |
1490 "pxor %%mm7, %%mm1\n" | |
1491 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
1492 "1:\n" | |
2967 | 1493 |
1729 | 1494 SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
2967 | 1495 |
1729 | 1496 SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
2967 | 1497 |
1729 | 1498 "subl $2, %%ecx\n" |
1499 "jnz 1b\n" | |
1500 | |
1501 "movd %%mm6,%2\n" | |
2967 | 1502 : "+r" (pix1), "+r" (pix2), "=r"(tmp) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1503 : "r" ((long)line_size) , "m" (h) |
1729 | 1504 : "%ecx"); |
1505 return tmp; | |
1506 } | |
1507 #undef SUM | |
1508 | |
866 | 1509 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1510 long i=0; |
866 | 1511 asm volatile( |
2979 | 1512 "1: \n\t" |
1513 "movq (%2, %0), %%mm0 \n\t" | |
1514 "movq (%1, %0), %%mm1 \n\t" | |
1515 "psubb %%mm0, %%mm1 \n\t" | |
1516 "movq %%mm1, (%3, %0) \n\t" | |
1517 "movq 8(%2, %0), %%mm0 \n\t" | |
1518 "movq 8(%1, %0), %%mm1 \n\t" | |
1519 "psubb %%mm0, %%mm1 \n\t" | |
1520 "movq %%mm1, 8(%3, %0) \n\t" | |
1521 "add $16, %0 \n\t" | |
1522 "cmp %4, %0 \n\t" | |
1523 " jb 1b \n\t" | |
866 | 1524 : "+r" (i) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1525 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15) |
866 | 1526 ); |
1527 for(; i<w; i++) | |
1528 dst[i+0] = src1[i+0]-src2[i+0]; | |
1529 } | |
1527 | 1530 |
1531 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1532 long i=0; |
1527 | 1533 uint8_t l, lt; |
2967 | 1534 |
1527 | 1535 asm volatile( |
2979 | 1536 "1: \n\t" |
1537 "movq -1(%1, %0), %%mm0 \n\t" // LT | |
1538 "movq (%1, %0), %%mm1 \n\t" // T | |
1539 "movq -1(%2, %0), %%mm2 \n\t" // L | |
1540 "movq (%2, %0), %%mm3 \n\t" // X | |
1541 "movq %%mm2, %%mm4 \n\t" // L | |
1542 "psubb %%mm0, %%mm2 \n\t" | |
1543 "paddb %%mm1, %%mm2 \n\t" // L + T - LT | |
1544 "movq %%mm4, %%mm5 \n\t" // L | |
1545 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) | |
1546 "pminub %%mm5, %%mm1 \n\t" // min(T, L) | |
1547 "pminub %%mm2, %%mm4 \n\t" | |
1548 "pmaxub %%mm1, %%mm4 \n\t" | |
1549 "psubb %%mm4, %%mm3 \n\t" // dst - pred | |
1550 "movq %%mm3, (%3, %0) \n\t" | |
1551 "add $8, %0 \n\t" | |
1552 "cmp %4, %0 \n\t" | |
1553 " jb 1b \n\t" | |
1527 | 1554 : "+r" (i) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
1555 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w) |
1527 | 1556 ); |
1557 | |
1558 l= *left; | |
1559 lt= *left_top; | |
2967 | 1560 |
1527 | 1561 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); |
2967 | 1562 |
1527 | 1563 *left_top= src1[w-1]; |
1564 *left = src2[w-1]; | |
1565 } | |
1566 | |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1567 #define DIFF_PIXELS_1(m,a,t,p1,p2)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1568 "mov"#m" "#p1", "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1569 "mov"#m" "#p2", "#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1570 "punpcklbw "#a", "#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1571 "punpcklbw "#a", "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1572 "psubw "#t", "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1573 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1574 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1575 uint8_t *p1b=p1, *p2b=p2;\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1576 asm volatile(\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1577 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1578 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1579 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1580 "add %4, %1 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1581 "add %4, %2 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1582 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1583 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1584 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1585 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1586 "mov"#m1" "#mm"0, %0 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1587 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1588 "mov"#m1" %0, "#mm"0 \n\t"\ |
5912
f75ee7ea171b
tring to workaround gcc 2.95 bug which causes random failures
michael
parents:
5737
diff
changeset
|
1589 : "+m"(temp), "+r"(p1b), "+r"(p2b)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1590 : "r"((long)stride), "r"((long)stride*3)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1591 );\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1592 } |
5912
f75ee7ea171b
tring to workaround gcc 2.95 bug which causes random failures
michael
parents:
5737
diff
changeset
|
1593 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp) |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1594 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1595 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp) |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1596 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp) |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1597 |
1153 | 1598 #define LBUTTERFLY2(a1,b1,a2,b2)\ |
2979 | 1599 "paddw " #b1 ", " #a1 " \n\t"\ |
1600 "paddw " #b2 ", " #a2 " \n\t"\ | |
1601 "paddw " #b1 ", " #b1 " \n\t"\ | |
1602 "paddw " #b2 ", " #b2 " \n\t"\ | |
1603 "psubw " #a1 ", " #b1 " \n\t"\ | |
1604 "psubw " #a2 ", " #b2 " \n\t" | |
866 | 1605 |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1606 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1607 LBUTTERFLY2(m0, m1, m2, m3)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1608 LBUTTERFLY2(m4, m5, m6, m7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1609 LBUTTERFLY2(m0, m2, m1, m3)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1610 LBUTTERFLY2(m4, m6, m5, m7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1611 LBUTTERFLY2(m0, m4, m1, m5)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1612 LBUTTERFLY2(m2, m6, m3, m7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1613 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1614 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7) |
936 | 1615 |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1616 #define MMABS_MMX(a,z)\ |
2979 | 1617 "pxor " #z ", " #z " \n\t"\ |
1618 "pcmpgtw " #a ", " #z " \n\t"\ | |
1619 "pxor " #z ", " #a " \n\t"\ | |
1620 "psubw " #z ", " #a " \n\t" | |
936 | 1621 |
1153 | 1622 #define MMABS_MMX2(a,z)\ |
2979 | 1623 "pxor " #z ", " #z " \n\t"\ |
1624 "psubw " #a ", " #z " \n\t"\ | |
1625 "pmaxsw " #z ", " #a " \n\t" | |
1153 | 1626 |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1627 #define MMABS_SSSE3(a,z)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1628 "pabsw " #a ", " #a " \n\t" |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1629 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1630 #define MMABS_SUM(a,z, sum)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1631 MMABS(a,z)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1632 "paddusw " #a ", " #sum " \n\t" |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1633 |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1634 #define MMABS_SUM_8x8_NOSPILL\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1635 MMABS(%%xmm0, %%xmm8)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1636 MMABS(%%xmm1, %%xmm9)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1637 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1638 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1639 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1640 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1641 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1642 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1643 "paddusw %%xmm1, %%xmm0 \n\t" |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1644 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1645 #ifdef ARCH_X86_64 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1646 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1647 #else |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1648 #define MMABS_SUM_8x8_SSE2\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1649 "movdqa %%xmm7, (%1) \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1650 MMABS(%%xmm0, %%xmm7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1651 MMABS(%%xmm1, %%xmm7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1652 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1653 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1654 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1655 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1656 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1657 "movdqa (%1), %%xmm2 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1658 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1659 "paddusw %%xmm1, %%xmm0 \n\t" |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1660 #endif |
2967 | 1661 |
936 | 1662 #define LOAD4(o, a, b, c, d)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1663 "movq "#o"(%1), "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1664 "movq "#o"+8(%1), "#b" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1665 "movq "#o"+16(%1), "#c" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1666 "movq "#o"+24(%1), "#d" \n\t"\ |
936 | 1667 |
1668 #define STORE4(o, a, b, c, d)\ | |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1669 "movq "#a", "#o"(%1) \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1670 "movq "#b", "#o"+8(%1) \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1671 "movq "#c", "#o"+16(%1) \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1672 "movq "#d", "#o"+24(%1) \n\t"\ |
936 | 1673 |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1674 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1675 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1676 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1677 #define HSUM_MMX(a, t, dst)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1678 "movq "#a", "#t" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1679 "psrlq $32, "#a" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1680 "paddusw "#t", "#a" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1681 "movq "#a", "#t" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1682 "psrlq $16, "#a" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1683 "paddusw "#t", "#a" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1684 "movd "#a", "#dst" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1685 |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1686 #define HSUM_MMX2(a, t, dst)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1687 "pshufw $0x0E, "#a", "#t" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1688 "paddusw "#t", "#a" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1689 "pshufw $0x01, "#a", "#t" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1690 "paddusw "#t", "#a" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1691 "movd "#a", "#dst" \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1692 |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1693 #define HSUM_SSE2(a, t, dst)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1694 "movhlps "#a", "#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1695 "paddusw "#t", "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1696 "pshuflw $0x0E, "#a", "#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1697 "paddusw "#t", "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1698 "pshuflw $0x01, "#a", "#t" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1699 "paddusw "#t", "#a" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1700 "movd "#a", "#dst" \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1701 |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1702 #define HADAMARD8_DIFF_MMX(cpu) \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1703 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1704 DECLARE_ALIGNED_8(uint64_t, temp[13]);\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1705 int sum;\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1706 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1707 assert(h==8);\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1708 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1709 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1710 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1711 asm volatile(\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1712 HADAMARD48\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1713 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1714 "movq %%mm7, 96(%1) \n\t"\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1715 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1716 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1717 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1718 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1719 "movq 96(%1), %%mm7 \n\t"\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1720 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1721 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1722 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1723 : "=r" (sum)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1724 : "r"(temp)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1725 );\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1726 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1727 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1728 \ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1729 asm volatile(\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1730 HADAMARD48\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1731 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1732 "movq %%mm7, 96(%1) \n\t"\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1733 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1734 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1735 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1736 \ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1737 "movq 96(%1), %%mm7 \n\t"\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1738 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1739 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1740 "movq %%mm6, %%mm7 \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1741 "movq %%mm0, %%mm6 \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1742 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1743 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1744 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1745 HADAMARD48\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1746 "movq %%mm7, 64(%1) \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1747 MMABS(%%mm0, %%mm7)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1748 MMABS(%%mm1, %%mm7)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1749 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1750 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1751 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1752 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1753 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1754 "movq 64(%1), %%mm2 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1755 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1756 "paddusw %%mm1, %%mm0 \n\t"\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1757 "movq %%mm0, 64(%1) \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1758 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1759 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1760 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1761 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1762 HADAMARD48\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1763 "movq %%mm7, (%1) \n\t"\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1764 MMABS(%%mm0, %%mm7)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1765 MMABS(%%mm1, %%mm7)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1766 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1767 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1768 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1769 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1770 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1771 "movq (%1), %%mm2 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1772 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1773 "paddusw 64(%1), %%mm0 \n\t"\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1774 "paddusw %%mm1, %%mm0 \n\t"\ |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1775 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1776 HSUM(%%mm0, %%mm1, %0)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1777 \ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1778 : "=r" (sum)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1779 : "r"(temp)\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1780 );\ |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1781 return sum&0xFFFF;\ |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1782 }\ |
6056
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6030
diff
changeset
|
1783 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1784 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1785 #define HADAMARD8_DIFF_SSE2(cpu) \ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1786 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1787 DECLARE_ALIGNED_16(uint64_t, temp[4]);\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1788 int sum;\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1789 \ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1790 assert(h==8);\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1791 \ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1792 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1793 \ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1794 asm volatile(\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1795 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1796 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1797 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1798 MMABS_SUM_8x8\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1799 HSUM_SSE2(%%xmm0, %%xmm1, %0)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1800 : "=r" (sum)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1801 : "r"(temp)\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1802 );\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1803 return sum&0xFFFF;\ |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1804 }\ |
6056
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6030
diff
changeset
|
1805 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) |
936 | 1806 |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1807 #define MMABS(a,z) MMABS_MMX(a,z) |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1808 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1809 HADAMARD8_DIFF_MMX(mmx) |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1810 #undef MMABS |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1811 #undef HSUM |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1812 |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1813 #define MMABS(a,z) MMABS_MMX2(a,z) |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1814 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2 |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1815 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) |
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1816 HADAMARD8_DIFF_MMX(mmx2) |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1817 HADAMARD8_DIFF_SSE2(sse2) |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1818 #undef MMABS |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1819 #undef MMABS_SUM_8x8 |
4946
c1fb4544bd59
cosmetics: remove code duplication in hadamard8_diff_mmx
lorenm
parents:
4939
diff
changeset
|
1820 #undef HSUM |
1153 | 1821 |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1822 #ifdef HAVE_SSSE3 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1823 #define MMABS(a,z) MMABS_SSSE3(a,z) |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1824 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1825 HADAMARD8_DIFF_SSE2(ssse3) |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1826 #undef MMABS |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1827 #undef MMABS_SUM_8x8 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
1828 #endif |
4749 | 1829 |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1830 #define DCT_SAD4(m,mm,o)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1831 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1832 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1833 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1834 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1835 MMABS_SUM(mm##2, mm##6, mm##0)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1836 MMABS_SUM(mm##3, mm##7, mm##1)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1837 MMABS_SUM(mm##4, mm##6, mm##0)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1838 MMABS_SUM(mm##5, mm##7, mm##1)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1839 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1840 #define DCT_SAD_MMX\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1841 "pxor %%mm0, %%mm0 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1842 "pxor %%mm1, %%mm1 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1843 DCT_SAD4(q, %%mm, 0)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1844 DCT_SAD4(q, %%mm, 8)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1845 DCT_SAD4(q, %%mm, 64)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1846 DCT_SAD4(q, %%mm, 72)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1847 "paddusw %%mm1, %%mm0 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1848 HSUM(%%mm0, %%mm1, %0) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1849 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1850 #define DCT_SAD_SSE2\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1851 "pxor %%xmm0, %%xmm0 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1852 "pxor %%xmm1, %%xmm1 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1853 DCT_SAD4(dqa, %%xmm, 0)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1854 DCT_SAD4(dqa, %%xmm, 64)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1855 "paddusw %%xmm1, %%xmm0 \n\t"\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1856 HSUM(%%xmm0, %%xmm1, %0) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1857 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1858 #define DCT_SAD_FUNC(cpu) \ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1859 static int sum_abs_dctelem_##cpu(DCTELEM *block){\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1860 int sum;\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1861 asm volatile(\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1862 DCT_SAD\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1863 :"=r"(sum)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1864 :"r"(block)\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1865 );\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1866 return sum&0xFFFF;\ |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1867 } |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1868 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1869 #define DCT_SAD DCT_SAD_MMX |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1870 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1871 #define MMABS(a,z) MMABS_MMX(a,z) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1872 DCT_SAD_FUNC(mmx) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1873 #undef MMABS |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1874 #undef HSUM |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1875 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1876 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1877 #define MMABS(a,z) MMABS_MMX2(a,z) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1878 DCT_SAD_FUNC(mmx2) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1879 #undef HSUM |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1880 #undef DCT_SAD |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1881 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1882 #define DCT_SAD DCT_SAD_SSE2 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1883 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1884 DCT_SAD_FUNC(sse2) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1885 #undef MMABS |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1886 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1887 #ifdef HAVE_SSSE3 |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1888 #define MMABS(a,z) MMABS_SSSE3(a,z) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1889 DCT_SAD_FUNC(ssse3) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1890 #undef MMABS |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1891 #endif |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1892 #undef HSUM |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1893 #undef DCT_SAD |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
1894 |
5255 | 1895 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ |
4749 | 1896 int sum; |
1897 long i=size; | |
1898 asm volatile( | |
1899 "pxor %%mm4, %%mm4 \n" | |
1900 "1: \n" | |
1901 "sub $8, %0 \n" | |
1902 "movq (%2,%0), %%mm2 \n" | |
1903 "movq (%3,%0,2), %%mm0 \n" | |
1904 "movq 8(%3,%0,2), %%mm1 \n" | |
1905 "punpckhbw %%mm2, %%mm3 \n" | |
1906 "punpcklbw %%mm2, %%mm2 \n" | |
1907 "psraw $8, %%mm3 \n" | |
1908 "psraw $8, %%mm2 \n" | |
1909 "psubw %%mm3, %%mm1 \n" | |
1910 "psubw %%mm2, %%mm0 \n" | |
1911 "pmaddwd %%mm1, %%mm1 \n" | |
1912 "pmaddwd %%mm0, %%mm0 \n" | |
1913 "paddd %%mm1, %%mm4 \n" | |
1914 "paddd %%mm0, %%mm4 \n" | |
1915 "jg 1b \n" | |
1916 "movq %%mm4, %%mm3 \n" | |
1917 "psrlq $32, %%mm3 \n" | |
1918 "paddd %%mm3, %%mm4 \n" | |
1919 "movd %%mm4, %1 \n" | |
1920 :"+r"(i), "=r"(sum) | |
1921 :"r"(pix1), "r"(pix2) | |
1922 ); | |
1923 return sum; | |
1924 } | |
1925 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
1926 #endif //CONFIG_ENCODERS |
866 | 1927 |
954 | 1928 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ |
2979 | 1929 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ |
1930 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ | |
1931 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ | |
1932 "movq "#in7", " #m3 " \n\t" /* d */\ | |
1933 "movq "#in0", %%mm5 \n\t" /* D */\ | |
1934 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ | |
1935 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ | |
1936 "movq "#in1", %%mm5 \n\t" /* C */\ | |
1937 "movq "#in2", %%mm6 \n\t" /* B */\ | |
1938 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ | |
1939 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ | |
1940 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ | |
1941 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ | |
1942 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ | |
1943 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ | |
1944 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ | |
1945 "psraw $5, %%mm5 \n\t"\ | |
1946 "packuswb %%mm5, %%mm5 \n\t"\ | |
954 | 1947 OP(%%mm5, out, %%mm7, d) |
1948 | |
959 | 1949 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ |
1057 | 1950 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
954 | 1951 uint64_t temp;\ |
1952 \ | |
1953 asm volatile(\ | |
2979 | 1954 "pxor %%mm7, %%mm7 \n\t"\ |
1955 "1: \n\t"\ | |
1956 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
1957 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
1958 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
1959 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
1960 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
1961 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
1962 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
1963 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
1964 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
1965 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
1966 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
1967 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
1968 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
1969 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
1970 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
1971 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
1972 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
1973 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1974 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
1975 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
1976 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ | |
1977 "paddw %%mm4, %%mm0 \n\t" /* a */\ | |
1978 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
1979 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ | |
1980 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ | |
1981 "paddw %6, %%mm6 \n\t"\ | |
1982 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
1983 "psraw $5, %%mm0 \n\t"\ | |
1984 "movq %%mm0, %5 \n\t"\ | |
954 | 1985 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ |
1986 \ | |
2979 | 1987 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ |
1988 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ | |
1989 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ | |
1990 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ | |
1991 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ | |
1992 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ | |
1993 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ | |
1994 "paddw %%mm0, %%mm2 \n\t" /* b */\ | |
1995 "paddw %%mm5, %%mm3 \n\t" /* c */\ | |
1996 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
1997 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
1998 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ | |
1999 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ | |
2000 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ | |
2001 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ | |
2002 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ | |
2003 "paddw %%mm2, %%mm1 \n\t" /* a */\ | |
2004 "paddw %%mm6, %%mm4 \n\t" /* d */\ | |
2005 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ | |
2006 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ | |
2007 "paddw %6, %%mm1 \n\t"\ | |
2008 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ | |
2009 "psraw $5, %%mm3 \n\t"\ | |
2010 "movq %5, %%mm1 \n\t"\ | |
2011 "packuswb %%mm3, %%mm1 \n\t"\ | |
959 | 2012 OP_MMX2(%%mm1, (%1),%%mm4, q)\ |
954 | 2013 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ |
2014 \ | |
2979 | 2015 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ |
2016 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ | |
2017 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ | |
2018 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ | |
2019 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ | |
2020 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ | |
2021 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ | |
2022 "paddw %%mm1, %%mm5 \n\t" /* b */\ | |
2023 "paddw %%mm4, %%mm0 \n\t" /* c */\ | |
2024 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
2025 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ | |
2026 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ | |
2027 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ | |
2028 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ | |
2029 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ | |
2030 "paddw %%mm3, %%mm2 \n\t" /* d */\ | |
2031 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ | |
2032 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ | |
2033 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ | |
2034 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ | |
2035 "paddw %%mm2, %%mm6 \n\t" /* a */\ | |
2036 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ | |
2037 "paddw %6, %%mm0 \n\t"\ | |
2038 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
2039 "psraw $5, %%mm0 \n\t"\ | |
954 | 2040 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ |
2041 \ | |
2979 | 2042 "paddw %%mm5, %%mm3 \n\t" /* a */\ |
2043 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ | |
2044 "paddw %%mm4, %%mm6 \n\t" /* b */\ | |
2045 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ | |
2046 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ | |
2047 "paddw %%mm1, %%mm4 \n\t" /* c */\ | |
2048 "paddw %%mm2, %%mm5 \n\t" /* d */\ | |
2049 "paddw %%mm6, %%mm6 \n\t" /* 2b */\ | |
2050 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ | |
2051 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ | |
2052 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ | |
2053 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ | |
2054 "paddw %6, %%mm4 \n\t"\ | |
2055 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ | |
2056 "psraw $5, %%mm4 \n\t"\ | |
2057 "packuswb %%mm4, %%mm0 \n\t"\ | |
959 | 2058 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ |
954 | 2059 \ |
2979 | 2060 "add %3, %0 \n\t"\ |
2061 "add %4, %1 \n\t"\ | |
2062 "decl %2 \n\t"\ | |
2063 " jnz 1b \n\t"\ | |
6335
950811a14eb3
put loop counter in a register if possible. makes some of the qpel functions 3% faster.
lorenm
parents:
6333
diff
changeset
|
2064 : "+a"(src), "+c"(dst), "+g"(h)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2065 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
966 | 2066 : "memory"\ |
954 | 2067 );\ |
2068 }\ | |
2069 \ | |
2070 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
2071 int i;\ | |
2072 int16_t temp[16];\ | |
2073 /* quick HACK, XXX FIXME MUST be optimized */\ | |
2074 for(i=0; i<h; i++)\ | |
2075 {\ | |
2076 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
2077 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
2078 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
2079 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
2080 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
2081 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\ | |
2082 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\ | |
2083 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\ | |
2084 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\ | |
2085 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\ | |
2086 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\ | |
2087 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\ | |
2088 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\ | |
2089 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\ | |
2090 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\ | |
2091 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\ | |
2092 asm volatile(\ | |
2979 | 2093 "movq (%0), %%mm0 \n\t"\ |
2094 "movq 8(%0), %%mm1 \n\t"\ | |
2095 "paddw %2, %%mm0 \n\t"\ | |
2096 "paddw %2, %%mm1 \n\t"\ | |
2097 "psraw $5, %%mm0 \n\t"\ | |
2098 "psraw $5, %%mm1 \n\t"\ | |
2099 "packuswb %%mm1, %%mm0 \n\t"\ | |
959 | 2100 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ |
2979 | 2101 "movq 16(%0), %%mm0 \n\t"\ |
2102 "movq 24(%0), %%mm1 \n\t"\ | |
2103 "paddw %2, %%mm0 \n\t"\ | |
2104 "paddw %2, %%mm1 \n\t"\ | |
2105 "psraw $5, %%mm0 \n\t"\ | |
2106 "psraw $5, %%mm1 \n\t"\ | |
2107 "packuswb %%mm1, %%mm0 \n\t"\ | |
959 | 2108 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ |
954 | 2109 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ |
966 | 2110 : "memory"\ |
954 | 2111 );\ |
2112 dst+=dstStride;\ | |
2113 src+=srcStride;\ | |
2114 }\ | |
2115 }\ | |
2116 \ | |
1057 | 2117 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
959 | 2118 uint64_t temp;\ |
2119 \ | |
2120 asm volatile(\ | |
2979 | 2121 "pxor %%mm7, %%mm7 \n\t"\ |
2122 "1: \n\t"\ | |
2123 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
2124 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
2125 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
2126 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
2127 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
2128 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
2129 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
2130 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
2131 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
2132 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
2133 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
2134 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
2135 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
2136 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
2137 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
2138 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
2139 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
2140 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
2141 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
2142 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
2143 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ | |
2144 "paddw %%mm4, %%mm0 \n\t" /* a */\ | |
2145 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
2146 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ | |
2147 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ | |
2148 "paddw %6, %%mm6 \n\t"\ | |
2149 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
2150 "psraw $5, %%mm0 \n\t"\ | |
959 | 2151 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ |
2152 \ | |
2979 | 2153 "movd 5(%0), %%mm5 \n\t" /* FGHI */\ |
2154 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ | |
2155 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ | |
2156 "paddw %%mm5, %%mm1 \n\t" /* a */\ | |
2157 "paddw %%mm6, %%mm2 \n\t" /* b */\ | |
2158 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ | |
2159 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ | |
2160 "paddw %%mm6, %%mm3 \n\t" /* c */\ | |
2161 "paddw %%mm5, %%mm4 \n\t" /* d */\ | |
2162 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
2163 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
2164 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ | |
2165 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ | |
2166 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ | |
2167 "paddw %6, %%mm1 \n\t"\ | |
2168 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ | |
2169 "psraw $5, %%mm3 \n\t"\ | |
2170 "packuswb %%mm3, %%mm0 \n\t"\ | |
959 | 2171 OP_MMX2(%%mm0, (%1), %%mm4, q)\ |
2172 \ | |
2979 | 2173 "add %3, %0 \n\t"\ |
2174 "add %4, %1 \n\t"\ | |
2175 "decl %2 \n\t"\ | |
2176 " jnz 1b \n\t"\ | |
6335
950811a14eb3
put loop counter in a register if possible. makes some of the qpel functions 3% faster.
lorenm
parents:
6333
diff
changeset
|
2177 : "+a"(src), "+c"(dst), "+g"(h)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2178 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
966 | 2179 : "memory"\ |
959 | 2180 );\ |
2181 }\ | |
2182 \ | |
2183 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
2184 int i;\ | |
2185 int16_t temp[8];\ | |
2186 /* quick HACK, XXX FIXME MUST be optimized */\ | |
2187 for(i=0; i<h; i++)\ | |
2188 {\ | |
2189 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
2190 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
2191 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
2192 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
2193 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
2194 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ | |
2195 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ | |
2196 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ | |
2197 asm volatile(\ | |
2979 | 2198 "movq (%0), %%mm0 \n\t"\ |
2199 "movq 8(%0), %%mm1 \n\t"\ | |
2200 "paddw %2, %%mm0 \n\t"\ | |
2201 "paddw %2, %%mm1 \n\t"\ | |
2202 "psraw $5, %%mm0 \n\t"\ | |
2203 "psraw $5, %%mm1 \n\t"\ | |
2204 "packuswb %%mm1, %%mm0 \n\t"\ | |
959 | 2205 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ |
2206 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ | |
966 | 2207 :"memory"\ |
959 | 2208 );\ |
2209 dst+=dstStride;\ | |
2210 src+=srcStride;\ | |
2211 }\ | |
2212 } | |
2213 | |
2214 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ | |
2215 \ | |
2216 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
954 | 2217 uint64_t temp[17*4];\ |
2218 uint64_t *temp_ptr= temp;\ | |
2219 int count= 17;\ | |
2220 \ | |
2221 /*FIXME unroll */\ | |
2222 asm volatile(\ | |
2979 | 2223 "pxor %%mm7, %%mm7 \n\t"\ |
2224 "1: \n\t"\ | |
2225 "movq (%0), %%mm0 \n\t"\ | |
2226 "movq (%0), %%mm1 \n\t"\ | |
2227 "movq 8(%0), %%mm2 \n\t"\ | |
2228 "movq 8(%0), %%mm3 \n\t"\ | |
2229 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2230 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2231 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
2232 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
2233 "movq %%mm0, (%1) \n\t"\ | |
2234 "movq %%mm1, 17*8(%1) \n\t"\ | |
2235 "movq %%mm2, 2*17*8(%1) \n\t"\ | |
2236 "movq %%mm3, 3*17*8(%1) \n\t"\ | |
2237 "add $8, %1 \n\t"\ | |
2238 "add %3, %0 \n\t"\ | |
2239 "decl %2 \n\t"\ | |
2240 " jnz 1b \n\t"\ | |
954 | 2241 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2242 : "r" ((long)srcStride)\ |
966 | 2243 : "memory"\ |
954 | 2244 );\ |
2245 \ | |
2246 temp_ptr= temp;\ | |
2247 count=4;\ | |
2248 \ | |
2249 /*FIXME reorder for speed */\ | |
2250 asm volatile(\ | |
2979 | 2251 /*"pxor %%mm7, %%mm7 \n\t"*/\ |
2252 "1: \n\t"\ | |
2253 "movq (%0), %%mm0 \n\t"\ | |
2254 "movq 8(%0), %%mm1 \n\t"\ | |
2255 "movq 16(%0), %%mm2 \n\t"\ | |
2256 "movq 24(%0), %%mm3 \n\t"\ | |
961 | 2257 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
2258 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
2979 | 2259 "add %4, %1 \n\t"\ |
961 | 2260 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
954 | 2261 \ |
961 | 2262 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
2979 | 2263 "add %4, %1 \n\t"\ |
961 | 2264 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
2265 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ | |
2979 | 2266 "add %4, %1 \n\t"\ |
961 | 2267 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ |
2268 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ | |
2979 | 2269 "add %4, %1 \n\t"\ |
961 | 2270 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ |
2271 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ | |
2979 | 2272 "add %4, %1 \n\t"\ |
961 | 2273 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ |
2274 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ | |
2979 | 2275 "add %4, %1 \n\t"\ |
961 | 2276 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ |
954 | 2277 \ |
961 | 2278 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ |
2979 | 2279 "add %4, %1 \n\t" \ |
961 | 2280 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ |
2281 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ | |
954 | 2282 \ |
2979 | 2283 "add $136, %0 \n\t"\ |
2284 "add %6, %1 \n\t"\ | |
2285 "decl %2 \n\t"\ | |
2286 " jnz 1b \n\t"\ | |
958
9bb668034ecf
slowdown / gcc 2.95.* bug workaround (this should be reversed as soon as gcc 2.95.* support is droped)
michaelni
parents:
954
diff
changeset
|
2287 \ |
967 | 2288 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2289 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\ |
966 | 2290 :"memory"\ |
954 | 2291 );\ |
2292 }\ | |
2293 \ | |
1057 | 2294 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2209 | 2295 uint64_t temp[9*2];\ |
954 | 2296 uint64_t *temp_ptr= temp;\ |
2297 int count= 9;\ | |
2298 \ | |
2299 /*FIXME unroll */\ | |
2300 asm volatile(\ | |
2979 | 2301 "pxor %%mm7, %%mm7 \n\t"\ |
2302 "1: \n\t"\ | |
2303 "movq (%0), %%mm0 \n\t"\ | |
2304 "movq (%0), %%mm1 \n\t"\ | |
2305 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
2306 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2307 "movq %%mm0, (%1) \n\t"\ | |
2308 "movq %%mm1, 9*8(%1) \n\t"\ | |
2309 "add $8, %1 \n\t"\ | |
2310 "add %3, %0 \n\t"\ | |
2311 "decl %2 \n\t"\ | |
2312 " jnz 1b \n\t"\ | |
954 | 2313 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2314 : "r" ((long)srcStride)\ |
966 | 2315 : "memory"\ |
954 | 2316 );\ |
2317 \ | |
2318 temp_ptr= temp;\ | |
2319 count=2;\ | |
2320 \ | |
2321 /*FIXME reorder for speed */\ | |
2322 asm volatile(\ | |
2979 | 2323 /*"pxor %%mm7, %%mm7 \n\t"*/\ |
2324 "1: \n\t"\ | |
2325 "movq (%0), %%mm0 \n\t"\ | |
2326 "movq 8(%0), %%mm1 \n\t"\ | |
2327 "movq 16(%0), %%mm2 \n\t"\ | |
2328 "movq 24(%0), %%mm3 \n\t"\ | |
961 | 2329 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
2330 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
2979 | 2331 "add %4, %1 \n\t"\ |
961 | 2332 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
954 | 2333 \ |
961 | 2334 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
2979 | 2335 "add %4, %1 \n\t"\ |
961 | 2336 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
954 | 2337 \ |
961 | 2338 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ |
2979 | 2339 "add %4, %1 \n\t"\ |
961 | 2340 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ |
2341 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ | |
954 | 2342 \ |
2979 | 2343 "add $72, %0 \n\t"\ |
2344 "add %6, %1 \n\t"\ | |
2345 "decl %2 \n\t"\ | |
2346 " jnz 1b \n\t"\ | |
954 | 2347 \ |
961 | 2348 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2256
diff
changeset
|
2349 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\ |
966 | 2350 : "memory"\ |
2351 );\ | |
959 | 2352 }\ |
954 | 2353 \ |
1064 | 2354 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ |
6321 | 2355 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\ |
954 | 2356 }\ |
2357 \ | |
1064 | 2358 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2359 uint64_t temp[8];\ |
954 | 2360 uint8_t * const half= (uint8_t*)temp;\ |
2361 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2362 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ |
954 | 2363 }\ |
2364 \ | |
1064 | 2365 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2366 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ |
2367 }\ | |
2368 \ | |
1064 | 2369 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2370 uint64_t temp[8];\ |
954 | 2371 uint8_t * const half= (uint8_t*)temp;\ |
2372 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2373 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ |
954 | 2374 }\ |
2375 \ | |
1064 | 2376 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2377 uint64_t temp[8];\ |
954 | 2378 uint8_t * const half= (uint8_t*)temp;\ |
959 | 2379 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2380 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ |
954 | 2381 }\ |
2382 \ | |
1064 | 2383 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2384 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
954 | 2385 }\ |
2386 \ | |
1064 | 2387 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2388 uint64_t temp[8];\ |
954 | 2389 uint8_t * const half= (uint8_t*)temp;\ |
959 | 2390 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2391 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ |
954 | 2392 }\ |
1064 | 2393 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2394 uint64_t half[8 + 9];\ |
2395 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2396 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2397 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2398 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
959 | 2399 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2400 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 2401 }\ |
1064 | 2402 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2403 uint64_t half[8 + 9];\ |
2404 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2405 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2406 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2407 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
959 | 2408 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2409 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 2410 }\ |
1064 | 2411 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2412 uint64_t half[8 + 9];\ |
2413 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2414 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2415 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2416 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
959 | 2417 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2418 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 2419 }\ |
1064 | 2420 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2421 uint64_t half[8 + 9];\ |
2422 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
2423 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2424 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2425 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
959 | 2426 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2427 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 2428 }\ |
1064 | 2429 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2430 uint64_t half[8 + 9];\ |
954 | 2431 uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
2432 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2433 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 2434 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2435 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
954 | 2436 }\ |
1064 | 2437 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2438 uint64_t half[8 + 9];\ |
954 | 2439 uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
2440 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2441 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 2442 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2443 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
954 | 2444 }\ |
1064 | 2445 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2446 uint64_t half[8 + 9];\ |
2447 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2448 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2449 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
984 | 2450 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
954 | 2451 }\ |
1064 | 2452 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2453 uint64_t half[8 + 9];\ |
2454 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2455 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2456 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
984 | 2457 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
954 | 2458 }\ |
1064 | 2459 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2460 uint64_t half[9];\ |
954 | 2461 uint8_t * const halfH= ((uint8_t*)half);\ |
2462 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
959 | 2463 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
954 | 2464 }\ |
1064 | 2465 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ |
6321 | 2466 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\ |
954 | 2467 }\ |
2468 \ | |
1064 | 2469 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2470 uint64_t temp[32];\ |
2471 uint8_t * const half= (uint8_t*)temp;\ | |
2472 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2473 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ |
954 | 2474 }\ |
2475 \ | |
1064 | 2476 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2477 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ |
2478 }\ | |
2479 \ | |
1064 | 2480 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2481 uint64_t temp[32];\ |
2482 uint8_t * const half= (uint8_t*)temp;\ | |
2483 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2484 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ |
954 | 2485 }\ |
2486 \ | |
1064 | 2487 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2488 uint64_t temp[32];\ |
2489 uint8_t * const half= (uint8_t*)temp;\ | |
959 | 2490 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2491 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ |
954 | 2492 }\ |
2493 \ | |
1064 | 2494 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
959 | 2495 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
954 | 2496 }\ |
2497 \ | |
1064 | 2498 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2499 uint64_t temp[32];\ |
2500 uint8_t * const half= (uint8_t*)temp;\ | |
959 | 2501 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2502 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ |
954 | 2503 }\ |
1064 | 2504 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2505 uint64_t half[16*2 + 17*2];\ |
2506 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2507 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2508 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2509 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
959 | 2510 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2511 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 2512 }\ |
1064 | 2513 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2514 uint64_t half[16*2 + 17*2];\ |
2515 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2516 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2517 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2518 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
959 | 2519 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2520 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 2521 }\ |
1064 | 2522 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2523 uint64_t half[16*2 + 17*2];\ |
2524 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2525 uint8_t * const halfHV= ((uint8_t*)half);\ | |
954 | 2526 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2527 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
959 | 2528 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2529 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 2530 }\ |
1064 | 2531 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2532 uint64_t half[16*2 + 17*2];\ |
2533 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2534 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2535 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2536 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
959 | 2537 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2538 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 2539 }\ |
1064 | 2540 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2541 uint64_t half[16*2 + 17*2];\ |
2542 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2543 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2544 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 2545 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2546 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
954 | 2547 }\ |
1064 | 2548 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2549 uint64_t half[16*2 + 17*2];\ |
2550 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
2551 uint8_t * const halfHV= ((uint8_t*)half);\ | |
2552 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 2553 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2554 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
954 | 2555 }\ |
1064 | 2556 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2557 uint64_t half[17*2];\ |
2558 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2559 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2560 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
984 | 2561 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
954 | 2562 }\ |
1064 | 2563 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
984 | 2564 uint64_t half[17*2];\ |
2565 uint8_t * const halfH= ((uint8_t*)half);\ | |
954 | 2566 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
2207
22b768f1261a
10000l fix and use more mmx2/3dnow code for mpeg4 qpel which has been written and commited long time ago but appearently never used, qpel motion compensation is 5% faster
michael
parents:
2067
diff
changeset
|
2567 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
984 | 2568 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
954 | 2569 }\ |
1064 | 2570 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
954 | 2571 uint64_t half[17*2];\ |
2572 uint8_t * const halfH= ((uint8_t*)half);\ | |
2573 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
959 | 2574 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
954 | 2575 } |
2576 | |
2979 | 2577 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" |
959 | 2578 #define AVG_3DNOW_OP(a,b,temp, size) \ |
2979 | 2579 "mov" #size " " #b ", " #temp " \n\t"\ |
2580 "pavgusb " #temp ", " #a " \n\t"\ | |
2581 "mov" #size " " #a ", " #b " \n\t" | |
959 | 2582 #define AVG_MMX2_OP(a,b,temp, size) \ |
2979 | 2583 "mov" #size " " #b ", " #temp " \n\t"\ |
2584 "pavgb " #temp ", " #a " \n\t"\ | |
2585 "mov" #size " " #a ", " #b " \n\t" | |
959 | 2586 |
2587 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) | |
2588 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) | |
2589 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) | |
2590 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) | |
2591 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) | |
2592 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) | |
954 | 2593 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) |
959 | 2594 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) |
954 | 2595 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) |
2596 | |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2597 /***********************************/ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2598 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2599 |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2600 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2601 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2602 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2603 } |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2604 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2605 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2606 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2607 } |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2608 |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2609 #define QPEL_2TAP(OPNAME, SIZE, MMX)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2610 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2611 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2612 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2613 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2614 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2615 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2616 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2617 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2618 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2619 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2620 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2621 }\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2622 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2623 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2624 }\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2625 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2626 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2627 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2628 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2629 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2630 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2631 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2632 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2633 |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2634 QPEL_2TAP(put_, 16, mmx2) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2635 QPEL_2TAP(avg_, 16, mmx2) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2636 QPEL_2TAP(put_, 8, mmx2) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2637 QPEL_2TAP(avg_, 8, mmx2) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2638 QPEL_2TAP(put_, 16, 3dnow) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2639 QPEL_2TAP(avg_, 16, 3dnow) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2640 QPEL_2TAP(put_, 8, 3dnow) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2641 QPEL_2TAP(avg_, 8, 3dnow) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2642 |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
2643 |
393 | 2644 #if 0 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2645 static void just_return() { return; } |
393 | 2646 #endif |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
2647 |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2648 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2649 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){ |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2650 const int w = 8; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2651 const int ix = ox>>(16+shift); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2652 const int iy = oy>>(16+shift); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2653 const int oxs = ox>>4; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2654 const int oys = oy>>4; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2655 const int dxxs = dxx>>4; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2656 const int dxys = dxy>>4; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2657 const int dyxs = dyx>>4; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2658 const int dyys = dyy>>4; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2659 const uint16_t r4[4] = {r,r,r,r}; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2660 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys}; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2661 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys}; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2662 const uint64_t shift2 = 2*shift; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2663 uint8_t edge_buf[(h+1)*stride]; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2664 int x, y; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2665 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2666 const int dxw = (dxx-(1<<(16+shift)))*(w-1); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2667 const int dyh = (dyy-(1<<(16+shift)))*(h-1); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2668 const int dxh = dxy*(h-1); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2669 const int dyw = dyx*(w-1); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2670 if( // non-constant fullpel offset (3% of blocks) |
6196
166bef5cad01
add parenthesis, fix warning: i386/dsputil_mmx.c:2618: warning: suggest parentheses around arithmetic in operand of |
bcoudurier
parents:
6195
diff
changeset
|
2671 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) | |
166bef5cad01
add parenthesis, fix warning: i386/dsputil_mmx.c:2618: warning: suggest parentheses around arithmetic in operand of |
bcoudurier
parents:
6195
diff
changeset
|
2672 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift) |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2673 // uses more than 16 bits of subpel mv (only at huge resolution) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2674 || (dxx|dxy|dyx|dyy)&15 ) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2675 { |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2676 //FIXME could still use mmx for some of the rows |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2677 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2678 return; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2679 } |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2680 |
3250 | 2681 src += ix + iy*stride; |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2682 if( (unsigned)ix >= width-w || |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2683 (unsigned)iy >= height-h ) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2684 { |
3250 | 2685 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height); |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2686 src = edge_buf; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2687 } |
3250 | 2688 |
2689 asm volatile( | |
2690 "movd %0, %%mm6 \n\t" | |
2691 "pxor %%mm7, %%mm7 \n\t" | |
2692 "punpcklwd %%mm6, %%mm6 \n\t" | |
2693 "punpcklwd %%mm6, %%mm6 \n\t" | |
2694 :: "r"(1<<shift) | |
2695 ); | |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2696 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2697 for(x=0; x<w; x+=4){ |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2698 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2699 oxs - dxys + dxxs*(x+1), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2700 oxs - dxys + dxxs*(x+2), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2701 oxs - dxys + dxxs*(x+3) }; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2702 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2703 oys - dyys + dyxs*(x+1), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2704 oys - dyys + dyxs*(x+2), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2705 oys - dyys + dyxs*(x+3) }; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2706 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2707 for(y=0; y<h; y++){ |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2708 asm volatile( |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2709 "movq %0, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2710 "movq %1, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2711 "paddw %2, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2712 "paddw %3, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2713 "movq %%mm4, %0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2714 "movq %%mm5, %1 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2715 "psrlw $12, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2716 "psrlw $12, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2717 : "+m"(*dx4), "+m"(*dy4) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2718 : "m"(*dxy4), "m"(*dyy4) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2719 ); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2720 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2721 asm volatile( |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2722 "movq %%mm6, %%mm2 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2723 "movq %%mm6, %%mm1 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2724 "psubw %%mm4, %%mm2 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2725 "psubw %%mm5, %%mm1 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2726 "movq %%mm2, %%mm0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2727 "movq %%mm4, %%mm3 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2728 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2729 "pmullw %%mm5, %%mm3 \n\t" // dx*dy |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2730 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2731 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2732 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2733 "movd %4, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2734 "movd %3, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2735 "punpcklbw %%mm7, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2736 "punpcklbw %%mm7, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2737 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2738 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2739 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2740 "movd %2, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2741 "movd %1, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2742 "punpcklbw %%mm7, %%mm5 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2743 "punpcklbw %%mm7, %%mm4 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2744 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2745 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy) |
3250 | 2746 "paddw %5, %%mm1 \n\t" |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2747 "paddw %%mm3, %%mm2 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2748 "paddw %%mm1, %%mm0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2749 "paddw %%mm2, %%mm0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2750 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2751 "psrlw %6, %%mm0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2752 "packuswb %%mm0, %%mm0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2753 "movd %%mm0, %0 \n\t" |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2754 |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2755 : "=m"(dst[x+y*stride]) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2756 : "m"(src[0]), "m"(src[1]), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2757 "m"(src[stride]), "m"(src[stride+1]), |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2758 "m"(*r4), "m"(shift2) |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2759 ); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2760 src += stride; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2761 } |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2762 src += 4-h*stride; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2763 } |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2764 } |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
2765 |
3777 | 2766 #ifdef CONFIG_ENCODERS |
5024
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2767 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2768 #define PHADDD(a, t)\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2769 "movq "#a", "#t" \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2770 "psrlq $32, "#a" \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2771 "paddd "#t", "#a" \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2772 /* |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2773 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31] |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2774 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31] |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2775 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30] |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2776 */ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2777 #define PMULHRW(x, y, s, o)\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2778 "pmulhw " #s ", "#x " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2779 "pmulhw " #s ", "#y " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2780 "paddw " #o ", "#x " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2781 "paddw " #o ", "#y " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2782 "psraw $1, "#x " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2783 "psraw $1, "#y " \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2784 #define DEF(x) x ## _mmx |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2785 #define SET_RND MOVQ_WONE |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2786 #define SCALE_OFFSET 1 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2787 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2788 #include "dsputil_mmx_qns.h" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2789 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2790 #undef DEF |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2791 #undef SET_RND |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2792 #undef SCALE_OFFSET |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2793 #undef PMULHRW |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2794 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2795 #define DEF(x) x ## _3dnow |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2796 #define SET_RND(x) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2797 #define SCALE_OFFSET 0 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2798 #define PMULHRW(x, y, s, o)\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2799 "pmulhrw " #s ", "#x " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2800 "pmulhrw " #s ", "#y " \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2801 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2802 #include "dsputil_mmx_qns.h" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2803 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2804 #undef DEF |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2805 #undef SET_RND |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2806 #undef SCALE_OFFSET |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2807 #undef PMULHRW |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2808 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2809 #ifdef HAVE_SSSE3 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2810 #undef PHADDD |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2811 #define DEF(x) x ## _ssse3 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2812 #define SET_RND(x) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2813 #define SCALE_OFFSET -1 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2814 #define PHADDD(a, t)\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2815 "pshufw $0x0E, "#a", "#t" \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2816 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2817 #define PMULHRW(x, y, s, o)\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2818 "pmulhrsw " #s ", "#x " \n\t"\ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2819 "pmulhrsw " #s ", "#y " \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2820 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2821 #include "dsputil_mmx_qns.h" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2822 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2823 #undef DEF |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2824 #undef SET_RND |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2825 #undef SCALE_OFFSET |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2826 #undef PMULHRW |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2827 #undef PHADDD |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2828 #endif //HAVE_SSSE3 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
2829 |
3777 | 2830 #endif /* CONFIG_ENCODERS */ |
2754 | 2831 |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2832 #define PREFETCH(name, op) \ |
4172 | 2833 static void name(void *mem, int stride, int h){\ |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2834 const uint8_t *p= mem;\ |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2835 do{\ |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2836 asm volatile(#op" %0" :: "m"(*p));\ |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2837 p+= stride;\ |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2838 }while(--h);\ |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2839 } |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2840 PREFETCH(prefetch_mmx2, prefetcht0) |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2841 PREFETCH(prefetch_3dnow, prefetch) |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2842 #undef PREFETCH |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
2843 |
2754 | 2844 #include "h264dsp_mmx.c" |
2967 | 2845 |
6009 | 2846 /* CAVS specific */ |
3524 | 2847 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx); |
2848 | |
2849 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
2850 put_pixels8_mmx(dst, src, stride, 8); | |
2851 } | |
2852 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
2853 avg_pixels8_mmx(dst, src, stride, 8); | |
2854 } | |
2855 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
2856 put_pixels16_mmx(dst, src, stride, 16); | |
2857 } | |
2858 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
2859 avg_pixels16_mmx(dst, src, stride, 16); | |
2860 } | |
2861 | |
6030 | 2862 /* FLAC specific */ |
2863 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag, | |
2864 double *autoc); | |
2865 | |
5948 | 2866 /* VC1 specific */ |
2867 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx); | |
2868 | |
2869 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { | |
2870 put_pixels8_mmx(dst, src, stride, 8); | |
2871 } | |
2872 | |
1092 | 2873 /* external functions, from idct_mmx.c */ |
2874 void ff_mmx_idct(DCTELEM *block); | |
2875 void ff_mmxext_idct(DCTELEM *block); | |
2876 | |
2877 /* XXX: those functions should be suppressed ASAP when all IDCTs are | |
2878 converted */ | |
4020
723818b5de0f
Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure
diego
parents:
4001
diff
changeset
|
2879 #ifdef CONFIG_GPL |
1092 | 2880 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) |
2881 { | |
2882 ff_mmx_idct (block); | |
2883 put_pixels_clamped_mmx(block, dest, line_size); | |
2884 } | |
2885 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2886 { | |
2887 ff_mmx_idct (block); | |
2888 add_pixels_clamped_mmx(block, dest, line_size); | |
2889 } | |
2890 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) | |
2891 { | |
2892 ff_mmxext_idct (block); | |
2893 put_pixels_clamped_mmx(block, dest, line_size); | |
2894 } | |
2895 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2896 { | |
2897 ff_mmxext_idct (block); | |
2898 add_pixels_clamped_mmx(block, dest, line_size); | |
2899 } | |
4020
723818b5de0f
Put libmpeg2 IDCT functions under CONFIG_GPL, fixes link failure
diego
parents:
4001
diff
changeset
|
2900 #endif |
2868 | 2901 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) |
2902 { | |
2903 ff_idct_xvid_mmx (block); | |
2904 put_pixels_clamped_mmx(block, dest, line_size); | |
2905 } | |
2906 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2907 { | |
2908 ff_idct_xvid_mmx (block); | |
2909 add_pixels_clamped_mmx(block, dest, line_size); | |
2910 } | |
2911 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block) | |
2912 { | |
2913 ff_idct_xvid_mmx2 (block); | |
2914 put_pixels_clamped_mmx(block, dest, line_size); | |
2915 } | |
2916 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) | |
2917 { | |
2918 ff_idct_xvid_mmx2 (block); | |
2919 add_pixels_clamped_mmx(block, dest, line_size); | |
2920 } | |
2967 | 2921 |
3541
3fbddeb13686
10l, vorbis_inverse_coupling_sse() was really 3dnow
lorenm
parents:
3536
diff
changeset
|
2922 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize) |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2923 { |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2924 int i; |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2925 asm volatile("pxor %%mm7, %%mm7":); |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2926 for(i=0; i<blocksize; i+=2) { |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2927 asm volatile( |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2928 "movq %0, %%mm0 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2929 "movq %1, %%mm1 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2930 "movq %%mm0, %%mm2 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2931 "movq %%mm1, %%mm3 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2932 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2933 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2934 "pslld $31, %%mm2 \n\t" // keep only the sign bit |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2935 "pxor %%mm2, %%mm1 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2936 "movq %%mm3, %%mm4 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2937 "pand %%mm1, %%mm3 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2938 "pandn %%mm1, %%mm4 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2939 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2940 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2941 "movq %%mm3, %1 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2942 "movq %%mm0, %0 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2943 :"+m"(mag[i]), "+m"(ang[i]) |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2944 ::"memory" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2945 ); |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2946 } |
3561 | 2947 asm volatile("femms"); |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2948 } |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2949 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2950 { |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2951 int i; |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2952 |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2953 asm volatile( |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2954 "movaps %0, %%xmm5 \n\t" |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2955 ::"m"(ff_pdw_80000000[0]) |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2956 ); |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2957 for(i=0; i<blocksize; i+=4) { |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2958 asm volatile( |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2959 "movaps %0, %%xmm0 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2960 "movaps %1, %%xmm1 \n\t" |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2961 "xorps %%xmm2, %%xmm2 \n\t" |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2962 "xorps %%xmm3, %%xmm3 \n\t" |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2963 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2964 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0 |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2965 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2966 "xorps %%xmm2, %%xmm1 \n\t" |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2967 "movaps %%xmm3, %%xmm4 \n\t" |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2968 "andps %%xmm1, %%xmm3 \n\t" |
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
2969 "andnps %%xmm1, %%xmm4 \n\t" |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2970 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2971 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2972 "movaps %%xmm3, %1 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2973 "movaps %%xmm0, %0 \n\t" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2974 :"+m"(mag[i]), "+m"(ang[i]) |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2975 ::"memory" |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2976 ); |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2977 } |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2978 } |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
2979 |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
2980 static void vector_fmul_3dnow(float *dst, const float *src, int len){ |
3574 | 2981 long i = (len-4)*4; |
2982 asm volatile( | |
2983 "1: \n\t" | |
2984 "movq (%1,%0), %%mm0 \n\t" | |
2985 "movq 8(%1,%0), %%mm1 \n\t" | |
2986 "pfmul (%2,%0), %%mm0 \n\t" | |
2987 "pfmul 8(%2,%0), %%mm1 \n\t" | |
2988 "movq %%mm0, (%1,%0) \n\t" | |
2989 "movq %%mm1, 8(%1,%0) \n\t" | |
2990 "sub $16, %0 \n\t" | |
2991 "jge 1b \n\t" | |
2992 "femms \n\t" | |
2993 :"+r"(i) | |
2994 :"r"(dst), "r"(src) | |
2995 :"memory" | |
2996 ); | |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
2997 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
2998 static void vector_fmul_sse(float *dst, const float *src, int len){ |
3574 | 2999 long i = (len-8)*4; |
3000 asm volatile( | |
3001 "1: \n\t" | |
3002 "movaps (%1,%0), %%xmm0 \n\t" | |
3003 "movaps 16(%1,%0), %%xmm1 \n\t" | |
3004 "mulps (%2,%0), %%xmm0 \n\t" | |
3005 "mulps 16(%2,%0), %%xmm1 \n\t" | |
3006 "movaps %%xmm0, (%1,%0) \n\t" | |
3007 "movaps %%xmm1, 16(%1,%0) \n\t" | |
3008 "sub $32, %0 \n\t" | |
3009 "jge 1b \n\t" | |
3010 :"+r"(i) | |
3011 :"r"(dst), "r"(src) | |
3012 :"memory" | |
3013 ); | |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3014 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3015 |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3016 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3017 long i = len*4-16; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3018 asm volatile( |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3019 "1: \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3020 "pswapd 8(%1), %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3021 "pswapd (%1), %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3022 "pfmul (%3,%0), %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3023 "pfmul 8(%3,%0), %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3024 "movq %%mm0, (%2,%0) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3025 "movq %%mm1, 8(%2,%0) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3026 "add $16, %1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3027 "sub $16, %0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3028 "jge 1b \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3029 :"+r"(i), "+r"(src1) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3030 :"r"(dst), "r"(src0) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3031 ); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3032 asm volatile("femms"); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3033 } |
3569
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3034 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3035 long i = len*4-32; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3036 asm volatile( |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3037 "1: \n\t" |
3569
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3038 "movaps 16(%1), %%xmm0 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3039 "movaps (%1), %%xmm1 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3040 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3041 "shufps $0x1b, %%xmm1, %%xmm1 \n\t" |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3042 "mulps (%3,%0), %%xmm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3043 "mulps 16(%3,%0), %%xmm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3044 "movaps %%xmm0, (%2,%0) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3045 "movaps %%xmm1, 16(%2,%0) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3046 "add $32, %1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3047 "sub $32, %0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3048 "jge 1b \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3049 :"+r"(i), "+r"(src1) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3050 :"r"(dst), "r"(src0) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3051 ); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3052 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3053 |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3054 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3055 const float *src2, int src3, int len, int step){ |
3574 | 3056 long i = (len-4)*4; |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3057 if(step == 2 && src3 == 0){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3058 dst += (len-4)*2; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3059 asm volatile( |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3060 "1: \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3061 "movq (%2,%0), %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3062 "movq 8(%2,%0), %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3063 "pfmul (%3,%0), %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3064 "pfmul 8(%3,%0), %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3065 "pfadd (%4,%0), %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3066 "pfadd 8(%4,%0), %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3067 "movd %%mm0, (%1) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3068 "movd %%mm1, 16(%1) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3069 "psrlq $32, %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3070 "psrlq $32, %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3071 "movd %%mm0, 8(%1) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3072 "movd %%mm1, 24(%1) \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3073 "sub $32, %1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3074 "sub $16, %0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3075 "jge 1b \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3076 :"+r"(i), "+r"(dst) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3077 :"r"(src0), "r"(src1), "r"(src2) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3078 :"memory" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3079 ); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3080 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3081 else if(step == 1 && src3 == 0){ |
3574 | 3082 asm volatile( |
3083 "1: \n\t" | |
3084 "movq (%2,%0), %%mm0 \n\t" | |
3085 "movq 8(%2,%0), %%mm1 \n\t" | |
3086 "pfmul (%3,%0), %%mm0 \n\t" | |
3087 "pfmul 8(%3,%0), %%mm1 \n\t" | |
3088 "pfadd (%4,%0), %%mm0 \n\t" | |
3089 "pfadd 8(%4,%0), %%mm1 \n\t" | |
3090 "movq %%mm0, (%1,%0) \n\t" | |
3091 "movq %%mm1, 8(%1,%0) \n\t" | |
3092 "sub $16, %0 \n\t" | |
3093 "jge 1b \n\t" | |
3094 :"+r"(i) | |
3095 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) | |
3096 :"memory" | |
3097 ); | |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3098 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3099 else |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3100 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3101 asm volatile("femms"); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3102 } |
3569
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3103 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1, |
3574 | 3104 const float *src2, int src3, int len, int step){ |
3105 long i = (len-8)*4; | |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3106 if(step == 2 && src3 == 0){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3107 dst += (len-8)*2; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3108 asm volatile( |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3109 "1: \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3110 "movaps (%2,%0), %%xmm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3111 "movaps 16(%2,%0), %%xmm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3112 "mulps (%3,%0), %%xmm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3113 "mulps 16(%3,%0), %%xmm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3114 "addps (%4,%0), %%xmm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3115 "addps 16(%4,%0), %%xmm1 \n\t" |
3569
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3116 "movss %%xmm0, (%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3117 "movss %%xmm1, 32(%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3118 "movhlps %%xmm0, %%xmm2 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3119 "movhlps %%xmm1, %%xmm3 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3120 "movss %%xmm2, 16(%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3121 "movss %%xmm3, 48(%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3122 "shufps $0xb1, %%xmm0, %%xmm0 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3123 "shufps $0xb1, %%xmm1, %%xmm1 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3124 "movss %%xmm0, 8(%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3125 "movss %%xmm1, 40(%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3126 "movhlps %%xmm0, %%xmm2 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3127 "movhlps %%xmm1, %%xmm3 \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3128 "movss %%xmm2, 24(%1) \n\t" |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3129 "movss %%xmm3, 56(%1) \n\t" |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3130 "sub $64, %1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3131 "sub $32, %0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3132 "jge 1b \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3133 :"+r"(i), "+r"(dst) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3134 :"r"(src0), "r"(src1), "r"(src2) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3135 :"memory" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3136 ); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3137 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3138 else if(step == 1 && src3 == 0){ |
3574 | 3139 asm volatile( |
3140 "1: \n\t" | |
3141 "movaps (%2,%0), %%xmm0 \n\t" | |
3142 "movaps 16(%2,%0), %%xmm1 \n\t" | |
3143 "mulps (%3,%0), %%xmm0 \n\t" | |
3144 "mulps 16(%3,%0), %%xmm1 \n\t" | |
3145 "addps (%4,%0), %%xmm0 \n\t" | |
3146 "addps 16(%4,%0), %%xmm1 \n\t" | |
3147 "movaps %%xmm0, (%1,%0) \n\t" | |
3148 "movaps %%xmm1, 16(%1,%0) \n\t" | |
3149 "sub $32, %0 \n\t" | |
3150 "jge 1b \n\t" | |
3151 :"+r"(i) | |
3152 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) | |
3153 :"memory" | |
3154 ); | |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3155 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3156 else |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3157 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3158 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3159 |
4172 | 3160 static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3161 // not bit-exact: pf2id uses different rounding than C and SSE |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3162 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3163 for(i=0; i<len; i+=4) { |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3164 asm volatile( |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3165 "pf2id %1, %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3166 "pf2id %2, %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3167 "packssdw %%mm1, %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3168 "movq %%mm0, %0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3169 :"=m"(dst[i]) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3170 :"m"(src[i]), "m"(src[i+2]) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3171 ); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3172 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3173 asm volatile("femms"); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3174 } |
4172 | 3175 static void float_to_int16_sse(int16_t *dst, const float *src, int len){ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3176 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3177 for(i=0; i<len; i+=4) { |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3178 asm volatile( |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3179 "cvtps2pi %1, %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3180 "cvtps2pi %2, %%mm1 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3181 "packssdw %%mm1, %%mm0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3182 "movq %%mm0, %0 \n\t" |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3183 :"=m"(dst[i]) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3184 :"m"(src[i]), "m"(src[i+2]) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3185 ); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3186 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3187 asm volatile("emms"); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3188 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3189 |
6195
5f704e9cb518
fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type
bcoudurier
parents:
6181
diff
changeset
|
3190 extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width); |
5f704e9cb518
fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type
bcoudurier
parents:
6181
diff
changeset
|
3191 extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width); |
5f704e9cb518
fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type
bcoudurier
parents:
6181
diff
changeset
|
3192 extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); |
5f704e9cb518
fix prototypes, remove warning: i386/dsputil_mmx.c:3594: warning: assignment from incompatible pointer type
bcoudurier
parents:
6181
diff
changeset
|
3193 extern void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); |
4436
d3e389536b0a
Add the const specifier as needed to reduce the number of warnings.
takis
parents:
4197
diff
changeset
|
3194 extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
3195 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); |
4436
d3e389536b0a
Add the const specifier as needed to reduce the number of warnings.
takis
parents:
4197
diff
changeset
|
3196 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
3197 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3198 |
1092 | 3199 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
0 | 3200 { |
4197 | 3201 mm_flags = mm_support(); |
1115 | 3202 |
1122 | 3203 if (avctx->dsp_mask) { |
2979 | 3204 if (avctx->dsp_mask & FF_MM_FORCE) |
4197 | 3205 mm_flags |= (avctx->dsp_mask & 0xffff); |
2979 | 3206 else |
4197 | 3207 mm_flags &= ~(avctx->dsp_mask & 0xffff); |
1122 | 3208 } |
1115 | 3209 |
631
47a8964ba5cd
be less verbose patch by (Lennert Buytenhek <buytenh at math dot leidenuniv dot nl>)
michaelni
parents:
629
diff
changeset
|
3210 #if 0 |
1868 | 3211 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); |
4197 | 3212 if (mm_flags & MM_MMX) |
1868 | 3213 av_log(avctx, AV_LOG_INFO, " mmx"); |
4197 | 3214 if (mm_flags & MM_MMXEXT) |
1868 | 3215 av_log(avctx, AV_LOG_INFO, " mmxext"); |
4197 | 3216 if (mm_flags & MM_3DNOW) |
1868 | 3217 av_log(avctx, AV_LOG_INFO, " 3dnow"); |
4197 | 3218 if (mm_flags & MM_SSE) |
1868 | 3219 av_log(avctx, AV_LOG_INFO, " sse"); |
4197 | 3220 if (mm_flags & MM_SSE2) |
1868 | 3221 av_log(avctx, AV_LOG_INFO, " sse2"); |
3222 av_log(avctx, AV_LOG_INFO, "\n"); | |
0 | 3223 #endif |
3224 | |
4197 | 3225 if (mm_flags & MM_MMX) { |
1092 | 3226 const int idct_algo= avctx->idct_algo; |
3227 | |
1232
e88d3b1fb2a1
more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents:
1186
diff
changeset
|
3228 #ifdef CONFIG_ENCODERS |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1985
diff
changeset
|
3229 const int dct_algo = avctx->dct_algo; |
1565 | 3230 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ |
4197 | 3231 if(mm_flags & MM_SSE2){ |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
3232 c->fdct = ff_fdct_sse2; |
4197 | 3233 }else if(mm_flags & MM_MMXEXT){ |
1565 | 3234 c->fdct = ff_fdct_mmx2; |
3235 }else{ | |
3236 c->fdct = ff_fdct_mmx; | |
3237 } | |
3238 } | |
1232
e88d3b1fb2a1
more #ifdef CONFIG_ENCODERS by (Wolfgang Hesseler <qv at multimediaware dot com>)
michaelni
parents:
1186
diff
changeset
|
3239 #endif //CONFIG_ENCODERS |
2256 | 3240 if(avctx->lowres==0){ |
3241 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ | |
3242 c->idct_put= ff_simple_idct_put_mmx; | |
3243 c->idct_add= ff_simple_idct_add_mmx; | |
3244 c->idct = ff_simple_idct_mmx; | |
3245 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; | |
3717
ea9fe1c9d126
Remove the LGPL exception clause as discussed on ffmpeg-devel
diego
parents:
3712
diff
changeset
|
3246 #ifdef CONFIG_GPL |
2256 | 3247 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ |
4197 | 3248 if(mm_flags & MM_MMXEXT){ |
2256 | 3249 c->idct_put= ff_libmpeg2mmx2_idct_put; |
3250 c->idct_add= ff_libmpeg2mmx2_idct_add; | |
3251 c->idct = ff_mmxext_idct; | |
3252 }else{ | |
3253 c->idct_put= ff_libmpeg2mmx_idct_put; | |
3254 c->idct_add= ff_libmpeg2mmx_idct_add; | |
3255 c->idct = ff_mmx_idct; | |
3256 } | |
3257 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; | |
3717
ea9fe1c9d126
Remove the LGPL exception clause as discussed on ffmpeg-devel
diego
parents:
3712
diff
changeset
|
3258 #endif |
5007 | 3259 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) && |
3260 idct_algo==FF_IDCT_VP3 && | |
3721
2000e401593d
disable vp3 mmx idct for theora files to avoid artifacts
aurel
parents:
3717
diff
changeset
|
3261 avctx->codec->id!=CODEC_ID_THEORA && |
3712
f7f75f718efb
Enables back the mmx/sse optimized version of the vp3 idct.
aurel
parents:
3666
diff
changeset
|
3262 !(avctx->flags & CODEC_FLAG_BITEXACT)){ |
4197 | 3263 if(mm_flags & MM_SSE2){ |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3264 c->idct_put= ff_vp3_idct_put_sse2; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3265 c->idct_add= ff_vp3_idct_add_sse2; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3266 c->idct = ff_vp3_idct_sse2; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3267 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3268 }else{ |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3269 ff_vp3_dsp_init_mmx(); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3270 c->idct_put= ff_vp3_idct_put_mmx; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3271 c->idct_add= ff_vp3_idct_add_mmx; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3272 c->idct = ff_vp3_idct_mmx; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3273 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2691
diff
changeset
|
3274 } |
3524 | 3275 }else if(idct_algo==FF_IDCT_CAVS){ |
3276 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; | |
2868 | 3277 }else if(idct_algo==FF_IDCT_XVIDMMX){ |
4197 | 3278 if(mm_flags & MM_MMXEXT){ |
2868 | 3279 c->idct_put= ff_idct_xvid_mmx2_put; |
3280 c->idct_add= ff_idct_xvid_mmx2_add; | |
3281 c->idct = ff_idct_xvid_mmx2; | |
3282 }else{ | |
3283 c->idct_put= ff_idct_xvid_mmx_put; | |
3284 c->idct_add= ff_idct_xvid_mmx_add; | |
3285 c->idct = ff_idct_xvid_mmx; | |
3286 } | |
1092 | 3287 } |
3288 } | |
1868 | 3289 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3290 #ifdef CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3291 c->get_pixels = get_pixels_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3292 c->diff_pixels = diff_pixels_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3293 #endif //CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3294 c->put_pixels_clamped = put_pixels_clamped_mmx; |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
3295 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3296 c->add_pixels_clamped = add_pixels_clamped_mmx; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3297 c->clear_blocks = clear_blocks_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3298 #ifdef CONFIG_ENCODERS |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3299 c->pix_sum = pix_sum16_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3300 #endif //CONFIG_ENCODERS |
415 | 3301 |
6327 | 3302 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ |
3303 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ | |
3304 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ | |
3305 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ | |
3306 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU | |
3307 | |
3308 SET_HPEL_FUNCS(put, 0, 16, mmx); | |
3309 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx); | |
3310 SET_HPEL_FUNCS(avg, 0, 16, mmx); | |
3311 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx); | |
3312 SET_HPEL_FUNCS(put, 1, 8, mmx); | |
3313 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx); | |
3314 SET_HPEL_FUNCS(avg, 1, 8, mmx); | |
3315 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx); | |
2967 | 3316 |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
3317 c->gmc= gmc_mmx; |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3215
diff
changeset
|
3318 |
866 | 3319 c->add_bytes= add_bytes_mmx; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3320 #ifdef CONFIG_ENCODERS |
866 | 3321 c->diff_bytes= diff_bytes_mmx; |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
3322 c->sum_abs_dctelem= sum_abs_dctelem_mmx; |
2967 | 3323 |
936 | 3324 c->hadamard8_diff[0]= hadamard8_diff16_mmx; |
3325 c->hadamard8_diff[1]= hadamard8_diff_mmx; | |
2967 | 3326 |
2979 | 3327 c->pix_norm1 = pix_norm1_mmx; |
4197 | 3328 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx; |
2979 | 3329 c->sse[1] = sse8_mmx; |
1729 | 3330 c->vsad[4]= vsad_intra16_mmx; |
3331 | |
2979 | 3332 c->nsse[0] = nsse16_mmx; |
3333 c->nsse[1] = nsse8_mmx; | |
1729 | 3334 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
3335 c->vsad[0] = vsad16_mmx; | |
3336 } | |
2967 | 3337 |
1784 | 3338 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
3339 c->try_8x8basis= try_8x8basis_mmx; | |
3340 } | |
3341 c->add_8x8basis= add_8x8basis_mmx; | |
2967 | 3342 |
4749 | 3343 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; |
3344 | |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3345 #endif //CONFIG_ENCODERS |
1647 | 3346 |
5277
7b3fcb7c61ce
Avoid linking with h263.c functions when the relevant codecs
aurel
parents:
5255
diff
changeset
|
3347 if (ENABLE_ANY_H263) { |
5278 | 3348 c->h263_v_loop_filter= h263_v_loop_filter_mmx; |
3349 c->h263_h_loop_filter= h263_h_loop_filter_mmx; | |
5277
7b3fcb7c61ce
Avoid linking with h263.c functions when the relevant codecs
aurel
parents:
5255
diff
changeset
|
3350 } |
6057
03febc8f506f
add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents:
6056
diff
changeset
|
3351 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd; |
2922
d772011258ec
faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents:
2902
diff
changeset
|
3352 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; |
6057
03febc8f506f
add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents:
6056
diff
changeset
|
3353 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd; |
2967 | 3354 |
3173 | 3355 c->h264_idct_dc_add= |
3356 c->h264_idct_add= ff_h264_idct_add_mmx; | |
3174 | 3357 c->h264_idct8_dc_add= |
3358 c->h264_idct8_add= ff_h264_idct8_add_mmx; | |
6320 | 3359 if (mm_flags & MM_SSE2) |
3360 c->h264_idct8_add= ff_h264_idct8_add_sse2; | |
3173 | 3361 |
4197 | 3362 if (mm_flags & MM_MMXEXT) { |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
3363 c->prefetch = prefetch_mmx2; |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
3364 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3365 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3366 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; |
651 | 3367 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3368 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3369 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3370 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; |
415 | 3371 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3372 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3373 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; |
651 | 3374 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3375 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3376 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3377 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; |
1092 | 3378 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3379 #ifdef CONFIG_ENCODERS |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
3380 c->sum_abs_dctelem= sum_abs_dctelem_mmx2; |
1153 | 3381 c->hadamard8_diff[0]= hadamard8_diff16_mmx2; |
3382 c->hadamard8_diff[1]= hadamard8_diff_mmx2; | |
1729 | 3383 c->vsad[4]= vsad_intra16_mmx2; |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3384 #endif //CONFIG_ENCODERS |
1153 | 3385 |
3105
2d35fb3cb940
h264: special case dc-only idct. ~1% faster overall
lorenm
parents:
3089
diff
changeset
|
3386 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; |
2d35fb3cb940
h264: special case dc-only idct. ~1% faster overall
lorenm
parents:
3089
diff
changeset
|
3387 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; |
2745 | 3388 |
1092 | 3389 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
3390 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; | |
3391 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; | |
3392 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; | |
3393 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; | |
3394 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; | |
3395 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; | |
1772
8cd5257195c9
vsad16_mmx2 only applies if encoders are turned on
melanson
parents:
1765
diff
changeset
|
3396 #ifdef CONFIG_ENCODERS |
1729 | 3397 c->vsad[0] = vsad16_mmx2; |
1772
8cd5257195c9
vsad16_mmx2 only applies if encoders are turned on
melanson
parents:
1765
diff
changeset
|
3398 #endif //CONFIG_ENCODERS |
1092 | 3399 } |
959 | 3400 |
6327 | 3401 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ |
3402 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ | |
3403 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \ | |
3404 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \ | |
3405 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \ | |
3406 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \ | |
3407 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \ | |
3408 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \ | |
3409 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \ | |
3410 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \ | |
3411 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \ | |
3412 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \ | |
3413 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \ | |
3414 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \ | |
3415 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \ | |
3416 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \ | |
3417 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU | |
3418 | |
3419 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2); | |
3420 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2); | |
3421 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2); | |
3422 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2); | |
3423 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2); | |
3424 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2); | |
3425 | |
3426 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2); | |
3427 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2); | |
3428 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2); | |
3429 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2); | |
3430 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2); | |
3431 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2); | |
3432 | |
3433 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2); | |
3434 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2); | |
3435 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2); | |
3436 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2); | |
2209 | 3437 |
6057
03febc8f506f
add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents:
6056
diff
changeset
|
3438 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd; |
2922
d772011258ec
faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents:
2902
diff
changeset
|
3439 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2; |
3213 | 3440 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2; |
3441 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2; | |
2633 | 3442 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; |
3443 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; | |
3444 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; | |
3445 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3446 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3447 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; |
3645
47821be55b6c
mmx implementation of deblocking strength decision.
lorenm
parents:
3576
diff
changeset
|
3448 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; |
2633 | 3449 |
2902
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3450 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3451 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3452 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3453 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3454 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3455 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3456 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3457 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3458 |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3459 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3460 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3461 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3462 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3463 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3464 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3465 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3466 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; |
3c79bc9f3aa9
h264 mmx weighted prediction. up to 3% overall speedup.
lorenm
parents:
2899
diff
changeset
|
3467 |
5949 | 3468 if (ENABLE_CAVS_DECODER) |
5950 | 3469 ff_cavsdsp_init_mmx2(c, avctx); |
5949 | 3470 |
3471 if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER) | |
5950 | 3472 ff_vc1dsp_init_mmx(c, avctx); |
5933 | 3473 |
1686
68abbec33289
Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents:
1648
diff
changeset
|
3474 #ifdef CONFIG_ENCODERS |
1527 | 3475 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; |
1686
68abbec33289
Here are juste two added #ifdef CONFIG_ENCODERS to allow
michael
parents:
1648
diff
changeset
|
3476 #endif //CONFIG_ENCODERS |
4197 | 3477 } else if (mm_flags & MM_3DNOW) { |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
3478 c->prefetch = prefetch_3dnow; |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3213
diff
changeset
|
3479 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3480 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3481 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; |
393 | 3482 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3483 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3484 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3485 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; |
651 | 3486 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3487 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3488 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3489 |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3490 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3491 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
3492 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; |
1092 | 3493 |
3494 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
3495 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; | |
3496 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; | |
3497 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; | |
3498 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; | |
3499 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; | |
3500 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; | |
3501 } | |
984 | 3502 |
6327 | 3503 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow); |
3504 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow); | |
3505 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow); | |
3506 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow); | |
3507 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow); | |
3508 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow); | |
3509 | |
3510 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow); | |
3511 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow); | |
3512 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow); | |
3513 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow); | |
3514 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow); | |
3515 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow); | |
3516 | |
3517 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow); | |
3518 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow); | |
3519 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow); | |
3520 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow); | |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3777
diff
changeset
|
3521 |
6057
03febc8f506f
add MMX version for put_no_rnd_h264_chroma_mc8_c, used in VC-1 decoding.
gpoirier
parents:
6056
diff
changeset
|
3522 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd; |
2922
d772011258ec
faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
lorenm
parents:
2902
diff
changeset
|
3523 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; |
0 | 3524 } |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3525 |
6336
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3526 |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3527 #define H264_QPEL_FUNCS(x, y, CPU)\ |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3528 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\ |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3529 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\ |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3530 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\ |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3531 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; |
6331 | 3532 if((mm_flags & MM_SSE2) && !(mm_flags & MM_3DNOW)){ |
3533 // these functions are slower than mmx on AMD, but faster on Intel | |
6336
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3534 /* FIXME works in most codecs, but crashes svq1 due to unaligned chroma |
6331 | 3535 c->put_pixels_tab[0][0] = put_pixels16_sse2; |
3536 c->avg_pixels_tab[0][0] = avg_pixels16_sse2; | |
6336
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3537 */ |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3538 H264_QPEL_FUNCS(0, 0, sse2); |
6331 | 3539 } |
6336
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3540 if(mm_flags & MM_SSE2){ |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3541 H264_QPEL_FUNCS(0, 1, sse2); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3542 H264_QPEL_FUNCS(0, 2, sse2); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3543 H264_QPEL_FUNCS(0, 3, sse2); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3544 H264_QPEL_FUNCS(1, 1, sse2); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3545 H264_QPEL_FUNCS(1, 2, sse2); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3546 H264_QPEL_FUNCS(1, 3, sse2); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3547 H264_QPEL_FUNCS(2, 1, sse2); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3548 H264_QPEL_FUNCS(2, 2, sse2); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3549 H264_QPEL_FUNCS(2, 3, sse2); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3550 H264_QPEL_FUNCS(3, 1, sse2); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3551 H264_QPEL_FUNCS(3, 2, sse2); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3552 H264_QPEL_FUNCS(3, 3, sse2); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3553 } |
6331 | 3554 #ifdef HAVE_SSSE3 |
3555 if(mm_flags & MM_SSSE3){ | |
6336
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3556 H264_QPEL_FUNCS(1, 0, ssse3); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3557 H264_QPEL_FUNCS(1, 1, ssse3); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3558 H264_QPEL_FUNCS(1, 2, ssse3); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3559 H264_QPEL_FUNCS(1, 3, ssse3); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3560 H264_QPEL_FUNCS(2, 0, ssse3); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3561 H264_QPEL_FUNCS(2, 1, ssse3); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3562 H264_QPEL_FUNCS(2, 2, ssse3); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3563 H264_QPEL_FUNCS(2, 3, ssse3); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3564 H264_QPEL_FUNCS(3, 0, ssse3); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3565 H264_QPEL_FUNCS(3, 1, ssse3); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3566 H264_QPEL_FUNCS(3, 2, ssse3); |
ef3fb5a7e275
sse2 h264 motion compensation. not new code, just separate out the cases that didn't need ssse3.
lorenm
parents:
6335
diff
changeset
|
3567 H264_QPEL_FUNCS(3, 3, ssse3); |
6331 | 3568 } |
3569 #endif | |
3570 | |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3571 #ifdef CONFIG_ENCODERS |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3572 if(mm_flags & MM_SSE2){ |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
3573 c->sum_abs_dctelem= sum_abs_dctelem_sse2; |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3574 c->hadamard8_diff[0]= hadamard8_diff16_sse2; |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3575 c->hadamard8_diff[1]= hadamard8_diff_sse2; |
6030 | 3576 if (ENABLE_FLAC_ENCODER) |
3577 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2; | |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3578 } |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3579 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3580 #ifdef HAVE_SSSE3 |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3581 if(mm_flags & MM_SSSE3){ |
5024
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3582 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3583 c->try_8x8basis= try_8x8basis_ssse3; |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3584 } |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3585 c->add_8x8basis= add_8x8basis_ssse3; |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4987
diff
changeset
|
3586 c->sum_abs_dctelem= sum_abs_dctelem_ssse3; |
4987
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3587 c->hadamard8_diff[0]= hadamard8_diff16_ssse3; |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3588 c->hadamard8_diff[1]= hadamard8_diff_ssse3; |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3589 } |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3590 #endif |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3591 #endif |
02199b094850
sse2 & ssse3 versions of hadamard. unroll and inline diff_pixels.
lorenm
parents:
4946
diff
changeset
|
3592 |
4589
30261f4ed12d
Fix wrong conditional, Snow decoding, not encoding, was SIMD-accelerated.
diego
parents:
4436
diff
changeset
|
3593 #ifdef CONFIG_SNOW_DECODER |
5591 | 3594 if(mm_flags & MM_SSE2 & 0){ |
3210 | 3595 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; |
5602 | 3596 #ifdef HAVE_7REGS |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3597 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; |
5601
b26025b9586d
workaround gcc bug, untested as my gcc is not complaining
michael
parents:
5594
diff
changeset
|
3598 #endif |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
3599 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3600 } |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3601 else{ |
5594
384629ebcb93
avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum
michael
parents:
5591
diff
changeset
|
3602 if(mm_flags & MM_MMXEXT){ |
3210 | 3603 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; |
5602 | 3604 #ifdef HAVE_7REGS |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3605 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; |
5601
b26025b9586d
workaround gcc bug, untested as my gcc is not complaining
michael
parents:
5594
diff
changeset
|
3606 #endif |
5594
384629ebcb93
avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum
michael
parents:
5591
diff
changeset
|
3607 } |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
3608 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3609 } |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
3174
diff
changeset
|
3610 #endif |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3524
diff
changeset
|
3611 |
4197 | 3612 if(mm_flags & MM_3DNOW){ |
5024
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3613 #ifdef CONFIG_ENCODERS |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3614 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3615 c->try_8x8basis= try_8x8basis_3dnow; |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3616 } |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3617 c->add_8x8basis= add_8x8basis_3dnow; |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
5014
diff
changeset
|
3618 #endif //CONFIG_ENCODERS |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3619 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3620 c->vector_fmul = vector_fmul_3dnow; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3621 if(!(avctx->flags & CODEC_FLAG_BITEXACT)) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3622 c->float_to_int16 = float_to_int16_3dnow; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3623 } |
4197 | 3624 if(mm_flags & MM_3DNOWEXT) |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3625 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; |
4197 | 3626 if(mm_flags & MM_SSE){ |
3557
8e13ec0f8aa3
change vorbis_inverse_coupling_sse2() so it works on sse1 cpus
michael
parents:
3541
diff
changeset
|
3627 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3628 c->vector_fmul = vector_fmul_sse; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3629 c->float_to_int16 = float_to_int16_sse; |
3569
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3630 c->vector_fmul_reverse = vector_fmul_reverse_sse; |
c42c03f3b402
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
michael
parents:
3568
diff
changeset
|
3631 c->vector_fmul_add_add = vector_fmul_add_add_sse; |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3561
diff
changeset
|
3632 } |
4197 | 3633 if(mm_flags & MM_3DNOW) |
3574 | 3634 c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse |
0 | 3635 } |
2967 | 3636 |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3637 #ifdef CONFIG_ENCODERS |
1092 | 3638 dsputil_init_pix_mmx(c, avctx); |
1530
3b31998fe22f
disable encoders where appropriate (patch courtesy of BERO
melanson
parents:
1527
diff
changeset
|
3639 #endif //CONFIG_ENCODERS |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3640 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3641 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3642 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3643 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3644 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3645 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3646 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3647 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3648 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3649 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3650 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3651 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3652 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3653 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3654 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3655 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3656 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3657 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3658 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3659 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3660 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3661 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3662 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3663 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3664 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3665 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3666 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3667 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3668 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3669 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3670 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3671 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3672 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
3673 #endif |
0 | 3674 } |