Mercurial > libavcodec.hg
annotate x86/dsputil_mmx.c @ 10936:b2ea6b0d17bf libavcodec
Update libx264.c to use new libx264 features
With b_keyframe instead of IDR for detecting keyframes, ffmpeg should now
support periodic encoding with periodic intra refresh (although there is no
interface option for it yet).
Set the new timebase values for full VFR input support.
Bump configure to check for API version 83.
author | darkshikari |
---|---|
date | Tue, 19 Jan 2010 04:00:08 +0000 |
parents | 78c2be62260a |
children | 34a65026fa06 |
rev | line source |
---|---|
8430 | 1 /* |
2 * MMX optimized DSP utils | |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8596
diff
changeset
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard |
8430 | 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
5 * | |
6 * This file is part of FFmpeg. | |
7 * | |
8 * FFmpeg is free software; you can redistribute it and/or | |
9 * modify it under the terms of the GNU Lesser General Public | |
10 * License as published by the Free Software Foundation; either | |
11 * version 2.1 of the License, or (at your option) any later version. | |
12 * | |
13 * FFmpeg is distributed in the hope that it will be useful, | |
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 * Lesser General Public License for more details. | |
17 * | |
18 * You should have received a copy of the GNU Lesser General Public | |
19 * License along with FFmpeg; if not, write to the Free Software | |
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 * | |
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
23 */ | |
24 | |
25 #include "libavutil/x86_cpu.h" | |
26 #include "libavcodec/dsputil.h" | |
27 #include "libavcodec/mpegvideo.h" | |
28 #include "libavcodec/simple_idct.h" | |
29 #include "dsputil_mmx.h" | |
30 #include "vp3dsp_mmx.h" | |
31 #include "vp3dsp_sse2.h" | |
8817 | 32 #include "vp6dsp_mmx.h" |
8818 | 33 #include "vp6dsp_sse2.h" |
8430 | 34 #include "idct_xvid.h" |
35 | |
36 //#undef NDEBUG | |
37 //#include <assert.h> | |
38 | |
39 int mm_flags; /* multimedia extension flags */ | |
40 | |
41 /* pixel operations */ | |
42 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL; | |
43 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL; | |
44 | |
45 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) = | |
46 {0x8000000080000000ULL, 0x8000000080000000ULL}; | |
47 | |
48 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; | |
49 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL; | |
50 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; | |
51 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; | |
52 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; | |
53 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; | |
54 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; | |
55 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; | |
56 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; | |
57 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; | |
8816
53f9f3994ec8
convert ff_pw_64 into an xmm_reg for future use in vp6 sse code
aurel
parents:
8798
diff
changeset
|
58 DECLARE_ALIGNED_16(const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL}; |
8430 | 59 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; |
60 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL; | |
61 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; | |
62 | |
63 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; | |
64 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; | |
65 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; | |
66 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; | |
67 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; | |
68 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; | |
69 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; | |
70 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; | |
71 | |
72 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 }; | |
73 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 }; | |
74 | |
75 #define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::) | |
76 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) | |
77 | |
78 #define MOVQ_BFE(regd) \ | |
79 __asm__ volatile ( \ | |
80 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ | |
81 "paddb %%" #regd ", %%" #regd " \n\t" ::) | |
82 | |
83 #ifndef PIC | |
84 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) | |
85 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) | |
86 #else | |
87 // for shared library it's better to use this way for accessing constants | |
88 // pcmpeqd -> -1 | |
89 #define MOVQ_BONE(regd) \ | |
90 __asm__ volatile ( \ | |
91 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
92 "psrlw $15, %%" #regd " \n\t" \ | |
93 "packuswb %%" #regd ", %%" #regd " \n\t" ::) | |
94 | |
95 #define MOVQ_WTWO(regd) \ | |
96 __asm__ volatile ( \ | |
97 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
98 "psrlw $15, %%" #regd " \n\t" \ | |
99 "psllw $1, %%" #regd " \n\t"::) | |
100 | |
101 #endif | |
102 | |
103 // using regr as temporary and for the output result | |
104 // first argument is unmodifed and second is trashed | |
105 // regfe is supposed to contain 0xfefefefefefefefe | |
106 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ | |
107 "movq " #rega ", " #regr " \n\t"\ | |
108 "pand " #regb ", " #regr " \n\t"\ | |
109 "pxor " #rega ", " #regb " \n\t"\ | |
110 "pand " #regfe "," #regb " \n\t"\ | |
111 "psrlq $1, " #regb " \n\t"\ | |
112 "paddb " #regb ", " #regr " \n\t" | |
113 | |
114 #define PAVGB_MMX(rega, regb, regr, regfe) \ | |
115 "movq " #rega ", " #regr " \n\t"\ | |
116 "por " #regb ", " #regr " \n\t"\ | |
117 "pxor " #rega ", " #regb " \n\t"\ | |
118 "pand " #regfe "," #regb " \n\t"\ | |
119 "psrlq $1, " #regb " \n\t"\ | |
120 "psubb " #regb ", " #regr " \n\t" | |
121 | |
122 // mm6 is supposed to contain 0xfefefefefefefefe | |
123 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ | |
124 "movq " #rega ", " #regr " \n\t"\ | |
125 "movq " #regc ", " #regp " \n\t"\ | |
126 "pand " #regb ", " #regr " \n\t"\ | |
127 "pand " #regd ", " #regp " \n\t"\ | |
128 "pxor " #rega ", " #regb " \n\t"\ | |
129 "pxor " #regc ", " #regd " \n\t"\ | |
130 "pand %%mm6, " #regb " \n\t"\ | |
131 "pand %%mm6, " #regd " \n\t"\ | |
132 "psrlq $1, " #regb " \n\t"\ | |
133 "psrlq $1, " #regd " \n\t"\ | |
134 "paddb " #regb ", " #regr " \n\t"\ | |
135 "paddb " #regd ", " #regp " \n\t" | |
136 | |
137 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ | |
138 "movq " #rega ", " #regr " \n\t"\ | |
139 "movq " #regc ", " #regp " \n\t"\ | |
140 "por " #regb ", " #regr " \n\t"\ | |
141 "por " #regd ", " #regp " \n\t"\ | |
142 "pxor " #rega ", " #regb " \n\t"\ | |
143 "pxor " #regc ", " #regd " \n\t"\ | |
144 "pand %%mm6, " #regb " \n\t"\ | |
145 "pand %%mm6, " #regd " \n\t"\ | |
146 "psrlq $1, " #regd " \n\t"\ | |
147 "psrlq $1, " #regb " \n\t"\ | |
148 "psubb " #regb ", " #regr " \n\t"\ | |
149 "psubb " #regd ", " #regp " \n\t" | |
150 | |
151 /***********************************/ | |
152 /* MMX no rounding */ | |
153 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx | |
154 #define SET_RND MOVQ_WONE | |
155 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) | |
156 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
9441
diff
changeset
|
157 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e) |
8430 | 158 |
159 #include "dsputil_mmx_rnd_template.c" | |
160 | |
161 #undef DEF | |
162 #undef SET_RND | |
163 #undef PAVGBP | |
164 #undef PAVGB | |
165 /***********************************/ | |
166 /* MMX rounding */ | |
167 | |
168 #define DEF(x, y) x ## _ ## y ##_mmx | |
169 #define SET_RND MOVQ_WTWO | |
170 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) | |
171 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) | |
172 | |
173 #include "dsputil_mmx_rnd_template.c" | |
174 | |
175 #undef DEF | |
176 #undef SET_RND | |
177 #undef PAVGBP | |
178 #undef PAVGB | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
9441
diff
changeset
|
179 #undef OP_AVG |
8430 | 180 |
181 /***********************************/ | |
182 /* 3Dnow specific */ | |
183 | |
184 #define DEF(x) x ## _3dnow | |
185 #define PAVGB "pavgusb" | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
9441
diff
changeset
|
186 #define OP_AVG PAVGB |
8430 | 187 |
188 #include "dsputil_mmx_avg_template.c" | |
189 | |
190 #undef DEF | |
191 #undef PAVGB | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
9441
diff
changeset
|
192 #undef OP_AVG |
8430 | 193 |
194 /***********************************/ | |
195 /* MMX2 specific */ | |
196 | |
197 #define DEF(x) x ## _mmx2 | |
198 | |
199 /* Introduced only in MMX2 set */ | |
200 #define PAVGB "pavgb" | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
9441
diff
changeset
|
201 #define OP_AVG PAVGB |
8430 | 202 |
203 #include "dsputil_mmx_avg_template.c" | |
204 | |
205 #undef DEF | |
206 #undef PAVGB | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
9441
diff
changeset
|
207 #undef OP_AVG |
8430 | 208 |
209 #define put_no_rnd_pixels16_mmx put_pixels16_mmx | |
210 #define put_no_rnd_pixels8_mmx put_pixels8_mmx | |
211 #define put_pixels16_mmx2 put_pixels16_mmx | |
212 #define put_pixels8_mmx2 put_pixels8_mmx | |
213 #define put_pixels4_mmx2 put_pixels4_mmx | |
214 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx | |
215 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx | |
216 #define put_pixels16_3dnow put_pixels16_mmx | |
217 #define put_pixels8_3dnow put_pixels8_mmx | |
218 #define put_pixels4_3dnow put_pixels4_mmx | |
219 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx | |
220 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx | |
221 | |
222 /***********************************/ | |
223 /* standard MMX */ | |
224 | |
225 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) | |
226 { | |
227 const DCTELEM *p; | |
228 uint8_t *pix; | |
229 | |
230 /* read the pixels */ | |
231 p = block; | |
232 pix = pixels; | |
233 /* unrolled loop */ | |
234 __asm__ volatile( | |
235 "movq %3, %%mm0 \n\t" | |
236 "movq 8%3, %%mm1 \n\t" | |
237 "movq 16%3, %%mm2 \n\t" | |
238 "movq 24%3, %%mm3 \n\t" | |
239 "movq 32%3, %%mm4 \n\t" | |
240 "movq 40%3, %%mm5 \n\t" | |
241 "movq 48%3, %%mm6 \n\t" | |
242 "movq 56%3, %%mm7 \n\t" | |
243 "packuswb %%mm1, %%mm0 \n\t" | |
244 "packuswb %%mm3, %%mm2 \n\t" | |
245 "packuswb %%mm5, %%mm4 \n\t" | |
246 "packuswb %%mm7, %%mm6 \n\t" | |
247 "movq %%mm0, (%0) \n\t" | |
248 "movq %%mm2, (%0, %1) \n\t" | |
249 "movq %%mm4, (%0, %1, 2) \n\t" | |
250 "movq %%mm6, (%0, %2) \n\t" | |
251 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) | |
252 :"memory"); | |
253 pix += line_size*4; | |
254 p += 32; | |
255 | |
256 // if here would be an exact copy of the code above | |
257 // compiler would generate some very strange code | |
258 // thus using "r" | |
259 __asm__ volatile( | |
260 "movq (%3), %%mm0 \n\t" | |
261 "movq 8(%3), %%mm1 \n\t" | |
262 "movq 16(%3), %%mm2 \n\t" | |
263 "movq 24(%3), %%mm3 \n\t" | |
264 "movq 32(%3), %%mm4 \n\t" | |
265 "movq 40(%3), %%mm5 \n\t" | |
266 "movq 48(%3), %%mm6 \n\t" | |
267 "movq 56(%3), %%mm7 \n\t" | |
268 "packuswb %%mm1, %%mm0 \n\t" | |
269 "packuswb %%mm3, %%mm2 \n\t" | |
270 "packuswb %%mm5, %%mm4 \n\t" | |
271 "packuswb %%mm7, %%mm6 \n\t" | |
272 "movq %%mm0, (%0) \n\t" | |
273 "movq %%mm2, (%0, %1) \n\t" | |
274 "movq %%mm4, (%0, %1, 2) \n\t" | |
275 "movq %%mm6, (%0, %2) \n\t" | |
276 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p) | |
277 :"memory"); | |
278 } | |
279 | |
9339
5e020cbd1599
Use DECLARE_ASM_CONST for non-global ff_vector128 constant used via MANGLE
reimar
parents:
9337
diff
changeset
|
280 DECLARE_ASM_CONST(8, uint8_t, ff_vector128[8]) = |
8430 | 281 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; |
282 | |
9337
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
283 #define put_signed_pixels_clamped_mmx_half(off) \ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
284 "movq "#off"(%2), %%mm1 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
285 "movq 16+"#off"(%2), %%mm2 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
286 "movq 32+"#off"(%2), %%mm3 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
287 "movq 48+"#off"(%2), %%mm4 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
288 "packsswb 8+"#off"(%2), %%mm1 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
289 "packsswb 24+"#off"(%2), %%mm2 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
290 "packsswb 40+"#off"(%2), %%mm3 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
291 "packsswb 56+"#off"(%2), %%mm4 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
292 "paddb %%mm0, %%mm1 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
293 "paddb %%mm0, %%mm2 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
294 "paddb %%mm0, %%mm3 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
295 "paddb %%mm0, %%mm4 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
296 "movq %%mm1, (%0) \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
297 "movq %%mm2, (%0, %3) \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
298 "movq %%mm3, (%0, %3, 2) \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
299 "movq %%mm4, (%0, %1) \n\t" |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
300 |
8430 | 301 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
302 { | |
9337
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
303 x86_reg line_skip = line_size; |
9341
06532529c428
Mark line_skip3 asm argument as output-only instead of using av_uninit.
reimar
parents:
9340
diff
changeset
|
304 x86_reg line_skip3; |
8430 | 305 |
9337
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
306 __asm__ volatile ( |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
307 "movq "MANGLE(ff_vector128)", %%mm0 \n\t" |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
308 "lea (%3, %3, 2), %1 \n\t" |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
309 put_signed_pixels_clamped_mmx_half(0) |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
310 "lea (%0, %3, 4), %0 \n\t" |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
311 put_signed_pixels_clamped_mmx_half(64) |
9341
06532529c428
Mark line_skip3 asm argument as output-only instead of using av_uninit.
reimar
parents:
9340
diff
changeset
|
312 :"+&r" (pixels), "=&r" (line_skip3) |
9337
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
313 :"r" (block), "r"(line_skip) |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
314 :"memory"); |
8430 | 315 } |
316 | |
317 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) | |
318 { | |
319 const DCTELEM *p; | |
320 uint8_t *pix; | |
321 int i; | |
322 | |
323 /* read the pixels */ | |
324 p = block; | |
325 pix = pixels; | |
326 MOVQ_ZERO(mm7); | |
327 i = 4; | |
328 do { | |
329 __asm__ volatile( | |
330 "movq (%2), %%mm0 \n\t" | |
331 "movq 8(%2), %%mm1 \n\t" | |
332 "movq 16(%2), %%mm2 \n\t" | |
333 "movq 24(%2), %%mm3 \n\t" | |
334 "movq %0, %%mm4 \n\t" | |
335 "movq %1, %%mm6 \n\t" | |
336 "movq %%mm4, %%mm5 \n\t" | |
337 "punpcklbw %%mm7, %%mm4 \n\t" | |
338 "punpckhbw %%mm7, %%mm5 \n\t" | |
339 "paddsw %%mm4, %%mm0 \n\t" | |
340 "paddsw %%mm5, %%mm1 \n\t" | |
341 "movq %%mm6, %%mm5 \n\t" | |
342 "punpcklbw %%mm7, %%mm6 \n\t" | |
343 "punpckhbw %%mm7, %%mm5 \n\t" | |
344 "paddsw %%mm6, %%mm2 \n\t" | |
345 "paddsw %%mm5, %%mm3 \n\t" | |
346 "packuswb %%mm1, %%mm0 \n\t" | |
347 "packuswb %%mm3, %%mm2 \n\t" | |
348 "movq %%mm0, %0 \n\t" | |
349 "movq %%mm2, %1 \n\t" | |
350 :"+m"(*pix), "+m"(*(pix+line_size)) | |
351 :"r"(p) | |
352 :"memory"); | |
353 pix += line_size*2; | |
354 p += 16; | |
355 } while (--i); | |
356 } | |
357 | |
358 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
359 { | |
360 __asm__ volatile( | |
361 "lea (%3, %3), %%"REG_a" \n\t" | |
362 ASMALIGN(3) | |
363 "1: \n\t" | |
364 "movd (%1), %%mm0 \n\t" | |
365 "movd (%1, %3), %%mm1 \n\t" | |
366 "movd %%mm0, (%2) \n\t" | |
367 "movd %%mm1, (%2, %3) \n\t" | |
368 "add %%"REG_a", %1 \n\t" | |
369 "add %%"REG_a", %2 \n\t" | |
370 "movd (%1), %%mm0 \n\t" | |
371 "movd (%1, %3), %%mm1 \n\t" | |
372 "movd %%mm0, (%2) \n\t" | |
373 "movd %%mm1, (%2, %3) \n\t" | |
374 "add %%"REG_a", %1 \n\t" | |
375 "add %%"REG_a", %2 \n\t" | |
376 "subl $4, %0 \n\t" | |
377 "jnz 1b \n\t" | |
378 : "+g"(h), "+r" (pixels), "+r" (block) | |
379 : "r"((x86_reg)line_size) | |
380 : "%"REG_a, "memory" | |
381 ); | |
382 } | |
383 | |
384 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
385 { | |
386 __asm__ volatile( | |
387 "lea (%3, %3), %%"REG_a" \n\t" | |
388 ASMALIGN(3) | |
389 "1: \n\t" | |
390 "movq (%1), %%mm0 \n\t" | |
391 "movq (%1, %3), %%mm1 \n\t" | |
392 "movq %%mm0, (%2) \n\t" | |
393 "movq %%mm1, (%2, %3) \n\t" | |
394 "add %%"REG_a", %1 \n\t" | |
395 "add %%"REG_a", %2 \n\t" | |
396 "movq (%1), %%mm0 \n\t" | |
397 "movq (%1, %3), %%mm1 \n\t" | |
398 "movq %%mm0, (%2) \n\t" | |
399 "movq %%mm1, (%2, %3) \n\t" | |
400 "add %%"REG_a", %1 \n\t" | |
401 "add %%"REG_a", %2 \n\t" | |
402 "subl $4, %0 \n\t" | |
403 "jnz 1b \n\t" | |
404 : "+g"(h), "+r" (pixels), "+r" (block) | |
405 : "r"((x86_reg)line_size) | |
406 : "%"REG_a, "memory" | |
407 ); | |
408 } | |
409 | |
410 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
411 { | |
412 __asm__ volatile( | |
413 "lea (%3, %3), %%"REG_a" \n\t" | |
414 ASMALIGN(3) | |
415 "1: \n\t" | |
416 "movq (%1), %%mm0 \n\t" | |
417 "movq 8(%1), %%mm4 \n\t" | |
418 "movq (%1, %3), %%mm1 \n\t" | |
419 "movq 8(%1, %3), %%mm5 \n\t" | |
420 "movq %%mm0, (%2) \n\t" | |
421 "movq %%mm4, 8(%2) \n\t" | |
422 "movq %%mm1, (%2, %3) \n\t" | |
423 "movq %%mm5, 8(%2, %3) \n\t" | |
424 "add %%"REG_a", %1 \n\t" | |
425 "add %%"REG_a", %2 \n\t" | |
426 "movq (%1), %%mm0 \n\t" | |
427 "movq 8(%1), %%mm4 \n\t" | |
428 "movq (%1, %3), %%mm1 \n\t" | |
429 "movq 8(%1, %3), %%mm5 \n\t" | |
430 "movq %%mm0, (%2) \n\t" | |
431 "movq %%mm4, 8(%2) \n\t" | |
432 "movq %%mm1, (%2, %3) \n\t" | |
433 "movq %%mm5, 8(%2, %3) \n\t" | |
434 "add %%"REG_a", %1 \n\t" | |
435 "add %%"REG_a", %2 \n\t" | |
436 "subl $4, %0 \n\t" | |
437 "jnz 1b \n\t" | |
438 : "+g"(h), "+r" (pixels), "+r" (block) | |
439 : "r"((x86_reg)line_size) | |
440 : "%"REG_a, "memory" | |
441 ); | |
442 } | |
443 | |
444 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
445 { | |
446 __asm__ volatile( | |
447 "1: \n\t" | |
448 "movdqu (%1), %%xmm0 \n\t" | |
449 "movdqu (%1,%3), %%xmm1 \n\t" | |
450 "movdqu (%1,%3,2), %%xmm2 \n\t" | |
451 "movdqu (%1,%4), %%xmm3 \n\t" | |
452 "movdqa %%xmm0, (%2) \n\t" | |
453 "movdqa %%xmm1, (%2,%3) \n\t" | |
454 "movdqa %%xmm2, (%2,%3,2) \n\t" | |
455 "movdqa %%xmm3, (%2,%4) \n\t" | |
456 "subl $4, %0 \n\t" | |
457 "lea (%1,%3,4), %1 \n\t" | |
458 "lea (%2,%3,4), %2 \n\t" | |
459 "jnz 1b \n\t" | |
460 : "+g"(h), "+r" (pixels), "+r" (block) | |
461 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) | |
462 : "memory" | |
463 ); | |
464 } | |
465 | |
466 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
467 { | |
468 __asm__ volatile( | |
469 "1: \n\t" | |
470 "movdqu (%1), %%xmm0 \n\t" | |
471 "movdqu (%1,%3), %%xmm1 \n\t" | |
472 "movdqu (%1,%3,2), %%xmm2 \n\t" | |
473 "movdqu (%1,%4), %%xmm3 \n\t" | |
474 "pavgb (%2), %%xmm0 \n\t" | |
475 "pavgb (%2,%3), %%xmm1 \n\t" | |
476 "pavgb (%2,%3,2), %%xmm2 \n\t" | |
477 "pavgb (%2,%4), %%xmm3 \n\t" | |
478 "movdqa %%xmm0, (%2) \n\t" | |
479 "movdqa %%xmm1, (%2,%3) \n\t" | |
480 "movdqa %%xmm2, (%2,%3,2) \n\t" | |
481 "movdqa %%xmm3, (%2,%4) \n\t" | |
482 "subl $4, %0 \n\t" | |
483 "lea (%1,%3,4), %1 \n\t" | |
484 "lea (%2,%3,4), %2 \n\t" | |
485 "jnz 1b \n\t" | |
486 : "+g"(h), "+r" (pixels), "+r" (block) | |
487 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) | |
488 : "memory" | |
489 ); | |
490 } | |
491 | |
492 #define CLEAR_BLOCKS(name,n) \ | |
493 static void name(DCTELEM *blocks)\ | |
494 {\ | |
495 __asm__ volatile(\ | |
496 "pxor %%mm7, %%mm7 \n\t"\ | |
497 "mov %1, %%"REG_a" \n\t"\ | |
498 "1: \n\t"\ | |
499 "movq %%mm7, (%0, %%"REG_a") \n\t"\ | |
500 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\ | |
501 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\ | |
502 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\ | |
503 "add $32, %%"REG_a" \n\t"\ | |
504 " js 1b \n\t"\ | |
505 : : "r" (((uint8_t *)blocks)+128*n),\ | |
506 "i" (-128*n)\ | |
507 : "%"REG_a\ | |
508 );\ | |
509 } | |
510 CLEAR_BLOCKS(clear_blocks_mmx, 6) | |
511 CLEAR_BLOCKS(clear_block_mmx, 1) | |
512 | |
513 static void clear_block_sse(DCTELEM *block) | |
514 { | |
515 __asm__ volatile( | |
516 "xorps %%xmm0, %%xmm0 \n" | |
517 "movaps %%xmm0, (%0) \n" | |
518 "movaps %%xmm0, 16(%0) \n" | |
519 "movaps %%xmm0, 32(%0) \n" | |
520 "movaps %%xmm0, 48(%0) \n" | |
521 "movaps %%xmm0, 64(%0) \n" | |
522 "movaps %%xmm0, 80(%0) \n" | |
523 "movaps %%xmm0, 96(%0) \n" | |
524 "movaps %%xmm0, 112(%0) \n" | |
525 :: "r"(block) | |
526 : "memory" | |
527 ); | |
528 } | |
529 | |
9861 | 530 static void clear_blocks_sse(DCTELEM *blocks) |
531 {\ | |
532 __asm__ volatile( | |
533 "xorps %%xmm0, %%xmm0 \n" | |
534 "mov %1, %%"REG_a" \n" | |
535 "1: \n" | |
536 "movaps %%xmm0, (%0, %%"REG_a") \n" | |
537 "movaps %%xmm0, 16(%0, %%"REG_a") \n" | |
538 "movaps %%xmm0, 32(%0, %%"REG_a") \n" | |
539 "movaps %%xmm0, 48(%0, %%"REG_a") \n" | |
540 "movaps %%xmm0, 64(%0, %%"REG_a") \n" | |
541 "movaps %%xmm0, 80(%0, %%"REG_a") \n" | |
542 "movaps %%xmm0, 96(%0, %%"REG_a") \n" | |
543 "movaps %%xmm0, 112(%0, %%"REG_a") \n" | |
544 "add $128, %%"REG_a" \n" | |
545 " js 1b \n" | |
546 : : "r" (((uint8_t *)blocks)+128*6), | |
547 "i" (-128*6) | |
548 : "%"REG_a | |
549 ); | |
550 } | |
551 | |
8430 | 552 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ |
553 x86_reg i=0; | |
554 __asm__ volatile( | |
555 "jmp 2f \n\t" | |
556 "1: \n\t" | |
557 "movq (%1, %0), %%mm0 \n\t" | |
558 "movq (%2, %0), %%mm1 \n\t" | |
559 "paddb %%mm0, %%mm1 \n\t" | |
560 "movq %%mm1, (%2, %0) \n\t" | |
561 "movq 8(%1, %0), %%mm0 \n\t" | |
562 "movq 8(%2, %0), %%mm1 \n\t" | |
563 "paddb %%mm0, %%mm1 \n\t" | |
564 "movq %%mm1, 8(%2, %0) \n\t" | |
565 "add $16, %0 \n\t" | |
566 "2: \n\t" | |
567 "cmp %3, %0 \n\t" | |
568 " js 1b \n\t" | |
569 : "+r" (i) | |
570 : "r"(src), "r"(dst), "r"((x86_reg)w-15) | |
571 ); | |
572 for(; i<w; i++) | |
573 dst[i+0] += src[i+0]; | |
574 } | |
575 | |
576 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ | |
577 x86_reg i=0; | |
578 __asm__ volatile( | |
579 "jmp 2f \n\t" | |
580 "1: \n\t" | |
581 "movq (%2, %0), %%mm0 \n\t" | |
582 "movq 8(%2, %0), %%mm1 \n\t" | |
583 "paddb (%3, %0), %%mm0 \n\t" | |
584 "paddb 8(%3, %0), %%mm1 \n\t" | |
585 "movq %%mm0, (%1, %0) \n\t" | |
586 "movq %%mm1, 8(%1, %0) \n\t" | |
587 "add $16, %0 \n\t" | |
588 "2: \n\t" | |
589 "cmp %4, %0 \n\t" | |
590 " js 1b \n\t" | |
591 : "+r" (i) | |
592 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15) | |
593 ); | |
594 for(; i<w; i++) | |
595 dst[i] = src1[i] + src2[i]; | |
596 } | |
597 | |
8798
a5c8210814d7
Add check whether the compiler/assembler supports 10 or more operands.
diego
parents:
8760
diff
changeset
|
598 #if HAVE_7REGS && HAVE_TEN_OPERANDS |
10431 | 599 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) { |
8760 | 600 x86_reg w2 = -w; |
601 x86_reg x; | |
602 int l = *left & 0xff; | |
603 int tl = *left_top & 0xff; | |
604 int t; | |
605 __asm__ volatile( | |
606 "mov %7, %3 \n" | |
607 "1: \n" | |
608 "movzx (%3,%4), %2 \n" | |
609 "mov %2, %k3 \n" | |
610 "sub %b1, %b3 \n" | |
611 "add %b0, %b3 \n" | |
612 "mov %2, %1 \n" | |
613 "cmp %0, %2 \n" | |
614 "cmovg %0, %2 \n" | |
615 "cmovg %1, %0 \n" | |
616 "cmp %k3, %0 \n" | |
617 "cmovg %k3, %0 \n" | |
618 "mov %7, %3 \n" | |
619 "cmp %2, %0 \n" | |
620 "cmovl %2, %0 \n" | |
621 "add (%6,%4), %b0 \n" | |
622 "mov %b0, (%5,%4) \n" | |
623 "inc %4 \n" | |
624 "jl 1b \n" | |
625 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2) | |
626 :"r"(dst+w), "r"(diff+w), "rm"(top+w) | |
627 ); | |
628 *left = l; | |
629 *left_top = tl; | |
630 } | |
631 #endif | |
632 | |
8430 | 633 #define H263_LOOP_FILTER \ |
634 "pxor %%mm7, %%mm7 \n\t"\ | |
635 "movq %0, %%mm0 \n\t"\ | |
636 "movq %0, %%mm1 \n\t"\ | |
637 "movq %3, %%mm2 \n\t"\ | |
638 "movq %3, %%mm3 \n\t"\ | |
639 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
640 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
641 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
642 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
643 "psubw %%mm2, %%mm0 \n\t"\ | |
644 "psubw %%mm3, %%mm1 \n\t"\ | |
645 "movq %1, %%mm2 \n\t"\ | |
646 "movq %1, %%mm3 \n\t"\ | |
647 "movq %2, %%mm4 \n\t"\ | |
648 "movq %2, %%mm5 \n\t"\ | |
649 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
650 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
651 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
652 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
653 "psubw %%mm2, %%mm4 \n\t"\ | |
654 "psubw %%mm3, %%mm5 \n\t"\ | |
655 "psllw $2, %%mm4 \n\t"\ | |
656 "psllw $2, %%mm5 \n\t"\ | |
657 "paddw %%mm0, %%mm4 \n\t"\ | |
658 "paddw %%mm1, %%mm5 \n\t"\ | |
659 "pxor %%mm6, %%mm6 \n\t"\ | |
660 "pcmpgtw %%mm4, %%mm6 \n\t"\ | |
661 "pcmpgtw %%mm5, %%mm7 \n\t"\ | |
662 "pxor %%mm6, %%mm4 \n\t"\ | |
663 "pxor %%mm7, %%mm5 \n\t"\ | |
664 "psubw %%mm6, %%mm4 \n\t"\ | |
665 "psubw %%mm7, %%mm5 \n\t"\ | |
666 "psrlw $3, %%mm4 \n\t"\ | |
667 "psrlw $3, %%mm5 \n\t"\ | |
668 "packuswb %%mm5, %%mm4 \n\t"\ | |
669 "packsswb %%mm7, %%mm6 \n\t"\ | |
670 "pxor %%mm7, %%mm7 \n\t"\ | |
671 "movd %4, %%mm2 \n\t"\ | |
672 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
673 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
674 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
675 "psubusb %%mm4, %%mm2 \n\t"\ | |
676 "movq %%mm2, %%mm3 \n\t"\ | |
677 "psubusb %%mm4, %%mm3 \n\t"\ | |
678 "psubb %%mm3, %%mm2 \n\t"\ | |
679 "movq %1, %%mm3 \n\t"\ | |
680 "movq %2, %%mm4 \n\t"\ | |
681 "pxor %%mm6, %%mm3 \n\t"\ | |
682 "pxor %%mm6, %%mm4 \n\t"\ | |
683 "paddusb %%mm2, %%mm3 \n\t"\ | |
684 "psubusb %%mm2, %%mm4 \n\t"\ | |
685 "pxor %%mm6, %%mm3 \n\t"\ | |
686 "pxor %%mm6, %%mm4 \n\t"\ | |
687 "paddusb %%mm2, %%mm2 \n\t"\ | |
688 "packsswb %%mm1, %%mm0 \n\t"\ | |
689 "pcmpgtb %%mm0, %%mm7 \n\t"\ | |
690 "pxor %%mm7, %%mm0 \n\t"\ | |
691 "psubb %%mm7, %%mm0 \n\t"\ | |
692 "movq %%mm0, %%mm1 \n\t"\ | |
693 "psubusb %%mm2, %%mm0 \n\t"\ | |
694 "psubb %%mm0, %%mm1 \n\t"\ | |
695 "pand %5, %%mm1 \n\t"\ | |
696 "psrlw $2, %%mm1 \n\t"\ | |
697 "pxor %%mm7, %%mm1 \n\t"\ | |
698 "psubb %%mm7, %%mm1 \n\t"\ | |
699 "movq %0, %%mm5 \n\t"\ | |
700 "movq %3, %%mm6 \n\t"\ | |
701 "psubb %%mm1, %%mm5 \n\t"\ | |
702 "paddb %%mm1, %%mm6 \n\t" | |
703 | |
704 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ | |
10749
5cca4b6c459d
Get rid of pointless CONFIG_ANY_H263 preprocessor definition.
diego
parents:
10645
diff
changeset
|
705 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
8430 | 706 const int strength= ff_h263_loop_filter_strength[qscale]; |
707 | |
708 __asm__ volatile( | |
709 | |
710 H263_LOOP_FILTER | |
711 | |
712 "movq %%mm3, %1 \n\t" | |
713 "movq %%mm4, %2 \n\t" | |
714 "movq %%mm5, %0 \n\t" | |
715 "movq %%mm6, %3 \n\t" | |
716 : "+m" (*(uint64_t*)(src - 2*stride)), | |
717 "+m" (*(uint64_t*)(src - 1*stride)), | |
718 "+m" (*(uint64_t*)(src + 0*stride)), | |
719 "+m" (*(uint64_t*)(src + 1*stride)) | |
720 : "g" (2*strength), "m"(ff_pb_FC) | |
721 ); | |
722 } | |
723 } | |
724 | |
725 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ | |
726 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... | |
727 "movd %4, %%mm0 \n\t" | |
728 "movd %5, %%mm1 \n\t" | |
729 "movd %6, %%mm2 \n\t" | |
730 "movd %7, %%mm3 \n\t" | |
731 "punpcklbw %%mm1, %%mm0 \n\t" | |
732 "punpcklbw %%mm3, %%mm2 \n\t" | |
733 "movq %%mm0, %%mm1 \n\t" | |
734 "punpcklwd %%mm2, %%mm0 \n\t" | |
735 "punpckhwd %%mm2, %%mm1 \n\t" | |
736 "movd %%mm0, %0 \n\t" | |
737 "punpckhdq %%mm0, %%mm0 \n\t" | |
738 "movd %%mm0, %1 \n\t" | |
739 "movd %%mm1, %2 \n\t" | |
740 "punpckhdq %%mm1, %%mm1 \n\t" | |
741 "movd %%mm1, %3 \n\t" | |
742 | |
743 : "=m" (*(uint32_t*)(dst + 0*dst_stride)), | |
744 "=m" (*(uint32_t*)(dst + 1*dst_stride)), | |
745 "=m" (*(uint32_t*)(dst + 2*dst_stride)), | |
746 "=m" (*(uint32_t*)(dst + 3*dst_stride)) | |
747 : "m" (*(uint32_t*)(src + 0*src_stride)), | |
748 "m" (*(uint32_t*)(src + 1*src_stride)), | |
749 "m" (*(uint32_t*)(src + 2*src_stride)), | |
750 "m" (*(uint32_t*)(src + 3*src_stride)) | |
751 ); | |
752 } | |
753 | |
754 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ | |
10749
5cca4b6c459d
Get rid of pointless CONFIG_ANY_H263 preprocessor definition.
diego
parents:
10645
diff
changeset
|
755 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
8430 | 756 const int strength= ff_h263_loop_filter_strength[qscale]; |
757 DECLARE_ALIGNED(8, uint64_t, temp[4]); | |
758 uint8_t *btemp= (uint8_t*)temp; | |
759 | |
760 src -= 2; | |
761 | |
762 transpose4x4(btemp , src , 8, stride); | |
763 transpose4x4(btemp+4, src + 4*stride, 8, stride); | |
764 __asm__ volatile( | |
765 H263_LOOP_FILTER // 5 3 4 6 | |
766 | |
767 : "+m" (temp[0]), | |
768 "+m" (temp[1]), | |
769 "+m" (temp[2]), | |
770 "+m" (temp[3]) | |
771 : "g" (2*strength), "m"(ff_pb_FC) | |
772 ); | |
773 | |
774 __asm__ volatile( | |
775 "movq %%mm5, %%mm1 \n\t" | |
776 "movq %%mm4, %%mm0 \n\t" | |
777 "punpcklbw %%mm3, %%mm5 \n\t" | |
778 "punpcklbw %%mm6, %%mm4 \n\t" | |
779 "punpckhbw %%mm3, %%mm1 \n\t" | |
780 "punpckhbw %%mm6, %%mm0 \n\t" | |
781 "movq %%mm5, %%mm3 \n\t" | |
782 "movq %%mm1, %%mm6 \n\t" | |
783 "punpcklwd %%mm4, %%mm5 \n\t" | |
784 "punpcklwd %%mm0, %%mm1 \n\t" | |
785 "punpckhwd %%mm4, %%mm3 \n\t" | |
786 "punpckhwd %%mm0, %%mm6 \n\t" | |
787 "movd %%mm5, (%0) \n\t" | |
788 "punpckhdq %%mm5, %%mm5 \n\t" | |
789 "movd %%mm5, (%0,%2) \n\t" | |
790 "movd %%mm3, (%0,%2,2) \n\t" | |
791 "punpckhdq %%mm3, %%mm3 \n\t" | |
792 "movd %%mm3, (%0,%3) \n\t" | |
793 "movd %%mm1, (%1) \n\t" | |
794 "punpckhdq %%mm1, %%mm1 \n\t" | |
795 "movd %%mm1, (%1,%2) \n\t" | |
796 "movd %%mm6, (%1,%2,2) \n\t" | |
797 "punpckhdq %%mm6, %%mm6 \n\t" | |
798 "movd %%mm6, (%1,%3) \n\t" | |
799 :: "r" (src), | |
800 "r" (src + 4*stride), | |
801 "r" ((x86_reg) stride ), | |
802 "r" ((x86_reg)(3*stride)) | |
803 ); | |
804 } | |
805 } | |
806 | |
807 /* draw the edges of width 'w' of an image of size width, height | |
808 this mmx version can only handle w==8 || w==16 */ | |
809 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) | |
810 { | |
811 uint8_t *ptr, *last_line; | |
812 int i; | |
813 | |
814 last_line = buf + (height - 1) * wrap; | |
815 /* left and right */ | |
816 ptr = buf; | |
817 if(w==8) | |
818 { | |
819 __asm__ volatile( | |
820 "1: \n\t" | |
821 "movd (%0), %%mm0 \n\t" | |
822 "punpcklbw %%mm0, %%mm0 \n\t" | |
823 "punpcklwd %%mm0, %%mm0 \n\t" | |
824 "punpckldq %%mm0, %%mm0 \n\t" | |
825 "movq %%mm0, -8(%0) \n\t" | |
826 "movq -8(%0, %2), %%mm1 \n\t" | |
827 "punpckhbw %%mm1, %%mm1 \n\t" | |
828 "punpckhwd %%mm1, %%mm1 \n\t" | |
829 "punpckhdq %%mm1, %%mm1 \n\t" | |
830 "movq %%mm1, (%0, %2) \n\t" | |
831 "add %1, %0 \n\t" | |
832 "cmp %3, %0 \n\t" | |
833 " jb 1b \n\t" | |
834 : "+r" (ptr) | |
835 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) | |
836 ); | |
837 } | |
838 else | |
839 { | |
840 __asm__ volatile( | |
841 "1: \n\t" | |
842 "movd (%0), %%mm0 \n\t" | |
843 "punpcklbw %%mm0, %%mm0 \n\t" | |
844 "punpcklwd %%mm0, %%mm0 \n\t" | |
845 "punpckldq %%mm0, %%mm0 \n\t" | |
846 "movq %%mm0, -8(%0) \n\t" | |
847 "movq %%mm0, -16(%0) \n\t" | |
848 "movq -8(%0, %2), %%mm1 \n\t" | |
849 "punpckhbw %%mm1, %%mm1 \n\t" | |
850 "punpckhwd %%mm1, %%mm1 \n\t" | |
851 "punpckhdq %%mm1, %%mm1 \n\t" | |
852 "movq %%mm1, (%0, %2) \n\t" | |
853 "movq %%mm1, 8(%0, %2) \n\t" | |
854 "add %1, %0 \n\t" | |
855 "cmp %3, %0 \n\t" | |
856 " jb 1b \n\t" | |
857 : "+r" (ptr) | |
858 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) | |
859 ); | |
860 } | |
861 | |
862 for(i=0;i<w;i+=4) { | |
863 /* top and bottom (and hopefully also the corners) */ | |
864 ptr= buf - (i + 1) * wrap - w; | |
865 __asm__ volatile( | |
866 "1: \n\t" | |
867 "movq (%1, %0), %%mm0 \n\t" | |
868 "movq %%mm0, (%0) \n\t" | |
869 "movq %%mm0, (%0, %2) \n\t" | |
870 "movq %%mm0, (%0, %2, 2) \n\t" | |
871 "movq %%mm0, (%0, %3) \n\t" | |
872 "add $8, %0 \n\t" | |
873 "cmp %4, %0 \n\t" | |
874 " jb 1b \n\t" | |
875 : "+r" (ptr) | |
876 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w) | |
877 ); | |
878 ptr= last_line + (i + 1) * wrap - w; | |
879 __asm__ volatile( | |
880 "1: \n\t" | |
881 "movq (%1, %0), %%mm0 \n\t" | |
882 "movq %%mm0, (%0) \n\t" | |
883 "movq %%mm0, (%0, %2) \n\t" | |
884 "movq %%mm0, (%0, %2, 2) \n\t" | |
885 "movq %%mm0, (%0, %3) \n\t" | |
886 "add $8, %0 \n\t" | |
887 "cmp %4, %0 \n\t" | |
888 " jb 1b \n\t" | |
889 : "+r" (ptr) | |
890 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w) | |
891 ); | |
892 } | |
893 } | |
894 | |
895 #define PAETH(cpu, abs3)\ | |
896 static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ | |
897 {\ | |
898 x86_reg i = -bpp;\ | |
899 x86_reg end = w-3;\ | |
900 __asm__ volatile(\ | |
901 "pxor %%mm7, %%mm7 \n"\ | |
902 "movd (%1,%0), %%mm0 \n"\ | |
903 "movd (%2,%0), %%mm1 \n"\ | |
904 "punpcklbw %%mm7, %%mm0 \n"\ | |
905 "punpcklbw %%mm7, %%mm1 \n"\ | |
906 "add %4, %0 \n"\ | |
907 "1: \n"\ | |
908 "movq %%mm1, %%mm2 \n"\ | |
909 "movd (%2,%0), %%mm1 \n"\ | |
910 "movq %%mm2, %%mm3 \n"\ | |
911 "punpcklbw %%mm7, %%mm1 \n"\ | |
912 "movq %%mm2, %%mm4 \n"\ | |
913 "psubw %%mm1, %%mm3 \n"\ | |
914 "psubw %%mm0, %%mm4 \n"\ | |
915 "movq %%mm3, %%mm5 \n"\ | |
916 "paddw %%mm4, %%mm5 \n"\ | |
917 abs3\ | |
918 "movq %%mm4, %%mm6 \n"\ | |
919 "pminsw %%mm5, %%mm6 \n"\ | |
920 "pcmpgtw %%mm6, %%mm3 \n"\ | |
921 "pcmpgtw %%mm5, %%mm4 \n"\ | |
922 "movq %%mm4, %%mm6 \n"\ | |
923 "pand %%mm3, %%mm4 \n"\ | |
924 "pandn %%mm3, %%mm6 \n"\ | |
925 "pandn %%mm0, %%mm3 \n"\ | |
926 "movd (%3,%0), %%mm0 \n"\ | |
927 "pand %%mm1, %%mm6 \n"\ | |
928 "pand %%mm4, %%mm2 \n"\ | |
929 "punpcklbw %%mm7, %%mm0 \n"\ | |
930 "movq %6, %%mm5 \n"\ | |
931 "paddw %%mm6, %%mm0 \n"\ | |
932 "paddw %%mm2, %%mm3 \n"\ | |
933 "paddw %%mm3, %%mm0 \n"\ | |
934 "pand %%mm5, %%mm0 \n"\ | |
935 "movq %%mm0, %%mm3 \n"\ | |
936 "packuswb %%mm3, %%mm3 \n"\ | |
937 "movd %%mm3, (%1,%0) \n"\ | |
938 "add %4, %0 \n"\ | |
939 "cmp %5, %0 \n"\ | |
940 "jle 1b \n"\ | |
941 :"+r"(i)\ | |
942 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\ | |
943 "m"(ff_pw_255)\ | |
944 :"memory"\ | |
945 );\ | |
946 } | |
947 | |
948 #define ABS3_MMX2\ | |
949 "psubw %%mm5, %%mm7 \n"\ | |
950 "pmaxsw %%mm7, %%mm5 \n"\ | |
951 "pxor %%mm6, %%mm6 \n"\ | |
952 "pxor %%mm7, %%mm7 \n"\ | |
953 "psubw %%mm3, %%mm6 \n"\ | |
954 "psubw %%mm4, %%mm7 \n"\ | |
955 "pmaxsw %%mm6, %%mm3 \n"\ | |
956 "pmaxsw %%mm7, %%mm4 \n"\ | |
957 "pxor %%mm7, %%mm7 \n" | |
958 | |
959 #define ABS3_SSSE3\ | |
960 "pabsw %%mm3, %%mm3 \n"\ | |
961 "pabsw %%mm4, %%mm4 \n"\ | |
962 "pabsw %%mm5, %%mm5 \n" | |
963 | |
964 PAETH(mmx2, ABS3_MMX2) | |
8590 | 965 #if HAVE_SSSE3 |
8430 | 966 PAETH(ssse3, ABS3_SSSE3) |
967 #endif | |
968 | |
969 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ | |
970 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ | |
971 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ | |
972 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ | |
973 "movq "#in7", " #m3 " \n\t" /* d */\ | |
974 "movq "#in0", %%mm5 \n\t" /* D */\ | |
975 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ | |
976 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ | |
977 "movq "#in1", %%mm5 \n\t" /* C */\ | |
978 "movq "#in2", %%mm6 \n\t" /* B */\ | |
979 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ | |
980 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ | |
981 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ | |
982 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ | |
983 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ | |
984 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ | |
985 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ | |
986 "psraw $5, %%mm5 \n\t"\ | |
987 "packuswb %%mm5, %%mm5 \n\t"\ | |
988 OP(%%mm5, out, %%mm7, d) | |
989 | |
990 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ | |
991 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
992 uint64_t temp;\ | |
993 \ | |
994 __asm__ volatile(\ | |
995 "pxor %%mm7, %%mm7 \n\t"\ | |
996 "1: \n\t"\ | |
997 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
998 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
999 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
1000 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
1001 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
1002 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
1003 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
1004 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
1005 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
1006 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
1007 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
1008 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
1009 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
1010 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
1011 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
1012 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
1013 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
1014 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1015 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
1016 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
1017 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ | |
1018 "paddw %%mm4, %%mm0 \n\t" /* a */\ | |
1019 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
1020 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ | |
1021 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ | |
1022 "paddw %6, %%mm6 \n\t"\ | |
1023 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
1024 "psraw $5, %%mm0 \n\t"\ | |
1025 "movq %%mm0, %5 \n\t"\ | |
1026 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ | |
1027 \ | |
1028 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ | |
1029 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ | |
1030 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ | |
1031 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ | |
1032 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ | |
1033 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ | |
1034 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ | |
1035 "paddw %%mm0, %%mm2 \n\t" /* b */\ | |
1036 "paddw %%mm5, %%mm3 \n\t" /* c */\ | |
1037 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
1038 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
1039 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ | |
1040 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ | |
1041 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ | |
1042 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ | |
1043 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ | |
1044 "paddw %%mm2, %%mm1 \n\t" /* a */\ | |
1045 "paddw %%mm6, %%mm4 \n\t" /* d */\ | |
1046 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ | |
1047 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ | |
1048 "paddw %6, %%mm1 \n\t"\ | |
1049 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ | |
1050 "psraw $5, %%mm3 \n\t"\ | |
1051 "movq %5, %%mm1 \n\t"\ | |
1052 "packuswb %%mm3, %%mm1 \n\t"\ | |
1053 OP_MMX2(%%mm1, (%1),%%mm4, q)\ | |
1054 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ | |
1055 \ | |
1056 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ | |
1057 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ | |
1058 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ | |
1059 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ | |
1060 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ | |
1061 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ | |
1062 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ | |
1063 "paddw %%mm1, %%mm5 \n\t" /* b */\ | |
1064 "paddw %%mm4, %%mm0 \n\t" /* c */\ | |
1065 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1066 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ | |
1067 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ | |
1068 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ | |
1069 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ | |
1070 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ | |
1071 "paddw %%mm3, %%mm2 \n\t" /* d */\ | |
1072 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ | |
1073 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ | |
1074 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ | |
1075 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ | |
1076 "paddw %%mm2, %%mm6 \n\t" /* a */\ | |
1077 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ | |
1078 "paddw %6, %%mm0 \n\t"\ | |
1079 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
1080 "psraw $5, %%mm0 \n\t"\ | |
1081 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ | |
1082 \ | |
1083 "paddw %%mm5, %%mm3 \n\t" /* a */\ | |
1084 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ | |
1085 "paddw %%mm4, %%mm6 \n\t" /* b */\ | |
1086 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ | |
1087 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ | |
1088 "paddw %%mm1, %%mm4 \n\t" /* c */\ | |
1089 "paddw %%mm2, %%mm5 \n\t" /* d */\ | |
1090 "paddw %%mm6, %%mm6 \n\t" /* 2b */\ | |
1091 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ | |
1092 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ | |
1093 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ | |
1094 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ | |
1095 "paddw %6, %%mm4 \n\t"\ | |
1096 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ | |
1097 "psraw $5, %%mm4 \n\t"\ | |
1098 "packuswb %%mm4, %%mm0 \n\t"\ | |
1099 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ | |
1100 \ | |
1101 "add %3, %0 \n\t"\ | |
1102 "add %4, %1 \n\t"\ | |
1103 "decl %2 \n\t"\ | |
1104 " jnz 1b \n\t"\ | |
1105 : "+a"(src), "+c"(dst), "+D"(h)\ | |
1106 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ | |
1107 : "memory"\ | |
1108 );\ | |
1109 }\ | |
1110 \ | |
1111 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1112 int i;\ | |
1113 int16_t temp[16];\ | |
1114 /* quick HACK, XXX FIXME MUST be optimized */\ | |
1115 for(i=0; i<h; i++)\ | |
1116 {\ | |
1117 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
1118 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
1119 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
1120 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
1121 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
1122 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\ | |
1123 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\ | |
1124 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\ | |
1125 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\ | |
1126 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\ | |
1127 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\ | |
1128 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\ | |
1129 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\ | |
1130 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\ | |
1131 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\ | |
1132 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\ | |
1133 __asm__ volatile(\ | |
1134 "movq (%0), %%mm0 \n\t"\ | |
1135 "movq 8(%0), %%mm1 \n\t"\ | |
1136 "paddw %2, %%mm0 \n\t"\ | |
1137 "paddw %2, %%mm1 \n\t"\ | |
1138 "psraw $5, %%mm0 \n\t"\ | |
1139 "psraw $5, %%mm1 \n\t"\ | |
1140 "packuswb %%mm1, %%mm0 \n\t"\ | |
1141 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ | |
1142 "movq 16(%0), %%mm0 \n\t"\ | |
1143 "movq 24(%0), %%mm1 \n\t"\ | |
1144 "paddw %2, %%mm0 \n\t"\ | |
1145 "paddw %2, %%mm1 \n\t"\ | |
1146 "psraw $5, %%mm0 \n\t"\ | |
1147 "psraw $5, %%mm1 \n\t"\ | |
1148 "packuswb %%mm1, %%mm0 \n\t"\ | |
1149 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ | |
1150 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ | |
1151 : "memory"\ | |
1152 );\ | |
1153 dst+=dstStride;\ | |
1154 src+=srcStride;\ | |
1155 }\ | |
1156 }\ | |
1157 \ | |
1158 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1159 __asm__ volatile(\ | |
1160 "pxor %%mm7, %%mm7 \n\t"\ | |
1161 "1: \n\t"\ | |
1162 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
1163 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
1164 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
1165 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
1166 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
1167 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
1168 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
1169 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
1170 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
1171 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
1172 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
1173 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
1174 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
1175 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
1176 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
1177 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
1178 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
1179 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1180 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
1181 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
1182 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ | |
1183 "paddw %%mm4, %%mm0 \n\t" /* a */\ | |
1184 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
1185 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ | |
1186 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ | |
1187 "paddw %5, %%mm6 \n\t"\ | |
1188 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
1189 "psraw $5, %%mm0 \n\t"\ | |
1190 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ | |
1191 \ | |
1192 "movd 5(%0), %%mm5 \n\t" /* FGHI */\ | |
1193 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ | |
1194 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ | |
1195 "paddw %%mm5, %%mm1 \n\t" /* a */\ | |
1196 "paddw %%mm6, %%mm2 \n\t" /* b */\ | |
1197 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ | |
1198 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ | |
1199 "paddw %%mm6, %%mm3 \n\t" /* c */\ | |
1200 "paddw %%mm5, %%mm4 \n\t" /* d */\ | |
1201 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
1202 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
1203 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ | |
1204 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ | |
1205 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ | |
1206 "paddw %5, %%mm1 \n\t"\ | |
1207 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ | |
1208 "psraw $5, %%mm3 \n\t"\ | |
1209 "packuswb %%mm3, %%mm0 \n\t"\ | |
1210 OP_MMX2(%%mm0, (%1), %%mm4, q)\ | |
1211 \ | |
1212 "add %3, %0 \n\t"\ | |
1213 "add %4, %1 \n\t"\ | |
1214 "decl %2 \n\t"\ | |
1215 " jnz 1b \n\t"\ | |
1216 : "+a"(src), "+c"(dst), "+d"(h)\ | |
1217 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\ | |
1218 : "memory"\ | |
1219 );\ | |
1220 }\ | |
1221 \ | |
1222 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1223 int i;\ | |
1224 int16_t temp[8];\ | |
1225 /* quick HACK, XXX FIXME MUST be optimized */\ | |
1226 for(i=0; i<h; i++)\ | |
1227 {\ | |
1228 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
1229 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
1230 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
1231 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
1232 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
1233 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ | |
1234 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ | |
1235 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ | |
1236 __asm__ volatile(\ | |
1237 "movq (%0), %%mm0 \n\t"\ | |
1238 "movq 8(%0), %%mm1 \n\t"\ | |
1239 "paddw %2, %%mm0 \n\t"\ | |
1240 "paddw %2, %%mm1 \n\t"\ | |
1241 "psraw $5, %%mm0 \n\t"\ | |
1242 "psraw $5, %%mm1 \n\t"\ | |
1243 "packuswb %%mm1, %%mm0 \n\t"\ | |
1244 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ | |
1245 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ | |
1246 :"memory"\ | |
1247 );\ | |
1248 dst+=dstStride;\ | |
1249 src+=srcStride;\ | |
1250 }\ | |
1251 } | |
1252 | |
1253 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ | |
1254 \ | |
1255 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1256 uint64_t temp[17*4];\ | |
1257 uint64_t *temp_ptr= temp;\ | |
1258 int count= 17;\ | |
1259 \ | |
1260 /*FIXME unroll */\ | |
1261 __asm__ volatile(\ | |
1262 "pxor %%mm7, %%mm7 \n\t"\ | |
1263 "1: \n\t"\ | |
1264 "movq (%0), %%mm0 \n\t"\ | |
1265 "movq (%0), %%mm1 \n\t"\ | |
1266 "movq 8(%0), %%mm2 \n\t"\ | |
1267 "movq 8(%0), %%mm3 \n\t"\ | |
1268 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1269 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
1270 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1271 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1272 "movq %%mm0, (%1) \n\t"\ | |
1273 "movq %%mm1, 17*8(%1) \n\t"\ | |
1274 "movq %%mm2, 2*17*8(%1) \n\t"\ | |
1275 "movq %%mm3, 3*17*8(%1) \n\t"\ | |
1276 "add $8, %1 \n\t"\ | |
1277 "add %3, %0 \n\t"\ | |
1278 "decl %2 \n\t"\ | |
1279 " jnz 1b \n\t"\ | |
1280 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ | |
1281 : "r" ((x86_reg)srcStride)\ | |
1282 : "memory"\ | |
1283 );\ | |
1284 \ | |
1285 temp_ptr= temp;\ | |
1286 count=4;\ | |
1287 \ | |
1288 /*FIXME reorder for speed */\ | |
1289 __asm__ volatile(\ | |
1290 /*"pxor %%mm7, %%mm7 \n\t"*/\ | |
1291 "1: \n\t"\ | |
1292 "movq (%0), %%mm0 \n\t"\ | |
1293 "movq 8(%0), %%mm1 \n\t"\ | |
1294 "movq 16(%0), %%mm2 \n\t"\ | |
1295 "movq 24(%0), %%mm3 \n\t"\ | |
1296 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ | |
1297 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
1298 "add %4, %1 \n\t"\ | |
1299 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ | |
1300 \ | |
1301 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ | |
1302 "add %4, %1 \n\t"\ | |
1303 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ | |
1304 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ | |
1305 "add %4, %1 \n\t"\ | |
1306 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ | |
1307 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ | |
1308 "add %4, %1 \n\t"\ | |
1309 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ | |
1310 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ | |
1311 "add %4, %1 \n\t"\ | |
1312 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ | |
1313 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ | |
1314 "add %4, %1 \n\t"\ | |
1315 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ | |
1316 \ | |
1317 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ | |
1318 "add %4, %1 \n\t" \ | |
1319 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ | |
1320 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ | |
1321 \ | |
1322 "add $136, %0 \n\t"\ | |
1323 "add %6, %1 \n\t"\ | |
1324 "decl %2 \n\t"\ | |
1325 " jnz 1b \n\t"\ | |
1326 \ | |
1327 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ | |
1328 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\ | |
1329 :"memory"\ | |
1330 );\ | |
1331 }\ | |
1332 \ | |
1333 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1334 uint64_t temp[9*2];\ | |
1335 uint64_t *temp_ptr= temp;\ | |
1336 int count= 9;\ | |
1337 \ | |
1338 /*FIXME unroll */\ | |
1339 __asm__ volatile(\ | |
1340 "pxor %%mm7, %%mm7 \n\t"\ | |
1341 "1: \n\t"\ | |
1342 "movq (%0), %%mm0 \n\t"\ | |
1343 "movq (%0), %%mm1 \n\t"\ | |
1344 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1345 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
1346 "movq %%mm0, (%1) \n\t"\ | |
1347 "movq %%mm1, 9*8(%1) \n\t"\ | |
1348 "add $8, %1 \n\t"\ | |
1349 "add %3, %0 \n\t"\ | |
1350 "decl %2 \n\t"\ | |
1351 " jnz 1b \n\t"\ | |
1352 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ | |
1353 : "r" ((x86_reg)srcStride)\ | |
1354 : "memory"\ | |
1355 );\ | |
1356 \ | |
1357 temp_ptr= temp;\ | |
1358 count=2;\ | |
1359 \ | |
1360 /*FIXME reorder for speed */\ | |
1361 __asm__ volatile(\ | |
1362 /*"pxor %%mm7, %%mm7 \n\t"*/\ | |
1363 "1: \n\t"\ | |
1364 "movq (%0), %%mm0 \n\t"\ | |
1365 "movq 8(%0), %%mm1 \n\t"\ | |
1366 "movq 16(%0), %%mm2 \n\t"\ | |
1367 "movq 24(%0), %%mm3 \n\t"\ | |
1368 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ | |
1369 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
1370 "add %4, %1 \n\t"\ | |
1371 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ | |
1372 \ | |
1373 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ | |
1374 "add %4, %1 \n\t"\ | |
1375 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ | |
1376 \ | |
1377 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ | |
1378 "add %4, %1 \n\t"\ | |
1379 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ | |
1380 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ | |
1381 \ | |
1382 "add $72, %0 \n\t"\ | |
1383 "add %6, %1 \n\t"\ | |
1384 "decl %2 \n\t"\ | |
1385 " jnz 1b \n\t"\ | |
1386 \ | |
1387 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ | |
1388 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\ | |
1389 : "memory"\ | |
1390 );\ | |
1391 }\ | |
1392 \ | |
1393 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ | |
1394 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\ | |
1395 }\ | |
1396 \ | |
1397 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1398 uint64_t temp[8];\ | |
1399 uint8_t * const half= (uint8_t*)temp;\ | |
1400 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
1401 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ | |
1402 }\ | |
1403 \ | |
1404 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1405 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ | |
1406 }\ | |
1407 \ | |
1408 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1409 uint64_t temp[8];\ | |
1410 uint8_t * const half= (uint8_t*)temp;\ | |
1411 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
1412 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ | |
1413 }\ | |
1414 \ | |
1415 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1416 uint64_t temp[8];\ | |
1417 uint8_t * const half= (uint8_t*)temp;\ | |
1418 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ | |
1419 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ | |
1420 }\ | |
1421 \ | |
1422 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1423 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ | |
1424 }\ | |
1425 \ | |
1426 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1427 uint64_t temp[8];\ | |
1428 uint8_t * const half= (uint8_t*)temp;\ | |
1429 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ | |
1430 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ | |
1431 }\ | |
1432 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1433 uint64_t half[8 + 9];\ | |
1434 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1435 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1436 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1437 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ | |
1438 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
1439 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ | |
1440 }\ | |
1441 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1442 uint64_t half[8 + 9];\ | |
1443 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1444 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1445 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1446 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ | |
1447 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
1448 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ | |
1449 }\ | |
1450 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1451 uint64_t half[8 + 9];\ | |
1452 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1453 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1454 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1455 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ | |
1456 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
1457 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ | |
1458 }\ | |
1459 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1460 uint64_t half[8 + 9];\ | |
1461 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1462 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1463 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1464 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ | |
1465 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
1466 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ | |
1467 }\ | |
1468 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1469 uint64_t half[8 + 9];\ | |
1470 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1471 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1472 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1473 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
1474 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ | |
1475 }\ | |
1476 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1477 uint64_t half[8 + 9];\ | |
1478 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1479 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1480 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1481 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
1482 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ | |
1483 }\ | |
1484 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1485 uint64_t half[8 + 9];\ | |
1486 uint8_t * const halfH= ((uint8_t*)half);\ | |
1487 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1488 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ | |
1489 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ | |
1490 }\ | |
1491 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1492 uint64_t half[8 + 9];\ | |
1493 uint8_t * const halfH= ((uint8_t*)half);\ | |
1494 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1495 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ | |
1496 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ | |
1497 }\ | |
1498 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1499 uint64_t half[9];\ | |
1500 uint8_t * const halfH= ((uint8_t*)half);\ | |
1501 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1502 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ | |
1503 }\ | |
1504 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ | |
1505 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\ | |
1506 }\ | |
1507 \ | |
1508 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1509 uint64_t temp[32];\ | |
1510 uint8_t * const half= (uint8_t*)temp;\ | |
1511 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
1512 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ | |
1513 }\ | |
1514 \ | |
1515 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1516 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ | |
1517 }\ | |
1518 \ | |
1519 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1520 uint64_t temp[32];\ | |
1521 uint8_t * const half= (uint8_t*)temp;\ | |
1522 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
1523 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ | |
1524 }\ | |
1525 \ | |
1526 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1527 uint64_t temp[32];\ | |
1528 uint8_t * const half= (uint8_t*)temp;\ | |
1529 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ | |
1530 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ | |
1531 }\ | |
1532 \ | |
1533 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1534 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ | |
1535 }\ | |
1536 \ | |
1537 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1538 uint64_t temp[32];\ | |
1539 uint8_t * const half= (uint8_t*)temp;\ | |
1540 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ | |
1541 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ | |
1542 }\ | |
1543 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1544 uint64_t half[16*2 + 17*2];\ | |
1545 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1546 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1547 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1548 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ | |
1549 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
1550 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ | |
1551 }\ | |
1552 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1553 uint64_t half[16*2 + 17*2];\ | |
1554 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1555 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1556 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1557 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ | |
1558 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
1559 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ | |
1560 }\ | |
1561 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1562 uint64_t half[16*2 + 17*2];\ | |
1563 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1564 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1565 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1566 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ | |
1567 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
1568 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ | |
1569 }\ | |
1570 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1571 uint64_t half[16*2 + 17*2];\ | |
1572 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1573 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1574 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1575 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ | |
1576 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
1577 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ | |
1578 }\ | |
1579 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1580 uint64_t half[16*2 + 17*2];\ | |
1581 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1582 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1583 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1584 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
1585 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ | |
1586 }\ | |
1587 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1588 uint64_t half[16*2 + 17*2];\ | |
1589 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1590 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1591 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1592 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
1593 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ | |
1594 }\ | |
1595 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1596 uint64_t half[17*2];\ | |
1597 uint8_t * const halfH= ((uint8_t*)half);\ | |
1598 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1599 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ | |
1600 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ | |
1601 }\ | |
1602 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1603 uint64_t half[17*2];\ | |
1604 uint8_t * const halfH= ((uint8_t*)half);\ | |
1605 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1606 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ | |
1607 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ | |
1608 }\ | |
1609 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1610 uint64_t half[17*2];\ | |
1611 uint8_t * const halfH= ((uint8_t*)half);\ | |
1612 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1613 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ | |
1614 } | |
1615 | |
1616 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" | |
1617 #define AVG_3DNOW_OP(a,b,temp, size) \ | |
1618 "mov" #size " " #b ", " #temp " \n\t"\ | |
1619 "pavgusb " #temp ", " #a " \n\t"\ | |
1620 "mov" #size " " #a ", " #b " \n\t" | |
1621 #define AVG_MMX2_OP(a,b,temp, size) \ | |
1622 "mov" #size " " #b ", " #temp " \n\t"\ | |
1623 "pavgb " #temp ", " #a " \n\t"\ | |
1624 "mov" #size " " #a ", " #b " \n\t" | |
1625 | |
1626 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) | |
1627 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) | |
1628 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) | |
1629 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) | |
1630 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) | |
1631 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) | |
1632 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) | |
1633 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) | |
1634 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) | |
1635 | |
1636 /***********************************/ | |
1637 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */ | |
1638 | |
1639 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\ | |
1640 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1641 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\ | |
1642 } | |
1643 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\ | |
1644 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1645 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\ | |
1646 } | |
1647 | |
1648 #define QPEL_2TAP(OPNAME, SIZE, MMX)\ | |
1649 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\ | |
1650 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\ | |
1651 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\ | |
1652 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\ | |
1653 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\ | |
1654 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\ | |
1655 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\ | |
1656 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\ | |
1657 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\ | |
1658 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1659 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\ | |
1660 }\ | |
1661 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1662 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\ | |
1663 }\ | |
1664 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\ | |
1665 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\ | |
1666 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\ | |
1667 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\ | |
1668 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\ | |
1669 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\ | |
1670 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\ | |
1671 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\ | |
1672 | |
1673 QPEL_2TAP(put_, 16, mmx2) | |
1674 QPEL_2TAP(avg_, 16, mmx2) | |
1675 QPEL_2TAP(put_, 8, mmx2) | |
1676 QPEL_2TAP(avg_, 8, mmx2) | |
1677 QPEL_2TAP(put_, 16, 3dnow) | |
1678 QPEL_2TAP(avg_, 16, 3dnow) | |
1679 QPEL_2TAP(put_, 8, 3dnow) | |
1680 QPEL_2TAP(avg_, 8, 3dnow) | |
1681 | |
1682 | |
1683 #if 0 | |
8527
f8bf438c6000
Add missing 'void' keyword to parameterless function declarations.
diego
parents:
8519
diff
changeset
|
1684 static void just_return(void) { return; } |
8430 | 1685 #endif |
1686 | |
1687 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, | |
1688 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){ | |
1689 const int w = 8; | |
1690 const int ix = ox>>(16+shift); | |
1691 const int iy = oy>>(16+shift); | |
1692 const int oxs = ox>>4; | |
1693 const int oys = oy>>4; | |
1694 const int dxxs = dxx>>4; | |
1695 const int dxys = dxy>>4; | |
1696 const int dyxs = dyx>>4; | |
1697 const int dyys = dyy>>4; | |
1698 const uint16_t r4[4] = {r,r,r,r}; | |
1699 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys}; | |
1700 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys}; | |
1701 const uint64_t shift2 = 2*shift; | |
1702 uint8_t edge_buf[(h+1)*stride]; | |
1703 int x, y; | |
1704 | |
1705 const int dxw = (dxx-(1<<(16+shift)))*(w-1); | |
1706 const int dyh = (dyy-(1<<(16+shift)))*(h-1); | |
1707 const int dxh = dxy*(h-1); | |
1708 const int dyw = dyx*(w-1); | |
1709 if( // non-constant fullpel offset (3% of blocks) | |
1710 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) | | |
1711 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift) | |
1712 // uses more than 16 bits of subpel mv (only at huge resolution) | |
1713 || (dxx|dxy|dyx|dyy)&15 ) | |
1714 { | |
1715 //FIXME could still use mmx for some of the rows | |
1716 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); | |
1717 return; | |
1718 } | |
1719 | |
1720 src += ix + iy*stride; | |
1721 if( (unsigned)ix >= width-w || | |
1722 (unsigned)iy >= height-h ) | |
1723 { | |
1724 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height); | |
1725 src = edge_buf; | |
1726 } | |
1727 | |
1728 __asm__ volatile( | |
1729 "movd %0, %%mm6 \n\t" | |
1730 "pxor %%mm7, %%mm7 \n\t" | |
1731 "punpcklwd %%mm6, %%mm6 \n\t" | |
1732 "punpcklwd %%mm6, %%mm6 \n\t" | |
1733 :: "r"(1<<shift) | |
1734 ); | |
1735 | |
1736 for(x=0; x<w; x+=4){ | |
1737 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0), | |
1738 oxs - dxys + dxxs*(x+1), | |
1739 oxs - dxys + dxxs*(x+2), | |
1740 oxs - dxys + dxxs*(x+3) }; | |
1741 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0), | |
1742 oys - dyys + dyxs*(x+1), | |
1743 oys - dyys + dyxs*(x+2), | |
1744 oys - dyys + dyxs*(x+3) }; | |
1745 | |
1746 for(y=0; y<h; y++){ | |
1747 __asm__ volatile( | |
1748 "movq %0, %%mm4 \n\t" | |
1749 "movq %1, %%mm5 \n\t" | |
1750 "paddw %2, %%mm4 \n\t" | |
1751 "paddw %3, %%mm5 \n\t" | |
1752 "movq %%mm4, %0 \n\t" | |
1753 "movq %%mm5, %1 \n\t" | |
1754 "psrlw $12, %%mm4 \n\t" | |
1755 "psrlw $12, %%mm5 \n\t" | |
1756 : "+m"(*dx4), "+m"(*dy4) | |
1757 : "m"(*dxy4), "m"(*dyy4) | |
1758 ); | |
1759 | |
1760 __asm__ volatile( | |
1761 "movq %%mm6, %%mm2 \n\t" | |
1762 "movq %%mm6, %%mm1 \n\t" | |
1763 "psubw %%mm4, %%mm2 \n\t" | |
1764 "psubw %%mm5, %%mm1 \n\t" | |
1765 "movq %%mm2, %%mm0 \n\t" | |
1766 "movq %%mm4, %%mm3 \n\t" | |
1767 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy) | |
1768 "pmullw %%mm5, %%mm3 \n\t" // dx*dy | |
1769 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy | |
1770 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy) | |
1771 | |
1772 "movd %4, %%mm5 \n\t" | |
1773 "movd %3, %%mm4 \n\t" | |
1774 "punpcklbw %%mm7, %%mm5 \n\t" | |
1775 "punpcklbw %%mm7, %%mm4 \n\t" | |
1776 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy | |
1777 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy | |
1778 | |
1779 "movd %2, %%mm5 \n\t" | |
1780 "movd %1, %%mm4 \n\t" | |
1781 "punpcklbw %%mm7, %%mm5 \n\t" | |
1782 "punpcklbw %%mm7, %%mm4 \n\t" | |
1783 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy) | |
1784 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy) | |
1785 "paddw %5, %%mm1 \n\t" | |
1786 "paddw %%mm3, %%mm2 \n\t" | |
1787 "paddw %%mm1, %%mm0 \n\t" | |
1788 "paddw %%mm2, %%mm0 \n\t" | |
1789 | |
1790 "psrlw %6, %%mm0 \n\t" | |
1791 "packuswb %%mm0, %%mm0 \n\t" | |
1792 "movd %%mm0, %0 \n\t" | |
1793 | |
1794 : "=m"(dst[x+y*stride]) | |
1795 : "m"(src[0]), "m"(src[1]), | |
1796 "m"(src[stride]), "m"(src[stride+1]), | |
1797 "m"(*r4), "m"(shift2) | |
1798 ); | |
1799 src += stride; | |
1800 } | |
1801 src += 4-h*stride; | |
1802 } | |
1803 } | |
1804 | |
1805 #define PREFETCH(name, op) \ | |
1806 static void name(void *mem, int stride, int h){\ | |
1807 const uint8_t *p= mem;\ | |
1808 do{\ | |
1809 __asm__ volatile(#op" %0" :: "m"(*p));\ | |
1810 p+= stride;\ | |
1811 }while(--h);\ | |
1812 } | |
1813 PREFETCH(prefetch_mmx2, prefetcht0) | |
1814 PREFETCH(prefetch_3dnow, prefetch) | |
1815 #undef PREFETCH | |
1816 | |
1817 #include "h264dsp_mmx.c" | |
8519
cc64e1343397
Use H264 MMX chroma functions to accelerate RV40 decoding.
cehoyos
parents:
8510
diff
changeset
|
1818 #include "rv40dsp_mmx.c" |
8430 | 1819 |
1820 /* CAVS specific */ | |
1821 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx); | |
1822 void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx); | |
1823 | |
1824 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
1825 put_pixels8_mmx(dst, src, stride, 8); | |
1826 } | |
1827 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
1828 avg_pixels8_mmx(dst, src, stride, 8); | |
1829 } | |
1830 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
1831 put_pixels16_mmx(dst, src, stride, 16); | |
1832 } | |
1833 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
1834 avg_pixels16_mmx(dst, src, stride, 16); | |
1835 } | |
1836 | |
1837 /* VC1 specific */ | |
1838 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx); | |
1839 | |
1840 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { | |
1841 put_pixels8_mmx(dst, src, stride, 8); | |
1842 } | |
9441 | 1843 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { |
1844 avg_pixels8_mmx2(dst, src, stride, 8); | |
1845 } | |
8430 | 1846 |
1847 /* external functions, from idct_mmx.c */ | |
1848 void ff_mmx_idct(DCTELEM *block); | |
1849 void ff_mmxext_idct(DCTELEM *block); | |
1850 | |
1851 /* XXX: those functions should be suppressed ASAP when all IDCTs are | |
1852 converted */ | |
8590 | 1853 #if CONFIG_GPL |
8430 | 1854 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) |
1855 { | |
1856 ff_mmx_idct (block); | |
1857 put_pixels_clamped_mmx(block, dest, line_size); | |
1858 } | |
1859 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
1860 { | |
1861 ff_mmx_idct (block); | |
1862 add_pixels_clamped_mmx(block, dest, line_size); | |
1863 } | |
1864 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) | |
1865 { | |
1866 ff_mmxext_idct (block); | |
1867 put_pixels_clamped_mmx(block, dest, line_size); | |
1868 } | |
1869 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
1870 { | |
1871 ff_mmxext_idct (block); | |
1872 add_pixels_clamped_mmx(block, dest, line_size); | |
1873 } | |
1874 #endif | |
1875 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) | |
1876 { | |
1877 ff_idct_xvid_mmx (block); | |
1878 put_pixels_clamped_mmx(block, dest, line_size); | |
1879 } | |
1880 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block) | |
1881 { | |
1882 ff_idct_xvid_mmx (block); | |
1883 add_pixels_clamped_mmx(block, dest, line_size); | |
1884 } | |
1885 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block) | |
1886 { | |
1887 ff_idct_xvid_mmx2 (block); | |
1888 put_pixels_clamped_mmx(block, dest, line_size); | |
1889 } | |
1890 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) | |
1891 { | |
1892 ff_idct_xvid_mmx2 (block); | |
1893 add_pixels_clamped_mmx(block, dest, line_size); | |
1894 } | |
1895 | |
1896 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize) | |
1897 { | |
1898 int i; | |
1899 __asm__ volatile("pxor %%mm7, %%mm7":); | |
1900 for(i=0; i<blocksize; i+=2) { | |
1901 __asm__ volatile( | |
1902 "movq %0, %%mm0 \n\t" | |
1903 "movq %1, %%mm1 \n\t" | |
1904 "movq %%mm0, %%mm2 \n\t" | |
1905 "movq %%mm1, %%mm3 \n\t" | |
1906 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 | |
1907 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 | |
1908 "pslld $31, %%mm2 \n\t" // keep only the sign bit | |
1909 "pxor %%mm2, %%mm1 \n\t" | |
1910 "movq %%mm3, %%mm4 \n\t" | |
1911 "pand %%mm1, %%mm3 \n\t" | |
1912 "pandn %%mm1, %%mm4 \n\t" | |
1913 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) | |
1914 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) | |
1915 "movq %%mm3, %1 \n\t" | |
1916 "movq %%mm0, %0 \n\t" | |
1917 :"+m"(mag[i]), "+m"(ang[i]) | |
1918 ::"memory" | |
1919 ); | |
1920 } | |
1921 __asm__ volatile("femms"); | |
1922 } | |
1923 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) | |
1924 { | |
1925 int i; | |
1926 | |
1927 __asm__ volatile( | |
1928 "movaps %0, %%xmm5 \n\t" | |
1929 ::"m"(ff_pdw_80000000[0]) | |
1930 ); | |
1931 for(i=0; i<blocksize; i+=4) { | |
1932 __asm__ volatile( | |
1933 "movaps %0, %%xmm0 \n\t" | |
1934 "movaps %1, %%xmm1 \n\t" | |
1935 "xorps %%xmm2, %%xmm2 \n\t" | |
1936 "xorps %%xmm3, %%xmm3 \n\t" | |
1937 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 | |
1938 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0 | |
1939 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit | |
1940 "xorps %%xmm2, %%xmm1 \n\t" | |
1941 "movaps %%xmm3, %%xmm4 \n\t" | |
1942 "andps %%xmm1, %%xmm3 \n\t" | |
1943 "andnps %%xmm1, %%xmm4 \n\t" | |
1944 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) | |
1945 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) | |
1946 "movaps %%xmm3, %1 \n\t" | |
1947 "movaps %%xmm0, %0 \n\t" | |
1948 :"+m"(mag[i]), "+m"(ang[i]) | |
1949 ::"memory" | |
1950 ); | |
1951 } | |
1952 } | |
1953 | |
1954 #define IF1(x) x | |
1955 #define IF0(x) | |
1956 | |
1957 #define MIX5(mono,stereo)\ | |
1958 __asm__ volatile(\ | |
1959 "movss 0(%2), %%xmm5 \n"\ | |
1960 "movss 8(%2), %%xmm6 \n"\ | |
1961 "movss 24(%2), %%xmm7 \n"\ | |
1962 "shufps $0, %%xmm5, %%xmm5 \n"\ | |
1963 "shufps $0, %%xmm6, %%xmm6 \n"\ | |
1964 "shufps $0, %%xmm7, %%xmm7 \n"\ | |
1965 "1: \n"\ | |
1966 "movaps (%0,%1), %%xmm0 \n"\ | |
1967 "movaps 0x400(%0,%1), %%xmm1 \n"\ | |
1968 "movaps 0x800(%0,%1), %%xmm2 \n"\ | |
1969 "movaps 0xc00(%0,%1), %%xmm3 \n"\ | |
1970 "movaps 0x1000(%0,%1), %%xmm4 \n"\ | |
1971 "mulps %%xmm5, %%xmm0 \n"\ | |
1972 "mulps %%xmm6, %%xmm1 \n"\ | |
1973 "mulps %%xmm5, %%xmm2 \n"\ | |
1974 "mulps %%xmm7, %%xmm3 \n"\ | |
1975 "mulps %%xmm7, %%xmm4 \n"\ | |
1976 stereo("addps %%xmm1, %%xmm0 \n")\ | |
1977 "addps %%xmm1, %%xmm2 \n"\ | |
1978 "addps %%xmm3, %%xmm0 \n"\ | |
1979 "addps %%xmm4, %%xmm2 \n"\ | |
1980 mono("addps %%xmm2, %%xmm0 \n")\ | |
1981 "movaps %%xmm0, (%0,%1) \n"\ | |
1982 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\ | |
1983 "add $16, %0 \n"\ | |
1984 "jl 1b \n"\ | |
1985 :"+&r"(i)\ | |
1986 :"r"(samples[0]+len), "r"(matrix)\ | |
1987 :"memory"\ | |
1988 ); | |
1989 | |
1990 #define MIX_MISC(stereo)\ | |
1991 __asm__ volatile(\ | |
1992 "1: \n"\ | |
1993 "movaps (%3,%0), %%xmm0 \n"\ | |
1994 stereo("movaps %%xmm0, %%xmm1 \n")\ | |
1995 "mulps %%xmm6, %%xmm0 \n"\ | |
1996 stereo("mulps %%xmm7, %%xmm1 \n")\ | |
1997 "lea 1024(%3,%0), %1 \n"\ | |
1998 "mov %5, %2 \n"\ | |
1999 "2: \n"\ | |
2000 "movaps (%1), %%xmm2 \n"\ | |
2001 stereo("movaps %%xmm2, %%xmm3 \n")\ | |
2002 "mulps (%4,%2), %%xmm2 \n"\ | |
2003 stereo("mulps 16(%4,%2), %%xmm3 \n")\ | |
2004 "addps %%xmm2, %%xmm0 \n"\ | |
2005 stereo("addps %%xmm3, %%xmm1 \n")\ | |
2006 "add $1024, %1 \n"\ | |
2007 "add $32, %2 \n"\ | |
2008 "jl 2b \n"\ | |
2009 "movaps %%xmm0, (%3,%0) \n"\ | |
2010 stereo("movaps %%xmm1, 1024(%3,%0) \n")\ | |
2011 "add $16, %0 \n"\ | |
2012 "jl 1b \n"\ | |
2013 :"+&r"(i), "=&r"(j), "=&r"(k)\ | |
2014 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\ | |
2015 :"memory"\ | |
2016 ); | |
2017 | |
2018 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len) | |
2019 { | |
2020 int (*matrix_cmp)[2] = (int(*)[2])matrix; | |
2021 intptr_t i,j,k; | |
2022 | |
2023 i = -len*sizeof(float); | |
2024 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) { | |
2025 MIX5(IF0,IF1); | |
2026 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) { | |
2027 MIX5(IF1,IF0); | |
2028 } else { | |
2029 DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]); | |
2030 j = 2*in_ch*sizeof(float); | |
2031 __asm__ volatile( | |
2032 "1: \n" | |
2033 "sub $8, %0 \n" | |
2034 "movss (%2,%0), %%xmm6 \n" | |
2035 "movss 4(%2,%0), %%xmm7 \n" | |
2036 "shufps $0, %%xmm6, %%xmm6 \n" | |
2037 "shufps $0, %%xmm7, %%xmm7 \n" | |
2038 "movaps %%xmm6, (%1,%0,4) \n" | |
2039 "movaps %%xmm7, 16(%1,%0,4) \n" | |
2040 "jg 1b \n" | |
2041 :"+&r"(j) | |
2042 :"r"(matrix_simd), "r"(matrix) | |
2043 :"memory" | |
2044 ); | |
2045 if(out_ch == 2) { | |
2046 MIX_MISC(IF1); | |
2047 } else { | |
2048 MIX_MISC(IF0); | |
2049 } | |
2050 } | |
2051 } | |
2052 | |
2053 static void vector_fmul_3dnow(float *dst, const float *src, int len){ | |
2054 x86_reg i = (len-4)*4; | |
2055 __asm__ volatile( | |
2056 "1: \n\t" | |
2057 "movq (%1,%0), %%mm0 \n\t" | |
2058 "movq 8(%1,%0), %%mm1 \n\t" | |
2059 "pfmul (%2,%0), %%mm0 \n\t" | |
2060 "pfmul 8(%2,%0), %%mm1 \n\t" | |
2061 "movq %%mm0, (%1,%0) \n\t" | |
2062 "movq %%mm1, 8(%1,%0) \n\t" | |
2063 "sub $16, %0 \n\t" | |
2064 "jge 1b \n\t" | |
2065 "femms \n\t" | |
2066 :"+r"(i) | |
2067 :"r"(dst), "r"(src) | |
2068 :"memory" | |
2069 ); | |
2070 } | |
2071 static void vector_fmul_sse(float *dst, const float *src, int len){ | |
2072 x86_reg i = (len-8)*4; | |
2073 __asm__ volatile( | |
2074 "1: \n\t" | |
2075 "movaps (%1,%0), %%xmm0 \n\t" | |
2076 "movaps 16(%1,%0), %%xmm1 \n\t" | |
2077 "mulps (%2,%0), %%xmm0 \n\t" | |
2078 "mulps 16(%2,%0), %%xmm1 \n\t" | |
2079 "movaps %%xmm0, (%1,%0) \n\t" | |
2080 "movaps %%xmm1, 16(%1,%0) \n\t" | |
2081 "sub $32, %0 \n\t" | |
2082 "jge 1b \n\t" | |
2083 :"+r"(i) | |
2084 :"r"(dst), "r"(src) | |
2085 :"memory" | |
2086 ); | |
2087 } | |
2088 | |
2089 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ | |
2090 x86_reg i = len*4-16; | |
2091 __asm__ volatile( | |
2092 "1: \n\t" | |
2093 "pswapd 8(%1), %%mm0 \n\t" | |
2094 "pswapd (%1), %%mm1 \n\t" | |
2095 "pfmul (%3,%0), %%mm0 \n\t" | |
2096 "pfmul 8(%3,%0), %%mm1 \n\t" | |
2097 "movq %%mm0, (%2,%0) \n\t" | |
2098 "movq %%mm1, 8(%2,%0) \n\t" | |
2099 "add $16, %1 \n\t" | |
2100 "sub $16, %0 \n\t" | |
2101 "jge 1b \n\t" | |
2102 :"+r"(i), "+r"(src1) | |
2103 :"r"(dst), "r"(src0) | |
2104 ); | |
2105 __asm__ volatile("femms"); | |
2106 } | |
2107 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){ | |
2108 x86_reg i = len*4-32; | |
2109 __asm__ volatile( | |
2110 "1: \n\t" | |
2111 "movaps 16(%1), %%xmm0 \n\t" | |
2112 "movaps (%1), %%xmm1 \n\t" | |
2113 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" | |
2114 "shufps $0x1b, %%xmm1, %%xmm1 \n\t" | |
2115 "mulps (%3,%0), %%xmm0 \n\t" | |
2116 "mulps 16(%3,%0), %%xmm1 \n\t" | |
2117 "movaps %%xmm0, (%2,%0) \n\t" | |
2118 "movaps %%xmm1, 16(%2,%0) \n\t" | |
2119 "add $32, %1 \n\t" | |
2120 "sub $32, %0 \n\t" | |
2121 "jge 1b \n\t" | |
2122 :"+r"(i), "+r"(src1) | |
2123 :"r"(dst), "r"(src0) | |
2124 ); | |
2125 } | |
2126 | |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10107
diff
changeset
|
2127 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1, |
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10107
diff
changeset
|
2128 const float *src2, int len){ |
8430 | 2129 x86_reg i = (len-4)*4; |
10301 | 2130 __asm__ volatile( |
2131 "1: \n\t" | |
2132 "movq (%2,%0), %%mm0 \n\t" | |
2133 "movq 8(%2,%0), %%mm1 \n\t" | |
2134 "pfmul (%3,%0), %%mm0 \n\t" | |
2135 "pfmul 8(%3,%0), %%mm1 \n\t" | |
2136 "pfadd (%4,%0), %%mm0 \n\t" | |
2137 "pfadd 8(%4,%0), %%mm1 \n\t" | |
2138 "movq %%mm0, (%1,%0) \n\t" | |
2139 "movq %%mm1, 8(%1,%0) \n\t" | |
2140 "sub $16, %0 \n\t" | |
2141 "jge 1b \n\t" | |
2142 :"+r"(i) | |
2143 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) | |
2144 :"memory" | |
2145 ); | |
8430 | 2146 __asm__ volatile("femms"); |
2147 } | |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10107
diff
changeset
|
2148 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1, |
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10107
diff
changeset
|
2149 const float *src2, int len){ |
8430 | 2150 x86_reg i = (len-8)*4; |
10301 | 2151 __asm__ volatile( |
2152 "1: \n\t" | |
2153 "movaps (%2,%0), %%xmm0 \n\t" | |
2154 "movaps 16(%2,%0), %%xmm1 \n\t" | |
2155 "mulps (%3,%0), %%xmm0 \n\t" | |
2156 "mulps 16(%3,%0), %%xmm1 \n\t" | |
2157 "addps (%4,%0), %%xmm0 \n\t" | |
2158 "addps 16(%4,%0), %%xmm1 \n\t" | |
2159 "movaps %%xmm0, (%1,%0) \n\t" | |
2160 "movaps %%xmm1, 16(%1,%0) \n\t" | |
2161 "sub $32, %0 \n\t" | |
2162 "jge 1b \n\t" | |
2163 :"+r"(i) | |
2164 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) | |
2165 :"memory" | |
2166 ); | |
8430 | 2167 } |
2168 | |
2169 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, | |
2170 const float *win, float add_bias, int len){ | |
8590 | 2171 #if HAVE_6REGS |
8430 | 2172 if(add_bias == 0){ |
2173 x86_reg i = -len*4; | |
2174 x86_reg j = len*4-8; | |
2175 __asm__ volatile( | |
2176 "1: \n" | |
2177 "pswapd (%5,%1), %%mm1 \n" | |
2178 "movq (%5,%0), %%mm0 \n" | |
2179 "pswapd (%4,%1), %%mm5 \n" | |
2180 "movq (%3,%0), %%mm4 \n" | |
2181 "movq %%mm0, %%mm2 \n" | |
2182 "movq %%mm1, %%mm3 \n" | |
2183 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i] | |
2184 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j] | |
2185 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j] | |
2186 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i] | |
2187 "pfadd %%mm3, %%mm2 \n" | |
2188 "pfsub %%mm0, %%mm1 \n" | |
2189 "pswapd %%mm2, %%mm2 \n" | |
2190 "movq %%mm1, (%2,%0) \n" | |
2191 "movq %%mm2, (%2,%1) \n" | |
2192 "sub $8, %1 \n" | |
2193 "add $8, %0 \n" | |
2194 "jl 1b \n" | |
2195 "femms \n" | |
2196 :"+r"(i), "+r"(j) | |
2197 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) | |
2198 ); | |
2199 }else | |
2200 #endif | |
2201 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); | |
2202 } | |
2203 | |
2204 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, | |
2205 const float *win, float add_bias, int len){ | |
8590 | 2206 #if HAVE_6REGS |
8430 | 2207 if(add_bias == 0){ |
2208 x86_reg i = -len*4; | |
2209 x86_reg j = len*4-16; | |
2210 __asm__ volatile( | |
2211 "1: \n" | |
2212 "movaps (%5,%1), %%xmm1 \n" | |
2213 "movaps (%5,%0), %%xmm0 \n" | |
2214 "movaps (%4,%1), %%xmm5 \n" | |
2215 "movaps (%3,%0), %%xmm4 \n" | |
2216 "shufps $0x1b, %%xmm1, %%xmm1 \n" | |
2217 "shufps $0x1b, %%xmm5, %%xmm5 \n" | |
2218 "movaps %%xmm0, %%xmm2 \n" | |
2219 "movaps %%xmm1, %%xmm3 \n" | |
2220 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i] | |
2221 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j] | |
2222 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j] | |
2223 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i] | |
2224 "addps %%xmm3, %%xmm2 \n" | |
2225 "subps %%xmm0, %%xmm1 \n" | |
2226 "shufps $0x1b, %%xmm2, %%xmm2 \n" | |
2227 "movaps %%xmm1, (%2,%0) \n" | |
2228 "movaps %%xmm2, (%2,%1) \n" | |
2229 "sub $16, %1 \n" | |
2230 "add $16, %0 \n" | |
2231 "jl 1b \n" | |
2232 :"+r"(i), "+r"(j) | |
2233 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) | |
2234 ); | |
2235 }else | |
2236 #endif | |
2237 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); | |
2238 } | |
2239 | |
2240 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) | |
2241 { | |
2242 x86_reg i = -4*len; | |
2243 __asm__ volatile( | |
2244 "movss %3, %%xmm4 \n" | |
2245 "shufps $0, %%xmm4, %%xmm4 \n" | |
2246 "1: \n" | |
2247 "cvtpi2ps (%2,%0), %%xmm0 \n" | |
2248 "cvtpi2ps 8(%2,%0), %%xmm1 \n" | |
2249 "cvtpi2ps 16(%2,%0), %%xmm2 \n" | |
2250 "cvtpi2ps 24(%2,%0), %%xmm3 \n" | |
2251 "movlhps %%xmm1, %%xmm0 \n" | |
2252 "movlhps %%xmm3, %%xmm2 \n" | |
2253 "mulps %%xmm4, %%xmm0 \n" | |
2254 "mulps %%xmm4, %%xmm2 \n" | |
2255 "movaps %%xmm0, (%1,%0) \n" | |
2256 "movaps %%xmm2, 16(%1,%0) \n" | |
2257 "add $32, %0 \n" | |
2258 "jl 1b \n" | |
2259 :"+r"(i) | |
2260 :"r"(dst+len), "r"(src+len), "m"(mul) | |
2261 ); | |
2262 } | |
2263 | |
2264 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) | |
2265 { | |
2266 x86_reg i = -4*len; | |
2267 __asm__ volatile( | |
2268 "movss %3, %%xmm4 \n" | |
2269 "shufps $0, %%xmm4, %%xmm4 \n" | |
2270 "1: \n" | |
2271 "cvtdq2ps (%2,%0), %%xmm0 \n" | |
2272 "cvtdq2ps 16(%2,%0), %%xmm1 \n" | |
2273 "mulps %%xmm4, %%xmm0 \n" | |
2274 "mulps %%xmm4, %%xmm1 \n" | |
2275 "movaps %%xmm0, (%1,%0) \n" | |
2276 "movaps %%xmm1, 16(%1,%0) \n" | |
2277 "add $32, %0 \n" | |
2278 "jl 1b \n" | |
2279 :"+r"(i) | |
2280 :"r"(dst+len), "r"(src+len), "m"(mul) | |
2281 ); | |
2282 } | |
2283 | |
10105 | 2284 static void vector_clipf_sse(float *dst, const float *src, float min, float max, |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2285 int len) |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2286 { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2287 x86_reg i = (len-16)*4; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2288 __asm__ volatile( |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2289 "movss %3, %%xmm4 \n" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2290 "movss %4, %%xmm5 \n" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2291 "shufps $0, %%xmm4, %%xmm4 \n" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2292 "shufps $0, %%xmm5, %%xmm5 \n" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2293 "1: \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2294 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2295 "movaps 16(%2,%0), %%xmm1 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2296 "movaps 32(%2,%0), %%xmm2 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2297 "movaps 48(%2,%0), %%xmm3 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2298 "maxps %%xmm4, %%xmm0 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2299 "maxps %%xmm4, %%xmm1 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2300 "maxps %%xmm4, %%xmm2 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2301 "maxps %%xmm4, %%xmm3 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2302 "minps %%xmm5, %%xmm0 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2303 "minps %%xmm5, %%xmm1 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2304 "minps %%xmm5, %%xmm2 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2305 "minps %%xmm5, %%xmm3 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2306 "movaps %%xmm0, (%1,%0) \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2307 "movaps %%xmm1, 16(%1,%0) \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2308 "movaps %%xmm2, 32(%1,%0) \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2309 "movaps %%xmm3, 48(%1,%0) \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2310 "sub $64, %0 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2311 "jge 1b \n\t" |
10107
3b61bc6ce377
Mark "i" parameter of vector_clipf_sse() as early-clobber
vitor
parents:
10105
diff
changeset
|
2312 :"+&r"(i) |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2313 :"r"(dst), "r"(src), "m"(min), "m"(max) |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2314 :"memory" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2315 ); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2316 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2317 |
8430 | 2318 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ |
2319 x86_reg reglen = len; | |
2320 // not bit-exact: pf2id uses different rounding than C and SSE | |
2321 __asm__ volatile( | |
2322 "add %0 , %0 \n\t" | |
2323 "lea (%2,%0,2) , %2 \n\t" | |
2324 "add %0 , %1 \n\t" | |
2325 "neg %0 \n\t" | |
2326 "1: \n\t" | |
2327 "pf2id (%2,%0,2) , %%mm0 \n\t" | |
2328 "pf2id 8(%2,%0,2) , %%mm1 \n\t" | |
2329 "pf2id 16(%2,%0,2) , %%mm2 \n\t" | |
2330 "pf2id 24(%2,%0,2) , %%mm3 \n\t" | |
2331 "packssdw %%mm1 , %%mm0 \n\t" | |
2332 "packssdw %%mm3 , %%mm2 \n\t" | |
2333 "movq %%mm0 , (%1,%0) \n\t" | |
2334 "movq %%mm2 , 8(%1,%0) \n\t" | |
2335 "add $16 , %0 \n\t" | |
2336 " js 1b \n\t" | |
2337 "femms \n\t" | |
2338 :"+r"(reglen), "+r"(dst), "+r"(src) | |
2339 ); | |
2340 } | |
2341 static void float_to_int16_sse(int16_t *dst, const float *src, long len){ | |
2342 x86_reg reglen = len; | |
2343 __asm__ volatile( | |
2344 "add %0 , %0 \n\t" | |
2345 "lea (%2,%0,2) , %2 \n\t" | |
2346 "add %0 , %1 \n\t" | |
2347 "neg %0 \n\t" | |
2348 "1: \n\t" | |
2349 "cvtps2pi (%2,%0,2) , %%mm0 \n\t" | |
2350 "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" | |
2351 "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" | |
2352 "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" | |
2353 "packssdw %%mm1 , %%mm0 \n\t" | |
2354 "packssdw %%mm3 , %%mm2 \n\t" | |
2355 "movq %%mm0 , (%1,%0) \n\t" | |
2356 "movq %%mm2 , 8(%1,%0) \n\t" | |
2357 "add $16 , %0 \n\t" | |
2358 " js 1b \n\t" | |
2359 "emms \n\t" | |
2360 :"+r"(reglen), "+r"(dst), "+r"(src) | |
2361 ); | |
2362 } | |
2363 | |
2364 static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ | |
2365 x86_reg reglen = len; | |
2366 __asm__ volatile( | |
2367 "add %0 , %0 \n\t" | |
2368 "lea (%2,%0,2) , %2 \n\t" | |
2369 "add %0 , %1 \n\t" | |
2370 "neg %0 \n\t" | |
2371 "1: \n\t" | |
2372 "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" | |
2373 "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" | |
2374 "packssdw %%xmm1 , %%xmm0 \n\t" | |
2375 "movdqa %%xmm0 , (%1,%0) \n\t" | |
2376 "add $16 , %0 \n\t" | |
2377 " js 1b \n\t" | |
2378 :"+r"(reglen), "+r"(dst), "+r"(src) | |
2379 ); | |
2380 } | |
2381 | |
2382 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); | |
2383 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); | |
2384 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); | |
10644 | 2385 int32_t ff_scalarproduct_int16_mmx2(int16_t *v1, int16_t *v2, int order, int shift); |
2386 int32_t ff_scalarproduct_int16_sse2(int16_t *v1, int16_t *v2, int order, int shift); | |
2387 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul); | |
2388 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul); | |
2389 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul); | |
10431 | 2390 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); |
2391 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); | |
2392 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); | |
8430 | 2393 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); |
2394 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); | |
10645 | 2395 void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); |
2396 void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); | |
2397 void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); | |
2398 | |
2399 #if HAVE_YASM && ARCH_X86_32 | |
8430 | 2400 void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); |
2401 static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) | |
2402 { | |
2403 ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); | |
2404 ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); | |
2405 } | |
10645 | 2406 #elif !HAVE_YASM |
8430 | 2407 #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) |
2408 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) | |
2409 #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) | |
2410 #endif | |
2411 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse | |
2412 | |
2413 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ | |
2414 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ | |
2415 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ | |
2416 DECLARE_ALIGNED_16(int16_t, tmp[len]);\ | |
2417 int i,j,c;\ | |
2418 for(c=0; c<channels; c++){\ | |
2419 float_to_int16_##cpu(tmp, src[c], len);\ | |
2420 for(i=0, j=c; i<len; i++, j+=channels)\ | |
2421 dst[j] = tmp[i];\ | |
2422 }\ | |
2423 }\ | |
2424 \ | |
2425 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ | |
2426 if(channels==1)\ | |
2427 float_to_int16_##cpu(dst, src[0], len);\ | |
2428 else if(channels==2){\ | |
2429 x86_reg reglen = len; \ | |
2430 const float *src0 = src[0];\ | |
2431 const float *src1 = src[1];\ | |
2432 __asm__ volatile(\ | |
2433 "shl $2, %0 \n"\ | |
2434 "add %0, %1 \n"\ | |
2435 "add %0, %2 \n"\ | |
2436 "add %0, %3 \n"\ | |
2437 "neg %0 \n"\ | |
2438 body\ | |
2439 :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ | |
2440 );\ | |
2441 }else if(channels==6){\ | |
2442 ff_float_to_int16_interleave6_##cpu(dst, src, len);\ | |
2443 }else\ | |
2444 float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ | |
2445 } | |
2446 | |
2447 FLOAT_TO_INT16_INTERLEAVE(3dnow, | |
2448 "1: \n" | |
2449 "pf2id (%2,%0), %%mm0 \n" | |
2450 "pf2id 8(%2,%0), %%mm1 \n" | |
2451 "pf2id (%3,%0), %%mm2 \n" | |
2452 "pf2id 8(%3,%0), %%mm3 \n" | |
2453 "packssdw %%mm1, %%mm0 \n" | |
2454 "packssdw %%mm3, %%mm2 \n" | |
2455 "movq %%mm0, %%mm1 \n" | |
2456 "punpcklwd %%mm2, %%mm0 \n" | |
2457 "punpckhwd %%mm2, %%mm1 \n" | |
2458 "movq %%mm0, (%1,%0)\n" | |
2459 "movq %%mm1, 8(%1,%0)\n" | |
2460 "add $16, %0 \n" | |
2461 "js 1b \n" | |
2462 "femms \n" | |
2463 ) | |
2464 | |
2465 FLOAT_TO_INT16_INTERLEAVE(sse, | |
2466 "1: \n" | |
2467 "cvtps2pi (%2,%0), %%mm0 \n" | |
2468 "cvtps2pi 8(%2,%0), %%mm1 \n" | |
2469 "cvtps2pi (%3,%0), %%mm2 \n" | |
2470 "cvtps2pi 8(%3,%0), %%mm3 \n" | |
2471 "packssdw %%mm1, %%mm0 \n" | |
2472 "packssdw %%mm3, %%mm2 \n" | |
2473 "movq %%mm0, %%mm1 \n" | |
2474 "punpcklwd %%mm2, %%mm0 \n" | |
2475 "punpckhwd %%mm2, %%mm1 \n" | |
2476 "movq %%mm0, (%1,%0)\n" | |
2477 "movq %%mm1, 8(%1,%0)\n" | |
2478 "add $16, %0 \n" | |
2479 "js 1b \n" | |
2480 "emms \n" | |
2481 ) | |
2482 | |
2483 FLOAT_TO_INT16_INTERLEAVE(sse2, | |
2484 "1: \n" | |
2485 "cvtps2dq (%2,%0), %%xmm0 \n" | |
2486 "cvtps2dq (%3,%0), %%xmm1 \n" | |
2487 "packssdw %%xmm1, %%xmm0 \n" | |
2488 "movhlps %%xmm0, %%xmm1 \n" | |
2489 "punpcklwd %%xmm1, %%xmm0 \n" | |
2490 "movdqa %%xmm0, (%1,%0) \n" | |
2491 "add $16, %0 \n" | |
2492 "js 1b \n" | |
2493 ) | |
2494 | |
2495 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ | |
2496 if(channels==6) | |
2497 ff_float_to_int16_interleave6_3dn2(dst, src, len); | |
2498 else | |
2499 float_to_int16_interleave_3dnow(dst, src, len, channels); | |
2500 } | |
2501 | |
2502 | |
2503 void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width); | |
2504 void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width); | |
2505 void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); | |
2506 void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); | |
2507 void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, | |
2508 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); | |
2509 void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, | |
2510 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); | |
2511 | |
2512 | |
2513 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |
2514 { | |
2515 mm_flags = mm_support(); | |
2516 | |
2517 if (avctx->dsp_mask) { | |
2518 if (avctx->dsp_mask & FF_MM_FORCE) | |
2519 mm_flags |= (avctx->dsp_mask & 0xffff); | |
2520 else | |
2521 mm_flags &= ~(avctx->dsp_mask & 0xffff); | |
2522 } | |
2523 | |
2524 #if 0 | |
2525 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); | |
2526 if (mm_flags & FF_MM_MMX) | |
2527 av_log(avctx, AV_LOG_INFO, " mmx"); | |
9342
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
9341
diff
changeset
|
2528 if (mm_flags & FF_MM_MMX2) |
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
9341
diff
changeset
|
2529 av_log(avctx, AV_LOG_INFO, " mmx2"); |
8430 | 2530 if (mm_flags & FF_MM_3DNOW) |
2531 av_log(avctx, AV_LOG_INFO, " 3dnow"); | |
2532 if (mm_flags & FF_MM_SSE) | |
2533 av_log(avctx, AV_LOG_INFO, " sse"); | |
2534 if (mm_flags & FF_MM_SSE2) | |
2535 av_log(avctx, AV_LOG_INFO, " sse2"); | |
2536 av_log(avctx, AV_LOG_INFO, "\n"); | |
2537 #endif | |
2538 | |
2539 if (mm_flags & FF_MM_MMX) { | |
2540 const int idct_algo= avctx->idct_algo; | |
2541 | |
2542 if(avctx->lowres==0){ | |
2543 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ | |
2544 c->idct_put= ff_simple_idct_put_mmx; | |
2545 c->idct_add= ff_simple_idct_add_mmx; | |
2546 c->idct = ff_simple_idct_mmx; | |
2547 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; | |
8590 | 2548 #if CONFIG_GPL |
8430 | 2549 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ |
9342
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
9341
diff
changeset
|
2550 if(mm_flags & FF_MM_MMX2){ |
8430 | 2551 c->idct_put= ff_libmpeg2mmx2_idct_put; |
2552 c->idct_add= ff_libmpeg2mmx2_idct_add; | |
2553 c->idct = ff_mmxext_idct; | |
2554 }else{ | |
2555 c->idct_put= ff_libmpeg2mmx_idct_put; | |
2556 c->idct_add= ff_libmpeg2mmx_idct_add; | |
2557 c->idct = ff_mmx_idct; | |
2558 } | |
2559 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; | |
2560 #endif | |
9975
d6d7e8d4a04d
Do not redundantly check for both CONFIG_THEORA_DECODER and CONFIG_VP3_DECODER.
diego
parents:
9959
diff
changeset
|
2561 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) && |
8430 | 2562 idct_algo==FF_IDCT_VP3){ |
2563 if(mm_flags & FF_MM_SSE2){ | |
2564 c->idct_put= ff_vp3_idct_put_sse2; | |
2565 c->idct_add= ff_vp3_idct_add_sse2; | |
2566 c->idct = ff_vp3_idct_sse2; | |
2567 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; | |
2568 }else{ | |
2569 c->idct_put= ff_vp3_idct_put_mmx; | |
2570 c->idct_add= ff_vp3_idct_add_mmx; | |
2571 c->idct = ff_vp3_idct_mmx; | |
2572 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; | |
2573 } | |
2574 }else if(idct_algo==FF_IDCT_CAVS){ | |
2575 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; | |
2576 }else if(idct_algo==FF_IDCT_XVIDMMX){ | |
2577 if(mm_flags & FF_MM_SSE2){ | |
2578 c->idct_put= ff_idct_xvid_sse2_put; | |
2579 c->idct_add= ff_idct_xvid_sse2_add; | |
2580 c->idct = ff_idct_xvid_sse2; | |
2581 c->idct_permutation_type= FF_SSE2_IDCT_PERM; | |
9342
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
9341
diff
changeset
|
2582 }else if(mm_flags & FF_MM_MMX2){ |
8430 | 2583 c->idct_put= ff_idct_xvid_mmx2_put; |
2584 c->idct_add= ff_idct_xvid_mmx2_add; | |
2585 c->idct = ff_idct_xvid_mmx2; | |
2586 }else{ | |
2587 c->idct_put= ff_idct_xvid_mmx_put; | |
2588 c->idct_add= ff_idct_xvid_mmx_add; | |
2589 c->idct = ff_idct_xvid_mmx; | |
2590 } | |
2591 } | |
2592 } | |
2593 | |
2594 c->put_pixels_clamped = put_pixels_clamped_mmx; | |
2595 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; | |
2596 c->add_pixels_clamped = add_pixels_clamped_mmx; | |
2597 c->clear_block = clear_block_mmx; | |
2598 c->clear_blocks = clear_blocks_mmx; | |
10766
78c2be62260a
Fix XvMC. XvMCCreateBlocks() may not allocate 16-byte aligned blocks,
gb
parents:
10749
diff
changeset
|
2599 if ((mm_flags & FF_MM_SSE) && |
78c2be62260a
Fix XvMC. XvMCCreateBlocks() may not allocate 16-byte aligned blocks,
gb
parents:
10749
diff
changeset
|
2600 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){ |
78c2be62260a
Fix XvMC. XvMCCreateBlocks() may not allocate 16-byte aligned blocks,
gb
parents:
10749
diff
changeset
|
2601 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ |
9861 | 2602 c->clear_block = clear_block_sse; |
2603 c->clear_blocks = clear_blocks_sse; | |
2604 } | |
8430 | 2605 |
2606 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ | |
2607 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ | |
2608 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ | |
2609 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ | |
2610 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU | |
2611 | |
2612 SET_HPEL_FUNCS(put, 0, 16, mmx); | |
2613 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx); | |
2614 SET_HPEL_FUNCS(avg, 0, 16, mmx); | |
2615 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx); | |
2616 SET_HPEL_FUNCS(put, 1, 8, mmx); | |
2617 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx); | |
2618 SET_HPEL_FUNCS(avg, 1, 8, mmx); | |
2619 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx); | |
2620 | |
2621 c->gmc= gmc_mmx; | |
2622 | |
2623 c->add_bytes= add_bytes_mmx; | |
2624 c->add_bytes_l2= add_bytes_l2_mmx; | |
2625 | |
2626 c->draw_edges = draw_edges_mmx; | |
2627 | |
10749
5cca4b6c459d
Get rid of pointless CONFIG_ANY_H263 preprocessor definition.
diego
parents:
10645
diff
changeset
|
2628 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
8430 | 2629 c->h263_v_loop_filter= h263_v_loop_filter_mmx; |
2630 c->h263_h_loop_filter= h263_h_loop_filter_mmx; | |
2631 } | |
2632 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd; | |
2633 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; | |
9439
ef3a7b711cc0
Rename put_no_rnd_h264_chroma* to reflect its usage in VC1 only
conrad
parents:
9342
diff
changeset
|
2634 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_vc1_chroma_mc8_mmx_nornd; |
8430 | 2635 |
8519
cc64e1343397
Use H264 MMX chroma functions to accelerate RV40 decoding.
cehoyos
parents:
8510
diff
changeset
|
2636 c->put_rv40_chroma_pixels_tab[0]= put_rv40_chroma_mc8_mmx; |
cc64e1343397
Use H264 MMX chroma functions to accelerate RV40 decoding.
cehoyos
parents:
8510
diff
changeset
|
2637 c->put_rv40_chroma_pixels_tab[1]= put_rv40_chroma_mc4_mmx; |
cc64e1343397
Use H264 MMX chroma functions to accelerate RV40 decoding.
cehoyos
parents:
8510
diff
changeset
|
2638 |
8430 | 2639 c->h264_idct_dc_add= |
2640 c->h264_idct_add= ff_h264_idct_add_mmx; | |
2641 c->h264_idct8_dc_add= | |
2642 c->h264_idct8_add= ff_h264_idct8_add_mmx; | |
2643 | |
2644 c->h264_idct_add16 = ff_h264_idct_add16_mmx; | |
2645 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; | |
2646 c->h264_idct_add8 = ff_h264_idct_add8_mmx; | |
2647 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; | |
2648 | |
8817 | 2649 if (CONFIG_VP6_DECODER) { |
2650 c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx; | |
2651 } | |
2652 | |
9342
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
9341
diff
changeset
|
2653 if (mm_flags & FF_MM_MMX2) { |
8430 | 2654 c->prefetch = prefetch_mmx2; |
2655 | |
2656 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; | |
2657 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; | |
2658 | |
2659 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; | |
2660 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; | |
2661 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; | |
2662 | |
2663 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; | |
2664 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; | |
2665 | |
2666 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; | |
2667 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; | |
2668 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; | |
2669 | |
2670 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; | |
2671 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; | |
2672 c->h264_idct_add16 = ff_h264_idct_add16_mmx2; | |
2673 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; | |
2674 c->h264_idct_add8 = ff_h264_idct_add8_mmx2; | |
2675 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; | |
2676 | |
2677 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
2678 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; | |
2679 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; | |
2680 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; | |
2681 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; | |
2682 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; | |
2683 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; | |
2684 | |
9975
d6d7e8d4a04d
Do not redundantly check for both CONFIG_THEORA_DECODER and CONFIG_VP3_DECODER.
diego
parents:
9959
diff
changeset
|
2685 if (CONFIG_VP3_DECODER) { |
8430 | 2686 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2; |
2687 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2; | |
2688 } | |
2689 } | |
2690 | |
2691 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ | |
2692 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ | |
2693 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \ | |
2694 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \ | |
2695 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \ | |
2696 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \ | |
2697 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \ | |
2698 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \ | |
2699 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \ | |
2700 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \ | |
2701 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \ | |
2702 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \ | |
2703 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \ | |
2704 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \ | |
2705 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \ | |
2706 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \ | |
2707 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU | |
2708 | |
2709 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2); | |
2710 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2); | |
2711 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2); | |
2712 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2); | |
2713 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2); | |
2714 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2); | |
2715 | |
2716 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2); | |
2717 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2); | |
2718 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2); | |
2719 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2); | |
2720 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2); | |
2721 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2); | |
2722 | |
2723 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2); | |
2724 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2); | |
2725 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2); | |
2726 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2); | |
2727 | |
8519
cc64e1343397
Use H264 MMX chroma functions to accelerate RV40 decoding.
cehoyos
parents:
8510
diff
changeset
|
2728 c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_mmx2; |
cc64e1343397
Use H264 MMX chroma functions to accelerate RV40 decoding.
cehoyos
parents:
8510
diff
changeset
|
2729 c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_mmx2; |
cc64e1343397
Use H264 MMX chroma functions to accelerate RV40 decoding.
cehoyos
parents:
8510
diff
changeset
|
2730 |
9440 | 2731 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_vc1_chroma_mc8_mmx2_nornd; |
2732 | |
8430 | 2733 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd; |
2734 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2; | |
2735 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2; | |
2736 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2; | |
2737 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; | |
2738 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; | |
2739 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; | |
2740 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; | |
2741 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; | |
2742 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; | |
2743 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; | |
2744 | |
2745 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; | |
2746 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; | |
2747 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; | |
2748 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; | |
2749 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; | |
2750 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; | |
2751 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; | |
2752 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; | |
2753 | |
2754 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; | |
2755 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; | |
2756 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; | |
2757 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; | |
2758 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; | |
2759 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; | |
2760 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; | |
2761 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; | |
2762 | |
8760 | 2763 #if HAVE_YASM |
2764 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; | |
2765 #endif | |
8798
a5c8210814d7
Add check whether the compiler/assembler supports 10 or more operands.
diego
parents:
8760
diff
changeset
|
2766 #if HAVE_7REGS && HAVE_TEN_OPERANDS |
8760 | 2767 if( mm_flags&FF_MM_3DNOW ) |
2768 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov; | |
2769 #endif | |
2770 | |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
2771 if (CONFIG_CAVS_DECODER) |
8430 | 2772 ff_cavsdsp_init_mmx2(c, avctx); |
2773 | |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
2774 if (CONFIG_VC1_DECODER) |
8430 | 2775 ff_vc1dsp_init_mmx(c, avctx); |
2776 | |
2777 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; | |
2778 } else if (mm_flags & FF_MM_3DNOW) { | |
2779 c->prefetch = prefetch_3dnow; | |
2780 | |
2781 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; | |
2782 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; | |
2783 | |
2784 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; | |
2785 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; | |
2786 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; | |
2787 | |
2788 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; | |
2789 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; | |
2790 | |
2791 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; | |
2792 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; | |
2793 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; | |
2794 | |
2795 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
2796 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; | |
2797 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; | |
2798 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; | |
2799 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; | |
2800 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; | |
2801 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; | |
2802 } | |
2803 | |
2804 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow); | |
2805 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow); | |
2806 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow); | |
2807 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow); | |
2808 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow); | |
2809 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow); | |
2810 | |
2811 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow); | |
2812 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow); | |
2813 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow); | |
2814 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow); | |
2815 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow); | |
2816 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow); | |
2817 | |
2818 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow); | |
2819 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow); | |
2820 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow); | |
2821 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow); | |
2822 | |
2823 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd; | |
2824 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; | |
2825 | |
8519
cc64e1343397
Use H264 MMX chroma functions to accelerate RV40 decoding.
cehoyos
parents:
8510
diff
changeset
|
2826 c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_3dnow; |
cc64e1343397
Use H264 MMX chroma functions to accelerate RV40 decoding.
cehoyos
parents:
8510
diff
changeset
|
2827 c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_3dnow; |
cc64e1343397
Use H264 MMX chroma functions to accelerate RV40 decoding.
cehoyos
parents:
8510
diff
changeset
|
2828 |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
2829 if (CONFIG_CAVS_DECODER) |
8430 | 2830 ff_cavsdsp_init_3dnow(c, avctx); |
2831 } | |
2832 | |
2833 | |
2834 #define H264_QPEL_FUNCS(x, y, CPU)\ | |
2835 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\ | |
2836 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\ | |
2837 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\ | |
2838 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; | |
2839 if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){ | |
2840 // these functions are slower than mmx on AMD, but faster on Intel | |
2841 /* FIXME works in most codecs, but crashes svq1 due to unaligned chroma | |
2842 c->put_pixels_tab[0][0] = put_pixels16_sse2; | |
2843 c->avg_pixels_tab[0][0] = avg_pixels16_sse2; | |
2844 */ | |
2845 H264_QPEL_FUNCS(0, 0, sse2); | |
2846 } | |
2847 if(mm_flags & FF_MM_SSE2){ | |
2848 c->h264_idct8_add = ff_h264_idct8_add_sse2; | |
2849 c->h264_idct8_add4= ff_h264_idct8_add4_sse2; | |
2850 | |
2851 H264_QPEL_FUNCS(0, 1, sse2); | |
2852 H264_QPEL_FUNCS(0, 2, sse2); | |
2853 H264_QPEL_FUNCS(0, 3, sse2); | |
2854 H264_QPEL_FUNCS(1, 1, sse2); | |
2855 H264_QPEL_FUNCS(1, 2, sse2); | |
2856 H264_QPEL_FUNCS(1, 3, sse2); | |
2857 H264_QPEL_FUNCS(2, 1, sse2); | |
2858 H264_QPEL_FUNCS(2, 2, sse2); | |
2859 H264_QPEL_FUNCS(2, 3, sse2); | |
2860 H264_QPEL_FUNCS(3, 1, sse2); | |
2861 H264_QPEL_FUNCS(3, 2, sse2); | |
2862 H264_QPEL_FUNCS(3, 3, sse2); | |
8818 | 2863 |
2864 if (CONFIG_VP6_DECODER) { | |
2865 c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2; | |
2866 } | |
8430 | 2867 } |
8590 | 2868 #if HAVE_SSSE3 |
8430 | 2869 if(mm_flags & FF_MM_SSSE3){ |
2870 H264_QPEL_FUNCS(1, 0, ssse3); | |
2871 H264_QPEL_FUNCS(1, 1, ssse3); | |
2872 H264_QPEL_FUNCS(1, 2, ssse3); | |
2873 H264_QPEL_FUNCS(1, 3, ssse3); | |
2874 H264_QPEL_FUNCS(2, 0, ssse3); | |
2875 H264_QPEL_FUNCS(2, 1, ssse3); | |
2876 H264_QPEL_FUNCS(2, 2, ssse3); | |
2877 H264_QPEL_FUNCS(2, 3, ssse3); | |
2878 H264_QPEL_FUNCS(3, 0, ssse3); | |
2879 H264_QPEL_FUNCS(3, 1, ssse3); | |
2880 H264_QPEL_FUNCS(3, 2, ssse3); | |
2881 H264_QPEL_FUNCS(3, 3, ssse3); | |
9439
ef3a7b711cc0
Rename put_no_rnd_h264_chroma* to reflect its usage in VC1 only
conrad
parents:
9342
diff
changeset
|
2882 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_vc1_chroma_mc8_ssse3_nornd; |
9440 | 2883 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_vc1_chroma_mc8_ssse3_nornd; |
8430 | 2884 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd; |
2885 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd; | |
2886 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3; | |
2887 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3; | |
2888 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; | |
10430 | 2889 #if HAVE_YASM |
2890 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3; | |
2891 if (mm_flags & FF_MM_SSE4) // not really sse4, just slow on Conroe | |
2892 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; | |
2893 #endif | |
8430 | 2894 } |
2895 #endif | |
2896 | |
8590 | 2897 #if CONFIG_GPL && HAVE_YASM |
9342
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
9341
diff
changeset
|
2898 if (mm_flags & FF_MM_MMX2){ |
8590 | 2899 #if ARCH_X86_32 |
8430 | 2900 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; |
2901 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; | |
2902 #endif | |
8510
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8499
diff
changeset
|
2903 if( mm_flags&FF_MM_SSE2 ){ |
9959
6972b493f41f
Icc 11.1 still does not align the stack pointer, disable some x264 functions.
cehoyos
parents:
9861
diff
changeset
|
2904 #if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110 |
8430 | 2905 c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; |
2906 c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; | |
2907 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; | |
2908 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; | |
8510
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8499
diff
changeset
|
2909 #endif |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8499
diff
changeset
|
2910 c->h264_idct_add16 = ff_h264_idct_add16_sse2; |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8499
diff
changeset
|
2911 c->h264_idct_add8 = ff_h264_idct_add8_sse2; |
cea216e44ee3
Add x264 SSE2 iDCT functions to H.264 decoder.
darkshikari
parents:
8499
diff
changeset
|
2912 c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2; |
8430 | 2913 } |
2914 } | |
2915 #endif | |
2916 | |
8590 | 2917 #if CONFIG_SNOW_DECODER |
8430 | 2918 if(mm_flags & FF_MM_SSE2 & 0){ |
2919 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; | |
8590 | 2920 #if HAVE_7REGS |
8430 | 2921 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; |
2922 #endif | |
2923 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; | |
2924 } | |
2925 else{ | |
9342
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
9341
diff
changeset
|
2926 if(mm_flags & FF_MM_MMX2){ |
8430 | 2927 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; |
8590 | 2928 #if HAVE_7REGS |
8430 | 2929 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; |
2930 #endif | |
2931 } | |
2932 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; | |
2933 } | |
2934 #endif | |
2935 | |
2936 if(mm_flags & FF_MM_3DNOW){ | |
2937 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; | |
2938 c->vector_fmul = vector_fmul_3dnow; | |
2939 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
2940 c->float_to_int16 = float_to_int16_3dnow; | |
2941 c->float_to_int16_interleave = float_to_int16_interleave_3dnow; | |
2942 } | |
2943 } | |
2944 if(mm_flags & FF_MM_3DNOWEXT){ | |
2945 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; | |
2946 c->vector_fmul_window = vector_fmul_window_3dnow2; | |
2947 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
2948 c->float_to_int16_interleave = float_to_int16_interleave_3dn2; | |
2949 } | |
2950 } | |
10633 | 2951 if(mm_flags & FF_MM_MMX2){ |
2952 #if HAVE_YASM | |
2953 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; | |
10644 | 2954 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2; |
10633 | 2955 #endif |
2956 } | |
8430 | 2957 if(mm_flags & FF_MM_SSE){ |
2958 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; | |
2959 c->ac3_downmix = ac3_downmix_sse; | |
2960 c->vector_fmul = vector_fmul_sse; | |
2961 c->vector_fmul_reverse = vector_fmul_reverse_sse; | |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10107
diff
changeset
|
2962 c->vector_fmul_add = vector_fmul_add_sse; |
8430 | 2963 c->vector_fmul_window = vector_fmul_window_sse; |
2964 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; | |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2965 c->vector_clipf = vector_clipf_sse; |
8430 | 2966 c->float_to_int16 = float_to_int16_sse; |
2967 c->float_to_int16_interleave = float_to_int16_interleave_sse; | |
2968 } | |
2969 if(mm_flags & FF_MM_3DNOW) | |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10107
diff
changeset
|
2970 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse |
8430 | 2971 if(mm_flags & FF_MM_SSE2){ |
2972 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; | |
2973 c->float_to_int16 = float_to_int16_sse2; | |
2974 c->float_to_int16_interleave = float_to_int16_interleave_sse2; | |
10633 | 2975 #if HAVE_YASM |
2976 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | |
10644 | 2977 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; |
10633 | 2978 #endif |
8430 | 2979 } |
10644 | 2980 if((mm_flags & FF_MM_SSSE3) && !(mm_flags & (FF_MM_SSE42|FF_MM_3DNOW)) && HAVE_YASM) // cachesplit |
2981 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; | |
8430 | 2982 } |
2983 | |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
2984 if (CONFIG_ENCODERS) |
8430 | 2985 dsputilenc_init_mmx(c, avctx); |
2986 | |
2987 #if 0 | |
2988 // for speed testing | |
2989 get_pixels = just_return; | |
2990 put_pixels_clamped = just_return; | |
2991 add_pixels_clamped = just_return; | |
2992 | |
2993 pix_abs16x16 = just_return; | |
2994 pix_abs16x16_x2 = just_return; | |
2995 pix_abs16x16_y2 = just_return; | |
2996 pix_abs16x16_xy2 = just_return; | |
2997 | |
2998 put_pixels_tab[0] = just_return; | |
2999 put_pixels_tab[1] = just_return; | |
3000 put_pixels_tab[2] = just_return; | |
3001 put_pixels_tab[3] = just_return; | |
3002 | |
3003 put_no_rnd_pixels_tab[0] = just_return; | |
3004 put_no_rnd_pixels_tab[1] = just_return; | |
3005 put_no_rnd_pixels_tab[2] = just_return; | |
3006 put_no_rnd_pixels_tab[3] = just_return; | |
3007 | |
3008 avg_pixels_tab[0] = just_return; | |
3009 avg_pixels_tab[1] = just_return; | |
3010 avg_pixels_tab[2] = just_return; | |
3011 avg_pixels_tab[3] = just_return; | |
3012 | |
3013 avg_no_rnd_pixels_tab[0] = just_return; | |
3014 avg_no_rnd_pixels_tab[1] = just_return; | |
3015 avg_no_rnd_pixels_tab[2] = just_return; | |
3016 avg_no_rnd_pixels_tab[3] = just_return; | |
3017 | |
3018 //av_fdct = just_return; | |
3019 //ff_idct = just_return; | |
3020 #endif | |
3021 } |