Mercurial > libavcodec.hg
annotate x86/dsputil_mmx.c @ 12454:f4355cd85faa libavcodec
Port latest x264 deblock asm (before they moved to using NV12 as internal
format), LGPL'ed with permission from Jason and Loren. This includes mmx2
code, so remove inline asm from h264dsp_mmx.c accordingly.
author | rbultje |
---|---|
date | Fri, 03 Sep 2010 16:52:46 +0000 |
parents | 3941687b4fa9 |
children | a5ddb39627fd |
rev | line source |
---|---|
8430 | 1 /* |
2 * MMX optimized DSP utils | |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8596
diff
changeset
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard |
8430 | 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
5 * | |
6 * This file is part of FFmpeg. | |
7 * | |
8 * FFmpeg is free software; you can redistribute it and/or | |
9 * modify it under the terms of the GNU Lesser General Public | |
10 * License as published by the Free Software Foundation; either | |
11 * version 2.1 of the License, or (at your option) any later version. | |
12 * | |
13 * FFmpeg is distributed in the hope that it will be useful, | |
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 * Lesser General Public License for more details. | |
17 * | |
18 * You should have received a copy of the GNU Lesser General Public | |
19 * License along with FFmpeg; if not, write to the Free Software | |
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 * | |
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
23 */ | |
24 | |
25 #include "libavutil/x86_cpu.h" | |
26 #include "libavcodec/dsputil.h" | |
11499 | 27 #include "libavcodec/h264dsp.h" |
8430 | 28 #include "libavcodec/mpegvideo.h" |
29 #include "libavcodec/simple_idct.h" | |
30 #include "dsputil_mmx.h" | |
31 #include "idct_xvid.h" | |
32 | |
33 //#undef NDEBUG | |
34 //#include <assert.h> | |
35 | |
36 /* pixel operations */ | |
11369 | 37 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; |
38 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; | |
8430 | 39 |
11369 | 40 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = |
8430 | 41 {0x8000000080000000ULL, 0x8000000080000000ULL}; |
42 | |
11369 | 43 DECLARE_ALIGNED(8, const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; |
12143 | 44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; |
11369 | 45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; |
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12168
diff
changeset
|
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL}; |
11369 | 48 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; |
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12168
diff
changeset
|
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL}; |
11369 | 51 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12168
diff
changeset
|
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL}; |
11369 | 53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; |
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; | |
55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; | |
12206 | 56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL; |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12168
diff
changeset
|
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL}; |
11369 | 58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL}; |
59 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; | |
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; | |
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; | |
8430 | 62 |
12454
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12450
diff
changeset
|
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL}; |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12143
diff
changeset
|
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL}; |
11951
afee30fe8c26
16x16 and 8x8c x86 SIMD intra pred functions for VP8 and H.264
darkshikari
parents:
11826
diff
changeset
|
65 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL}; |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
11981
diff
changeset
|
66 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL}; |
11369 | 67 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; |
68 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; | |
69 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; | |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
11981
diff
changeset
|
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL}; |
11369 | 71 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; |
12454
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
12450
diff
changeset
|
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL}; |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
11981
diff
changeset
|
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL}; |
11369 | 74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
11981
diff
changeset
|
75 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL}; |
8430 | 76 |
11369 | 77 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; |
78 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; | |
8430 | 79 |
80 #define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::) | |
81 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) | |
82 | |
83 #define MOVQ_BFE(regd) \ | |
84 __asm__ volatile ( \ | |
85 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ | |
86 "paddb %%" #regd ", %%" #regd " \n\t" ::) | |
87 | |
88 #ifndef PIC | |
89 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) | |
90 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) | |
91 #else | |
92 // for shared library it's better to use this way for accessing constants | |
93 // pcmpeqd -> -1 | |
94 #define MOVQ_BONE(regd) \ | |
95 __asm__ volatile ( \ | |
96 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
97 "psrlw $15, %%" #regd " \n\t" \ | |
98 "packuswb %%" #regd ", %%" #regd " \n\t" ::) | |
99 | |
100 #define MOVQ_WTWO(regd) \ | |
101 __asm__ volatile ( \ | |
102 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
103 "psrlw $15, %%" #regd " \n\t" \ | |
104 "psllw $1, %%" #regd " \n\t"::) | |
105 | |
106 #endif | |
107 | |
108 // using regr as temporary and for the output result | |
109 // first argument is unmodifed and second is trashed | |
110 // regfe is supposed to contain 0xfefefefefefefefe | |
111 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ | |
112 "movq " #rega ", " #regr " \n\t"\ | |
113 "pand " #regb ", " #regr " \n\t"\ | |
114 "pxor " #rega ", " #regb " \n\t"\ | |
115 "pand " #regfe "," #regb " \n\t"\ | |
116 "psrlq $1, " #regb " \n\t"\ | |
117 "paddb " #regb ", " #regr " \n\t" | |
118 | |
119 #define PAVGB_MMX(rega, regb, regr, regfe) \ | |
120 "movq " #rega ", " #regr " \n\t"\ | |
121 "por " #regb ", " #regr " \n\t"\ | |
122 "pxor " #rega ", " #regb " \n\t"\ | |
123 "pand " #regfe "," #regb " \n\t"\ | |
124 "psrlq $1, " #regb " \n\t"\ | |
125 "psubb " #regb ", " #regr " \n\t" | |
126 | |
127 // mm6 is supposed to contain 0xfefefefefefefefe | |
128 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ | |
129 "movq " #rega ", " #regr " \n\t"\ | |
130 "movq " #regc ", " #regp " \n\t"\ | |
131 "pand " #regb ", " #regr " \n\t"\ | |
132 "pand " #regd ", " #regp " \n\t"\ | |
133 "pxor " #rega ", " #regb " \n\t"\ | |
134 "pxor " #regc ", " #regd " \n\t"\ | |
135 "pand %%mm6, " #regb " \n\t"\ | |
136 "pand %%mm6, " #regd " \n\t"\ | |
137 "psrlq $1, " #regb " \n\t"\ | |
138 "psrlq $1, " #regd " \n\t"\ | |
139 "paddb " #regb ", " #regr " \n\t"\ | |
140 "paddb " #regd ", " #regp " \n\t" | |
141 | |
142 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ | |
143 "movq " #rega ", " #regr " \n\t"\ | |
144 "movq " #regc ", " #regp " \n\t"\ | |
145 "por " #regb ", " #regr " \n\t"\ | |
146 "por " #regd ", " #regp " \n\t"\ | |
147 "pxor " #rega ", " #regb " \n\t"\ | |
148 "pxor " #regc ", " #regd " \n\t"\ | |
149 "pand %%mm6, " #regb " \n\t"\ | |
150 "pand %%mm6, " #regd " \n\t"\ | |
151 "psrlq $1, " #regd " \n\t"\ | |
152 "psrlq $1, " #regb " \n\t"\ | |
153 "psubb " #regb ", " #regr " \n\t"\ | |
154 "psubb " #regd ", " #regp " \n\t" | |
155 | |
156 /***********************************/ | |
157 /* MMX no rounding */ | |
158 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx | |
159 #define SET_RND MOVQ_WONE | |
160 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) | |
161 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
9441
diff
changeset
|
162 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e) |
8430 | 163 |
164 #include "dsputil_mmx_rnd_template.c" | |
165 | |
166 #undef DEF | |
167 #undef SET_RND | |
168 #undef PAVGBP | |
169 #undef PAVGB | |
170 /***********************************/ | |
171 /* MMX rounding */ | |
172 | |
173 #define DEF(x, y) x ## _ ## y ##_mmx | |
174 #define SET_RND MOVQ_WTWO | |
175 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) | |
176 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) | |
177 | |
178 #include "dsputil_mmx_rnd_template.c" | |
179 | |
180 #undef DEF | |
181 #undef SET_RND | |
182 #undef PAVGBP | |
183 #undef PAVGB | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
9441
diff
changeset
|
184 #undef OP_AVG |
8430 | 185 |
186 /***********************************/ | |
187 /* 3Dnow specific */ | |
188 | |
189 #define DEF(x) x ## _3dnow | |
190 #define PAVGB "pavgusb" | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
9441
diff
changeset
|
191 #define OP_AVG PAVGB |
8430 | 192 |
193 #include "dsputil_mmx_avg_template.c" | |
194 | |
195 #undef DEF | |
196 #undef PAVGB | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
9441
diff
changeset
|
197 #undef OP_AVG |
8430 | 198 |
199 /***********************************/ | |
200 /* MMX2 specific */ | |
201 | |
202 #define DEF(x) x ## _mmx2 | |
203 | |
204 /* Introduced only in MMX2 set */ | |
205 #define PAVGB "pavgb" | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
9441
diff
changeset
|
206 #define OP_AVG PAVGB |
8430 | 207 |
208 #include "dsputil_mmx_avg_template.c" | |
209 | |
210 #undef DEF | |
211 #undef PAVGB | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
9441
diff
changeset
|
212 #undef OP_AVG |
8430 | 213 |
214 #define put_no_rnd_pixels16_mmx put_pixels16_mmx | |
215 #define put_no_rnd_pixels8_mmx put_pixels8_mmx | |
216 #define put_pixels16_mmx2 put_pixels16_mmx | |
217 #define put_pixels8_mmx2 put_pixels8_mmx | |
218 #define put_pixels4_mmx2 put_pixels4_mmx | |
219 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx | |
220 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx | |
221 #define put_pixels16_3dnow put_pixels16_mmx | |
222 #define put_pixels8_3dnow put_pixels8_mmx | |
223 #define put_pixels4_3dnow put_pixels4_mmx | |
224 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx | |
225 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx | |
226 | |
227 /***********************************/ | |
228 /* standard MMX */ | |
229 | |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
230 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
8430 | 231 { |
232 const DCTELEM *p; | |
233 uint8_t *pix; | |
234 | |
235 /* read the pixels */ | |
236 p = block; | |
237 pix = pixels; | |
238 /* unrolled loop */ | |
239 __asm__ volatile( | |
240 "movq %3, %%mm0 \n\t" | |
241 "movq 8%3, %%mm1 \n\t" | |
242 "movq 16%3, %%mm2 \n\t" | |
243 "movq 24%3, %%mm3 \n\t" | |
244 "movq 32%3, %%mm4 \n\t" | |
245 "movq 40%3, %%mm5 \n\t" | |
246 "movq 48%3, %%mm6 \n\t" | |
247 "movq 56%3, %%mm7 \n\t" | |
248 "packuswb %%mm1, %%mm0 \n\t" | |
249 "packuswb %%mm3, %%mm2 \n\t" | |
250 "packuswb %%mm5, %%mm4 \n\t" | |
251 "packuswb %%mm7, %%mm6 \n\t" | |
252 "movq %%mm0, (%0) \n\t" | |
253 "movq %%mm2, (%0, %1) \n\t" | |
254 "movq %%mm4, (%0, %1, 2) \n\t" | |
255 "movq %%mm6, (%0, %2) \n\t" | |
256 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) | |
257 :"memory"); | |
258 pix += line_size*4; | |
259 p += 32; | |
260 | |
261 // if here would be an exact copy of the code above | |
262 // compiler would generate some very strange code | |
263 // thus using "r" | |
264 __asm__ volatile( | |
265 "movq (%3), %%mm0 \n\t" | |
266 "movq 8(%3), %%mm1 \n\t" | |
267 "movq 16(%3), %%mm2 \n\t" | |
268 "movq 24(%3), %%mm3 \n\t" | |
269 "movq 32(%3), %%mm4 \n\t" | |
270 "movq 40(%3), %%mm5 \n\t" | |
271 "movq 48(%3), %%mm6 \n\t" | |
272 "movq 56(%3), %%mm7 \n\t" | |
273 "packuswb %%mm1, %%mm0 \n\t" | |
274 "packuswb %%mm3, %%mm2 \n\t" | |
275 "packuswb %%mm5, %%mm4 \n\t" | |
276 "packuswb %%mm7, %%mm6 \n\t" | |
277 "movq %%mm0, (%0) \n\t" | |
278 "movq %%mm2, (%0, %1) \n\t" | |
279 "movq %%mm4, (%0, %1, 2) \n\t" | |
280 "movq %%mm6, (%0, %2) \n\t" | |
281 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p) | |
282 :"memory"); | |
283 } | |
284 | |
10961
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
10766
diff
changeset
|
285 DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] = |
8430 | 286 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; |
287 | |
9337
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
288 #define put_signed_pixels_clamped_mmx_half(off) \ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
289 "movq "#off"(%2), %%mm1 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
290 "movq 16+"#off"(%2), %%mm2 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
291 "movq 32+"#off"(%2), %%mm3 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
292 "movq 48+"#off"(%2), %%mm4 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
293 "packsswb 8+"#off"(%2), %%mm1 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
294 "packsswb 24+"#off"(%2), %%mm2 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
295 "packsswb 40+"#off"(%2), %%mm3 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
296 "packsswb 56+"#off"(%2), %%mm4 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
297 "paddb %%mm0, %%mm1 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
298 "paddb %%mm0, %%mm2 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
299 "paddb %%mm0, %%mm3 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
300 "paddb %%mm0, %%mm4 \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
301 "movq %%mm1, (%0) \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
302 "movq %%mm2, (%0, %3) \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
303 "movq %%mm3, (%0, %3, 2) \n\t"\ |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
304 "movq %%mm4, (%0, %1) \n\t" |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
305 |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
306 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
8430 | 307 { |
9337
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
308 x86_reg line_skip = line_size; |
9341
06532529c428
Mark line_skip3 asm argument as output-only instead of using av_uninit.
reimar
parents:
9340
diff
changeset
|
309 x86_reg line_skip3; |
8430 | 310 |
9337
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
311 __asm__ volatile ( |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
312 "movq "MANGLE(ff_vector128)", %%mm0 \n\t" |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
313 "lea (%3, %3, 2), %1 \n\t" |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
314 put_signed_pixels_clamped_mmx_half(0) |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
315 "lea (%0, %3, 4), %0 \n\t" |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
316 put_signed_pixels_clamped_mmx_half(64) |
9341
06532529c428
Mark line_skip3 asm argument as output-only instead of using av_uninit.
reimar
parents:
9340
diff
changeset
|
317 :"+&r" (pixels), "=&r" (line_skip3) |
9337
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
318 :"r" (block), "r"(line_skip) |
a0d54042ea37
Rewrite put_signed_pixels_clamped_mmx() to eliminate mmx.h from dsputil_mmx.c.
alexc
parents:
8818
diff
changeset
|
319 :"memory"); |
8430 | 320 } |
321 | |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
322 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
8430 | 323 { |
324 const DCTELEM *p; | |
325 uint8_t *pix; | |
326 int i; | |
327 | |
328 /* read the pixels */ | |
329 p = block; | |
330 pix = pixels; | |
331 MOVQ_ZERO(mm7); | |
332 i = 4; | |
333 do { | |
334 __asm__ volatile( | |
335 "movq (%2), %%mm0 \n\t" | |
336 "movq 8(%2), %%mm1 \n\t" | |
337 "movq 16(%2), %%mm2 \n\t" | |
338 "movq 24(%2), %%mm3 \n\t" | |
339 "movq %0, %%mm4 \n\t" | |
340 "movq %1, %%mm6 \n\t" | |
341 "movq %%mm4, %%mm5 \n\t" | |
342 "punpcklbw %%mm7, %%mm4 \n\t" | |
343 "punpckhbw %%mm7, %%mm5 \n\t" | |
344 "paddsw %%mm4, %%mm0 \n\t" | |
345 "paddsw %%mm5, %%mm1 \n\t" | |
346 "movq %%mm6, %%mm5 \n\t" | |
347 "punpcklbw %%mm7, %%mm6 \n\t" | |
348 "punpckhbw %%mm7, %%mm5 \n\t" | |
349 "paddsw %%mm6, %%mm2 \n\t" | |
350 "paddsw %%mm5, %%mm3 \n\t" | |
351 "packuswb %%mm1, %%mm0 \n\t" | |
352 "packuswb %%mm3, %%mm2 \n\t" | |
353 "movq %%mm0, %0 \n\t" | |
354 "movq %%mm2, %1 \n\t" | |
355 :"+m"(*pix), "+m"(*(pix+line_size)) | |
356 :"r"(p) | |
357 :"memory"); | |
358 pix += line_size*2; | |
359 p += 16; | |
360 } while (--i); | |
361 } | |
362 | |
363 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
364 { | |
365 __asm__ volatile( | |
366 "lea (%3, %3), %%"REG_a" \n\t" | |
367 ASMALIGN(3) | |
368 "1: \n\t" | |
369 "movd (%1), %%mm0 \n\t" | |
370 "movd (%1, %3), %%mm1 \n\t" | |
371 "movd %%mm0, (%2) \n\t" | |
372 "movd %%mm1, (%2, %3) \n\t" | |
373 "add %%"REG_a", %1 \n\t" | |
374 "add %%"REG_a", %2 \n\t" | |
375 "movd (%1), %%mm0 \n\t" | |
376 "movd (%1, %3), %%mm1 \n\t" | |
377 "movd %%mm0, (%2) \n\t" | |
378 "movd %%mm1, (%2, %3) \n\t" | |
379 "add %%"REG_a", %1 \n\t" | |
380 "add %%"REG_a", %2 \n\t" | |
381 "subl $4, %0 \n\t" | |
382 "jnz 1b \n\t" | |
383 : "+g"(h), "+r" (pixels), "+r" (block) | |
384 : "r"((x86_reg)line_size) | |
385 : "%"REG_a, "memory" | |
386 ); | |
387 } | |
388 | |
389 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
390 { | |
391 __asm__ volatile( | |
392 "lea (%3, %3), %%"REG_a" \n\t" | |
393 ASMALIGN(3) | |
394 "1: \n\t" | |
395 "movq (%1), %%mm0 \n\t" | |
396 "movq (%1, %3), %%mm1 \n\t" | |
397 "movq %%mm0, (%2) \n\t" | |
398 "movq %%mm1, (%2, %3) \n\t" | |
399 "add %%"REG_a", %1 \n\t" | |
400 "add %%"REG_a", %2 \n\t" | |
401 "movq (%1), %%mm0 \n\t" | |
402 "movq (%1, %3), %%mm1 \n\t" | |
403 "movq %%mm0, (%2) \n\t" | |
404 "movq %%mm1, (%2, %3) \n\t" | |
405 "add %%"REG_a", %1 \n\t" | |
406 "add %%"REG_a", %2 \n\t" | |
407 "subl $4, %0 \n\t" | |
408 "jnz 1b \n\t" | |
409 : "+g"(h), "+r" (pixels), "+r" (block) | |
410 : "r"((x86_reg)line_size) | |
411 : "%"REG_a, "memory" | |
412 ); | |
413 } | |
414 | |
415 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
416 { | |
417 __asm__ volatile( | |
418 "lea (%3, %3), %%"REG_a" \n\t" | |
419 ASMALIGN(3) | |
420 "1: \n\t" | |
421 "movq (%1), %%mm0 \n\t" | |
422 "movq 8(%1), %%mm4 \n\t" | |
423 "movq (%1, %3), %%mm1 \n\t" | |
424 "movq 8(%1, %3), %%mm5 \n\t" | |
425 "movq %%mm0, (%2) \n\t" | |
426 "movq %%mm4, 8(%2) \n\t" | |
427 "movq %%mm1, (%2, %3) \n\t" | |
428 "movq %%mm5, 8(%2, %3) \n\t" | |
429 "add %%"REG_a", %1 \n\t" | |
430 "add %%"REG_a", %2 \n\t" | |
431 "movq (%1), %%mm0 \n\t" | |
432 "movq 8(%1), %%mm4 \n\t" | |
433 "movq (%1, %3), %%mm1 \n\t" | |
434 "movq 8(%1, %3), %%mm5 \n\t" | |
435 "movq %%mm0, (%2) \n\t" | |
436 "movq %%mm4, 8(%2) \n\t" | |
437 "movq %%mm1, (%2, %3) \n\t" | |
438 "movq %%mm5, 8(%2, %3) \n\t" | |
439 "add %%"REG_a", %1 \n\t" | |
440 "add %%"REG_a", %2 \n\t" | |
441 "subl $4, %0 \n\t" | |
442 "jnz 1b \n\t" | |
443 : "+g"(h), "+r" (pixels), "+r" (block) | |
444 : "r"((x86_reg)line_size) | |
445 : "%"REG_a, "memory" | |
446 ); | |
447 } | |
448 | |
449 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
450 { | |
451 __asm__ volatile( | |
452 "1: \n\t" | |
453 "movdqu (%1), %%xmm0 \n\t" | |
454 "movdqu (%1,%3), %%xmm1 \n\t" | |
455 "movdqu (%1,%3,2), %%xmm2 \n\t" | |
456 "movdqu (%1,%4), %%xmm3 \n\t" | |
457 "movdqa %%xmm0, (%2) \n\t" | |
458 "movdqa %%xmm1, (%2,%3) \n\t" | |
459 "movdqa %%xmm2, (%2,%3,2) \n\t" | |
460 "movdqa %%xmm3, (%2,%4) \n\t" | |
461 "subl $4, %0 \n\t" | |
462 "lea (%1,%3,4), %1 \n\t" | |
463 "lea (%2,%3,4), %2 \n\t" | |
464 "jnz 1b \n\t" | |
465 : "+g"(h), "+r" (pixels), "+r" (block) | |
466 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) | |
467 : "memory" | |
468 ); | |
469 } | |
470 | |
471 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
472 { | |
473 __asm__ volatile( | |
474 "1: \n\t" | |
475 "movdqu (%1), %%xmm0 \n\t" | |
476 "movdqu (%1,%3), %%xmm1 \n\t" | |
477 "movdqu (%1,%3,2), %%xmm2 \n\t" | |
478 "movdqu (%1,%4), %%xmm3 \n\t" | |
479 "pavgb (%2), %%xmm0 \n\t" | |
480 "pavgb (%2,%3), %%xmm1 \n\t" | |
481 "pavgb (%2,%3,2), %%xmm2 \n\t" | |
482 "pavgb (%2,%4), %%xmm3 \n\t" | |
483 "movdqa %%xmm0, (%2) \n\t" | |
484 "movdqa %%xmm1, (%2,%3) \n\t" | |
485 "movdqa %%xmm2, (%2,%3,2) \n\t" | |
486 "movdqa %%xmm3, (%2,%4) \n\t" | |
487 "subl $4, %0 \n\t" | |
488 "lea (%1,%3,4), %1 \n\t" | |
489 "lea (%2,%3,4), %2 \n\t" | |
490 "jnz 1b \n\t" | |
491 : "+g"(h), "+r" (pixels), "+r" (block) | |
492 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) | |
493 : "memory" | |
494 ); | |
495 } | |
496 | |
497 #define CLEAR_BLOCKS(name,n) \ | |
498 static void name(DCTELEM *blocks)\ | |
499 {\ | |
500 __asm__ volatile(\ | |
501 "pxor %%mm7, %%mm7 \n\t"\ | |
502 "mov %1, %%"REG_a" \n\t"\ | |
503 "1: \n\t"\ | |
504 "movq %%mm7, (%0, %%"REG_a") \n\t"\ | |
505 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\ | |
506 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\ | |
507 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\ | |
508 "add $32, %%"REG_a" \n\t"\ | |
509 " js 1b \n\t"\ | |
510 : : "r" (((uint8_t *)blocks)+128*n),\ | |
511 "i" (-128*n)\ | |
512 : "%"REG_a\ | |
513 );\ | |
514 } | |
515 CLEAR_BLOCKS(clear_blocks_mmx, 6) | |
516 CLEAR_BLOCKS(clear_block_mmx, 1) | |
517 | |
518 static void clear_block_sse(DCTELEM *block) | |
519 { | |
520 __asm__ volatile( | |
521 "xorps %%xmm0, %%xmm0 \n" | |
522 "movaps %%xmm0, (%0) \n" | |
523 "movaps %%xmm0, 16(%0) \n" | |
524 "movaps %%xmm0, 32(%0) \n" | |
525 "movaps %%xmm0, 48(%0) \n" | |
526 "movaps %%xmm0, 64(%0) \n" | |
527 "movaps %%xmm0, 80(%0) \n" | |
528 "movaps %%xmm0, 96(%0) \n" | |
529 "movaps %%xmm0, 112(%0) \n" | |
530 :: "r"(block) | |
531 : "memory" | |
532 ); | |
533 } | |
534 | |
9861 | 535 static void clear_blocks_sse(DCTELEM *blocks) |
536 {\ | |
537 __asm__ volatile( | |
538 "xorps %%xmm0, %%xmm0 \n" | |
539 "mov %1, %%"REG_a" \n" | |
540 "1: \n" | |
541 "movaps %%xmm0, (%0, %%"REG_a") \n" | |
542 "movaps %%xmm0, 16(%0, %%"REG_a") \n" | |
543 "movaps %%xmm0, 32(%0, %%"REG_a") \n" | |
544 "movaps %%xmm0, 48(%0, %%"REG_a") \n" | |
545 "movaps %%xmm0, 64(%0, %%"REG_a") \n" | |
546 "movaps %%xmm0, 80(%0, %%"REG_a") \n" | |
547 "movaps %%xmm0, 96(%0, %%"REG_a") \n" | |
548 "movaps %%xmm0, 112(%0, %%"REG_a") \n" | |
549 "add $128, %%"REG_a" \n" | |
550 " js 1b \n" | |
551 : : "r" (((uint8_t *)blocks)+128*6), | |
552 "i" (-128*6) | |
553 : "%"REG_a | |
554 ); | |
555 } | |
556 | |
8430 | 557 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ |
558 x86_reg i=0; | |
559 __asm__ volatile( | |
560 "jmp 2f \n\t" | |
561 "1: \n\t" | |
562 "movq (%1, %0), %%mm0 \n\t" | |
563 "movq (%2, %0), %%mm1 \n\t" | |
564 "paddb %%mm0, %%mm1 \n\t" | |
565 "movq %%mm1, (%2, %0) \n\t" | |
566 "movq 8(%1, %0), %%mm0 \n\t" | |
567 "movq 8(%2, %0), %%mm1 \n\t" | |
568 "paddb %%mm0, %%mm1 \n\t" | |
569 "movq %%mm1, 8(%2, %0) \n\t" | |
570 "add $16, %0 \n\t" | |
571 "2: \n\t" | |
572 "cmp %3, %0 \n\t" | |
573 " js 1b \n\t" | |
574 : "+r" (i) | |
575 : "r"(src), "r"(dst), "r"((x86_reg)w-15) | |
576 ); | |
577 for(; i<w; i++) | |
578 dst[i+0] += src[i+0]; | |
579 } | |
580 | |
581 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ | |
582 x86_reg i=0; | |
583 __asm__ volatile( | |
584 "jmp 2f \n\t" | |
585 "1: \n\t" | |
586 "movq (%2, %0), %%mm0 \n\t" | |
587 "movq 8(%2, %0), %%mm1 \n\t" | |
588 "paddb (%3, %0), %%mm0 \n\t" | |
589 "paddb 8(%3, %0), %%mm1 \n\t" | |
590 "movq %%mm0, (%1, %0) \n\t" | |
591 "movq %%mm1, 8(%1, %0) \n\t" | |
592 "add $16, %0 \n\t" | |
593 "2: \n\t" | |
594 "cmp %4, %0 \n\t" | |
595 " js 1b \n\t" | |
596 : "+r" (i) | |
597 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15) | |
598 ); | |
599 for(; i<w; i++) | |
600 dst[i] = src1[i] + src2[i]; | |
601 } | |
602 | |
8798
a5c8210814d7
Add check whether the compiler/assembler supports 10 or more operands.
diego
parents:
8760
diff
changeset
|
603 #if HAVE_7REGS && HAVE_TEN_OPERANDS |
10431 | 604 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) { |
8760 | 605 x86_reg w2 = -w; |
606 x86_reg x; | |
607 int l = *left & 0xff; | |
608 int tl = *left_top & 0xff; | |
609 int t; | |
610 __asm__ volatile( | |
611 "mov %7, %3 \n" | |
612 "1: \n" | |
613 "movzx (%3,%4), %2 \n" | |
614 "mov %2, %k3 \n" | |
615 "sub %b1, %b3 \n" | |
616 "add %b0, %b3 \n" | |
617 "mov %2, %1 \n" | |
618 "cmp %0, %2 \n" | |
619 "cmovg %0, %2 \n" | |
620 "cmovg %1, %0 \n" | |
621 "cmp %k3, %0 \n" | |
622 "cmovg %k3, %0 \n" | |
623 "mov %7, %3 \n" | |
624 "cmp %2, %0 \n" | |
625 "cmovl %2, %0 \n" | |
626 "add (%6,%4), %b0 \n" | |
627 "mov %b0, (%5,%4) \n" | |
628 "inc %4 \n" | |
629 "jl 1b \n" | |
630 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2) | |
631 :"r"(dst+w), "r"(diff+w), "rm"(top+w) | |
632 ); | |
633 *left = l; | |
634 *left_top = tl; | |
635 } | |
636 #endif | |
637 | |
8430 | 638 #define H263_LOOP_FILTER \ |
639 "pxor %%mm7, %%mm7 \n\t"\ | |
640 "movq %0, %%mm0 \n\t"\ | |
641 "movq %0, %%mm1 \n\t"\ | |
642 "movq %3, %%mm2 \n\t"\ | |
643 "movq %3, %%mm3 \n\t"\ | |
644 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
645 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
646 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
647 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
648 "psubw %%mm2, %%mm0 \n\t"\ | |
649 "psubw %%mm3, %%mm1 \n\t"\ | |
650 "movq %1, %%mm2 \n\t"\ | |
651 "movq %1, %%mm3 \n\t"\ | |
652 "movq %2, %%mm4 \n\t"\ | |
653 "movq %2, %%mm5 \n\t"\ | |
654 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
655 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
656 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
657 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
658 "psubw %%mm2, %%mm4 \n\t"\ | |
659 "psubw %%mm3, %%mm5 \n\t"\ | |
660 "psllw $2, %%mm4 \n\t"\ | |
661 "psllw $2, %%mm5 \n\t"\ | |
662 "paddw %%mm0, %%mm4 \n\t"\ | |
663 "paddw %%mm1, %%mm5 \n\t"\ | |
664 "pxor %%mm6, %%mm6 \n\t"\ | |
665 "pcmpgtw %%mm4, %%mm6 \n\t"\ | |
666 "pcmpgtw %%mm5, %%mm7 \n\t"\ | |
667 "pxor %%mm6, %%mm4 \n\t"\ | |
668 "pxor %%mm7, %%mm5 \n\t"\ | |
669 "psubw %%mm6, %%mm4 \n\t"\ | |
670 "psubw %%mm7, %%mm5 \n\t"\ | |
671 "psrlw $3, %%mm4 \n\t"\ | |
672 "psrlw $3, %%mm5 \n\t"\ | |
673 "packuswb %%mm5, %%mm4 \n\t"\ | |
674 "packsswb %%mm7, %%mm6 \n\t"\ | |
675 "pxor %%mm7, %%mm7 \n\t"\ | |
676 "movd %4, %%mm2 \n\t"\ | |
677 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
678 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
679 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
680 "psubusb %%mm4, %%mm2 \n\t"\ | |
681 "movq %%mm2, %%mm3 \n\t"\ | |
682 "psubusb %%mm4, %%mm3 \n\t"\ | |
683 "psubb %%mm3, %%mm2 \n\t"\ | |
684 "movq %1, %%mm3 \n\t"\ | |
685 "movq %2, %%mm4 \n\t"\ | |
686 "pxor %%mm6, %%mm3 \n\t"\ | |
687 "pxor %%mm6, %%mm4 \n\t"\ | |
688 "paddusb %%mm2, %%mm3 \n\t"\ | |
689 "psubusb %%mm2, %%mm4 \n\t"\ | |
690 "pxor %%mm6, %%mm3 \n\t"\ | |
691 "pxor %%mm6, %%mm4 \n\t"\ | |
692 "paddusb %%mm2, %%mm2 \n\t"\ | |
693 "packsswb %%mm1, %%mm0 \n\t"\ | |
694 "pcmpgtb %%mm0, %%mm7 \n\t"\ | |
695 "pxor %%mm7, %%mm0 \n\t"\ | |
696 "psubb %%mm7, %%mm0 \n\t"\ | |
697 "movq %%mm0, %%mm1 \n\t"\ | |
698 "psubusb %%mm2, %%mm0 \n\t"\ | |
699 "psubb %%mm0, %%mm1 \n\t"\ | |
700 "pand %5, %%mm1 \n\t"\ | |
701 "psrlw $2, %%mm1 \n\t"\ | |
702 "pxor %%mm7, %%mm1 \n\t"\ | |
703 "psubb %%mm7, %%mm1 \n\t"\ | |
704 "movq %0, %%mm5 \n\t"\ | |
705 "movq %3, %%mm6 \n\t"\ | |
706 "psubb %%mm1, %%mm5 \n\t"\ | |
707 "paddb %%mm1, %%mm6 \n\t" | |
708 | |
709 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ | |
10749
5cca4b6c459d
Get rid of pointless CONFIG_ANY_H263 preprocessor definition.
diego
parents:
10645
diff
changeset
|
710 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
8430 | 711 const int strength= ff_h263_loop_filter_strength[qscale]; |
712 | |
713 __asm__ volatile( | |
714 | |
715 H263_LOOP_FILTER | |
716 | |
717 "movq %%mm3, %1 \n\t" | |
718 "movq %%mm4, %2 \n\t" | |
719 "movq %%mm5, %0 \n\t" | |
720 "movq %%mm6, %3 \n\t" | |
721 : "+m" (*(uint64_t*)(src - 2*stride)), | |
722 "+m" (*(uint64_t*)(src - 1*stride)), | |
723 "+m" (*(uint64_t*)(src + 0*stride)), | |
724 "+m" (*(uint64_t*)(src + 1*stride)) | |
725 : "g" (2*strength), "m"(ff_pb_FC) | |
726 ); | |
727 } | |
728 } | |
729 | |
730 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ | |
10749
5cca4b6c459d
Get rid of pointless CONFIG_ANY_H263 preprocessor definition.
diego
parents:
10645
diff
changeset
|
731 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
8430 | 732 const int strength= ff_h263_loop_filter_strength[qscale]; |
10961
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
10766
diff
changeset
|
733 DECLARE_ALIGNED(8, uint64_t, temp)[4]; |
8430 | 734 uint8_t *btemp= (uint8_t*)temp; |
735 | |
736 src -= 2; | |
737 | |
738 transpose4x4(btemp , src , 8, stride); | |
739 transpose4x4(btemp+4, src + 4*stride, 8, stride); | |
740 __asm__ volatile( | |
741 H263_LOOP_FILTER // 5 3 4 6 | |
742 | |
743 : "+m" (temp[0]), | |
744 "+m" (temp[1]), | |
745 "+m" (temp[2]), | |
746 "+m" (temp[3]) | |
747 : "g" (2*strength), "m"(ff_pb_FC) | |
748 ); | |
749 | |
750 __asm__ volatile( | |
751 "movq %%mm5, %%mm1 \n\t" | |
752 "movq %%mm4, %%mm0 \n\t" | |
753 "punpcklbw %%mm3, %%mm5 \n\t" | |
754 "punpcklbw %%mm6, %%mm4 \n\t" | |
755 "punpckhbw %%mm3, %%mm1 \n\t" | |
756 "punpckhbw %%mm6, %%mm0 \n\t" | |
757 "movq %%mm5, %%mm3 \n\t" | |
758 "movq %%mm1, %%mm6 \n\t" | |
759 "punpcklwd %%mm4, %%mm5 \n\t" | |
760 "punpcklwd %%mm0, %%mm1 \n\t" | |
761 "punpckhwd %%mm4, %%mm3 \n\t" | |
762 "punpckhwd %%mm0, %%mm6 \n\t" | |
763 "movd %%mm5, (%0) \n\t" | |
764 "punpckhdq %%mm5, %%mm5 \n\t" | |
765 "movd %%mm5, (%0,%2) \n\t" | |
766 "movd %%mm3, (%0,%2,2) \n\t" | |
767 "punpckhdq %%mm3, %%mm3 \n\t" | |
768 "movd %%mm3, (%0,%3) \n\t" | |
769 "movd %%mm1, (%1) \n\t" | |
770 "punpckhdq %%mm1, %%mm1 \n\t" | |
771 "movd %%mm1, (%1,%2) \n\t" | |
772 "movd %%mm6, (%1,%2,2) \n\t" | |
773 "punpckhdq %%mm6, %%mm6 \n\t" | |
774 "movd %%mm6, (%1,%3) \n\t" | |
775 :: "r" (src), | |
776 "r" (src + 4*stride), | |
777 "r" ((x86_reg) stride ), | |
778 "r" ((x86_reg)(3*stride)) | |
779 ); | |
780 } | |
781 } | |
782 | |
783 /* draw the edges of width 'w' of an image of size width, height | |
784 this mmx version can only handle w==8 || w==16 */ | |
785 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) | |
786 { | |
787 uint8_t *ptr, *last_line; | |
788 int i; | |
789 | |
790 last_line = buf + (height - 1) * wrap; | |
791 /* left and right */ | |
792 ptr = buf; | |
793 if(w==8) | |
794 { | |
795 __asm__ volatile( | |
796 "1: \n\t" | |
797 "movd (%0), %%mm0 \n\t" | |
798 "punpcklbw %%mm0, %%mm0 \n\t" | |
799 "punpcklwd %%mm0, %%mm0 \n\t" | |
800 "punpckldq %%mm0, %%mm0 \n\t" | |
801 "movq %%mm0, -8(%0) \n\t" | |
802 "movq -8(%0, %2), %%mm1 \n\t" | |
803 "punpckhbw %%mm1, %%mm1 \n\t" | |
804 "punpckhwd %%mm1, %%mm1 \n\t" | |
805 "punpckhdq %%mm1, %%mm1 \n\t" | |
806 "movq %%mm1, (%0, %2) \n\t" | |
807 "add %1, %0 \n\t" | |
808 "cmp %3, %0 \n\t" | |
809 " jb 1b \n\t" | |
810 : "+r" (ptr) | |
811 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) | |
812 ); | |
813 } | |
814 else | |
815 { | |
816 __asm__ volatile( | |
817 "1: \n\t" | |
818 "movd (%0), %%mm0 \n\t" | |
819 "punpcklbw %%mm0, %%mm0 \n\t" | |
820 "punpcklwd %%mm0, %%mm0 \n\t" | |
821 "punpckldq %%mm0, %%mm0 \n\t" | |
822 "movq %%mm0, -8(%0) \n\t" | |
823 "movq %%mm0, -16(%0) \n\t" | |
824 "movq -8(%0, %2), %%mm1 \n\t" | |
825 "punpckhbw %%mm1, %%mm1 \n\t" | |
826 "punpckhwd %%mm1, %%mm1 \n\t" | |
827 "punpckhdq %%mm1, %%mm1 \n\t" | |
828 "movq %%mm1, (%0, %2) \n\t" | |
829 "movq %%mm1, 8(%0, %2) \n\t" | |
830 "add %1, %0 \n\t" | |
831 "cmp %3, %0 \n\t" | |
832 " jb 1b \n\t" | |
833 : "+r" (ptr) | |
834 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) | |
835 ); | |
836 } | |
837 | |
838 for(i=0;i<w;i+=4) { | |
839 /* top and bottom (and hopefully also the corners) */ | |
840 ptr= buf - (i + 1) * wrap - w; | |
841 __asm__ volatile( | |
842 "1: \n\t" | |
843 "movq (%1, %0), %%mm0 \n\t" | |
844 "movq %%mm0, (%0) \n\t" | |
845 "movq %%mm0, (%0, %2) \n\t" | |
846 "movq %%mm0, (%0, %2, 2) \n\t" | |
847 "movq %%mm0, (%0, %3) \n\t" | |
848 "add $8, %0 \n\t" | |
849 "cmp %4, %0 \n\t" | |
850 " jb 1b \n\t" | |
851 : "+r" (ptr) | |
852 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w) | |
853 ); | |
854 ptr= last_line + (i + 1) * wrap - w; | |
855 __asm__ volatile( | |
856 "1: \n\t" | |
857 "movq (%1, %0), %%mm0 \n\t" | |
858 "movq %%mm0, (%0) \n\t" | |
859 "movq %%mm0, (%0, %2) \n\t" | |
860 "movq %%mm0, (%0, %2, 2) \n\t" | |
861 "movq %%mm0, (%0, %3) \n\t" | |
862 "add $8, %0 \n\t" | |
863 "cmp %4, %0 \n\t" | |
864 " jb 1b \n\t" | |
865 : "+r" (ptr) | |
866 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w) | |
867 ); | |
868 } | |
869 } | |
870 | |
871 #define PAETH(cpu, abs3)\ | |
872 static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ | |
873 {\ | |
874 x86_reg i = -bpp;\ | |
875 x86_reg end = w-3;\ | |
876 __asm__ volatile(\ | |
877 "pxor %%mm7, %%mm7 \n"\ | |
878 "movd (%1,%0), %%mm0 \n"\ | |
879 "movd (%2,%0), %%mm1 \n"\ | |
880 "punpcklbw %%mm7, %%mm0 \n"\ | |
881 "punpcklbw %%mm7, %%mm1 \n"\ | |
882 "add %4, %0 \n"\ | |
883 "1: \n"\ | |
884 "movq %%mm1, %%mm2 \n"\ | |
885 "movd (%2,%0), %%mm1 \n"\ | |
886 "movq %%mm2, %%mm3 \n"\ | |
887 "punpcklbw %%mm7, %%mm1 \n"\ | |
888 "movq %%mm2, %%mm4 \n"\ | |
889 "psubw %%mm1, %%mm3 \n"\ | |
890 "psubw %%mm0, %%mm4 \n"\ | |
891 "movq %%mm3, %%mm5 \n"\ | |
892 "paddw %%mm4, %%mm5 \n"\ | |
893 abs3\ | |
894 "movq %%mm4, %%mm6 \n"\ | |
895 "pminsw %%mm5, %%mm6 \n"\ | |
896 "pcmpgtw %%mm6, %%mm3 \n"\ | |
897 "pcmpgtw %%mm5, %%mm4 \n"\ | |
898 "movq %%mm4, %%mm6 \n"\ | |
899 "pand %%mm3, %%mm4 \n"\ | |
900 "pandn %%mm3, %%mm6 \n"\ | |
901 "pandn %%mm0, %%mm3 \n"\ | |
902 "movd (%3,%0), %%mm0 \n"\ | |
903 "pand %%mm1, %%mm6 \n"\ | |
904 "pand %%mm4, %%mm2 \n"\ | |
905 "punpcklbw %%mm7, %%mm0 \n"\ | |
906 "movq %6, %%mm5 \n"\ | |
907 "paddw %%mm6, %%mm0 \n"\ | |
908 "paddw %%mm2, %%mm3 \n"\ | |
909 "paddw %%mm3, %%mm0 \n"\ | |
910 "pand %%mm5, %%mm0 \n"\ | |
911 "movq %%mm0, %%mm3 \n"\ | |
912 "packuswb %%mm3, %%mm3 \n"\ | |
913 "movd %%mm3, (%1,%0) \n"\ | |
914 "add %4, %0 \n"\ | |
915 "cmp %5, %0 \n"\ | |
916 "jle 1b \n"\ | |
917 :"+r"(i)\ | |
918 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\ | |
919 "m"(ff_pw_255)\ | |
920 :"memory"\ | |
921 );\ | |
922 } | |
923 | |
924 #define ABS3_MMX2\ | |
925 "psubw %%mm5, %%mm7 \n"\ | |
926 "pmaxsw %%mm7, %%mm5 \n"\ | |
927 "pxor %%mm6, %%mm6 \n"\ | |
928 "pxor %%mm7, %%mm7 \n"\ | |
929 "psubw %%mm3, %%mm6 \n"\ | |
930 "psubw %%mm4, %%mm7 \n"\ | |
931 "pmaxsw %%mm6, %%mm3 \n"\ | |
932 "pmaxsw %%mm7, %%mm4 \n"\ | |
933 "pxor %%mm7, %%mm7 \n" | |
934 | |
935 #define ABS3_SSSE3\ | |
936 "pabsw %%mm3, %%mm3 \n"\ | |
937 "pabsw %%mm4, %%mm4 \n"\ | |
938 "pabsw %%mm5, %%mm5 \n" | |
939 | |
940 PAETH(mmx2, ABS3_MMX2) | |
8590 | 941 #if HAVE_SSSE3 |
8430 | 942 PAETH(ssse3, ABS3_SSSE3) |
943 #endif | |
944 | |
945 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ | |
946 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ | |
947 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ | |
948 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ | |
949 "movq "#in7", " #m3 " \n\t" /* d */\ | |
950 "movq "#in0", %%mm5 \n\t" /* D */\ | |
951 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ | |
952 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ | |
953 "movq "#in1", %%mm5 \n\t" /* C */\ | |
954 "movq "#in2", %%mm6 \n\t" /* B */\ | |
955 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ | |
956 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ | |
957 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ | |
958 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ | |
959 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ | |
960 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ | |
961 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ | |
962 "psraw $5, %%mm5 \n\t"\ | |
963 "packuswb %%mm5, %%mm5 \n\t"\ | |
964 OP(%%mm5, out, %%mm7, d) | |
965 | |
966 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ | |
967 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
968 uint64_t temp;\ | |
969 \ | |
970 __asm__ volatile(\ | |
971 "pxor %%mm7, %%mm7 \n\t"\ | |
972 "1: \n\t"\ | |
973 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
974 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
975 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
976 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
977 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
978 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
979 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
980 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
981 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
982 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
983 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
984 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
985 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
986 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
987 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
988 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
989 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
990 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
991 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
992 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
993 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ | |
994 "paddw %%mm4, %%mm0 \n\t" /* a */\ | |
995 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
996 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ | |
997 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ | |
998 "paddw %6, %%mm6 \n\t"\ | |
999 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
1000 "psraw $5, %%mm0 \n\t"\ | |
1001 "movq %%mm0, %5 \n\t"\ | |
1002 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ | |
1003 \ | |
1004 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ | |
1005 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ | |
1006 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ | |
1007 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ | |
1008 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ | |
1009 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ | |
1010 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ | |
1011 "paddw %%mm0, %%mm2 \n\t" /* b */\ | |
1012 "paddw %%mm5, %%mm3 \n\t" /* c */\ | |
1013 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
1014 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
1015 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ | |
1016 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ | |
1017 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ | |
1018 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ | |
1019 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ | |
1020 "paddw %%mm2, %%mm1 \n\t" /* a */\ | |
1021 "paddw %%mm6, %%mm4 \n\t" /* d */\ | |
1022 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ | |
1023 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ | |
1024 "paddw %6, %%mm1 \n\t"\ | |
1025 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ | |
1026 "psraw $5, %%mm3 \n\t"\ | |
1027 "movq %5, %%mm1 \n\t"\ | |
1028 "packuswb %%mm3, %%mm1 \n\t"\ | |
1029 OP_MMX2(%%mm1, (%1),%%mm4, q)\ | |
1030 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ | |
1031 \ | |
1032 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ | |
1033 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ | |
1034 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ | |
1035 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ | |
1036 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ | |
1037 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ | |
1038 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ | |
1039 "paddw %%mm1, %%mm5 \n\t" /* b */\ | |
1040 "paddw %%mm4, %%mm0 \n\t" /* c */\ | |
1041 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1042 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ | |
1043 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ | |
1044 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ | |
1045 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ | |
1046 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ | |
1047 "paddw %%mm3, %%mm2 \n\t" /* d */\ | |
1048 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ | |
1049 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ | |
1050 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ | |
1051 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ | |
1052 "paddw %%mm2, %%mm6 \n\t" /* a */\ | |
1053 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ | |
1054 "paddw %6, %%mm0 \n\t"\ | |
1055 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
1056 "psraw $5, %%mm0 \n\t"\ | |
1057 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ | |
1058 \ | |
1059 "paddw %%mm5, %%mm3 \n\t" /* a */\ | |
1060 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ | |
1061 "paddw %%mm4, %%mm6 \n\t" /* b */\ | |
1062 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ | |
1063 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ | |
1064 "paddw %%mm1, %%mm4 \n\t" /* c */\ | |
1065 "paddw %%mm2, %%mm5 \n\t" /* d */\ | |
1066 "paddw %%mm6, %%mm6 \n\t" /* 2b */\ | |
1067 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ | |
1068 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ | |
1069 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ | |
1070 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ | |
1071 "paddw %6, %%mm4 \n\t"\ | |
1072 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ | |
1073 "psraw $5, %%mm4 \n\t"\ | |
1074 "packuswb %%mm4, %%mm0 \n\t"\ | |
1075 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ | |
1076 \ | |
1077 "add %3, %0 \n\t"\ | |
1078 "add %4, %1 \n\t"\ | |
1079 "decl %2 \n\t"\ | |
1080 " jnz 1b \n\t"\ | |
1081 : "+a"(src), "+c"(dst), "+D"(h)\ | |
1082 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ | |
1083 : "memory"\ | |
1084 );\ | |
1085 }\ | |
1086 \ | |
1087 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1088 int i;\ | |
1089 int16_t temp[16];\ | |
1090 /* quick HACK, XXX FIXME MUST be optimized */\ | |
1091 for(i=0; i<h; i++)\ | |
1092 {\ | |
1093 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
1094 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
1095 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
1096 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
1097 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
1098 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\ | |
1099 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\ | |
1100 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\ | |
1101 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\ | |
1102 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\ | |
1103 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\ | |
1104 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\ | |
1105 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\ | |
1106 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\ | |
1107 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\ | |
1108 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\ | |
1109 __asm__ volatile(\ | |
1110 "movq (%0), %%mm0 \n\t"\ | |
1111 "movq 8(%0), %%mm1 \n\t"\ | |
1112 "paddw %2, %%mm0 \n\t"\ | |
1113 "paddw %2, %%mm1 \n\t"\ | |
1114 "psraw $5, %%mm0 \n\t"\ | |
1115 "psraw $5, %%mm1 \n\t"\ | |
1116 "packuswb %%mm1, %%mm0 \n\t"\ | |
1117 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ | |
1118 "movq 16(%0), %%mm0 \n\t"\ | |
1119 "movq 24(%0), %%mm1 \n\t"\ | |
1120 "paddw %2, %%mm0 \n\t"\ | |
1121 "paddw %2, %%mm1 \n\t"\ | |
1122 "psraw $5, %%mm0 \n\t"\ | |
1123 "psraw $5, %%mm1 \n\t"\ | |
1124 "packuswb %%mm1, %%mm0 \n\t"\ | |
1125 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ | |
1126 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ | |
1127 : "memory"\ | |
1128 );\ | |
1129 dst+=dstStride;\ | |
1130 src+=srcStride;\ | |
1131 }\ | |
1132 }\ | |
1133 \ | |
1134 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1135 __asm__ volatile(\ | |
1136 "pxor %%mm7, %%mm7 \n\t"\ | |
1137 "1: \n\t"\ | |
1138 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ | |
1139 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ | |
1140 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ | |
1141 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ | |
1142 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ | |
1143 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ | |
1144 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ | |
1145 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ | |
1146 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ | |
1147 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ | |
1148 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ | |
1149 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ | |
1150 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ | |
1151 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ | |
1152 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ | |
1153 "paddw %%mm3, %%mm5 \n\t" /* b */\ | |
1154 "paddw %%mm2, %%mm6 \n\t" /* c */\ | |
1155 "paddw %%mm5, %%mm5 \n\t" /* 2b */\ | |
1156 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ | |
1157 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ | |
1158 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ | |
1159 "paddw %%mm4, %%mm0 \n\t" /* a */\ | |
1160 "paddw %%mm1, %%mm5 \n\t" /* d */\ | |
1161 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ | |
1162 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ | |
1163 "paddw %5, %%mm6 \n\t"\ | |
1164 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ | |
1165 "psraw $5, %%mm0 \n\t"\ | |
1166 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ | |
1167 \ | |
1168 "movd 5(%0), %%mm5 \n\t" /* FGHI */\ | |
1169 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ | |
1170 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ | |
1171 "paddw %%mm5, %%mm1 \n\t" /* a */\ | |
1172 "paddw %%mm6, %%mm2 \n\t" /* b */\ | |
1173 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ | |
1174 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ | |
1175 "paddw %%mm6, %%mm3 \n\t" /* c */\ | |
1176 "paddw %%mm5, %%mm4 \n\t" /* d */\ | |
1177 "paddw %%mm2, %%mm2 \n\t" /* 2b */\ | |
1178 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ | |
1179 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ | |
1180 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ | |
1181 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ | |
1182 "paddw %5, %%mm1 \n\t"\ | |
1183 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ | |
1184 "psraw $5, %%mm3 \n\t"\ | |
1185 "packuswb %%mm3, %%mm0 \n\t"\ | |
1186 OP_MMX2(%%mm0, (%1), %%mm4, q)\ | |
1187 \ | |
1188 "add %3, %0 \n\t"\ | |
1189 "add %4, %1 \n\t"\ | |
1190 "decl %2 \n\t"\ | |
1191 " jnz 1b \n\t"\ | |
1192 : "+a"(src), "+c"(dst), "+d"(h)\ | |
1193 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\ | |
1194 : "memory"\ | |
1195 );\ | |
1196 }\ | |
1197 \ | |
1198 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
1199 int i;\ | |
1200 int16_t temp[8];\ | |
1201 /* quick HACK, XXX FIXME MUST be optimized */\ | |
1202 for(i=0; i<h; i++)\ | |
1203 {\ | |
1204 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ | |
1205 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ | |
1206 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ | |
1207 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ | |
1208 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ | |
1209 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ | |
1210 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ | |
1211 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ | |
1212 __asm__ volatile(\ | |
1213 "movq (%0), %%mm0 \n\t"\ | |
1214 "movq 8(%0), %%mm1 \n\t"\ | |
1215 "paddw %2, %%mm0 \n\t"\ | |
1216 "paddw %2, %%mm1 \n\t"\ | |
1217 "psraw $5, %%mm0 \n\t"\ | |
1218 "psraw $5, %%mm1 \n\t"\ | |
1219 "packuswb %%mm1, %%mm0 \n\t"\ | |
1220 OP_3DNOW(%%mm0, (%1), %%mm1, q)\ | |
1221 :: "r"(temp), "r"(dst), "m"(ROUNDER)\ | |
1222 :"memory"\ | |
1223 );\ | |
1224 dst+=dstStride;\ | |
1225 src+=srcStride;\ | |
1226 }\ | |
1227 } | |
1228 | |
1229 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ | |
1230 \ | |
1231 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1232 uint64_t temp[17*4];\ | |
1233 uint64_t *temp_ptr= temp;\ | |
1234 int count= 17;\ | |
1235 \ | |
1236 /*FIXME unroll */\ | |
1237 __asm__ volatile(\ | |
1238 "pxor %%mm7, %%mm7 \n\t"\ | |
1239 "1: \n\t"\ | |
1240 "movq (%0), %%mm0 \n\t"\ | |
1241 "movq (%0), %%mm1 \n\t"\ | |
1242 "movq 8(%0), %%mm2 \n\t"\ | |
1243 "movq 8(%0), %%mm3 \n\t"\ | |
1244 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1245 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
1246 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1247 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1248 "movq %%mm0, (%1) \n\t"\ | |
1249 "movq %%mm1, 17*8(%1) \n\t"\ | |
1250 "movq %%mm2, 2*17*8(%1) \n\t"\ | |
1251 "movq %%mm3, 3*17*8(%1) \n\t"\ | |
1252 "add $8, %1 \n\t"\ | |
1253 "add %3, %0 \n\t"\ | |
1254 "decl %2 \n\t"\ | |
1255 " jnz 1b \n\t"\ | |
1256 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ | |
1257 : "r" ((x86_reg)srcStride)\ | |
1258 : "memory"\ | |
1259 );\ | |
1260 \ | |
1261 temp_ptr= temp;\ | |
1262 count=4;\ | |
1263 \ | |
1264 /*FIXME reorder for speed */\ | |
1265 __asm__ volatile(\ | |
1266 /*"pxor %%mm7, %%mm7 \n\t"*/\ | |
1267 "1: \n\t"\ | |
1268 "movq (%0), %%mm0 \n\t"\ | |
1269 "movq 8(%0), %%mm1 \n\t"\ | |
1270 "movq 16(%0), %%mm2 \n\t"\ | |
1271 "movq 24(%0), %%mm3 \n\t"\ | |
1272 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ | |
1273 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
1274 "add %4, %1 \n\t"\ | |
1275 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ | |
1276 \ | |
1277 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ | |
1278 "add %4, %1 \n\t"\ | |
1279 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ | |
1280 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ | |
1281 "add %4, %1 \n\t"\ | |
1282 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ | |
1283 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ | |
1284 "add %4, %1 \n\t"\ | |
1285 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ | |
1286 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ | |
1287 "add %4, %1 \n\t"\ | |
1288 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ | |
1289 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ | |
1290 "add %4, %1 \n\t"\ | |
1291 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ | |
1292 \ | |
1293 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ | |
1294 "add %4, %1 \n\t" \ | |
1295 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ | |
1296 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ | |
1297 \ | |
1298 "add $136, %0 \n\t"\ | |
1299 "add %6, %1 \n\t"\ | |
1300 "decl %2 \n\t"\ | |
1301 " jnz 1b \n\t"\ | |
1302 \ | |
1303 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ | |
1304 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\ | |
1305 :"memory"\ | |
1306 );\ | |
1307 }\ | |
1308 \ | |
1309 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
1310 uint64_t temp[9*2];\ | |
1311 uint64_t *temp_ptr= temp;\ | |
1312 int count= 9;\ | |
1313 \ | |
1314 /*FIXME unroll */\ | |
1315 __asm__ volatile(\ | |
1316 "pxor %%mm7, %%mm7 \n\t"\ | |
1317 "1: \n\t"\ | |
1318 "movq (%0), %%mm0 \n\t"\ | |
1319 "movq (%0), %%mm1 \n\t"\ | |
1320 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1321 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
1322 "movq %%mm0, (%1) \n\t"\ | |
1323 "movq %%mm1, 9*8(%1) \n\t"\ | |
1324 "add $8, %1 \n\t"\ | |
1325 "add %3, %0 \n\t"\ | |
1326 "decl %2 \n\t"\ | |
1327 " jnz 1b \n\t"\ | |
1328 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ | |
1329 : "r" ((x86_reg)srcStride)\ | |
1330 : "memory"\ | |
1331 );\ | |
1332 \ | |
1333 temp_ptr= temp;\ | |
1334 count=2;\ | |
1335 \ | |
1336 /*FIXME reorder for speed */\ | |
1337 __asm__ volatile(\ | |
1338 /*"pxor %%mm7, %%mm7 \n\t"*/\ | |
1339 "1: \n\t"\ | |
1340 "movq (%0), %%mm0 \n\t"\ | |
1341 "movq 8(%0), %%mm1 \n\t"\ | |
1342 "movq 16(%0), %%mm2 \n\t"\ | |
1343 "movq 24(%0), %%mm3 \n\t"\ | |
1344 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ | |
1345 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ | |
1346 "add %4, %1 \n\t"\ | |
1347 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ | |
1348 \ | |
1349 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ | |
1350 "add %4, %1 \n\t"\ | |
1351 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ | |
1352 \ | |
1353 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ | |
1354 "add %4, %1 \n\t"\ | |
1355 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ | |
1356 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ | |
1357 \ | |
1358 "add $72, %0 \n\t"\ | |
1359 "add %6, %1 \n\t"\ | |
1360 "decl %2 \n\t"\ | |
1361 " jnz 1b \n\t"\ | |
1362 \ | |
1363 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ | |
1364 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\ | |
1365 : "memory"\ | |
1366 );\ | |
1367 }\ | |
1368 \ | |
1369 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ | |
1370 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\ | |
1371 }\ | |
1372 \ | |
1373 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1374 uint64_t temp[8];\ | |
1375 uint8_t * const half= (uint8_t*)temp;\ | |
1376 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
1377 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ | |
1378 }\ | |
1379 \ | |
1380 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1381 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ | |
1382 }\ | |
1383 \ | |
1384 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1385 uint64_t temp[8];\ | |
1386 uint8_t * const half= (uint8_t*)temp;\ | |
1387 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
1388 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ | |
1389 }\ | |
1390 \ | |
1391 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1392 uint64_t temp[8];\ | |
1393 uint8_t * const half= (uint8_t*)temp;\ | |
1394 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ | |
1395 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ | |
1396 }\ | |
1397 \ | |
1398 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1399 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ | |
1400 }\ | |
1401 \ | |
1402 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1403 uint64_t temp[8];\ | |
1404 uint8_t * const half= (uint8_t*)temp;\ | |
1405 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ | |
1406 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ | |
1407 }\ | |
1408 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1409 uint64_t half[8 + 9];\ | |
1410 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1411 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1412 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1413 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ | |
1414 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
1415 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ | |
1416 }\ | |
1417 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1418 uint64_t half[8 + 9];\ | |
1419 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1420 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1421 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1422 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ | |
1423 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
1424 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ | |
1425 }\ | |
1426 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1427 uint64_t half[8 + 9];\ | |
1428 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1429 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1430 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1431 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ | |
1432 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
1433 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ | |
1434 }\ | |
1435 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1436 uint64_t half[8 + 9];\ | |
1437 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1438 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1439 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1440 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ | |
1441 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
1442 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ | |
1443 }\ | |
1444 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1445 uint64_t half[8 + 9];\ | |
1446 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1447 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1448 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1449 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
1450 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ | |
1451 }\ | |
1452 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1453 uint64_t half[8 + 9];\ | |
1454 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
1455 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1456 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1457 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
1458 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ | |
1459 }\ | |
1460 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1461 uint64_t half[8 + 9];\ | |
1462 uint8_t * const halfH= ((uint8_t*)half);\ | |
1463 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1464 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ | |
1465 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ | |
1466 }\ | |
1467 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1468 uint64_t half[8 + 9];\ | |
1469 uint8_t * const halfH= ((uint8_t*)half);\ | |
1470 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1471 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ | |
1472 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ | |
1473 }\ | |
1474 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1475 uint64_t half[9];\ | |
1476 uint8_t * const halfH= ((uint8_t*)half);\ | |
1477 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
1478 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ | |
1479 }\ | |
1480 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ | |
1481 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\ | |
1482 }\ | |
1483 \ | |
1484 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1485 uint64_t temp[32];\ | |
1486 uint8_t * const half= (uint8_t*)temp;\ | |
1487 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
1488 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ | |
1489 }\ | |
1490 \ | |
1491 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1492 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ | |
1493 }\ | |
1494 \ | |
1495 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1496 uint64_t temp[32];\ | |
1497 uint8_t * const half= (uint8_t*)temp;\ | |
1498 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
1499 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ | |
1500 }\ | |
1501 \ | |
1502 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1503 uint64_t temp[32];\ | |
1504 uint8_t * const half= (uint8_t*)temp;\ | |
1505 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ | |
1506 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ | |
1507 }\ | |
1508 \ | |
1509 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1510 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ | |
1511 }\ | |
1512 \ | |
1513 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1514 uint64_t temp[32];\ | |
1515 uint8_t * const half= (uint8_t*)temp;\ | |
1516 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ | |
1517 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ | |
1518 }\ | |
1519 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1520 uint64_t half[16*2 + 17*2];\ | |
1521 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1522 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1523 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1524 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ | |
1525 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
1526 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ | |
1527 }\ | |
1528 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1529 uint64_t half[16*2 + 17*2];\ | |
1530 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1531 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1532 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1533 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ | |
1534 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
1535 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ | |
1536 }\ | |
1537 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1538 uint64_t half[16*2 + 17*2];\ | |
1539 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1540 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1541 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1542 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ | |
1543 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
1544 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ | |
1545 }\ | |
1546 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1547 uint64_t half[16*2 + 17*2];\ | |
1548 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1549 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1550 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1551 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ | |
1552 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
1553 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ | |
1554 }\ | |
1555 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1556 uint64_t half[16*2 + 17*2];\ | |
1557 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1558 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1559 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1560 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
1561 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ | |
1562 }\ | |
1563 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1564 uint64_t half[16*2 + 17*2];\ | |
1565 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
1566 uint8_t * const halfHV= ((uint8_t*)half);\ | |
1567 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1568 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
1569 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ | |
1570 }\ | |
1571 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1572 uint64_t half[17*2];\ | |
1573 uint8_t * const halfH= ((uint8_t*)half);\ | |
1574 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1575 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ | |
1576 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ | |
1577 }\ | |
1578 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1579 uint64_t half[17*2];\ | |
1580 uint8_t * const halfH= ((uint8_t*)half);\ | |
1581 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1582 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ | |
1583 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ | |
1584 }\ | |
1585 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1586 uint64_t half[17*2];\ | |
1587 uint8_t * const halfH= ((uint8_t*)half);\ | |
1588 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
1589 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ | |
1590 } | |
1591 | |
1592 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" | |
1593 #define AVG_3DNOW_OP(a,b,temp, size) \ | |
1594 "mov" #size " " #b ", " #temp " \n\t"\ | |
1595 "pavgusb " #temp ", " #a " \n\t"\ | |
1596 "mov" #size " " #a ", " #b " \n\t" | |
1597 #define AVG_MMX2_OP(a,b,temp, size) \ | |
1598 "mov" #size " " #b ", " #temp " \n\t"\ | |
1599 "pavgb " #temp ", " #a " \n\t"\ | |
1600 "mov" #size " " #a ", " #b " \n\t" | |
1601 | |
1602 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) | |
1603 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) | |
1604 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) | |
1605 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) | |
1606 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) | |
1607 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) | |
1608 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) | |
1609 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) | |
1610 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) | |
1611 | |
1612 /***********************************/ | |
1613 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */ | |
1614 | |
1615 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\ | |
1616 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1617 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\ | |
1618 } | |
1619 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\ | |
1620 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1621 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\ | |
1622 } | |
1623 | |
1624 #define QPEL_2TAP(OPNAME, SIZE, MMX)\ | |
1625 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\ | |
1626 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\ | |
1627 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\ | |
1628 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\ | |
1629 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\ | |
1630 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\ | |
1631 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\ | |
1632 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\ | |
1633 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\ | |
1634 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1635 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\ | |
1636 }\ | |
1637 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
1638 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\ | |
1639 }\ | |
1640 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\ | |
1641 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\ | |
1642 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\ | |
1643 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\ | |
1644 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\ | |
1645 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\ | |
1646 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\ | |
1647 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\ | |
1648 | |
1649 QPEL_2TAP(put_, 16, mmx2) | |
1650 QPEL_2TAP(avg_, 16, mmx2) | |
1651 QPEL_2TAP(put_, 8, mmx2) | |
1652 QPEL_2TAP(avg_, 8, mmx2) | |
1653 QPEL_2TAP(put_, 16, 3dnow) | |
1654 QPEL_2TAP(avg_, 16, 3dnow) | |
1655 QPEL_2TAP(put_, 8, 3dnow) | |
1656 QPEL_2TAP(avg_, 8, 3dnow) | |
1657 | |
1658 | |
1659 #if 0 | |
8527
f8bf438c6000
Add missing 'void' keyword to parameterless function declarations.
diego
parents:
8519
diff
changeset
|
1660 static void just_return(void) { return; } |
8430 | 1661 #endif |
1662 | |
1663 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, | |
1664 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){ | |
1665 const int w = 8; | |
1666 const int ix = ox>>(16+shift); | |
1667 const int iy = oy>>(16+shift); | |
1668 const int oxs = ox>>4; | |
1669 const int oys = oy>>4; | |
1670 const int dxxs = dxx>>4; | |
1671 const int dxys = dxy>>4; | |
1672 const int dyxs = dyx>>4; | |
1673 const int dyys = dyy>>4; | |
1674 const uint16_t r4[4] = {r,r,r,r}; | |
1675 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys}; | |
1676 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys}; | |
1677 const uint64_t shift2 = 2*shift; | |
1678 uint8_t edge_buf[(h+1)*stride]; | |
1679 int x, y; | |
1680 | |
1681 const int dxw = (dxx-(1<<(16+shift)))*(w-1); | |
1682 const int dyh = (dyy-(1<<(16+shift)))*(h-1); | |
1683 const int dxh = dxy*(h-1); | |
1684 const int dyw = dyx*(w-1); | |
1685 if( // non-constant fullpel offset (3% of blocks) | |
1686 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) | | |
1687 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift) | |
1688 // uses more than 16 bits of subpel mv (only at huge resolution) | |
1689 || (dxx|dxy|dyx|dyy)&15 ) | |
1690 { | |
1691 //FIXME could still use mmx for some of the rows | |
1692 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); | |
1693 return; | |
1694 } | |
1695 | |
1696 src += ix + iy*stride; | |
1697 if( (unsigned)ix >= width-w || | |
1698 (unsigned)iy >= height-h ) | |
1699 { | |
1700 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height); | |
1701 src = edge_buf; | |
1702 } | |
1703 | |
1704 __asm__ volatile( | |
1705 "movd %0, %%mm6 \n\t" | |
1706 "pxor %%mm7, %%mm7 \n\t" | |
1707 "punpcklwd %%mm6, %%mm6 \n\t" | |
1708 "punpcklwd %%mm6, %%mm6 \n\t" | |
1709 :: "r"(1<<shift) | |
1710 ); | |
1711 | |
1712 for(x=0; x<w; x+=4){ | |
1713 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0), | |
1714 oxs - dxys + dxxs*(x+1), | |
1715 oxs - dxys + dxxs*(x+2), | |
1716 oxs - dxys + dxxs*(x+3) }; | |
1717 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0), | |
1718 oys - dyys + dyxs*(x+1), | |
1719 oys - dyys + dyxs*(x+2), | |
1720 oys - dyys + dyxs*(x+3) }; | |
1721 | |
1722 for(y=0; y<h; y++){ | |
1723 __asm__ volatile( | |
1724 "movq %0, %%mm4 \n\t" | |
1725 "movq %1, %%mm5 \n\t" | |
1726 "paddw %2, %%mm4 \n\t" | |
1727 "paddw %3, %%mm5 \n\t" | |
1728 "movq %%mm4, %0 \n\t" | |
1729 "movq %%mm5, %1 \n\t" | |
1730 "psrlw $12, %%mm4 \n\t" | |
1731 "psrlw $12, %%mm5 \n\t" | |
1732 : "+m"(*dx4), "+m"(*dy4) | |
1733 : "m"(*dxy4), "m"(*dyy4) | |
1734 ); | |
1735 | |
1736 __asm__ volatile( | |
1737 "movq %%mm6, %%mm2 \n\t" | |
1738 "movq %%mm6, %%mm1 \n\t" | |
1739 "psubw %%mm4, %%mm2 \n\t" | |
1740 "psubw %%mm5, %%mm1 \n\t" | |
1741 "movq %%mm2, %%mm0 \n\t" | |
1742 "movq %%mm4, %%mm3 \n\t" | |
1743 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy) | |
1744 "pmullw %%mm5, %%mm3 \n\t" // dx*dy | |
1745 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy | |
1746 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy) | |
1747 | |
1748 "movd %4, %%mm5 \n\t" | |
1749 "movd %3, %%mm4 \n\t" | |
1750 "punpcklbw %%mm7, %%mm5 \n\t" | |
1751 "punpcklbw %%mm7, %%mm4 \n\t" | |
1752 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy | |
1753 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy | |
1754 | |
1755 "movd %2, %%mm5 \n\t" | |
1756 "movd %1, %%mm4 \n\t" | |
1757 "punpcklbw %%mm7, %%mm5 \n\t" | |
1758 "punpcklbw %%mm7, %%mm4 \n\t" | |
1759 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy) | |
1760 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy) | |
1761 "paddw %5, %%mm1 \n\t" | |
1762 "paddw %%mm3, %%mm2 \n\t" | |
1763 "paddw %%mm1, %%mm0 \n\t" | |
1764 "paddw %%mm2, %%mm0 \n\t" | |
1765 | |
1766 "psrlw %6, %%mm0 \n\t" | |
1767 "packuswb %%mm0, %%mm0 \n\t" | |
1768 "movd %%mm0, %0 \n\t" | |
1769 | |
1770 : "=m"(dst[x+y*stride]) | |
1771 : "m"(src[0]), "m"(src[1]), | |
1772 "m"(src[stride]), "m"(src[stride+1]), | |
1773 "m"(*r4), "m"(shift2) | |
1774 ); | |
1775 src += stride; | |
1776 } | |
1777 src += 4-h*stride; | |
1778 } | |
1779 } | |
1780 | |
1781 #define PREFETCH(name, op) \ | |
1782 static void name(void *mem, int stride, int h){\ | |
1783 const uint8_t *p= mem;\ | |
1784 do{\ | |
1785 __asm__ volatile(#op" %0" :: "m"(*p));\ | |
1786 p+= stride;\ | |
1787 }while(--h);\ | |
1788 } | |
1789 PREFETCH(prefetch_mmx2, prefetcht0) | |
1790 PREFETCH(prefetch_3dnow, prefetch) | |
1791 #undef PREFETCH | |
1792 | |
12450
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12439
diff
changeset
|
1793 #include "h264_qpel_mmx.c" |
12437
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1794 |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1795 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1796 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1797 void ff_put_vc1_chroma_mc8_mmx_nornd (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1798 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1799 void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1800 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1801 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1802 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1803 void ff_avg_vc1_chroma_mc8_mmx2_nornd (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1804 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1805 void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1806 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1807 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1808 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1809 void ff_avg_vc1_chroma_mc8_3dnow_nornd(uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1810 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1811 void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1812 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1813 |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1814 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1815 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1816 void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1817 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1818 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1819 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1820 void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1821 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1822 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1823 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1824 void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1825 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1826 |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1827 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1828 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1829 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1830 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1831 |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1832 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1833 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1834 void ff_put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1835 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1836 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1837 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1838 |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1839 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1840 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1841 void ff_avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1842 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1843 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1844 int stride, int h, int x, int y); |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
1845 |
8430 | 1846 |
1847 /* CAVS specific */ | |
1848 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
1849 put_pixels8_mmx(dst, src, stride, 8); | |
1850 } | |
1851 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
1852 avg_pixels8_mmx(dst, src, stride, 8); | |
1853 } | |
1854 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
1855 put_pixels16_mmx(dst, src, stride, 16); | |
1856 } | |
1857 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { | |
1858 avg_pixels16_mmx(dst, src, stride, 16); | |
1859 } | |
1860 | |
1861 /* VC1 specific */ | |
1862 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { | |
1863 put_pixels8_mmx(dst, src, stride, 8); | |
1864 } | |
9441 | 1865 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { |
1866 avg_pixels8_mmx2(dst, src, stride, 8); | |
1867 } | |
8430 | 1868 |
1869 /* XXX: those functions should be suppressed ASAP when all IDCTs are | |
1870 converted */ | |
8590 | 1871 #if CONFIG_GPL |
8430 | 1872 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) |
1873 { | |
1874 ff_mmx_idct (block); | |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
1875 ff_put_pixels_clamped_mmx(block, dest, line_size); |
8430 | 1876 } |
1877 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
1878 { | |
1879 ff_mmx_idct (block); | |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
1880 ff_add_pixels_clamped_mmx(block, dest, line_size); |
8430 | 1881 } |
1882 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) | |
1883 { | |
1884 ff_mmxext_idct (block); | |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
1885 ff_put_pixels_clamped_mmx(block, dest, line_size); |
8430 | 1886 } |
1887 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
1888 { | |
1889 ff_mmxext_idct (block); | |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
1890 ff_add_pixels_clamped_mmx(block, dest, line_size); |
8430 | 1891 } |
1892 #endif | |
1893 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) | |
1894 { | |
1895 ff_idct_xvid_mmx (block); | |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
1896 ff_put_pixels_clamped_mmx(block, dest, line_size); |
8430 | 1897 } |
1898 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block) | |
1899 { | |
1900 ff_idct_xvid_mmx (block); | |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
1901 ff_add_pixels_clamped_mmx(block, dest, line_size); |
8430 | 1902 } |
1903 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block) | |
1904 { | |
1905 ff_idct_xvid_mmx2 (block); | |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
1906 ff_put_pixels_clamped_mmx(block, dest, line_size); |
8430 | 1907 } |
1908 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) | |
1909 { | |
1910 ff_idct_xvid_mmx2 (block); | |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
1911 ff_add_pixels_clamped_mmx(block, dest, line_size); |
8430 | 1912 } |
1913 | |
1914 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize) | |
1915 { | |
1916 int i; | |
1917 __asm__ volatile("pxor %%mm7, %%mm7":); | |
1918 for(i=0; i<blocksize; i+=2) { | |
1919 __asm__ volatile( | |
1920 "movq %0, %%mm0 \n\t" | |
1921 "movq %1, %%mm1 \n\t" | |
1922 "movq %%mm0, %%mm2 \n\t" | |
1923 "movq %%mm1, %%mm3 \n\t" | |
1924 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 | |
1925 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 | |
1926 "pslld $31, %%mm2 \n\t" // keep only the sign bit | |
1927 "pxor %%mm2, %%mm1 \n\t" | |
1928 "movq %%mm3, %%mm4 \n\t" | |
1929 "pand %%mm1, %%mm3 \n\t" | |
1930 "pandn %%mm1, %%mm4 \n\t" | |
1931 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) | |
1932 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) | |
1933 "movq %%mm3, %1 \n\t" | |
1934 "movq %%mm0, %0 \n\t" | |
1935 :"+m"(mag[i]), "+m"(ang[i]) | |
1936 ::"memory" | |
1937 ); | |
1938 } | |
1939 __asm__ volatile("femms"); | |
1940 } | |
1941 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) | |
1942 { | |
1943 int i; | |
1944 | |
1945 __asm__ volatile( | |
1946 "movaps %0, %%xmm5 \n\t" | |
1947 ::"m"(ff_pdw_80000000[0]) | |
1948 ); | |
1949 for(i=0; i<blocksize; i+=4) { | |
1950 __asm__ volatile( | |
1951 "movaps %0, %%xmm0 \n\t" | |
1952 "movaps %1, %%xmm1 \n\t" | |
1953 "xorps %%xmm2, %%xmm2 \n\t" | |
1954 "xorps %%xmm3, %%xmm3 \n\t" | |
1955 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 | |
1956 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0 | |
1957 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit | |
1958 "xorps %%xmm2, %%xmm1 \n\t" | |
1959 "movaps %%xmm3, %%xmm4 \n\t" | |
1960 "andps %%xmm1, %%xmm3 \n\t" | |
1961 "andnps %%xmm1, %%xmm4 \n\t" | |
1962 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) | |
1963 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) | |
1964 "movaps %%xmm3, %1 \n\t" | |
1965 "movaps %%xmm0, %0 \n\t" | |
1966 :"+m"(mag[i]), "+m"(ang[i]) | |
1967 ::"memory" | |
1968 ); | |
1969 } | |
1970 } | |
1971 | |
1972 #define IF1(x) x | |
1973 #define IF0(x) | |
1974 | |
1975 #define MIX5(mono,stereo)\ | |
1976 __asm__ volatile(\ | |
1977 "movss 0(%2), %%xmm5 \n"\ | |
1978 "movss 8(%2), %%xmm6 \n"\ | |
1979 "movss 24(%2), %%xmm7 \n"\ | |
1980 "shufps $0, %%xmm5, %%xmm5 \n"\ | |
1981 "shufps $0, %%xmm6, %%xmm6 \n"\ | |
1982 "shufps $0, %%xmm7, %%xmm7 \n"\ | |
1983 "1: \n"\ | |
1984 "movaps (%0,%1), %%xmm0 \n"\ | |
1985 "movaps 0x400(%0,%1), %%xmm1 \n"\ | |
1986 "movaps 0x800(%0,%1), %%xmm2 \n"\ | |
1987 "movaps 0xc00(%0,%1), %%xmm3 \n"\ | |
1988 "movaps 0x1000(%0,%1), %%xmm4 \n"\ | |
1989 "mulps %%xmm5, %%xmm0 \n"\ | |
1990 "mulps %%xmm6, %%xmm1 \n"\ | |
1991 "mulps %%xmm5, %%xmm2 \n"\ | |
1992 "mulps %%xmm7, %%xmm3 \n"\ | |
1993 "mulps %%xmm7, %%xmm4 \n"\ | |
1994 stereo("addps %%xmm1, %%xmm0 \n")\ | |
1995 "addps %%xmm1, %%xmm2 \n"\ | |
1996 "addps %%xmm3, %%xmm0 \n"\ | |
1997 "addps %%xmm4, %%xmm2 \n"\ | |
1998 mono("addps %%xmm2, %%xmm0 \n")\ | |
1999 "movaps %%xmm0, (%0,%1) \n"\ | |
2000 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\ | |
2001 "add $16, %0 \n"\ | |
2002 "jl 1b \n"\ | |
2003 :"+&r"(i)\ | |
2004 :"r"(samples[0]+len), "r"(matrix)\ | |
2005 :"memory"\ | |
2006 ); | |
2007 | |
2008 #define MIX_MISC(stereo)\ | |
2009 __asm__ volatile(\ | |
2010 "1: \n"\ | |
2011 "movaps (%3,%0), %%xmm0 \n"\ | |
2012 stereo("movaps %%xmm0, %%xmm1 \n")\ | |
2013 "mulps %%xmm6, %%xmm0 \n"\ | |
2014 stereo("mulps %%xmm7, %%xmm1 \n")\ | |
2015 "lea 1024(%3,%0), %1 \n"\ | |
2016 "mov %5, %2 \n"\ | |
2017 "2: \n"\ | |
2018 "movaps (%1), %%xmm2 \n"\ | |
2019 stereo("movaps %%xmm2, %%xmm3 \n")\ | |
2020 "mulps (%4,%2), %%xmm2 \n"\ | |
2021 stereo("mulps 16(%4,%2), %%xmm3 \n")\ | |
2022 "addps %%xmm2, %%xmm0 \n"\ | |
2023 stereo("addps %%xmm3, %%xmm1 \n")\ | |
2024 "add $1024, %1 \n"\ | |
2025 "add $32, %2 \n"\ | |
2026 "jl 2b \n"\ | |
2027 "movaps %%xmm0, (%3,%0) \n"\ | |
2028 stereo("movaps %%xmm1, 1024(%3,%0) \n")\ | |
2029 "add $16, %0 \n"\ | |
2030 "jl 1b \n"\ | |
2031 :"+&r"(i), "=&r"(j), "=&r"(k)\ | |
2032 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\ | |
2033 :"memory"\ | |
2034 ); | |
2035 | |
2036 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len) | |
2037 { | |
2038 int (*matrix_cmp)[2] = (int(*)[2])matrix; | |
2039 intptr_t i,j,k; | |
2040 | |
2041 i = -len*sizeof(float); | |
2042 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) { | |
2043 MIX5(IF0,IF1); | |
2044 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) { | |
2045 MIX5(IF1,IF0); | |
2046 } else { | |
11369 | 2047 DECLARE_ALIGNED(16, float, matrix_simd)[in_ch][2][4]; |
8430 | 2048 j = 2*in_ch*sizeof(float); |
2049 __asm__ volatile( | |
2050 "1: \n" | |
2051 "sub $8, %0 \n" | |
2052 "movss (%2,%0), %%xmm6 \n" | |
2053 "movss 4(%2,%0), %%xmm7 \n" | |
2054 "shufps $0, %%xmm6, %%xmm6 \n" | |
2055 "shufps $0, %%xmm7, %%xmm7 \n" | |
2056 "movaps %%xmm6, (%1,%0,4) \n" | |
2057 "movaps %%xmm7, 16(%1,%0,4) \n" | |
2058 "jg 1b \n" | |
2059 :"+&r"(j) | |
2060 :"r"(matrix_simd), "r"(matrix) | |
2061 :"memory" | |
2062 ); | |
2063 if(out_ch == 2) { | |
2064 MIX_MISC(IF1); | |
2065 } else { | |
2066 MIX_MISC(IF0); | |
2067 } | |
2068 } | |
2069 } | |
2070 | |
2071 static void vector_fmul_3dnow(float *dst, const float *src, int len){ | |
2072 x86_reg i = (len-4)*4; | |
2073 __asm__ volatile( | |
2074 "1: \n\t" | |
2075 "movq (%1,%0), %%mm0 \n\t" | |
2076 "movq 8(%1,%0), %%mm1 \n\t" | |
2077 "pfmul (%2,%0), %%mm0 \n\t" | |
2078 "pfmul 8(%2,%0), %%mm1 \n\t" | |
2079 "movq %%mm0, (%1,%0) \n\t" | |
2080 "movq %%mm1, 8(%1,%0) \n\t" | |
2081 "sub $16, %0 \n\t" | |
2082 "jge 1b \n\t" | |
2083 "femms \n\t" | |
2084 :"+r"(i) | |
2085 :"r"(dst), "r"(src) | |
2086 :"memory" | |
2087 ); | |
2088 } | |
2089 static void vector_fmul_sse(float *dst, const float *src, int len){ | |
2090 x86_reg i = (len-8)*4; | |
2091 __asm__ volatile( | |
2092 "1: \n\t" | |
2093 "movaps (%1,%0), %%xmm0 \n\t" | |
2094 "movaps 16(%1,%0), %%xmm1 \n\t" | |
2095 "mulps (%2,%0), %%xmm0 \n\t" | |
2096 "mulps 16(%2,%0), %%xmm1 \n\t" | |
2097 "movaps %%xmm0, (%1,%0) \n\t" | |
2098 "movaps %%xmm1, 16(%1,%0) \n\t" | |
2099 "sub $32, %0 \n\t" | |
2100 "jge 1b \n\t" | |
2101 :"+r"(i) | |
2102 :"r"(dst), "r"(src) | |
2103 :"memory" | |
2104 ); | |
2105 } | |
2106 | |
2107 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ | |
2108 x86_reg i = len*4-16; | |
2109 __asm__ volatile( | |
2110 "1: \n\t" | |
2111 "pswapd 8(%1), %%mm0 \n\t" | |
2112 "pswapd (%1), %%mm1 \n\t" | |
2113 "pfmul (%3,%0), %%mm0 \n\t" | |
2114 "pfmul 8(%3,%0), %%mm1 \n\t" | |
2115 "movq %%mm0, (%2,%0) \n\t" | |
2116 "movq %%mm1, 8(%2,%0) \n\t" | |
2117 "add $16, %1 \n\t" | |
2118 "sub $16, %0 \n\t" | |
2119 "jge 1b \n\t" | |
2120 :"+r"(i), "+r"(src1) | |
2121 :"r"(dst), "r"(src0) | |
2122 ); | |
2123 __asm__ volatile("femms"); | |
2124 } | |
2125 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){ | |
2126 x86_reg i = len*4-32; | |
2127 __asm__ volatile( | |
2128 "1: \n\t" | |
2129 "movaps 16(%1), %%xmm0 \n\t" | |
2130 "movaps (%1), %%xmm1 \n\t" | |
2131 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" | |
2132 "shufps $0x1b, %%xmm1, %%xmm1 \n\t" | |
2133 "mulps (%3,%0), %%xmm0 \n\t" | |
2134 "mulps 16(%3,%0), %%xmm1 \n\t" | |
2135 "movaps %%xmm0, (%2,%0) \n\t" | |
2136 "movaps %%xmm1, 16(%2,%0) \n\t" | |
2137 "add $32, %1 \n\t" | |
2138 "sub $32, %0 \n\t" | |
2139 "jge 1b \n\t" | |
2140 :"+r"(i), "+r"(src1) | |
2141 :"r"(dst), "r"(src0) | |
2142 ); | |
2143 } | |
2144 | |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10107
diff
changeset
|
2145 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1, |
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10107
diff
changeset
|
2146 const float *src2, int len){ |
8430 | 2147 x86_reg i = (len-4)*4; |
10301 | 2148 __asm__ volatile( |
2149 "1: \n\t" | |
2150 "movq (%2,%0), %%mm0 \n\t" | |
2151 "movq 8(%2,%0), %%mm1 \n\t" | |
2152 "pfmul (%3,%0), %%mm0 \n\t" | |
2153 "pfmul 8(%3,%0), %%mm1 \n\t" | |
2154 "pfadd (%4,%0), %%mm0 \n\t" | |
2155 "pfadd 8(%4,%0), %%mm1 \n\t" | |
2156 "movq %%mm0, (%1,%0) \n\t" | |
2157 "movq %%mm1, 8(%1,%0) \n\t" | |
2158 "sub $16, %0 \n\t" | |
2159 "jge 1b \n\t" | |
2160 :"+r"(i) | |
2161 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) | |
2162 :"memory" | |
2163 ); | |
8430 | 2164 __asm__ volatile("femms"); |
2165 } | |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10107
diff
changeset
|
2166 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1, |
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10107
diff
changeset
|
2167 const float *src2, int len){ |
8430 | 2168 x86_reg i = (len-8)*4; |
10301 | 2169 __asm__ volatile( |
2170 "1: \n\t" | |
2171 "movaps (%2,%0), %%xmm0 \n\t" | |
2172 "movaps 16(%2,%0), %%xmm1 \n\t" | |
2173 "mulps (%3,%0), %%xmm0 \n\t" | |
2174 "mulps 16(%3,%0), %%xmm1 \n\t" | |
2175 "addps (%4,%0), %%xmm0 \n\t" | |
2176 "addps 16(%4,%0), %%xmm1 \n\t" | |
2177 "movaps %%xmm0, (%1,%0) \n\t" | |
2178 "movaps %%xmm1, 16(%1,%0) \n\t" | |
2179 "sub $32, %0 \n\t" | |
2180 "jge 1b \n\t" | |
2181 :"+r"(i) | |
2182 :"r"(dst), "r"(src0), "r"(src1), "r"(src2) | |
2183 :"memory" | |
2184 ); | |
8430 | 2185 } |
2186 | |
2187 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, | |
2188 const float *win, float add_bias, int len){ | |
8590 | 2189 #if HAVE_6REGS |
8430 | 2190 if(add_bias == 0){ |
2191 x86_reg i = -len*4; | |
2192 x86_reg j = len*4-8; | |
2193 __asm__ volatile( | |
2194 "1: \n" | |
2195 "pswapd (%5,%1), %%mm1 \n" | |
2196 "movq (%5,%0), %%mm0 \n" | |
2197 "pswapd (%4,%1), %%mm5 \n" | |
2198 "movq (%3,%0), %%mm4 \n" | |
2199 "movq %%mm0, %%mm2 \n" | |
2200 "movq %%mm1, %%mm3 \n" | |
2201 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i] | |
2202 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j] | |
2203 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j] | |
2204 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i] | |
2205 "pfadd %%mm3, %%mm2 \n" | |
2206 "pfsub %%mm0, %%mm1 \n" | |
2207 "pswapd %%mm2, %%mm2 \n" | |
2208 "movq %%mm1, (%2,%0) \n" | |
2209 "movq %%mm2, (%2,%1) \n" | |
2210 "sub $8, %1 \n" | |
2211 "add $8, %0 \n" | |
2212 "jl 1b \n" | |
2213 "femms \n" | |
2214 :"+r"(i), "+r"(j) | |
2215 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) | |
2216 ); | |
2217 }else | |
2218 #endif | |
2219 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); | |
2220 } | |
2221 | |
2222 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, | |
2223 const float *win, float add_bias, int len){ | |
8590 | 2224 #if HAVE_6REGS |
8430 | 2225 if(add_bias == 0){ |
2226 x86_reg i = -len*4; | |
2227 x86_reg j = len*4-16; | |
2228 __asm__ volatile( | |
2229 "1: \n" | |
2230 "movaps (%5,%1), %%xmm1 \n" | |
2231 "movaps (%5,%0), %%xmm0 \n" | |
2232 "movaps (%4,%1), %%xmm5 \n" | |
2233 "movaps (%3,%0), %%xmm4 \n" | |
2234 "shufps $0x1b, %%xmm1, %%xmm1 \n" | |
2235 "shufps $0x1b, %%xmm5, %%xmm5 \n" | |
2236 "movaps %%xmm0, %%xmm2 \n" | |
2237 "movaps %%xmm1, %%xmm3 \n" | |
2238 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i] | |
2239 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j] | |
2240 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j] | |
2241 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i] | |
2242 "addps %%xmm3, %%xmm2 \n" | |
2243 "subps %%xmm0, %%xmm1 \n" | |
2244 "shufps $0x1b, %%xmm2, %%xmm2 \n" | |
2245 "movaps %%xmm1, (%2,%0) \n" | |
2246 "movaps %%xmm2, (%2,%1) \n" | |
2247 "sub $16, %1 \n" | |
2248 "add $16, %0 \n" | |
2249 "jl 1b \n" | |
2250 :"+r"(i), "+r"(j) | |
2251 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) | |
2252 ); | |
2253 }else | |
2254 #endif | |
2255 ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); | |
2256 } | |
2257 | |
2258 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) | |
2259 { | |
2260 x86_reg i = -4*len; | |
2261 __asm__ volatile( | |
2262 "movss %3, %%xmm4 \n" | |
2263 "shufps $0, %%xmm4, %%xmm4 \n" | |
2264 "1: \n" | |
2265 "cvtpi2ps (%2,%0), %%xmm0 \n" | |
2266 "cvtpi2ps 8(%2,%0), %%xmm1 \n" | |
2267 "cvtpi2ps 16(%2,%0), %%xmm2 \n" | |
2268 "cvtpi2ps 24(%2,%0), %%xmm3 \n" | |
2269 "movlhps %%xmm1, %%xmm0 \n" | |
2270 "movlhps %%xmm3, %%xmm2 \n" | |
2271 "mulps %%xmm4, %%xmm0 \n" | |
2272 "mulps %%xmm4, %%xmm2 \n" | |
2273 "movaps %%xmm0, (%1,%0) \n" | |
2274 "movaps %%xmm2, 16(%1,%0) \n" | |
2275 "add $32, %0 \n" | |
2276 "jl 1b \n" | |
2277 :"+r"(i) | |
2278 :"r"(dst+len), "r"(src+len), "m"(mul) | |
2279 ); | |
2280 } | |
2281 | |
2282 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) | |
2283 { | |
2284 x86_reg i = -4*len; | |
2285 __asm__ volatile( | |
2286 "movss %3, %%xmm4 \n" | |
2287 "shufps $0, %%xmm4, %%xmm4 \n" | |
2288 "1: \n" | |
2289 "cvtdq2ps (%2,%0), %%xmm0 \n" | |
2290 "cvtdq2ps 16(%2,%0), %%xmm1 \n" | |
2291 "mulps %%xmm4, %%xmm0 \n" | |
2292 "mulps %%xmm4, %%xmm1 \n" | |
2293 "movaps %%xmm0, (%1,%0) \n" | |
2294 "movaps %%xmm1, 16(%1,%0) \n" | |
2295 "add $32, %0 \n" | |
2296 "jl 1b \n" | |
2297 :"+r"(i) | |
2298 :"r"(dst+len), "r"(src+len), "m"(mul) | |
2299 ); | |
2300 } | |
2301 | |
10105 | 2302 static void vector_clipf_sse(float *dst, const float *src, float min, float max, |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2303 int len) |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2304 { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2305 x86_reg i = (len-16)*4; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2306 __asm__ volatile( |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2307 "movss %3, %%xmm4 \n" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2308 "movss %4, %%xmm5 \n" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2309 "shufps $0, %%xmm4, %%xmm4 \n" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2310 "shufps $0, %%xmm5, %%xmm5 \n" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2311 "1: \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2312 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2313 "movaps 16(%2,%0), %%xmm1 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2314 "movaps 32(%2,%0), %%xmm2 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2315 "movaps 48(%2,%0), %%xmm3 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2316 "maxps %%xmm4, %%xmm0 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2317 "maxps %%xmm4, %%xmm1 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2318 "maxps %%xmm4, %%xmm2 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2319 "maxps %%xmm4, %%xmm3 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2320 "minps %%xmm5, %%xmm0 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2321 "minps %%xmm5, %%xmm1 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2322 "minps %%xmm5, %%xmm2 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2323 "minps %%xmm5, %%xmm3 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2324 "movaps %%xmm0, (%1,%0) \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2325 "movaps %%xmm1, 16(%1,%0) \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2326 "movaps %%xmm2, 32(%1,%0) \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2327 "movaps %%xmm3, 48(%1,%0) \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2328 "sub $64, %0 \n\t" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2329 "jge 1b \n\t" |
10107
3b61bc6ce377
Mark "i" parameter of vector_clipf_sse() as early-clobber
vitor
parents:
10105
diff
changeset
|
2330 :"+&r"(i) |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2331 :"r"(dst), "r"(src), "m"(min), "m"(max) |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2332 :"memory" |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2333 ); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2334 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2335 |
8430 | 2336 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ |
2337 x86_reg reglen = len; | |
2338 // not bit-exact: pf2id uses different rounding than C and SSE | |
2339 __asm__ volatile( | |
2340 "add %0 , %0 \n\t" | |
2341 "lea (%2,%0,2) , %2 \n\t" | |
2342 "add %0 , %1 \n\t" | |
2343 "neg %0 \n\t" | |
2344 "1: \n\t" | |
2345 "pf2id (%2,%0,2) , %%mm0 \n\t" | |
2346 "pf2id 8(%2,%0,2) , %%mm1 \n\t" | |
2347 "pf2id 16(%2,%0,2) , %%mm2 \n\t" | |
2348 "pf2id 24(%2,%0,2) , %%mm3 \n\t" | |
2349 "packssdw %%mm1 , %%mm0 \n\t" | |
2350 "packssdw %%mm3 , %%mm2 \n\t" | |
2351 "movq %%mm0 , (%1,%0) \n\t" | |
2352 "movq %%mm2 , 8(%1,%0) \n\t" | |
2353 "add $16 , %0 \n\t" | |
2354 " js 1b \n\t" | |
2355 "femms \n\t" | |
2356 :"+r"(reglen), "+r"(dst), "+r"(src) | |
2357 ); | |
2358 } | |
2359 static void float_to_int16_sse(int16_t *dst, const float *src, long len){ | |
2360 x86_reg reglen = len; | |
2361 __asm__ volatile( | |
2362 "add %0 , %0 \n\t" | |
2363 "lea (%2,%0,2) , %2 \n\t" | |
2364 "add %0 , %1 \n\t" | |
2365 "neg %0 \n\t" | |
2366 "1: \n\t" | |
2367 "cvtps2pi (%2,%0,2) , %%mm0 \n\t" | |
2368 "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" | |
2369 "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" | |
2370 "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" | |
2371 "packssdw %%mm1 , %%mm0 \n\t" | |
2372 "packssdw %%mm3 , %%mm2 \n\t" | |
2373 "movq %%mm0 , (%1,%0) \n\t" | |
2374 "movq %%mm2 , 8(%1,%0) \n\t" | |
2375 "add $16 , %0 \n\t" | |
2376 " js 1b \n\t" | |
2377 "emms \n\t" | |
2378 :"+r"(reglen), "+r"(dst), "+r"(src) | |
2379 ); | |
2380 } | |
2381 | |
2382 static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ | |
2383 x86_reg reglen = len; | |
2384 __asm__ volatile( | |
2385 "add %0 , %0 \n\t" | |
2386 "lea (%2,%0,2) , %2 \n\t" | |
2387 "add %0 , %1 \n\t" | |
2388 "neg %0 \n\t" | |
2389 "1: \n\t" | |
2390 "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" | |
2391 "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" | |
2392 "packssdw %%xmm1 , %%xmm0 \n\t" | |
2393 "movdqa %%xmm0 , (%1,%0) \n\t" | |
2394 "add $16 , %0 \n\t" | |
2395 " js 1b \n\t" | |
2396 :"+r"(reglen), "+r"(dst), "+r"(src) | |
2397 ); | |
2398 } | |
2399 | |
12436
d6d0a43848b4
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
rbultje
parents:
12435
diff
changeset
|
2400 void ff_vp3_idct_mmx(int16_t *input_data); |
d6d0a43848b4
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
rbultje
parents:
12435
diff
changeset
|
2401 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); |
d6d0a43848b4
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
rbultje
parents:
12435
diff
changeset
|
2402 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); |
d6d0a43848b4
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
rbultje
parents:
12435
diff
changeset
|
2403 |
d6d0a43848b4
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
rbultje
parents:
12435
diff
changeset
|
2404 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block); |
d6d0a43848b4
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
rbultje
parents:
12435
diff
changeset
|
2405 |
d6d0a43848b4
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
rbultje
parents:
12435
diff
changeset
|
2406 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); |
d6d0a43848b4
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
rbultje
parents:
12435
diff
changeset
|
2407 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); |
d6d0a43848b4
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
rbultje
parents:
12435
diff
changeset
|
2408 |
d6d0a43848b4
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
rbultje
parents:
12435
diff
changeset
|
2409 void ff_vp3_idct_sse2(int16_t *input_data); |
d6d0a43848b4
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
rbultje
parents:
12435
diff
changeset
|
2410 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); |
d6d0a43848b4
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
rbultje
parents:
12435
diff
changeset
|
2411 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); |
d6d0a43848b4
Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6
rbultje
parents:
12435
diff
changeset
|
2412 |
8430 | 2413 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); |
2414 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); | |
2415 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); | |
11981 | 2416 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift); |
2417 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift); | |
2418 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); | |
2419 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); | |
2420 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); | |
10431 | 2421 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); |
2422 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); | |
2423 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); | |
10645 | 2424 |
12450
3941687b4fa9
Split h264dsp_mmx.c (which was #included in dsputil_mmx.c) in h264_qpel_mmx.c,
rbultje
parents:
12439
diff
changeset
|
2425 #if !HAVE_YASM |
8430 | 2426 #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) |
2427 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) | |
2428 #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) | |
2429 #endif | |
2430 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse | |
2431 | |
2432 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ | |
2433 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ | |
2434 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ | |
11369 | 2435 DECLARE_ALIGNED(16, int16_t, tmp)[len];\ |
8430 | 2436 int i,j,c;\ |
2437 for(c=0; c<channels; c++){\ | |
2438 float_to_int16_##cpu(tmp, src[c], len);\ | |
2439 for(i=0, j=c; i<len; i++, j+=channels)\ | |
2440 dst[j] = tmp[i];\ | |
2441 }\ | |
2442 }\ | |
2443 \ | |
2444 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ | |
2445 if(channels==1)\ | |
2446 float_to_int16_##cpu(dst, src[0], len);\ | |
2447 else if(channels==2){\ | |
2448 x86_reg reglen = len; \ | |
2449 const float *src0 = src[0];\ | |
2450 const float *src1 = src[1];\ | |
2451 __asm__ volatile(\ | |
2452 "shl $2, %0 \n"\ | |
2453 "add %0, %1 \n"\ | |
2454 "add %0, %2 \n"\ | |
2455 "add %0, %3 \n"\ | |
2456 "neg %0 \n"\ | |
2457 body\ | |
2458 :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ | |
2459 );\ | |
2460 }else if(channels==6){\ | |
2461 ff_float_to_int16_interleave6_##cpu(dst, src, len);\ | |
2462 }else\ | |
2463 float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ | |
2464 } | |
2465 | |
2466 FLOAT_TO_INT16_INTERLEAVE(3dnow, | |
2467 "1: \n" | |
2468 "pf2id (%2,%0), %%mm0 \n" | |
2469 "pf2id 8(%2,%0), %%mm1 \n" | |
2470 "pf2id (%3,%0), %%mm2 \n" | |
2471 "pf2id 8(%3,%0), %%mm3 \n" | |
2472 "packssdw %%mm1, %%mm0 \n" | |
2473 "packssdw %%mm3, %%mm2 \n" | |
2474 "movq %%mm0, %%mm1 \n" | |
2475 "punpcklwd %%mm2, %%mm0 \n" | |
2476 "punpckhwd %%mm2, %%mm1 \n" | |
2477 "movq %%mm0, (%1,%0)\n" | |
2478 "movq %%mm1, 8(%1,%0)\n" | |
2479 "add $16, %0 \n" | |
2480 "js 1b \n" | |
2481 "femms \n" | |
2482 ) | |
2483 | |
2484 FLOAT_TO_INT16_INTERLEAVE(sse, | |
2485 "1: \n" | |
2486 "cvtps2pi (%2,%0), %%mm0 \n" | |
2487 "cvtps2pi 8(%2,%0), %%mm1 \n" | |
2488 "cvtps2pi (%3,%0), %%mm2 \n" | |
2489 "cvtps2pi 8(%3,%0), %%mm3 \n" | |
2490 "packssdw %%mm1, %%mm0 \n" | |
2491 "packssdw %%mm3, %%mm2 \n" | |
2492 "movq %%mm0, %%mm1 \n" | |
2493 "punpcklwd %%mm2, %%mm0 \n" | |
2494 "punpckhwd %%mm2, %%mm1 \n" | |
2495 "movq %%mm0, (%1,%0)\n" | |
2496 "movq %%mm1, 8(%1,%0)\n" | |
2497 "add $16, %0 \n" | |
2498 "js 1b \n" | |
2499 "emms \n" | |
2500 ) | |
2501 | |
2502 FLOAT_TO_INT16_INTERLEAVE(sse2, | |
2503 "1: \n" | |
2504 "cvtps2dq (%2,%0), %%xmm0 \n" | |
2505 "cvtps2dq (%3,%0), %%xmm1 \n" | |
2506 "packssdw %%xmm1, %%xmm0 \n" | |
2507 "movhlps %%xmm0, %%xmm1 \n" | |
2508 "punpcklwd %%xmm1, %%xmm0 \n" | |
2509 "movdqa %%xmm0, (%1,%0) \n" | |
2510 "add $16, %0 \n" | |
2511 "js 1b \n" | |
2512 ) | |
2513 | |
2514 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ | |
2515 if(channels==6) | |
2516 ff_float_to_int16_interleave6_3dn2(dst, src, len); | |
2517 else | |
2518 float_to_int16_interleave_3dnow(dst, src, len, channels); | |
2519 } | |
2520 | |
10964 | 2521 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); |
2522 | |
8430 | 2523 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
2524 { | |
12414 | 2525 int mm_flags = mm_support(); |
8430 | 2526 |
2527 if (avctx->dsp_mask) { | |
2528 if (avctx->dsp_mask & FF_MM_FORCE) | |
2529 mm_flags |= (avctx->dsp_mask & 0xffff); | |
2530 else | |
2531 mm_flags &= ~(avctx->dsp_mask & 0xffff); | |
2532 } | |
2533 | |
2534 #if 0 | |
2535 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); | |
2536 if (mm_flags & FF_MM_MMX) | |
2537 av_log(avctx, AV_LOG_INFO, " mmx"); | |
9342
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
9341
diff
changeset
|
2538 if (mm_flags & FF_MM_MMX2) |
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
9341
diff
changeset
|
2539 av_log(avctx, AV_LOG_INFO, " mmx2"); |
8430 | 2540 if (mm_flags & FF_MM_3DNOW) |
2541 av_log(avctx, AV_LOG_INFO, " 3dnow"); | |
2542 if (mm_flags & FF_MM_SSE) | |
2543 av_log(avctx, AV_LOG_INFO, " sse"); | |
2544 if (mm_flags & FF_MM_SSE2) | |
2545 av_log(avctx, AV_LOG_INFO, " sse2"); | |
2546 av_log(avctx, AV_LOG_INFO, "\n"); | |
2547 #endif | |
2548 | |
2549 if (mm_flags & FF_MM_MMX) { | |
2550 const int idct_algo= avctx->idct_algo; | |
2551 | |
2552 if(avctx->lowres==0){ | |
2553 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ | |
2554 c->idct_put= ff_simple_idct_put_mmx; | |
2555 c->idct_add= ff_simple_idct_add_mmx; | |
2556 c->idct = ff_simple_idct_mmx; | |
2557 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; | |
8590 | 2558 #if CONFIG_GPL |
8430 | 2559 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ |
9342
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
9341
diff
changeset
|
2560 if(mm_flags & FF_MM_MMX2){ |
8430 | 2561 c->idct_put= ff_libmpeg2mmx2_idct_put; |
2562 c->idct_add= ff_libmpeg2mmx2_idct_add; | |
2563 c->idct = ff_mmxext_idct; | |
2564 }else{ | |
2565 c->idct_put= ff_libmpeg2mmx_idct_put; | |
2566 c->idct_add= ff_libmpeg2mmx_idct_add; | |
2567 c->idct = ff_mmx_idct; | |
2568 } | |
2569 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; | |
2570 #endif | |
9975
d6d7e8d4a04d
Do not redundantly check for both CONFIG_THEORA_DECODER and CONFIG_VP3_DECODER.
diego
parents:
9959
diff
changeset
|
2571 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) && |
12439
51fc247eed32
Fix compilation failure if yasm is disabled (missing vp3 symbols).
rbultje
parents:
12437
diff
changeset
|
2572 idct_algo==FF_IDCT_VP3 && HAVE_YASM){ |
8430 | 2573 if(mm_flags & FF_MM_SSE2){ |
2574 c->idct_put= ff_vp3_idct_put_sse2; | |
2575 c->idct_add= ff_vp3_idct_add_sse2; | |
2576 c->idct = ff_vp3_idct_sse2; | |
2577 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; | |
2578 }else{ | |
2579 c->idct_put= ff_vp3_idct_put_mmx; | |
2580 c->idct_add= ff_vp3_idct_add_mmx; | |
2581 c->idct = ff_vp3_idct_mmx; | |
2582 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; | |
2583 } | |
2584 }else if(idct_algo==FF_IDCT_CAVS){ | |
2585 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; | |
2586 }else if(idct_algo==FF_IDCT_XVIDMMX){ | |
2587 if(mm_flags & FF_MM_SSE2){ | |
2588 c->idct_put= ff_idct_xvid_sse2_put; | |
2589 c->idct_add= ff_idct_xvid_sse2_add; | |
2590 c->idct = ff_idct_xvid_sse2; | |
2591 c->idct_permutation_type= FF_SSE2_IDCT_PERM; | |
9342
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
9341
diff
changeset
|
2592 }else if(mm_flags & FF_MM_MMX2){ |
8430 | 2593 c->idct_put= ff_idct_xvid_mmx2_put; |
2594 c->idct_add= ff_idct_xvid_mmx2_add; | |
2595 c->idct = ff_idct_xvid_mmx2; | |
2596 }else{ | |
2597 c->idct_put= ff_idct_xvid_mmx_put; | |
2598 c->idct_add= ff_idct_xvid_mmx_add; | |
2599 c->idct = ff_idct_xvid_mmx; | |
2600 } | |
2601 } | |
2602 } | |
2603 | |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
2604 c->put_pixels_clamped = ff_put_pixels_clamped_mmx; |
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
2605 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; |
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12417
diff
changeset
|
2606 c->add_pixels_clamped = ff_add_pixels_clamped_mmx; |
8430 | 2607 c->clear_block = clear_block_mmx; |
2608 c->clear_blocks = clear_blocks_mmx; | |
10766
78c2be62260a
Fix XvMC. XvMCCreateBlocks() may not allocate 16-byte aligned blocks,
gb
parents:
10749
diff
changeset
|
2609 if ((mm_flags & FF_MM_SSE) && |
78c2be62260a
Fix XvMC. XvMCCreateBlocks() may not allocate 16-byte aligned blocks,
gb
parents:
10749
diff
changeset
|
2610 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){ |
78c2be62260a
Fix XvMC. XvMCCreateBlocks() may not allocate 16-byte aligned blocks,
gb
parents:
10749
diff
changeset
|
2611 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ |
9861 | 2612 c->clear_block = clear_block_sse; |
2613 c->clear_blocks = clear_blocks_sse; | |
2614 } | |
8430 | 2615 |
2616 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ | |
2617 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ | |
2618 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ | |
2619 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ | |
2620 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU | |
2621 | |
2622 SET_HPEL_FUNCS(put, 0, 16, mmx); | |
2623 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx); | |
2624 SET_HPEL_FUNCS(avg, 0, 16, mmx); | |
2625 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx); | |
2626 SET_HPEL_FUNCS(put, 1, 8, mmx); | |
2627 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx); | |
2628 SET_HPEL_FUNCS(avg, 1, 8, mmx); | |
2629 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx); | |
2630 | |
2631 c->gmc= gmc_mmx; | |
2632 | |
2633 c->add_bytes= add_bytes_mmx; | |
2634 c->add_bytes_l2= add_bytes_l2_mmx; | |
2635 | |
2636 c->draw_edges = draw_edges_mmx; | |
2637 | |
10749
5cca4b6c459d
Get rid of pointless CONFIG_ANY_H263 preprocessor definition.
diego
parents:
10645
diff
changeset
|
2638 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
8430 | 2639 c->h263_v_loop_filter= h263_v_loop_filter_mmx; |
2640 c->h263_h_loop_filter= h263_h_loop_filter_mmx; | |
2641 } | |
2642 | |
12437
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2643 #if HAVE_YASM |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2644 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2645 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2646 c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2647 |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2648 c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2649 c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2650 #endif |
8519
cc64e1343397
Use H264 MMX chroma functions to accelerate RV40 decoding.
cehoyos
parents:
8510
diff
changeset
|
2651 |
9342
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
9341
diff
changeset
|
2652 if (mm_flags & FF_MM_MMX2) { |
8430 | 2653 c->prefetch = prefetch_mmx2; |
2654 | |
2655 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; | |
2656 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; | |
2657 | |
2658 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; | |
2659 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; | |
2660 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; | |
2661 | |
2662 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; | |
2663 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; | |
2664 | |
2665 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; | |
2666 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; | |
2667 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; | |
2668 | |
2669 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
2670 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; | |
2671 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; | |
2672 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; | |
2673 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; | |
2674 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; | |
2675 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; | |
2676 | |
12439
51fc247eed32
Fix compilation failure if yasm is disabled (missing vp3 symbols).
rbultje
parents:
12437
diff
changeset
|
2677 if (CONFIG_VP3_DECODER && HAVE_YASM) { |
8430 | 2678 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2; |
2679 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2; | |
2680 } | |
2681 } | |
12439
51fc247eed32
Fix compilation failure if yasm is disabled (missing vp3 symbols).
rbultje
parents:
12437
diff
changeset
|
2682 if (CONFIG_VP3_DECODER && HAVE_YASM) { |
11637 | 2683 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2; |
2684 } | |
8430 | 2685 |
11826
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
11637
diff
changeset
|
2686 if (CONFIG_VP3_DECODER |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
11637
diff
changeset
|
2687 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) { |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
11637
diff
changeset
|
2688 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2; |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
11637
diff
changeset
|
2689 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2; |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
11637
diff
changeset
|
2690 } |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
11637
diff
changeset
|
2691 |
8430 | 2692 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ |
2693 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ | |
2694 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \ | |
2695 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \ | |
2696 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \ | |
2697 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \ | |
2698 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \ | |
2699 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \ | |
2700 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \ | |
2701 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \ | |
2702 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \ | |
2703 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \ | |
2704 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \ | |
2705 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \ | |
2706 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \ | |
2707 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \ | |
2708 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU | |
2709 | |
2710 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2); | |
2711 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2); | |
2712 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2); | |
2713 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2); | |
2714 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2); | |
2715 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2); | |
2716 | |
2717 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2); | |
2718 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2); | |
2719 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2); | |
2720 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2); | |
2721 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2); | |
2722 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2); | |
2723 | |
2724 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2); | |
2725 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2); | |
2726 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2); | |
2727 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2); | |
2728 | |
12437
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2729 #if HAVE_YASM |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2730 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2731 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2; |
8519
cc64e1343397
Use H264 MMX chroma functions to accelerate RV40 decoding.
cehoyos
parents:
8510
diff
changeset
|
2732 |
12437
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2733 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_mmx2_nornd; |
9440 | 2734 |
12437
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2735 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2736 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2737 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2738 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2; |
8430 | 2739 |
8760 | 2740 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; |
2741 #endif | |
8798
a5c8210814d7
Add check whether the compiler/assembler supports 10 or more operands.
diego
parents:
8760
diff
changeset
|
2742 #if HAVE_7REGS && HAVE_TEN_OPERANDS |
8760 | 2743 if( mm_flags&FF_MM_3DNOW ) |
2744 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov; | |
2745 #endif | |
2746 | |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
2747 if (CONFIG_VC1_DECODER) |
8430 | 2748 ff_vc1dsp_init_mmx(c, avctx); |
2749 | |
2750 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; | |
2751 } else if (mm_flags & FF_MM_3DNOW) { | |
2752 c->prefetch = prefetch_3dnow; | |
2753 | |
2754 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; | |
2755 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; | |
2756 | |
2757 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; | |
2758 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; | |
2759 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; | |
2760 | |
2761 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; | |
2762 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; | |
2763 | |
2764 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; | |
2765 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; | |
2766 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; | |
2767 | |
2768 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
2769 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; | |
2770 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; | |
2771 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; | |
2772 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; | |
2773 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; | |
2774 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; | |
2775 } | |
2776 | |
11826
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
11637
diff
changeset
|
2777 if (CONFIG_VP3_DECODER |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
11637
diff
changeset
|
2778 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) { |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
11637
diff
changeset
|
2779 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow; |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
11637
diff
changeset
|
2780 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow; |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
11637
diff
changeset
|
2781 } |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
11637
diff
changeset
|
2782 |
8430 | 2783 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow); |
2784 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow); | |
2785 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow); | |
2786 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow); | |
2787 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow); | |
2788 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow); | |
2789 | |
2790 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow); | |
2791 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow); | |
2792 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow); | |
2793 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow); | |
2794 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow); | |
2795 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow); | |
2796 | |
2797 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow); | |
2798 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow); | |
2799 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow); | |
2800 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow); | |
2801 | |
12437
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2802 #if HAVE_YASM |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2803 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2804 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow; |
8430 | 2805 |
12437
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2806 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_3dnow_nornd; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2807 |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2808 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2809 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2810 #endif |
8430 | 2811 } |
2812 | |
2813 | |
2814 #define H264_QPEL_FUNCS(x, y, CPU)\ | |
2815 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\ | |
2816 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\ | |
2817 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\ | |
2818 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; | |
2819 if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){ | |
2820 // these functions are slower than mmx on AMD, but faster on Intel | |
2821 c->put_pixels_tab[0][0] = put_pixels16_sse2; | |
2822 c->avg_pixels_tab[0][0] = avg_pixels16_sse2; | |
2823 H264_QPEL_FUNCS(0, 0, sse2); | |
2824 } | |
2825 if(mm_flags & FF_MM_SSE2){ | |
2826 H264_QPEL_FUNCS(0, 1, sse2); | |
2827 H264_QPEL_FUNCS(0, 2, sse2); | |
2828 H264_QPEL_FUNCS(0, 3, sse2); | |
2829 H264_QPEL_FUNCS(1, 1, sse2); | |
2830 H264_QPEL_FUNCS(1, 2, sse2); | |
2831 H264_QPEL_FUNCS(1, 3, sse2); | |
2832 H264_QPEL_FUNCS(2, 1, sse2); | |
2833 H264_QPEL_FUNCS(2, 2, sse2); | |
2834 H264_QPEL_FUNCS(2, 3, sse2); | |
2835 H264_QPEL_FUNCS(3, 1, sse2); | |
2836 H264_QPEL_FUNCS(3, 2, sse2); | |
2837 H264_QPEL_FUNCS(3, 3, sse2); | |
2838 } | |
8590 | 2839 #if HAVE_SSSE3 |
8430 | 2840 if(mm_flags & FF_MM_SSSE3){ |
2841 H264_QPEL_FUNCS(1, 0, ssse3); | |
2842 H264_QPEL_FUNCS(1, 1, ssse3); | |
2843 H264_QPEL_FUNCS(1, 2, ssse3); | |
2844 H264_QPEL_FUNCS(1, 3, ssse3); | |
2845 H264_QPEL_FUNCS(2, 0, ssse3); | |
2846 H264_QPEL_FUNCS(2, 1, ssse3); | |
2847 H264_QPEL_FUNCS(2, 2, ssse3); | |
2848 H264_QPEL_FUNCS(2, 3, ssse3); | |
2849 H264_QPEL_FUNCS(3, 0, ssse3); | |
2850 H264_QPEL_FUNCS(3, 1, ssse3); | |
2851 H264_QPEL_FUNCS(3, 2, ssse3); | |
2852 H264_QPEL_FUNCS(3, 3, ssse3); | |
2853 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; | |
10430 | 2854 #if HAVE_YASM |
12437
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2855 c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_ssse3_nornd; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2856 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_ssse3_nornd; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2857 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2858 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2859 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3; |
b242eb86ea9a
Move H264 chroma MC from inline asm to yasm. This fixes VP3/5/6 and VC-1
rbultje
parents:
12436
diff
changeset
|
2860 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3; |
10430 | 2861 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3; |
2862 if (mm_flags & FF_MM_SSE4) // not really sse4, just slow on Conroe | |
2863 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; | |
2864 #endif | |
8430 | 2865 } |
2866 #endif | |
2867 | |
2868 if(mm_flags & FF_MM_3DNOW){ | |
2869 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; | |
2870 c->vector_fmul = vector_fmul_3dnow; | |
2871 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
2872 c->float_to_int16 = float_to_int16_3dnow; | |
2873 c->float_to_int16_interleave = float_to_int16_interleave_3dnow; | |
2874 } | |
2875 } | |
2876 if(mm_flags & FF_MM_3DNOWEXT){ | |
2877 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; | |
2878 c->vector_fmul_window = vector_fmul_window_3dnow2; | |
2879 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
2880 c->float_to_int16_interleave = float_to_int16_interleave_3dn2; | |
2881 } | |
2882 } | |
10633 | 2883 if(mm_flags & FF_MM_MMX2){ |
2884 #if HAVE_YASM | |
2885 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; | |
10644 | 2886 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2; |
10633 | 2887 #endif |
2888 } | |
8430 | 2889 if(mm_flags & FF_MM_SSE){ |
2890 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; | |
2891 c->ac3_downmix = ac3_downmix_sse; | |
2892 c->vector_fmul = vector_fmul_sse; | |
2893 c->vector_fmul_reverse = vector_fmul_reverse_sse; | |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10107
diff
changeset
|
2894 c->vector_fmul_add = vector_fmul_add_sse; |
8430 | 2895 c->vector_fmul_window = vector_fmul_window_sse; |
2896 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; | |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
9995
diff
changeset
|
2897 c->vector_clipf = vector_clipf_sse; |
8430 | 2898 c->float_to_int16 = float_to_int16_sse; |
2899 c->float_to_int16_interleave = float_to_int16_interleave_sse; | |
10964 | 2900 #if HAVE_YASM |
2901 c->scalarproduct_float = ff_scalarproduct_float_sse; | |
2902 #endif | |
8430 | 2903 } |
2904 if(mm_flags & FF_MM_3DNOW) | |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10107
diff
changeset
|
2905 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse |
8430 | 2906 if(mm_flags & FF_MM_SSE2){ |
2907 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; | |
2908 c->float_to_int16 = float_to_int16_sse2; | |
2909 c->float_to_int16_interleave = float_to_int16_interleave_sse2; | |
10633 | 2910 #if HAVE_YASM |
2911 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | |
10644 | 2912 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; |
10633 | 2913 #endif |
8430 | 2914 } |
10644 | 2915 if((mm_flags & FF_MM_SSSE3) && !(mm_flags & (FF_MM_SSE42|FF_MM_3DNOW)) && HAVE_YASM) // cachesplit |
2916 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; | |
8430 | 2917 } |
2918 | |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
2919 if (CONFIG_ENCODERS) |
8430 | 2920 dsputilenc_init_mmx(c, avctx); |
2921 | |
2922 #if 0 | |
2923 // for speed testing | |
2924 get_pixels = just_return; | |
2925 put_pixels_clamped = just_return; | |
2926 add_pixels_clamped = just_return; | |
2927 | |
2928 pix_abs16x16 = just_return; | |
2929 pix_abs16x16_x2 = just_return; | |
2930 pix_abs16x16_y2 = just_return; | |
2931 pix_abs16x16_xy2 = just_return; | |
2932 | |
2933 put_pixels_tab[0] = just_return; | |
2934 put_pixels_tab[1] = just_return; | |
2935 put_pixels_tab[2] = just_return; | |
2936 put_pixels_tab[3] = just_return; | |
2937 | |
2938 put_no_rnd_pixels_tab[0] = just_return; | |
2939 put_no_rnd_pixels_tab[1] = just_return; | |
2940 put_no_rnd_pixels_tab[2] = just_return; | |
2941 put_no_rnd_pixels_tab[3] = just_return; | |
2942 | |
2943 avg_pixels_tab[0] = just_return; | |
2944 avg_pixels_tab[1] = just_return; | |
2945 avg_pixels_tab[2] = just_return; | |
2946 avg_pixels_tab[3] = just_return; | |
2947 | |
2948 avg_no_rnd_pixels_tab[0] = just_return; | |
2949 avg_no_rnd_pixels_tab[1] = just_return; | |
2950 avg_no_rnd_pixels_tab[2] = just_return; | |
2951 avg_no_rnd_pixels_tab[3] = just_return; | |
2952 | |
2953 //av_fdct = just_return; | |
2954 //ff_idct = just_return; | |
2955 #endif | |
2956 } |