Mercurial > libavcodec.hg
annotate x86/cavsdsp_mmx.c @ 12483:0159a19bfff7 libavcodec
aacdec: Rework channel mapping compatibility hacks.
For a PCE based configuration map the channels solely based on tags.
For an indexed configuration map the channels solely based on position.
This works with all known exotic samples including al17, elem_id0, bad_concat,
and lfe_is_sce.
author | alexc |
---|---|
date | Fri, 10 Sep 2010 18:01:48 +0000 |
parents | 9fef0a8ddd63 |
children |
rev | line source |
---|---|
8430 | 1 /* |
2 * Chinese AVS video (AVS1-P2, JiZhun profile) decoder. | |
3 * Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de> | |
4 * | |
5 * MMX-optimized DSP functions, based on H.264 optimizations by | |
6 * Michael Niedermayer and Loren Merritt | |
7 * | |
8 * This file is part of FFmpeg. | |
9 * | |
10 * FFmpeg is free software; you can redistribute it and/or | |
11 * modify it under the terms of the GNU Lesser General Public | |
12 * License as published by the Free Software Foundation; either | |
13 * version 2.1 of the License, or (at your option) any later version. | |
14 * | |
15 * FFmpeg is distributed in the hope that it will be useful, | |
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 * Lesser General Public License for more details. | |
19 * | |
20 * You should have received a copy of the GNU Lesser General Public | |
21 * License along with FFmpeg; if not, write to the Free Software | |
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 */ | |
24 | |
25 #include "libavutil/common.h" | |
12475
9fef0a8ddd63
Move mm_support() from libavcodec to libavutil, make it a public
stefano
parents:
12456
diff
changeset
|
26 #include "libavutil/cpu.h" |
8430 | 27 #include "libavutil/x86_cpu.h" |
28 #include "libavcodec/dsputil.h" | |
12356 | 29 #include "libavcodec/cavsdsp.h" |
8430 | 30 #include "dsputil_mmx.h" |
31 | |
32 /***************************************************************************** | |
33 * | |
34 * inverse transform | |
35 * | |
36 ****************************************************************************/ | |
37 | |
38 static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) | |
39 { | |
40 __asm__ volatile( | |
41 "movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */ | |
42 "movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */ | |
43 "movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */ | |
44 "movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */ | |
45 "movq %%mm4, %%mm0 \n\t" | |
46 "movq %%mm5, %%mm3 \n\t" | |
47 "movq %%mm2, %%mm6 \n\t" | |
48 "movq %%mm7, %%mm1 \n\t" | |
49 | |
50 "paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */ | |
51 "paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */ | |
52 "paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */ | |
53 "paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */ | |
54 "paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */ | |
55 "paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */ | |
56 "paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */ | |
57 "paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */ | |
58 "psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */ | |
59 "paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */ | |
60 "psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */ | |
61 "paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */ | |
62 | |
63 "movq %%mm5, %%mm4 \n\t" | |
64 "movq %%mm7, %%mm6 \n\t" | |
65 "movq %%mm3, %%mm0 \n\t" | |
66 "movq %%mm1, %%mm2 \n\t" | |
67 SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */ | |
68 "paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */ | |
69 "paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */ | |
70 "paddw %%mm7, %%mm7 \n\t" | |
71 "paddw %%mm5, %%mm5 \n\t" | |
72 "paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */ | |
73 "paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */ | |
74 | |
75 SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */ | |
76 "psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */ | |
77 "movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */ | |
78 "psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */ | |
79 "paddw %%mm1, %%mm1 \n\t" | |
80 "paddw %%mm3, %%mm3 \n\t" | |
81 "psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */ | |
82 "paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */ | |
83 | |
84 "movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */ | |
85 "movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */ | |
86 "movq %%mm2, %%mm4 \n\t" | |
87 "movq %%mm6, %%mm0 \n\t" | |
88 "psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */ | |
89 "psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */ | |
90 "paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */ | |
91 "paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */ | |
92 "paddw %%mm2, %%mm2 \n\t" | |
93 "paddw %%mm0, %%mm0 \n\t" | |
94 "psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */ | |
95 "paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */ | |
96 | |
97 "movq (%0), %%mm2 \n\t" /* mm2 = src0 */ | |
98 "movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */ | |
99 SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */ | |
100 "psllw $3, %%mm0 \n\t" | |
101 "psllw $3, %%mm2 \n\t" | |
102 "paddw %1, %%mm0 \n\t" /* add rounding bias */ | |
103 "paddw %1, %%mm2 \n\t" /* add rounding bias */ | |
104 | |
105 SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */ | |
106 SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */ | |
107 SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */ | |
108 SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */ | |
109 SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */ | |
110 SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */ | |
111 :: "r"(block), "m"(bias) | |
112 ); | |
113 } | |
114 | |
115 static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) | |
116 { | |
117 int i; | |
11369 | 118 DECLARE_ALIGNED(8, int16_t, b2)[64]; |
8430 | 119 |
120 for(i=0; i<2; i++){ | |
11369 | 121 DECLARE_ALIGNED(8, uint64_t, tmp); |
8430 | 122 |
12143 | 123 cavs_idct8_1d(block+4*i, ff_pw_4.a); |
8430 | 124 |
125 __asm__ volatile( | |
126 "psraw $3, %%mm7 \n\t" | |
127 "psraw $3, %%mm6 \n\t" | |
128 "psraw $3, %%mm5 \n\t" | |
129 "psraw $3, %%mm4 \n\t" | |
130 "psraw $3, %%mm3 \n\t" | |
131 "psraw $3, %%mm2 \n\t" | |
132 "psraw $3, %%mm1 \n\t" | |
133 "psraw $3, %%mm0 \n\t" | |
134 "movq %%mm7, %0 \n\t" | |
135 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) | |
136 "movq %%mm0, 8(%1) \n\t" | |
137 "movq %%mm6, 24(%1) \n\t" | |
138 "movq %%mm7, 40(%1) \n\t" | |
139 "movq %%mm4, 56(%1) \n\t" | |
140 "movq %0, %%mm7 \n\t" | |
141 TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) | |
142 "movq %%mm7, (%1) \n\t" | |
143 "movq %%mm1, 16(%1) \n\t" | |
144 "movq %%mm0, 32(%1) \n\t" | |
145 "movq %%mm3, 48(%1) \n\t" | |
146 : "=m"(tmp) | |
147 : "r"(b2+32*i) | |
148 : "memory" | |
149 ); | |
150 } | |
151 | |
152 for(i=0; i<2; i++){ | |
8816
53f9f3994ec8
convert ff_pw_64 into an xmm_reg for future use in vp6 sse code
aurel
parents:
8793
diff
changeset
|
153 cavs_idct8_1d(b2+4*i, ff_pw_64.a); |
8430 | 154 |
155 __asm__ volatile( | |
156 "psraw $7, %%mm7 \n\t" | |
157 "psraw $7, %%mm6 \n\t" | |
158 "psraw $7, %%mm5 \n\t" | |
159 "psraw $7, %%mm4 \n\t" | |
160 "psraw $7, %%mm3 \n\t" | |
161 "psraw $7, %%mm2 \n\t" | |
162 "psraw $7, %%mm1 \n\t" | |
163 "psraw $7, %%mm0 \n\t" | |
164 "movq %%mm7, (%0) \n\t" | |
165 "movq %%mm5, 16(%0) \n\t" | |
166 "movq %%mm3, 32(%0) \n\t" | |
167 "movq %%mm1, 48(%0) \n\t" | |
168 "movq %%mm0, 64(%0) \n\t" | |
169 "movq %%mm2, 80(%0) \n\t" | |
170 "movq %%mm4, 96(%0) \n\t" | |
171 "movq %%mm6, 112(%0) \n\t" | |
172 :: "r"(b2+4*i) | |
173 : "memory" | |
174 ); | |
175 } | |
176 | |
12435
fe78a4548d12
Put ff_ prefix on non-static {put_signed,put,add}_pixels_clamped_mmx()
rbultje
parents:
12356
diff
changeset
|
177 ff_add_pixels_clamped_mmx(b2, dst, stride); |
8430 | 178 } |
179 | |
180 /***************************************************************************** | |
181 * | |
182 * motion compensation | |
183 * | |
184 ****************************************************************************/ | |
185 | |
186 /* vertical filter [-1 -2 96 42 -7 0] */ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
187 #define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \ |
8430 | 188 "movd (%0), "#F" \n\t"\ |
189 "movq "#C", %%mm6 \n\t"\ | |
190 "pmullw %5, %%mm6 \n\t"\ | |
191 "movq "#D", %%mm7 \n\t"\ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
192 "pmullw "MANGLE(MUL2)", %%mm7\n\t"\ |
8430 | 193 "psllw $3, "#E" \n\t"\ |
194 "psubw "#E", %%mm6 \n\t"\ | |
195 "psraw $3, "#E" \n\t"\ | |
196 "paddw %%mm7, %%mm6 \n\t"\ | |
197 "paddw "#E", %%mm6 \n\t"\ | |
198 "paddw "#B", "#B" \n\t"\ | |
199 "pxor %%mm7, %%mm7 \n\t"\ | |
200 "add %2, %0 \n\t"\ | |
201 "punpcklbw %%mm7, "#F" \n\t"\ | |
202 "psubw "#B", %%mm6 \n\t"\ | |
203 "psraw $1, "#B" \n\t"\ | |
204 "psubw "#A", %%mm6 \n\t"\ | |
205 "paddw %4, %%mm6 \n\t"\ | |
206 "psraw $7, %%mm6 \n\t"\ | |
207 "packuswb %%mm6, %%mm6 \n\t"\ | |
208 OP(%%mm6, (%1), A, d) \ | |
209 "add %3, %1 \n\t" | |
210 | |
211 /* vertical filter [ 0 -1 5 5 -1 0] */ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
212 #define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \ |
8430 | 213 "movd (%0), "#F" \n\t"\ |
214 "movq "#C", %%mm6 \n\t"\ | |
215 "paddw "#D", %%mm6 \n\t"\ | |
216 "pmullw %5, %%mm6 \n\t"\ | |
217 "add %2, %0 \n\t"\ | |
218 "punpcklbw %%mm7, "#F" \n\t"\ | |
219 "psubw "#B", %%mm6 \n\t"\ | |
220 "psubw "#E", %%mm6 \n\t"\ | |
221 "paddw %4, %%mm6 \n\t"\ | |
222 "psraw $3, %%mm6 \n\t"\ | |
223 "packuswb %%mm6, %%mm6 \n\t"\ | |
224 OP(%%mm6, (%1), A, d) \ | |
225 "add %3, %1 \n\t" | |
226 | |
227 /* vertical filter [ 0 -7 42 96 -2 -1] */ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
228 #define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \ |
8430 | 229 "movd (%0), "#F" \n\t"\ |
230 "movq "#C", %%mm6 \n\t"\ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
231 "pmullw "MANGLE(MUL2)", %%mm6\n\t"\ |
8430 | 232 "movq "#D", %%mm7 \n\t"\ |
233 "pmullw %5, %%mm7 \n\t"\ | |
234 "psllw $3, "#B" \n\t"\ | |
235 "psubw "#B", %%mm6 \n\t"\ | |
236 "psraw $3, "#B" \n\t"\ | |
237 "paddw %%mm7, %%mm6 \n\t"\ | |
238 "paddw "#B", %%mm6 \n\t"\ | |
239 "paddw "#E", "#E" \n\t"\ | |
240 "pxor %%mm7, %%mm7 \n\t"\ | |
241 "add %2, %0 \n\t"\ | |
242 "punpcklbw %%mm7, "#F" \n\t"\ | |
243 "psubw "#E", %%mm6 \n\t"\ | |
244 "psraw $1, "#E" \n\t"\ | |
245 "psubw "#F", %%mm6 \n\t"\ | |
246 "paddw %4, %%mm6 \n\t"\ | |
247 "psraw $7, %%mm6 \n\t"\ | |
248 "packuswb %%mm6, %%mm6 \n\t"\ | |
249 OP(%%mm6, (%1), A, d) \ | |
250 "add %3, %1 \n\t" | |
251 | |
252 | |
253 #define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\ | |
254 int w= 2;\ | |
255 src -= 2*srcStride;\ | |
256 \ | |
257 while(w--){\ | |
258 __asm__ volatile(\ | |
259 "pxor %%mm7, %%mm7 \n\t"\ | |
260 "movd (%0), %%mm0 \n\t"\ | |
261 "add %2, %0 \n\t"\ | |
262 "movd (%0), %%mm1 \n\t"\ | |
263 "add %2, %0 \n\t"\ | |
264 "movd (%0), %%mm2 \n\t"\ | |
265 "add %2, %0 \n\t"\ | |
266 "movd (%0), %%mm3 \n\t"\ | |
267 "add %2, %0 \n\t"\ | |
268 "movd (%0), %%mm4 \n\t"\ | |
269 "add %2, %0 \n\t"\ | |
270 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
271 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
272 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
273 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
274 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
275 VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
276 VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
277 VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
278 VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
279 VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
280 VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
281 VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
282 VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ |
8430 | 283 \ |
284 : "+a"(src), "+c"(dst)\ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
285 : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ |
8430 | 286 : "memory"\ |
287 );\ | |
288 if(h==16){\ | |
289 __asm__ volatile(\ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
290 VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
291 VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
292 VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
293 VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
294 VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
295 VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
296 VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
297 VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ |
8430 | 298 \ |
299 : "+a"(src), "+c"(dst)\ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
300 : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ |
8430 | 301 : "memory"\ |
302 );\ | |
303 }\ | |
304 src += 4-(h+5)*srcStride;\ | |
305 dst += 4-h*dstStride;\ | |
306 } | |
307 | |
308 #define QPEL_CAVS(OPNAME, OP, MMX)\ | |
309 static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
310 int h=8;\ | |
311 __asm__ volatile(\ | |
312 "pxor %%mm7, %%mm7 \n\t"\ | |
313 "movq %5, %%mm6 \n\t"\ | |
314 "1: \n\t"\ | |
315 "movq (%0), %%mm0 \n\t"\ | |
316 "movq 1(%0), %%mm2 \n\t"\ | |
317 "movq %%mm0, %%mm1 \n\t"\ | |
318 "movq %%mm2, %%mm3 \n\t"\ | |
319 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
320 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
321 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
322 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
323 "paddw %%mm2, %%mm0 \n\t"\ | |
324 "paddw %%mm3, %%mm1 \n\t"\ | |
325 "pmullw %%mm6, %%mm0 \n\t"\ | |
326 "pmullw %%mm6, %%mm1 \n\t"\ | |
327 "movq -1(%0), %%mm2 \n\t"\ | |
328 "movq 2(%0), %%mm4 \n\t"\ | |
329 "movq %%mm2, %%mm3 \n\t"\ | |
330 "movq %%mm4, %%mm5 \n\t"\ | |
331 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
332 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
333 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
334 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
335 "paddw %%mm4, %%mm2 \n\t"\ | |
336 "paddw %%mm3, %%mm5 \n\t"\ | |
337 "psubw %%mm2, %%mm0 \n\t"\ | |
338 "psubw %%mm5, %%mm1 \n\t"\ | |
339 "movq %6, %%mm5 \n\t"\ | |
340 "paddw %%mm5, %%mm0 \n\t"\ | |
341 "paddw %%mm5, %%mm1 \n\t"\ | |
342 "psraw $3, %%mm0 \n\t"\ | |
343 "psraw $3, %%mm1 \n\t"\ | |
344 "packuswb %%mm1, %%mm0 \n\t"\ | |
345 OP(%%mm0, (%1),%%mm5, q) \ | |
346 "add %3, %0 \n\t"\ | |
347 "add %4, %1 \n\t"\ | |
348 "decl %2 \n\t"\ | |
349 " jnz 1b \n\t"\ | |
350 : "+a"(src), "+c"(dst), "+m"(h)\ | |
351 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\ | |
352 : "memory"\ | |
353 );\ | |
354 }\ | |
355 \ | |
356 static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
357 QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ | |
358 }\ | |
359 \ | |
360 static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
361 QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \ | |
362 }\ | |
363 \ | |
364 static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
365 QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ | |
366 }\ | |
367 \ | |
368 static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
369 OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\ | |
370 }\ | |
371 static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
372 OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\ | |
373 OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ | |
374 }\ | |
375 \ | |
376 static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
377 OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\ | |
378 }\ | |
379 static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
380 OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\ | |
381 OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ | |
382 }\ | |
383 \ | |
384 static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
385 OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\ | |
386 }\ | |
387 static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
388 OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\ | |
389 OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ | |
390 }\ | |
391 \ | |
392 static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
393 OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ | |
394 OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
395 src += 8*srcStride;\ | |
396 dst += 8*dstStride;\ | |
397 OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ | |
398 OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
399 }\ | |
400 | |
401 #define CAVS_MC(OPNAME, SIZE, MMX) \ | |
402 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
403 OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ | |
404 }\ | |
405 \ | |
406 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
407 OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ | |
408 }\ | |
409 \ | |
410 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
411 OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ | |
412 }\ | |
413 \ | |
414 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
415 OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ | |
416 }\ | |
417 | |
418 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" | |
419 #define AVG_3DNOW_OP(a,b,temp, size) \ | |
420 "mov" #size " " #b ", " #temp " \n\t"\ | |
421 "pavgusb " #temp ", " #a " \n\t"\ | |
422 "mov" #size " " #a ", " #b " \n\t" | |
423 #define AVG_MMX2_OP(a,b,temp, size) \ | |
424 "mov" #size " " #b ", " #temp " \n\t"\ | |
425 "pavgb " #temp ", " #a " \n\t"\ | |
426 "mov" #size " " #a ", " #b " \n\t" | |
427 | |
428 QPEL_CAVS(put_, PUT_OP, 3dnow) | |
429 QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow) | |
430 QPEL_CAVS(put_, PUT_OP, mmx2) | |
431 QPEL_CAVS(avg_, AVG_MMX2_OP, mmx2) | |
432 | |
433 CAVS_MC(put_, 8, 3dnow) | |
434 CAVS_MC(put_, 16,3dnow) | |
435 CAVS_MC(avg_, 8, 3dnow) | |
436 CAVS_MC(avg_, 16,3dnow) | |
437 CAVS_MC(put_, 8, mmx2) | |
438 CAVS_MC(put_, 16,mmx2) | |
439 CAVS_MC(avg_, 8, mmx2) | |
440 CAVS_MC(avg_, 16,mmx2) | |
441 | |
12356 | 442 static void ff_cavsdsp_init_mmx2(CAVSDSPContext* c, AVCodecContext *avctx) { |
8430 | 443 #define dspfunc(PFX, IDX, NUM) \ |
444 c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ | |
445 c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \ | |
446 c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \ | |
447 c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \ | |
448 c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \ | |
449 | |
450 dspfunc(put_cavs_qpel, 0, 16); | |
451 dspfunc(put_cavs_qpel, 1, 8); | |
452 dspfunc(avg_cavs_qpel, 0, 16); | |
453 dspfunc(avg_cavs_qpel, 1, 8); | |
454 #undef dspfunc | |
455 c->cavs_idct8_add = cavs_idct8_add_mmx; | |
456 } | |
457 | |
12356 | 458 static void ff_cavsdsp_init_3dnow(CAVSDSPContext* c, AVCodecContext *avctx) { |
8430 | 459 #define dspfunc(PFX, IDX, NUM) \ |
460 c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ | |
461 c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \ | |
462 c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \ | |
463 c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \ | |
464 c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \ | |
465 | |
466 dspfunc(put_cavs_qpel, 0, 16); | |
467 dspfunc(put_cavs_qpel, 1, 8); | |
468 dspfunc(avg_cavs_qpel, 0, 16); | |
469 dspfunc(avg_cavs_qpel, 1, 8); | |
470 #undef dspfunc | |
471 c->cavs_idct8_add = cavs_idct8_add_mmx; | |
472 } | |
12356 | 473 |
474 void ff_cavsdsp_init_mmx(CAVSDSPContext *c, AVCodecContext *avctx) | |
475 { | |
12475
9fef0a8ddd63
Move mm_support() from libavcodec to libavutil, make it a public
stefano
parents:
12456
diff
changeset
|
476 int mm_flags = av_get_cpu_flags(); |
12356 | 477 |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12435
diff
changeset
|
478 if (mm_flags & AV_CPU_FLAG_MMX2) ff_cavsdsp_init_mmx2 (c, avctx); |
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12435
diff
changeset
|
479 if (mm_flags & AV_CPU_FLAG_3DNOW) ff_cavsdsp_init_3dnow(c, avctx); |
12356 | 480 } |