Mercurial > libavcodec.hg
annotate x86/cavsdsp_mmx.c @ 11557:53822d92c3f7 libavcodec
Make sure the EC code does not attempt to use inter based concealment if there
is no reference frame available. (this can happen because the EC code will attempt
to use reference frames even for I/IDR frames)
author | michael |
---|---|
date | Tue, 30 Mar 2010 20:46:46 +0000 |
parents | f5ccf2e590d6 |
children | fa452b243aa6 |
rev | line source |
---|---|
8430 | 1 /* |
2 * Chinese AVS video (AVS1-P2, JiZhun profile) decoder. | |
3 * Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de> | |
4 * | |
5 * MMX-optimized DSP functions, based on H.264 optimizations by | |
6 * Michael Niedermayer and Loren Merritt | |
7 * | |
8 * This file is part of FFmpeg. | |
9 * | |
10 * FFmpeg is free software; you can redistribute it and/or | |
11 * modify it under the terms of the GNU Lesser General Public | |
12 * License as published by the Free Software Foundation; either | |
13 * version 2.1 of the License, or (at your option) any later version. | |
14 * | |
15 * FFmpeg is distributed in the hope that it will be useful, | |
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 * Lesser General Public License for more details. | |
19 * | |
20 * You should have received a copy of the GNU Lesser General Public | |
21 * License along with FFmpeg; if not, write to the Free Software | |
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 */ | |
24 | |
25 #include "libavutil/common.h" | |
26 #include "libavutil/x86_cpu.h" | |
27 #include "libavcodec/dsputil.h" | |
28 #include "dsputil_mmx.h" | |
29 | |
30 /***************************************************************************** | |
31 * | |
32 * inverse transform | |
33 * | |
34 ****************************************************************************/ | |
35 | |
36 static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) | |
37 { | |
38 __asm__ volatile( | |
39 "movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */ | |
40 "movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */ | |
41 "movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */ | |
42 "movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */ | |
43 "movq %%mm4, %%mm0 \n\t" | |
44 "movq %%mm5, %%mm3 \n\t" | |
45 "movq %%mm2, %%mm6 \n\t" | |
46 "movq %%mm7, %%mm1 \n\t" | |
47 | |
48 "paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */ | |
49 "paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */ | |
50 "paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */ | |
51 "paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */ | |
52 "paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */ | |
53 "paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */ | |
54 "paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */ | |
55 "paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */ | |
56 "psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */ | |
57 "paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */ | |
58 "psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */ | |
59 "paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */ | |
60 | |
61 "movq %%mm5, %%mm4 \n\t" | |
62 "movq %%mm7, %%mm6 \n\t" | |
63 "movq %%mm3, %%mm0 \n\t" | |
64 "movq %%mm1, %%mm2 \n\t" | |
65 SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */ | |
66 "paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */ | |
67 "paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */ | |
68 "paddw %%mm7, %%mm7 \n\t" | |
69 "paddw %%mm5, %%mm5 \n\t" | |
70 "paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */ | |
71 "paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */ | |
72 | |
73 SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */ | |
74 "psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */ | |
75 "movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */ | |
76 "psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */ | |
77 "paddw %%mm1, %%mm1 \n\t" | |
78 "paddw %%mm3, %%mm3 \n\t" | |
79 "psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */ | |
80 "paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */ | |
81 | |
82 "movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */ | |
83 "movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */ | |
84 "movq %%mm2, %%mm4 \n\t" | |
85 "movq %%mm6, %%mm0 \n\t" | |
86 "psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */ | |
87 "psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */ | |
88 "paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */ | |
89 "paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */ | |
90 "paddw %%mm2, %%mm2 \n\t" | |
91 "paddw %%mm0, %%mm0 \n\t" | |
92 "psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */ | |
93 "paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */ | |
94 | |
95 "movq (%0), %%mm2 \n\t" /* mm2 = src0 */ | |
96 "movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */ | |
97 SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */ | |
98 "psllw $3, %%mm0 \n\t" | |
99 "psllw $3, %%mm2 \n\t" | |
100 "paddw %1, %%mm0 \n\t" /* add rounding bias */ | |
101 "paddw %1, %%mm2 \n\t" /* add rounding bias */ | |
102 | |
103 SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */ | |
104 SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */ | |
105 SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */ | |
106 SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */ | |
107 SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */ | |
108 SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */ | |
109 :: "r"(block), "m"(bias) | |
110 ); | |
111 } | |
112 | |
113 static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) | |
114 { | |
115 int i; | |
11369 | 116 DECLARE_ALIGNED(8, int16_t, b2)[64]; |
8430 | 117 |
118 for(i=0; i<2; i++){ | |
11369 | 119 DECLARE_ALIGNED(8, uint64_t, tmp); |
8430 | 120 |
121 cavs_idct8_1d(block+4*i, ff_pw_4); | |
122 | |
123 __asm__ volatile( | |
124 "psraw $3, %%mm7 \n\t" | |
125 "psraw $3, %%mm6 \n\t" | |
126 "psraw $3, %%mm5 \n\t" | |
127 "psraw $3, %%mm4 \n\t" | |
128 "psraw $3, %%mm3 \n\t" | |
129 "psraw $3, %%mm2 \n\t" | |
130 "psraw $3, %%mm1 \n\t" | |
131 "psraw $3, %%mm0 \n\t" | |
132 "movq %%mm7, %0 \n\t" | |
133 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) | |
134 "movq %%mm0, 8(%1) \n\t" | |
135 "movq %%mm6, 24(%1) \n\t" | |
136 "movq %%mm7, 40(%1) \n\t" | |
137 "movq %%mm4, 56(%1) \n\t" | |
138 "movq %0, %%mm7 \n\t" | |
139 TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) | |
140 "movq %%mm7, (%1) \n\t" | |
141 "movq %%mm1, 16(%1) \n\t" | |
142 "movq %%mm0, 32(%1) \n\t" | |
143 "movq %%mm3, 48(%1) \n\t" | |
144 : "=m"(tmp) | |
145 : "r"(b2+32*i) | |
146 : "memory" | |
147 ); | |
148 } | |
149 | |
150 for(i=0; i<2; i++){ | |
8816
53f9f3994ec8
convert ff_pw_64 into an xmm_reg for future use in vp6 sse code
aurel
parents:
8793
diff
changeset
|
151 cavs_idct8_1d(b2+4*i, ff_pw_64.a); |
8430 | 152 |
153 __asm__ volatile( | |
154 "psraw $7, %%mm7 \n\t" | |
155 "psraw $7, %%mm6 \n\t" | |
156 "psraw $7, %%mm5 \n\t" | |
157 "psraw $7, %%mm4 \n\t" | |
158 "psraw $7, %%mm3 \n\t" | |
159 "psraw $7, %%mm2 \n\t" | |
160 "psraw $7, %%mm1 \n\t" | |
161 "psraw $7, %%mm0 \n\t" | |
162 "movq %%mm7, (%0) \n\t" | |
163 "movq %%mm5, 16(%0) \n\t" | |
164 "movq %%mm3, 32(%0) \n\t" | |
165 "movq %%mm1, 48(%0) \n\t" | |
166 "movq %%mm0, 64(%0) \n\t" | |
167 "movq %%mm2, 80(%0) \n\t" | |
168 "movq %%mm4, 96(%0) \n\t" | |
169 "movq %%mm6, 112(%0) \n\t" | |
170 :: "r"(b2+4*i) | |
171 : "memory" | |
172 ); | |
173 } | |
174 | |
175 add_pixels_clamped_mmx(b2, dst, stride); | |
176 } | |
177 | |
178 /***************************************************************************** | |
179 * | |
180 * motion compensation | |
181 * | |
182 ****************************************************************************/ | |
183 | |
184 /* vertical filter [-1 -2 96 42 -7 0] */ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
185 #define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \ |
8430 | 186 "movd (%0), "#F" \n\t"\ |
187 "movq "#C", %%mm6 \n\t"\ | |
188 "pmullw %5, %%mm6 \n\t"\ | |
189 "movq "#D", %%mm7 \n\t"\ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
190 "pmullw "MANGLE(MUL2)", %%mm7\n\t"\ |
8430 | 191 "psllw $3, "#E" \n\t"\ |
192 "psubw "#E", %%mm6 \n\t"\ | |
193 "psraw $3, "#E" \n\t"\ | |
194 "paddw %%mm7, %%mm6 \n\t"\ | |
195 "paddw "#E", %%mm6 \n\t"\ | |
196 "paddw "#B", "#B" \n\t"\ | |
197 "pxor %%mm7, %%mm7 \n\t"\ | |
198 "add %2, %0 \n\t"\ | |
199 "punpcklbw %%mm7, "#F" \n\t"\ | |
200 "psubw "#B", %%mm6 \n\t"\ | |
201 "psraw $1, "#B" \n\t"\ | |
202 "psubw "#A", %%mm6 \n\t"\ | |
203 "paddw %4, %%mm6 \n\t"\ | |
204 "psraw $7, %%mm6 \n\t"\ | |
205 "packuswb %%mm6, %%mm6 \n\t"\ | |
206 OP(%%mm6, (%1), A, d) \ | |
207 "add %3, %1 \n\t" | |
208 | |
209 /* vertical filter [ 0 -1 5 5 -1 0] */ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
210 #define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \ |
8430 | 211 "movd (%0), "#F" \n\t"\ |
212 "movq "#C", %%mm6 \n\t"\ | |
213 "paddw "#D", %%mm6 \n\t"\ | |
214 "pmullw %5, %%mm6 \n\t"\ | |
215 "add %2, %0 \n\t"\ | |
216 "punpcklbw %%mm7, "#F" \n\t"\ | |
217 "psubw "#B", %%mm6 \n\t"\ | |
218 "psubw "#E", %%mm6 \n\t"\ | |
219 "paddw %4, %%mm6 \n\t"\ | |
220 "psraw $3, %%mm6 \n\t"\ | |
221 "packuswb %%mm6, %%mm6 \n\t"\ | |
222 OP(%%mm6, (%1), A, d) \ | |
223 "add %3, %1 \n\t" | |
224 | |
225 /* vertical filter [ 0 -7 42 96 -2 -1] */ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
226 #define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \ |
8430 | 227 "movd (%0), "#F" \n\t"\ |
228 "movq "#C", %%mm6 \n\t"\ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
229 "pmullw "MANGLE(MUL2)", %%mm6\n\t"\ |
8430 | 230 "movq "#D", %%mm7 \n\t"\ |
231 "pmullw %5, %%mm7 \n\t"\ | |
232 "psllw $3, "#B" \n\t"\ | |
233 "psubw "#B", %%mm6 \n\t"\ | |
234 "psraw $3, "#B" \n\t"\ | |
235 "paddw %%mm7, %%mm6 \n\t"\ | |
236 "paddw "#B", %%mm6 \n\t"\ | |
237 "paddw "#E", "#E" \n\t"\ | |
238 "pxor %%mm7, %%mm7 \n\t"\ | |
239 "add %2, %0 \n\t"\ | |
240 "punpcklbw %%mm7, "#F" \n\t"\ | |
241 "psubw "#E", %%mm6 \n\t"\ | |
242 "psraw $1, "#E" \n\t"\ | |
243 "psubw "#F", %%mm6 \n\t"\ | |
244 "paddw %4, %%mm6 \n\t"\ | |
245 "psraw $7, %%mm6 \n\t"\ | |
246 "packuswb %%mm6, %%mm6 \n\t"\ | |
247 OP(%%mm6, (%1), A, d) \ | |
248 "add %3, %1 \n\t" | |
249 | |
250 | |
251 #define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\ | |
252 int w= 2;\ | |
253 src -= 2*srcStride;\ | |
254 \ | |
255 while(w--){\ | |
256 __asm__ volatile(\ | |
257 "pxor %%mm7, %%mm7 \n\t"\ | |
258 "movd (%0), %%mm0 \n\t"\ | |
259 "add %2, %0 \n\t"\ | |
260 "movd (%0), %%mm1 \n\t"\ | |
261 "add %2, %0 \n\t"\ | |
262 "movd (%0), %%mm2 \n\t"\ | |
263 "add %2, %0 \n\t"\ | |
264 "movd (%0), %%mm3 \n\t"\ | |
265 "add %2, %0 \n\t"\ | |
266 "movd (%0), %%mm4 \n\t"\ | |
267 "add %2, %0 \n\t"\ | |
268 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
269 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
270 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
271 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
272 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
273 VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
274 VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
275 VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
276 VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
277 VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
278 VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
279 VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
280 VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ |
8430 | 281 \ |
282 : "+a"(src), "+c"(dst)\ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
283 : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ |
8430 | 284 : "memory"\ |
285 );\ | |
286 if(h==16){\ | |
287 __asm__ volatile(\ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
288 VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
289 VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
290 VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
291 VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
292 VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
293 VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
294 VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ |
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
295 VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ |
8430 | 296 \ |
297 : "+a"(src), "+c"(dst)\ | |
10343
b1218e0b0f2b
Use MANGLE in cavsdsp, the current version using "m" constraints will not
reimar
parents:
8816
diff
changeset
|
298 : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ |
8430 | 299 : "memory"\ |
300 );\ | |
301 }\ | |
302 src += 4-(h+5)*srcStride;\ | |
303 dst += 4-h*dstStride;\ | |
304 } | |
305 | |
306 #define QPEL_CAVS(OPNAME, OP, MMX)\ | |
307 static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
308 int h=8;\ | |
309 __asm__ volatile(\ | |
310 "pxor %%mm7, %%mm7 \n\t"\ | |
311 "movq %5, %%mm6 \n\t"\ | |
312 "1: \n\t"\ | |
313 "movq (%0), %%mm0 \n\t"\ | |
314 "movq 1(%0), %%mm2 \n\t"\ | |
315 "movq %%mm0, %%mm1 \n\t"\ | |
316 "movq %%mm2, %%mm3 \n\t"\ | |
317 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
318 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
319 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
320 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
321 "paddw %%mm2, %%mm0 \n\t"\ | |
322 "paddw %%mm3, %%mm1 \n\t"\ | |
323 "pmullw %%mm6, %%mm0 \n\t"\ | |
324 "pmullw %%mm6, %%mm1 \n\t"\ | |
325 "movq -1(%0), %%mm2 \n\t"\ | |
326 "movq 2(%0), %%mm4 \n\t"\ | |
327 "movq %%mm2, %%mm3 \n\t"\ | |
328 "movq %%mm4, %%mm5 \n\t"\ | |
329 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
330 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
331 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
332 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
333 "paddw %%mm4, %%mm2 \n\t"\ | |
334 "paddw %%mm3, %%mm5 \n\t"\ | |
335 "psubw %%mm2, %%mm0 \n\t"\ | |
336 "psubw %%mm5, %%mm1 \n\t"\ | |
337 "movq %6, %%mm5 \n\t"\ | |
338 "paddw %%mm5, %%mm0 \n\t"\ | |
339 "paddw %%mm5, %%mm1 \n\t"\ | |
340 "psraw $3, %%mm0 \n\t"\ | |
341 "psraw $3, %%mm1 \n\t"\ | |
342 "packuswb %%mm1, %%mm0 \n\t"\ | |
343 OP(%%mm0, (%1),%%mm5, q) \ | |
344 "add %3, %0 \n\t"\ | |
345 "add %4, %1 \n\t"\ | |
346 "decl %2 \n\t"\ | |
347 " jnz 1b \n\t"\ | |
348 : "+a"(src), "+c"(dst), "+m"(h)\ | |
349 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\ | |
350 : "memory"\ | |
351 );\ | |
352 }\ | |
353 \ | |
354 static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
355 QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ | |
356 }\ | |
357 \ | |
358 static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
359 QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \ | |
360 }\ | |
361 \ | |
362 static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
363 QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ | |
364 }\ | |
365 \ | |
366 static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
367 OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\ | |
368 }\ | |
369 static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
370 OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\ | |
371 OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ | |
372 }\ | |
373 \ | |
374 static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
375 OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\ | |
376 }\ | |
377 static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
378 OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\ | |
379 OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ | |
380 }\ | |
381 \ | |
382 static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
383 OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\ | |
384 }\ | |
385 static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
386 OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\ | |
387 OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ | |
388 }\ | |
389 \ | |
390 static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
391 OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ | |
392 OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
393 src += 8*srcStride;\ | |
394 dst += 8*dstStride;\ | |
395 OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ | |
396 OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
397 }\ | |
398 | |
399 #define CAVS_MC(OPNAME, SIZE, MMX) \ | |
400 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
401 OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ | |
402 }\ | |
403 \ | |
404 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
405 OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ | |
406 }\ | |
407 \ | |
408 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
409 OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ | |
410 }\ | |
411 \ | |
412 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
413 OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ | |
414 }\ | |
415 | |
416 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" | |
417 #define AVG_3DNOW_OP(a,b,temp, size) \ | |
418 "mov" #size " " #b ", " #temp " \n\t"\ | |
419 "pavgusb " #temp ", " #a " \n\t"\ | |
420 "mov" #size " " #a ", " #b " \n\t" | |
421 #define AVG_MMX2_OP(a,b,temp, size) \ | |
422 "mov" #size " " #b ", " #temp " \n\t"\ | |
423 "pavgb " #temp ", " #a " \n\t"\ | |
424 "mov" #size " " #a ", " #b " \n\t" | |
425 | |
426 QPEL_CAVS(put_, PUT_OP, 3dnow) | |
427 QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow) | |
428 QPEL_CAVS(put_, PUT_OP, mmx2) | |
429 QPEL_CAVS(avg_, AVG_MMX2_OP, mmx2) | |
430 | |
431 CAVS_MC(put_, 8, 3dnow) | |
432 CAVS_MC(put_, 16,3dnow) | |
433 CAVS_MC(avg_, 8, 3dnow) | |
434 CAVS_MC(avg_, 16,3dnow) | |
435 CAVS_MC(put_, 8, mmx2) | |
436 CAVS_MC(put_, 16,mmx2) | |
437 CAVS_MC(avg_, 8, mmx2) | |
438 CAVS_MC(avg_, 16,mmx2) | |
439 | |
440 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx) { | |
441 #define dspfunc(PFX, IDX, NUM) \ | |
442 c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ | |
443 c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \ | |
444 c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \ | |
445 c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \ | |
446 c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \ | |
447 | |
448 dspfunc(put_cavs_qpel, 0, 16); | |
449 dspfunc(put_cavs_qpel, 1, 8); | |
450 dspfunc(avg_cavs_qpel, 0, 16); | |
451 dspfunc(avg_cavs_qpel, 1, 8); | |
452 #undef dspfunc | |
453 c->cavs_idct8_add = cavs_idct8_add_mmx; | |
454 } | |
455 | |
456 void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx) { | |
457 #define dspfunc(PFX, IDX, NUM) \ | |
458 c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ | |
459 c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \ | |
460 c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \ | |
461 c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \ | |
462 c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \ | |
463 | |
464 dspfunc(put_cavs_qpel, 0, 16); | |
465 dspfunc(put_cavs_qpel, 1, 8); | |
466 dspfunc(avg_cavs_qpel, 0, 16); | |
467 dspfunc(avg_cavs_qpel, 1, 8); | |
468 #undef dspfunc | |
469 c->cavs_idct8_add = cavs_idct8_add_mmx; | |
470 } |