Mercurial > libavcodec.hg
annotate i386/cavsdsp_mmx.c @ 6920:d02af7474bff libavcodec
Prevent 128*1<<trellis from becoming 0 and creating 0 sized arrays.
fixes CID84 RUN2
CID85 RUN2
CID86 RUN2
CID87 RUN2
CID88 RUN2
CID89 RUN2
CID90 RUN2
CID91 RUN2
CID92 RUN2
CID93 RUN2
CID94 RUN2
CID95 RUN2
CID96 RUN2
CID97 RUN2
CID98 RUN2
CID99 RUN2
CID100 RUN2
CID101 RUN2
CID102 RUN2
CID103 RUN2
CID104 RUN2
CID105 RUN2
CID106 RUN2
author | michael |
---|---|
date | Wed, 28 May 2008 11:59:41 +0000 |
parents | f7cbb7733146 |
children | eebc7209c47f |
rev | line source |
---|---|
3524 | 1 /* |
2 * Chinese AVS video (AVS1-P2, JiZhun profile) decoder. | |
3 * Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de> | |
4 * | |
5967
15ed47af1838
Misc spelling fixes, prefer American over British English.
diego
parents:
5963
diff
changeset
|
5 * MMX-optimized DSP functions, based on H.264 optimizations by |
3524 | 6 * Michael Niedermayer and Loren Merritt |
7 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3524
diff
changeset
|
8 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3524
diff
changeset
|
9 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3524
diff
changeset
|
10 * FFmpeg is free software; you can redistribute it and/or |
3524 | 11 * modify it under the terms of the GNU Lesser General Public |
12 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3524
diff
changeset
|
13 * version 2.1 of the License, or (at your option) any later version. |
3524 | 14 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3524
diff
changeset
|
15 * FFmpeg is distributed in the hope that it will be useful, |
3524 | 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 * Lesser General Public License for more details. | |
19 * | |
20 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3524
diff
changeset
|
21 * License along with FFmpeg; if not, write to the Free Software |
5215 | 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
3524 | 23 */ |
24 | |
6763 | 25 #include "libavutil/common.h" |
26 #include "libavutil/x86_cpu.h" | |
27 #include "libavcodec/dsputil.h" | |
5946
55251379b5b1
make ff_p* vars extern so that they can be used in various *_mmx.c files
aurel
parents:
5215
diff
changeset
|
28 #include "dsputil_mmx.h" |
3524 | 29 |
30 /***************************************************************************** | |
31 * | |
32 * inverse transform | |
33 * | |
34 ****************************************************************************/ | |
35 | |
36 static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) | |
37 { | |
38 asm volatile( | |
39 "movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */ | |
40 "movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */ | |
41 "movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */ | |
42 "movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */ | |
43 "movq %%mm4, %%mm0 \n\t" | |
44 "movq %%mm5, %%mm3 \n\t" | |
45 "movq %%mm2, %%mm6 \n\t" | |
46 "movq %%mm7, %%mm1 \n\t" | |
47 | |
48 "paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */ | |
49 "paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */ | |
50 "paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */ | |
51 "paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */ | |
52 "paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */ | |
53 "paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */ | |
54 "paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */ | |
55 "paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */ | |
56 "psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */ | |
57 "paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */ | |
58 "psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */ | |
59 "paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */ | |
60 | |
61 "movq %%mm5, %%mm4 \n\t" | |
62 "movq %%mm7, %%mm6 \n\t" | |
63 "movq %%mm3, %%mm0 \n\t" | |
64 "movq %%mm1, %%mm2 \n\t" | |
65 SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */ | |
66 "paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */ | |
67 "paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */ | |
68 "paddw %%mm7, %%mm7 \n\t" | |
69 "paddw %%mm5, %%mm5 \n\t" | |
70 "paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */ | |
71 "paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */ | |
72 | |
73 SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */ | |
74 "psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */ | |
75 "movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */ | |
76 "psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */ | |
77 "paddw %%mm1, %%mm1 \n\t" | |
78 "paddw %%mm3, %%mm3 \n\t" | |
79 "psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */ | |
80 "paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */ | |
81 | |
82 "movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */ | |
83 "movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */ | |
84 "movq %%mm2, %%mm4 \n\t" | |
85 "movq %%mm6, %%mm0 \n\t" | |
86 "psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */ | |
87 "psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */ | |
88 "paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */ | |
89 "paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */ | |
90 "paddw %%mm2, %%mm2 \n\t" | |
91 "paddw %%mm0, %%mm0 \n\t" | |
92 "psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */ | |
93 "paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */ | |
94 | |
95 "movq (%0), %%mm2 \n\t" /* mm2 = src0 */ | |
96 "movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */ | |
97 SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */ | |
98 "psllw $3, %%mm0 \n\t" | |
99 "psllw $3, %%mm2 \n\t" | |
100 "paddw %1, %%mm0 \n\t" /* add rounding bias */ | |
101 "paddw %1, %%mm2 \n\t" /* add rounding bias */ | |
102 | |
103 SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */ | |
104 SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */ | |
105 SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */ | |
106 SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */ | |
107 SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */ | |
108 SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */ | |
109 :: "r"(block), "m"(bias) | |
110 ); | |
111 } | |
112 | |
113 static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) | |
114 { | |
115 int i; | |
116 DECLARE_ALIGNED_8(int16_t, b2[64]); | |
117 | |
118 for(i=0; i<2; i++){ | |
119 DECLARE_ALIGNED_8(uint64_t, tmp); | |
120 | |
121 cavs_idct8_1d(block+4*i, ff_pw_4); | |
122 | |
123 asm volatile( | |
124 "psraw $3, %%mm7 \n\t" | |
125 "psraw $3, %%mm6 \n\t" | |
126 "psraw $3, %%mm5 \n\t" | |
127 "psraw $3, %%mm4 \n\t" | |
128 "psraw $3, %%mm3 \n\t" | |
129 "psraw $3, %%mm2 \n\t" | |
130 "psraw $3, %%mm1 \n\t" | |
131 "psraw $3, %%mm0 \n\t" | |
132 "movq %%mm7, %0 \n\t" | |
133 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) | |
134 "movq %%mm0, 8(%1) \n\t" | |
135 "movq %%mm6, 24(%1) \n\t" | |
136 "movq %%mm7, 40(%1) \n\t" | |
137 "movq %%mm4, 56(%1) \n\t" | |
138 "movq %0, %%mm7 \n\t" | |
139 TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) | |
140 "movq %%mm7, (%1) \n\t" | |
141 "movq %%mm1, 16(%1) \n\t" | |
142 "movq %%mm0, 32(%1) \n\t" | |
143 "movq %%mm3, 48(%1) \n\t" | |
144 : "=m"(tmp) | |
145 : "r"(b2+32*i) | |
146 : "memory" | |
147 ); | |
148 } | |
149 | |
150 for(i=0; i<2; i++){ | |
151 cavs_idct8_1d(b2+4*i, ff_pw_64); | |
152 | |
153 asm volatile( | |
154 "psraw $7, %%mm7 \n\t" | |
155 "psraw $7, %%mm6 \n\t" | |
156 "psraw $7, %%mm5 \n\t" | |
157 "psraw $7, %%mm4 \n\t" | |
158 "psraw $7, %%mm3 \n\t" | |
159 "psraw $7, %%mm2 \n\t" | |
160 "psraw $7, %%mm1 \n\t" | |
161 "psraw $7, %%mm0 \n\t" | |
162 "movq %%mm7, (%0) \n\t" | |
163 "movq %%mm5, 16(%0) \n\t" | |
164 "movq %%mm3, 32(%0) \n\t" | |
165 "movq %%mm1, 48(%0) \n\t" | |
166 "movq %%mm0, 64(%0) \n\t" | |
167 "movq %%mm2, 80(%0) \n\t" | |
168 "movq %%mm4, 96(%0) \n\t" | |
169 "movq %%mm6, 112(%0) \n\t" | |
170 :: "r"(b2+4*i) | |
171 : "memory" | |
172 ); | |
173 } | |
174 | |
175 add_pixels_clamped_mmx(b2, dst, stride); | |
176 | |
177 /* clear block */ | |
178 asm volatile( | |
179 "pxor %%mm7, %%mm7 \n\t" | |
180 "movq %%mm7, (%0) \n\t" | |
181 "movq %%mm7, 8(%0) \n\t" | |
182 "movq %%mm7, 16(%0) \n\t" | |
183 "movq %%mm7, 24(%0) \n\t" | |
184 "movq %%mm7, 32(%0) \n\t" | |
185 "movq %%mm7, 40(%0) \n\t" | |
186 "movq %%mm7, 48(%0) \n\t" | |
187 "movq %%mm7, 56(%0) \n\t" | |
188 "movq %%mm7, 64(%0) \n\t" | |
189 "movq %%mm7, 72(%0) \n\t" | |
190 "movq %%mm7, 80(%0) \n\t" | |
191 "movq %%mm7, 88(%0) \n\t" | |
192 "movq %%mm7, 96(%0) \n\t" | |
193 "movq %%mm7, 104(%0) \n\t" | |
194 "movq %%mm7, 112(%0) \n\t" | |
195 "movq %%mm7, 120(%0) \n\t" | |
196 :: "r" (block) | |
197 ); | |
198 } | |
199 | |
200 /***************************************************************************** | |
201 * | |
202 * motion compensation | |
203 * | |
204 ****************************************************************************/ | |
205 | |
206 /* vertical filter [-1 -2 96 42 -7 0] */ | |
207 #define QPEL_CAVSV1(A,B,C,D,E,F,OP) \ | |
208 "movd (%0), "#F" \n\t"\ | |
209 "movq "#C", %%mm6 \n\t"\ | |
210 "pmullw %5, %%mm6 \n\t"\ | |
211 "movq "#D", %%mm7 \n\t"\ | |
212 "pmullw %6, %%mm7 \n\t"\ | |
213 "psllw $3, "#E" \n\t"\ | |
214 "psubw "#E", %%mm6 \n\t"\ | |
215 "psraw $3, "#E" \n\t"\ | |
216 "paddw %%mm7, %%mm6 \n\t"\ | |
217 "paddw "#E", %%mm6 \n\t"\ | |
218 "paddw "#B", "#B" \n\t"\ | |
219 "pxor %%mm7, %%mm7 \n\t"\ | |
220 "add %2, %0 \n\t"\ | |
221 "punpcklbw %%mm7, "#F" \n\t"\ | |
222 "psubw "#B", %%mm6 \n\t"\ | |
223 "psraw $1, "#B" \n\t"\ | |
224 "psubw "#A", %%mm6 \n\t"\ | |
225 "paddw %4, %%mm6 \n\t"\ | |
226 "psraw $7, %%mm6 \n\t"\ | |
227 "packuswb %%mm6, %%mm6 \n\t"\ | |
228 OP(%%mm6, (%1), A, d) \ | |
229 "add %3, %1 \n\t" | |
230 | |
231 /* vertical filter [ 0 -1 5 5 -1 0] */ | |
232 #define QPEL_CAVSV2(A,B,C,D,E,F,OP) \ | |
233 "movd (%0), "#F" \n\t"\ | |
234 "movq "#C", %%mm6 \n\t"\ | |
235 "paddw "#D", %%mm6 \n\t"\ | |
236 "pmullw %5, %%mm6 \n\t"\ | |
237 "add %2, %0 \n\t"\ | |
238 "punpcklbw %%mm7, "#F" \n\t"\ | |
239 "psubw "#B", %%mm6 \n\t"\ | |
240 "psubw "#E", %%mm6 \n\t"\ | |
241 "paddw %4, %%mm6 \n\t"\ | |
242 "psraw $3, %%mm6 \n\t"\ | |
243 "packuswb %%mm6, %%mm6 \n\t"\ | |
244 OP(%%mm6, (%1), A, d) \ | |
245 "add %3, %1 \n\t" | |
246 | |
247 /* vertical filter [ 0 -7 42 96 -2 -1] */ | |
248 #define QPEL_CAVSV3(A,B,C,D,E,F,OP) \ | |
249 "movd (%0), "#F" \n\t"\ | |
250 "movq "#C", %%mm6 \n\t"\ | |
251 "pmullw %6, %%mm6 \n\t"\ | |
252 "movq "#D", %%mm7 \n\t"\ | |
253 "pmullw %5, %%mm7 \n\t"\ | |
254 "psllw $3, "#B" \n\t"\ | |
255 "psubw "#B", %%mm6 \n\t"\ | |
256 "psraw $3, "#B" \n\t"\ | |
257 "paddw %%mm7, %%mm6 \n\t"\ | |
258 "paddw "#B", %%mm6 \n\t"\ | |
259 "paddw "#E", "#E" \n\t"\ | |
260 "pxor %%mm7, %%mm7 \n\t"\ | |
261 "add %2, %0 \n\t"\ | |
262 "punpcklbw %%mm7, "#F" \n\t"\ | |
263 "psubw "#E", %%mm6 \n\t"\ | |
264 "psraw $1, "#E" \n\t"\ | |
265 "psubw "#F", %%mm6 \n\t"\ | |
266 "paddw %4, %%mm6 \n\t"\ | |
267 "psraw $7, %%mm6 \n\t"\ | |
268 "packuswb %%mm6, %%mm6 \n\t"\ | |
269 OP(%%mm6, (%1), A, d) \ | |
270 "add %3, %1 \n\t" | |
271 | |
272 | |
273 #define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\ | |
274 int w= 2;\ | |
275 src -= 2*srcStride;\ | |
276 \ | |
277 while(w--){\ | |
278 asm volatile(\ | |
279 "pxor %%mm7, %%mm7 \n\t"\ | |
280 "movd (%0), %%mm0 \n\t"\ | |
281 "add %2, %0 \n\t"\ | |
282 "movd (%0), %%mm1 \n\t"\ | |
283 "add %2, %0 \n\t"\ | |
284 "movd (%0), %%mm2 \n\t"\ | |
285 "add %2, %0 \n\t"\ | |
286 "movd (%0), %%mm3 \n\t"\ | |
287 "add %2, %0 \n\t"\ | |
288 "movd (%0), %%mm4 \n\t"\ | |
289 "add %2, %0 \n\t"\ | |
290 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
291 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
292 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
293 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
294 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
295 VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | |
296 VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | |
297 VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ | |
298 VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ | |
299 VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ | |
300 VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ | |
301 VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | |
302 VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | |
303 \ | |
304 : "+a"(src), "+c"(dst)\ | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6135
diff
changeset
|
305 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\ |
3524 | 306 : "memory"\ |
307 );\ | |
308 if(h==16){\ | |
309 asm volatile(\ | |
310 VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ | |
311 VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ | |
312 VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ | |
313 VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ | |
314 VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | |
315 VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ | |
316 VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ | |
317 VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ | |
318 \ | |
319 : "+a"(src), "+c"(dst)\ | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6135
diff
changeset
|
320 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\ |
3524 | 321 : "memory"\ |
322 );\ | |
323 }\ | |
324 src += 4-(h+5)*srcStride;\ | |
325 dst += 4-h*dstStride;\ | |
326 } | |
327 | |
328 #define QPEL_CAVS(OPNAME, OP, MMX)\ | |
329 static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
330 int h=8;\ | |
331 asm volatile(\ | |
332 "pxor %%mm7, %%mm7 \n\t"\ | |
333 "movq %5, %%mm6 \n\t"\ | |
334 "1: \n\t"\ | |
335 "movq (%0), %%mm0 \n\t"\ | |
336 "movq 1(%0), %%mm2 \n\t"\ | |
337 "movq %%mm0, %%mm1 \n\t"\ | |
338 "movq %%mm2, %%mm3 \n\t"\ | |
339 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
340 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
341 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
342 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
343 "paddw %%mm2, %%mm0 \n\t"\ | |
344 "paddw %%mm3, %%mm1 \n\t"\ | |
345 "pmullw %%mm6, %%mm0 \n\t"\ | |
346 "pmullw %%mm6, %%mm1 \n\t"\ | |
347 "movq -1(%0), %%mm2 \n\t"\ | |
348 "movq 2(%0), %%mm4 \n\t"\ | |
349 "movq %%mm2, %%mm3 \n\t"\ | |
350 "movq %%mm4, %%mm5 \n\t"\ | |
351 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
352 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
353 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
354 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
355 "paddw %%mm4, %%mm2 \n\t"\ | |
356 "paddw %%mm3, %%mm5 \n\t"\ | |
357 "psubw %%mm2, %%mm0 \n\t"\ | |
358 "psubw %%mm5, %%mm1 \n\t"\ | |
359 "movq %6, %%mm5 \n\t"\ | |
360 "paddw %%mm5, %%mm0 \n\t"\ | |
361 "paddw %%mm5, %%mm1 \n\t"\ | |
362 "psraw $3, %%mm0 \n\t"\ | |
363 "psraw $3, %%mm1 \n\t"\ | |
364 "packuswb %%mm1, %%mm0 \n\t"\ | |
365 OP(%%mm0, (%1),%%mm5, q) \ | |
366 "add %3, %0 \n\t"\ | |
367 "add %4, %1 \n\t"\ | |
368 "decl %2 \n\t"\ | |
369 " jnz 1b \n\t"\ | |
370 : "+a"(src), "+c"(dst), "+m"(h)\ | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6135
diff
changeset
|
371 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\ |
3524 | 372 : "memory"\ |
373 );\ | |
374 }\ | |
375 \ | |
376 static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
377 QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ | |
378 }\ | |
379 \ | |
380 static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
381 QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \ | |
382 }\ | |
383 \ | |
384 static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | |
385 QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ | |
386 }\ | |
387 \ | |
388 static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
389 OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\ | |
390 }\ | |
391 static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
392 OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\ | |
393 OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ | |
394 }\ | |
395 \ | |
396 static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
397 OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\ | |
398 }\ | |
399 static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
400 OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\ | |
401 OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ | |
402 }\ | |
403 \ | |
404 static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
405 OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\ | |
406 }\ | |
407 static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
408 OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\ | |
409 OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ | |
410 }\ | |
411 \ | |
412 static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
413 OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ | |
414 OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
415 src += 8*srcStride;\ | |
416 dst += 8*dstStride;\ | |
417 OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ | |
418 OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ | |
419 }\ | |
420 | |
421 #define CAVS_MC(OPNAME, SIZE, MMX) \ | |
422 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
423 OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ | |
424 }\ | |
425 \ | |
426 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
427 OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ | |
428 }\ | |
429 \ | |
430 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
431 OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ | |
432 }\ | |
433 \ | |
434 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
435 OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ | |
436 }\ | |
437 | |
438 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" | |
439 #define AVG_3DNOW_OP(a,b,temp, size) \ | |
440 "mov" #size " " #b ", " #temp " \n\t"\ | |
441 "pavgusb " #temp ", " #a " \n\t"\ | |
442 "mov" #size " " #a ", " #b " \n\t" | |
443 #define AVG_MMX2_OP(a,b,temp, size) \ | |
444 "mov" #size " " #b ", " #temp " \n\t"\ | |
445 "pavgb " #temp ", " #a " \n\t"\ | |
446 "mov" #size " " #a ", " #b " \n\t" | |
447 | |
448 QPEL_CAVS(put_, PUT_OP, 3dnow) | |
449 QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow) | |
450 QPEL_CAVS(put_, PUT_OP, mmx2) | |
451 QPEL_CAVS(avg_, AVG_MMX2_OP, mmx2) | |
452 | |
453 CAVS_MC(put_, 8, 3dnow) | |
454 CAVS_MC(put_, 16,3dnow) | |
455 CAVS_MC(avg_, 8, 3dnow) | |
456 CAVS_MC(avg_, 16,3dnow) | |
457 CAVS_MC(put_, 8, mmx2) | |
458 CAVS_MC(put_, 16,mmx2) | |
459 CAVS_MC(avg_, 8, mmx2) | |
460 CAVS_MC(avg_, 16,mmx2) | |
461 | |
462 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); | |
463 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); | |
464 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); | |
465 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); | |
466 | |
467 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx) { | |
468 #define dspfunc(PFX, IDX, NUM) \ | |
469 c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ | |
470 c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \ | |
471 c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \ | |
472 c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \ | |
473 c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \ | |
474 | |
475 dspfunc(put_cavs_qpel, 0, 16); | |
476 dspfunc(put_cavs_qpel, 1, 8); | |
477 dspfunc(avg_cavs_qpel, 0, 16); | |
478 dspfunc(avg_cavs_qpel, 1, 8); | |
479 #undef dspfunc | |
480 c->cavs_idct8_add = cavs_idct8_add_mmx; | |
481 } | |
482 | |
483 void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx) { | |
484 #define dspfunc(PFX, IDX, NUM) \ | |
485 c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ | |
486 c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \ | |
487 c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \ | |
488 c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \ | |
489 c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \ | |
490 | |
491 dspfunc(put_cavs_qpel, 0, 16); | |
492 dspfunc(put_cavs_qpel, 1, 8); | |
493 dspfunc(avg_cavs_qpel, 0, 16); | |
494 dspfunc(avg_cavs_qpel, 1, 8); | |
495 #undef dspfunc | |
496 c->cavs_idct8_add = cavs_idct8_add_mmx; | |
497 } |