Mercurial > libavcodec.hg
annotate x86/dsputilenc_mmx.c @ 12494:94eaea836bf4 libavcodec
Check avctx width/height more thoroughly (e.g. all values 0 except width would
have been accepted before).
Also do not fail if they are invalid but instead override them to 0.
This allows decoding e.g. MPEG video when only the container values are corrupted.
For encoding a value of 0,0 of course makes no sense, but was allowed
through before and will be caught by an extra check in the encode function.
author | reimar |
---|---|
date | Wed, 15 Sep 2010 04:46:55 +0000 |
parents | 9fef0a8ddd63 |
children | c5ffa8b81f9c |
rev | line source |
---|---|
8430 | 1 /* |
2 * MMX optimized DSP utils | |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8627
diff
changeset
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard |
8430 | 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
5 * | |
6 * This file is part of FFmpeg. | |
7 * | |
8 * FFmpeg is free software; you can redistribute it and/or | |
9 * modify it under the terms of the GNU Lesser General Public | |
10 * License as published by the Free Software Foundation; either | |
11 * version 2.1 of the License, or (at your option) any later version. | |
12 * | |
13 * FFmpeg is distributed in the hope that it will be useful, | |
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 * Lesser General Public License for more details. | |
17 * | |
18 * You should have received a copy of the GNU Lesser General Public | |
19 * License along with FFmpeg; if not, write to the Free Software | |
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 * | |
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
23 */ | |
24 | |
12475
9fef0a8ddd63
Move mm_support() from libavcodec to libavutil, make it a public
stefano
parents:
12456
diff
changeset
|
25 #include "libavutil/cpu.h" |
8430 | 26 #include "libavutil/x86_cpu.h" |
27 #include "libavcodec/dsputil.h" | |
28 #include "libavcodec/mpegvideo.h" | |
8627
d6bab465b82c
moves mid_pred() into mathops.h (with arch specific code split by directory)
aurel
parents:
8596
diff
changeset
|
29 #include "libavcodec/mathops.h" |
8430 | 30 #include "dsputil_mmx.h" |
31 | |
32 | |
33 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) | |
34 { | |
35 __asm__ volatile( | |
36 "mov $-128, %%"REG_a" \n\t" | |
37 "pxor %%mm7, %%mm7 \n\t" | |
38 ASMALIGN(4) | |
39 "1: \n\t" | |
40 "movq (%0), %%mm0 \n\t" | |
41 "movq (%0, %2), %%mm2 \n\t" | |
42 "movq %%mm0, %%mm1 \n\t" | |
43 "movq %%mm2, %%mm3 \n\t" | |
44 "punpcklbw %%mm7, %%mm0 \n\t" | |
45 "punpckhbw %%mm7, %%mm1 \n\t" | |
46 "punpcklbw %%mm7, %%mm2 \n\t" | |
47 "punpckhbw %%mm7, %%mm3 \n\t" | |
48 "movq %%mm0, (%1, %%"REG_a") \n\t" | |
49 "movq %%mm1, 8(%1, %%"REG_a") \n\t" | |
50 "movq %%mm2, 16(%1, %%"REG_a") \n\t" | |
51 "movq %%mm3, 24(%1, %%"REG_a") \n\t" | |
52 "add %3, %0 \n\t" | |
53 "add $32, %%"REG_a" \n\t" | |
54 "js 1b \n\t" | |
55 : "+r" (pixels) | |
56 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2) | |
57 : "%"REG_a | |
58 ); | |
59 } | |
60 | |
61 static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) | |
62 { | |
63 __asm__ volatile( | |
64 "pxor %%xmm7, %%xmm7 \n\t" | |
65 "movq (%0), %%xmm0 \n\t" | |
66 "movq (%0, %2), %%xmm1 \n\t" | |
67 "movq (%0, %2,2), %%xmm2 \n\t" | |
68 "movq (%0, %3), %%xmm3 \n\t" | |
69 "lea (%0,%2,4), %0 \n\t" | |
70 "punpcklbw %%xmm7, %%xmm0 \n\t" | |
71 "punpcklbw %%xmm7, %%xmm1 \n\t" | |
72 "punpcklbw %%xmm7, %%xmm2 \n\t" | |
73 "punpcklbw %%xmm7, %%xmm3 \n\t" | |
74 "movdqa %%xmm0, (%1) \n\t" | |
75 "movdqa %%xmm1, 16(%1) \n\t" | |
76 "movdqa %%xmm2, 32(%1) \n\t" | |
77 "movdqa %%xmm3, 48(%1) \n\t" | |
78 "movq (%0), %%xmm0 \n\t" | |
79 "movq (%0, %2), %%xmm1 \n\t" | |
80 "movq (%0, %2,2), %%xmm2 \n\t" | |
81 "movq (%0, %3), %%xmm3 \n\t" | |
82 "punpcklbw %%xmm7, %%xmm0 \n\t" | |
83 "punpcklbw %%xmm7, %%xmm1 \n\t" | |
84 "punpcklbw %%xmm7, %%xmm2 \n\t" | |
85 "punpcklbw %%xmm7, %%xmm3 \n\t" | |
86 "movdqa %%xmm0, 64(%1) \n\t" | |
87 "movdqa %%xmm1, 80(%1) \n\t" | |
88 "movdqa %%xmm2, 96(%1) \n\t" | |
89 "movdqa %%xmm3, 112(%1) \n\t" | |
90 : "+r" (pixels) | |
91 : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3) | |
92 ); | |
93 } | |
94 | |
95 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) | |
96 { | |
97 __asm__ volatile( | |
98 "pxor %%mm7, %%mm7 \n\t" | |
99 "mov $-128, %%"REG_a" \n\t" | |
100 ASMALIGN(4) | |
101 "1: \n\t" | |
102 "movq (%0), %%mm0 \n\t" | |
103 "movq (%1), %%mm2 \n\t" | |
104 "movq %%mm0, %%mm1 \n\t" | |
105 "movq %%mm2, %%mm3 \n\t" | |
106 "punpcklbw %%mm7, %%mm0 \n\t" | |
107 "punpckhbw %%mm7, %%mm1 \n\t" | |
108 "punpcklbw %%mm7, %%mm2 \n\t" | |
109 "punpckhbw %%mm7, %%mm3 \n\t" | |
110 "psubw %%mm2, %%mm0 \n\t" | |
111 "psubw %%mm3, %%mm1 \n\t" | |
112 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
113 "movq %%mm1, 8(%2, %%"REG_a") \n\t" | |
114 "add %3, %0 \n\t" | |
115 "add %3, %1 \n\t" | |
116 "add $16, %%"REG_a" \n\t" | |
117 "jnz 1b \n\t" | |
118 : "+r" (s1), "+r" (s2) | |
119 : "r" (block+64), "r" ((x86_reg)stride) | |
120 : "%"REG_a | |
121 ); | |
122 } | |
123 | |
124 static int pix_sum16_mmx(uint8_t * pix, int line_size){ | |
125 const int h=16; | |
126 int sum; | |
127 x86_reg index= -line_size*h; | |
128 | |
129 __asm__ volatile( | |
130 "pxor %%mm7, %%mm7 \n\t" | |
131 "pxor %%mm6, %%mm6 \n\t" | |
132 "1: \n\t" | |
133 "movq (%2, %1), %%mm0 \n\t" | |
134 "movq (%2, %1), %%mm1 \n\t" | |
135 "movq 8(%2, %1), %%mm2 \n\t" | |
136 "movq 8(%2, %1), %%mm3 \n\t" | |
137 "punpcklbw %%mm7, %%mm0 \n\t" | |
138 "punpckhbw %%mm7, %%mm1 \n\t" | |
139 "punpcklbw %%mm7, %%mm2 \n\t" | |
140 "punpckhbw %%mm7, %%mm3 \n\t" | |
141 "paddw %%mm0, %%mm1 \n\t" | |
142 "paddw %%mm2, %%mm3 \n\t" | |
143 "paddw %%mm1, %%mm3 \n\t" | |
144 "paddw %%mm3, %%mm6 \n\t" | |
145 "add %3, %1 \n\t" | |
146 " js 1b \n\t" | |
147 "movq %%mm6, %%mm5 \n\t" | |
148 "psrlq $32, %%mm6 \n\t" | |
149 "paddw %%mm5, %%mm6 \n\t" | |
150 "movq %%mm6, %%mm5 \n\t" | |
151 "psrlq $16, %%mm6 \n\t" | |
152 "paddw %%mm5, %%mm6 \n\t" | |
153 "movd %%mm6, %0 \n\t" | |
154 "andl $0xFFFF, %0 \n\t" | |
155 : "=&r" (sum), "+r" (index) | |
156 : "r" (pix - index), "r" ((x86_reg)line_size) | |
157 ); | |
158 | |
159 return sum; | |
160 } | |
161 | |
162 static int pix_norm1_mmx(uint8_t *pix, int line_size) { | |
163 int tmp; | |
164 __asm__ volatile ( | |
165 "movl $16,%%ecx\n" | |
166 "pxor %%mm0,%%mm0\n" | |
167 "pxor %%mm7,%%mm7\n" | |
168 "1:\n" | |
169 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ | |
170 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ | |
171 | |
172 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ | |
173 | |
174 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ | |
175 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ | |
176 | |
177 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ | |
178 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ | |
179 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ | |
180 | |
181 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ | |
182 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ | |
183 | |
184 "pmaddwd %%mm3,%%mm3\n" | |
185 "pmaddwd %%mm4,%%mm4\n" | |
186 | |
187 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, | |
188 pix2^2+pix3^2+pix6^2+pix7^2) */ | |
189 "paddd %%mm3,%%mm4\n" | |
190 "paddd %%mm2,%%mm7\n" | |
191 | |
192 "add %2, %0\n" | |
193 "paddd %%mm4,%%mm7\n" | |
194 "dec %%ecx\n" | |
195 "jnz 1b\n" | |
196 | |
197 "movq %%mm7,%%mm1\n" | |
198 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | |
199 "paddd %%mm7,%%mm1\n" | |
200 "movd %%mm1,%1\n" | |
201 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" ); | |
202 return tmp; | |
203 } | |
204 | |
205 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
206 int tmp; | |
207 __asm__ volatile ( | |
208 "movl %4,%%ecx\n" | |
209 "shr $1,%%ecx\n" | |
210 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ | |
211 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ | |
212 "1:\n" | |
213 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ | |
214 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ | |
215 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ | |
216 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ | |
217 | |
218 /* todo: mm1-mm2, mm3-mm4 */ | |
219 /* algo: subtract mm1 from mm2 with saturation and vice versa */ | |
220 /* OR the results to get absolute difference */ | |
221 "movq %%mm1,%%mm5\n" | |
222 "movq %%mm3,%%mm6\n" | |
223 "psubusb %%mm2,%%mm1\n" | |
224 "psubusb %%mm4,%%mm3\n" | |
225 "psubusb %%mm5,%%mm2\n" | |
226 "psubusb %%mm6,%%mm4\n" | |
227 | |
228 "por %%mm1,%%mm2\n" | |
229 "por %%mm3,%%mm4\n" | |
230 | |
231 /* now convert to 16-bit vectors so we can square them */ | |
232 "movq %%mm2,%%mm1\n" | |
233 "movq %%mm4,%%mm3\n" | |
234 | |
235 "punpckhbw %%mm0,%%mm2\n" | |
236 "punpckhbw %%mm0,%%mm4\n" | |
237 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ | |
238 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ | |
239 | |
240 "pmaddwd %%mm2,%%mm2\n" | |
241 "pmaddwd %%mm4,%%mm4\n" | |
242 "pmaddwd %%mm1,%%mm1\n" | |
243 "pmaddwd %%mm3,%%mm3\n" | |
244 | |
245 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ | |
246 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ | |
247 | |
248 "paddd %%mm2,%%mm1\n" | |
249 "paddd %%mm4,%%mm3\n" | |
250 "paddd %%mm1,%%mm7\n" | |
251 "paddd %%mm3,%%mm7\n" | |
252 | |
253 "decl %%ecx\n" | |
254 "jnz 1b\n" | |
255 | |
256 "movq %%mm7,%%mm1\n" | |
257 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | |
258 "paddd %%mm7,%%mm1\n" | |
259 "movd %%mm1,%2\n" | |
260 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
261 : "r" ((x86_reg)line_size) , "m" (h) | |
262 : "%ecx"); | |
263 return tmp; | |
264 } | |
265 | |
266 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
267 int tmp; | |
268 __asm__ volatile ( | |
269 "movl %4,%%ecx\n" | |
270 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ | |
271 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ | |
272 "1:\n" | |
273 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ | |
274 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ | |
275 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ | |
276 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ | |
277 | |
278 /* todo: mm1-mm2, mm3-mm4 */ | |
279 /* algo: subtract mm1 from mm2 with saturation and vice versa */ | |
280 /* OR the results to get absolute difference */ | |
281 "movq %%mm1,%%mm5\n" | |
282 "movq %%mm3,%%mm6\n" | |
283 "psubusb %%mm2,%%mm1\n" | |
284 "psubusb %%mm4,%%mm3\n" | |
285 "psubusb %%mm5,%%mm2\n" | |
286 "psubusb %%mm6,%%mm4\n" | |
287 | |
288 "por %%mm1,%%mm2\n" | |
289 "por %%mm3,%%mm4\n" | |
290 | |
291 /* now convert to 16-bit vectors so we can square them */ | |
292 "movq %%mm2,%%mm1\n" | |
293 "movq %%mm4,%%mm3\n" | |
294 | |
295 "punpckhbw %%mm0,%%mm2\n" | |
296 "punpckhbw %%mm0,%%mm4\n" | |
297 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ | |
298 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ | |
299 | |
300 "pmaddwd %%mm2,%%mm2\n" | |
301 "pmaddwd %%mm4,%%mm4\n" | |
302 "pmaddwd %%mm1,%%mm1\n" | |
303 "pmaddwd %%mm3,%%mm3\n" | |
304 | |
305 "add %3,%0\n" | |
306 "add %3,%1\n" | |
307 | |
308 "paddd %%mm2,%%mm1\n" | |
309 "paddd %%mm4,%%mm3\n" | |
310 "paddd %%mm1,%%mm7\n" | |
311 "paddd %%mm3,%%mm7\n" | |
312 | |
313 "decl %%ecx\n" | |
314 "jnz 1b\n" | |
315 | |
316 "movq %%mm7,%%mm1\n" | |
317 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | |
318 "paddd %%mm7,%%mm1\n" | |
319 "movd %%mm1,%2\n" | |
320 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
321 : "r" ((x86_reg)line_size) , "m" (h) | |
322 : "%ecx"); | |
323 return tmp; | |
324 } | |
325 | |
326 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
327 int tmp; | |
328 __asm__ volatile ( | |
329 "shr $1,%2\n" | |
330 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ | |
331 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ | |
332 "1:\n" | |
333 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */ | |
334 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */ | |
335 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */ | |
336 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */ | |
337 | |
338 /* todo: mm1-mm2, mm3-mm4 */ | |
339 /* algo: subtract mm1 from mm2 with saturation and vice versa */ | |
340 /* OR the results to get absolute difference */ | |
341 "movdqa %%xmm1,%%xmm5\n" | |
342 "movdqa %%xmm3,%%xmm6\n" | |
343 "psubusb %%xmm2,%%xmm1\n" | |
344 "psubusb %%xmm4,%%xmm3\n" | |
345 "psubusb %%xmm5,%%xmm2\n" | |
346 "psubusb %%xmm6,%%xmm4\n" | |
347 | |
348 "por %%xmm1,%%xmm2\n" | |
349 "por %%xmm3,%%xmm4\n" | |
350 | |
351 /* now convert to 16-bit vectors so we can square them */ | |
352 "movdqa %%xmm2,%%xmm1\n" | |
353 "movdqa %%xmm4,%%xmm3\n" | |
354 | |
355 "punpckhbw %%xmm0,%%xmm2\n" | |
356 "punpckhbw %%xmm0,%%xmm4\n" | |
357 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ | |
358 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ | |
359 | |
360 "pmaddwd %%xmm2,%%xmm2\n" | |
361 "pmaddwd %%xmm4,%%xmm4\n" | |
362 "pmaddwd %%xmm1,%%xmm1\n" | |
363 "pmaddwd %%xmm3,%%xmm3\n" | |
364 | |
365 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ | |
366 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ | |
367 | |
368 "paddd %%xmm2,%%xmm1\n" | |
369 "paddd %%xmm4,%%xmm3\n" | |
370 "paddd %%xmm1,%%xmm7\n" | |
371 "paddd %%xmm3,%%xmm7\n" | |
372 | |
373 "decl %2\n" | |
374 "jnz 1b\n" | |
375 | |
376 "movdqa %%xmm7,%%xmm1\n" | |
377 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */ | |
378 "paddd %%xmm1,%%xmm7\n" | |
379 "movdqa %%xmm7,%%xmm1\n" | |
380 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ | |
381 "paddd %%xmm1,%%xmm7\n" | |
382 "movd %%xmm7,%3\n" | |
383 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) | |
384 : "r" ((x86_reg)line_size)); | |
385 return tmp; | |
386 } | |
387 | |
388 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { | |
389 int tmp; | |
390 __asm__ volatile ( | |
391 "movl %3,%%ecx\n" | |
392 "pxor %%mm7,%%mm7\n" | |
393 "pxor %%mm6,%%mm6\n" | |
394 | |
395 "movq (%0),%%mm0\n" | |
396 "movq %%mm0, %%mm1\n" | |
397 "psllq $8, %%mm0\n" | |
398 "psrlq $8, %%mm1\n" | |
399 "psrlq $8, %%mm0\n" | |
400 "movq %%mm0, %%mm2\n" | |
401 "movq %%mm1, %%mm3\n" | |
402 "punpcklbw %%mm7,%%mm0\n" | |
403 "punpcklbw %%mm7,%%mm1\n" | |
404 "punpckhbw %%mm7,%%mm2\n" | |
405 "punpckhbw %%mm7,%%mm3\n" | |
406 "psubw %%mm1, %%mm0\n" | |
407 "psubw %%mm3, %%mm2\n" | |
408 | |
409 "add %2,%0\n" | |
410 | |
411 "movq (%0),%%mm4\n" | |
412 "movq %%mm4, %%mm1\n" | |
413 "psllq $8, %%mm4\n" | |
414 "psrlq $8, %%mm1\n" | |
415 "psrlq $8, %%mm4\n" | |
416 "movq %%mm4, %%mm5\n" | |
417 "movq %%mm1, %%mm3\n" | |
418 "punpcklbw %%mm7,%%mm4\n" | |
419 "punpcklbw %%mm7,%%mm1\n" | |
420 "punpckhbw %%mm7,%%mm5\n" | |
421 "punpckhbw %%mm7,%%mm3\n" | |
422 "psubw %%mm1, %%mm4\n" | |
423 "psubw %%mm3, %%mm5\n" | |
424 "psubw %%mm4, %%mm0\n" | |
425 "psubw %%mm5, %%mm2\n" | |
426 "pxor %%mm3, %%mm3\n" | |
427 "pxor %%mm1, %%mm1\n" | |
428 "pcmpgtw %%mm0, %%mm3\n\t" | |
429 "pcmpgtw %%mm2, %%mm1\n\t" | |
430 "pxor %%mm3, %%mm0\n" | |
431 "pxor %%mm1, %%mm2\n" | |
432 "psubw %%mm3, %%mm0\n" | |
433 "psubw %%mm1, %%mm2\n" | |
434 "paddw %%mm0, %%mm2\n" | |
435 "paddw %%mm2, %%mm6\n" | |
436 | |
437 "add %2,%0\n" | |
438 "1:\n" | |
439 | |
440 "movq (%0),%%mm0\n" | |
441 "movq %%mm0, %%mm1\n" | |
442 "psllq $8, %%mm0\n" | |
443 "psrlq $8, %%mm1\n" | |
444 "psrlq $8, %%mm0\n" | |
445 "movq %%mm0, %%mm2\n" | |
446 "movq %%mm1, %%mm3\n" | |
447 "punpcklbw %%mm7,%%mm0\n" | |
448 "punpcklbw %%mm7,%%mm1\n" | |
449 "punpckhbw %%mm7,%%mm2\n" | |
450 "punpckhbw %%mm7,%%mm3\n" | |
451 "psubw %%mm1, %%mm0\n" | |
452 "psubw %%mm3, %%mm2\n" | |
453 "psubw %%mm0, %%mm4\n" | |
454 "psubw %%mm2, %%mm5\n" | |
455 "pxor %%mm3, %%mm3\n" | |
456 "pxor %%mm1, %%mm1\n" | |
457 "pcmpgtw %%mm4, %%mm3\n\t" | |
458 "pcmpgtw %%mm5, %%mm1\n\t" | |
459 "pxor %%mm3, %%mm4\n" | |
460 "pxor %%mm1, %%mm5\n" | |
461 "psubw %%mm3, %%mm4\n" | |
462 "psubw %%mm1, %%mm5\n" | |
463 "paddw %%mm4, %%mm5\n" | |
464 "paddw %%mm5, %%mm6\n" | |
465 | |
466 "add %2,%0\n" | |
467 | |
468 "movq (%0),%%mm4\n" | |
469 "movq %%mm4, %%mm1\n" | |
470 "psllq $8, %%mm4\n" | |
471 "psrlq $8, %%mm1\n" | |
472 "psrlq $8, %%mm4\n" | |
473 "movq %%mm4, %%mm5\n" | |
474 "movq %%mm1, %%mm3\n" | |
475 "punpcklbw %%mm7,%%mm4\n" | |
476 "punpcklbw %%mm7,%%mm1\n" | |
477 "punpckhbw %%mm7,%%mm5\n" | |
478 "punpckhbw %%mm7,%%mm3\n" | |
479 "psubw %%mm1, %%mm4\n" | |
480 "psubw %%mm3, %%mm5\n" | |
481 "psubw %%mm4, %%mm0\n" | |
482 "psubw %%mm5, %%mm2\n" | |
483 "pxor %%mm3, %%mm3\n" | |
484 "pxor %%mm1, %%mm1\n" | |
485 "pcmpgtw %%mm0, %%mm3\n\t" | |
486 "pcmpgtw %%mm2, %%mm1\n\t" | |
487 "pxor %%mm3, %%mm0\n" | |
488 "pxor %%mm1, %%mm2\n" | |
489 "psubw %%mm3, %%mm0\n" | |
490 "psubw %%mm1, %%mm2\n" | |
491 "paddw %%mm0, %%mm2\n" | |
492 "paddw %%mm2, %%mm6\n" | |
493 | |
494 "add %2,%0\n" | |
495 "subl $2, %%ecx\n" | |
496 " jnz 1b\n" | |
497 | |
498 "movq %%mm6, %%mm0\n" | |
499 "punpcklwd %%mm7,%%mm0\n" | |
500 "punpckhwd %%mm7,%%mm6\n" | |
501 "paddd %%mm0, %%mm6\n" | |
502 | |
503 "movq %%mm6,%%mm0\n" | |
504 "psrlq $32, %%mm6\n" | |
505 "paddd %%mm6,%%mm0\n" | |
506 "movd %%mm0,%1\n" | |
507 : "+r" (pix1), "=r"(tmp) | |
508 : "r" ((x86_reg)line_size) , "g" (h-2) | |
509 : "%ecx"); | |
510 return tmp; | |
511 } | |
512 | |
513 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { | |
514 int tmp; | |
515 uint8_t * pix= pix1; | |
516 __asm__ volatile ( | |
517 "movl %3,%%ecx\n" | |
518 "pxor %%mm7,%%mm7\n" | |
519 "pxor %%mm6,%%mm6\n" | |
520 | |
521 "movq (%0),%%mm0\n" | |
522 "movq 1(%0),%%mm1\n" | |
523 "movq %%mm0, %%mm2\n" | |
524 "movq %%mm1, %%mm3\n" | |
525 "punpcklbw %%mm7,%%mm0\n" | |
526 "punpcklbw %%mm7,%%mm1\n" | |
527 "punpckhbw %%mm7,%%mm2\n" | |
528 "punpckhbw %%mm7,%%mm3\n" | |
529 "psubw %%mm1, %%mm0\n" | |
530 "psubw %%mm3, %%mm2\n" | |
531 | |
532 "add %2,%0\n" | |
533 | |
534 "movq (%0),%%mm4\n" | |
535 "movq 1(%0),%%mm1\n" | |
536 "movq %%mm4, %%mm5\n" | |
537 "movq %%mm1, %%mm3\n" | |
538 "punpcklbw %%mm7,%%mm4\n" | |
539 "punpcklbw %%mm7,%%mm1\n" | |
540 "punpckhbw %%mm7,%%mm5\n" | |
541 "punpckhbw %%mm7,%%mm3\n" | |
542 "psubw %%mm1, %%mm4\n" | |
543 "psubw %%mm3, %%mm5\n" | |
544 "psubw %%mm4, %%mm0\n" | |
545 "psubw %%mm5, %%mm2\n" | |
546 "pxor %%mm3, %%mm3\n" | |
547 "pxor %%mm1, %%mm1\n" | |
548 "pcmpgtw %%mm0, %%mm3\n\t" | |
549 "pcmpgtw %%mm2, %%mm1\n\t" | |
550 "pxor %%mm3, %%mm0\n" | |
551 "pxor %%mm1, %%mm2\n" | |
552 "psubw %%mm3, %%mm0\n" | |
553 "psubw %%mm1, %%mm2\n" | |
554 "paddw %%mm0, %%mm2\n" | |
555 "paddw %%mm2, %%mm6\n" | |
556 | |
557 "add %2,%0\n" | |
558 "1:\n" | |
559 | |
560 "movq (%0),%%mm0\n" | |
561 "movq 1(%0),%%mm1\n" | |
562 "movq %%mm0, %%mm2\n" | |
563 "movq %%mm1, %%mm3\n" | |
564 "punpcklbw %%mm7,%%mm0\n" | |
565 "punpcklbw %%mm7,%%mm1\n" | |
566 "punpckhbw %%mm7,%%mm2\n" | |
567 "punpckhbw %%mm7,%%mm3\n" | |
568 "psubw %%mm1, %%mm0\n" | |
569 "psubw %%mm3, %%mm2\n" | |
570 "psubw %%mm0, %%mm4\n" | |
571 "psubw %%mm2, %%mm5\n" | |
572 "pxor %%mm3, %%mm3\n" | |
573 "pxor %%mm1, %%mm1\n" | |
574 "pcmpgtw %%mm4, %%mm3\n\t" | |
575 "pcmpgtw %%mm5, %%mm1\n\t" | |
576 "pxor %%mm3, %%mm4\n" | |
577 "pxor %%mm1, %%mm5\n" | |
578 "psubw %%mm3, %%mm4\n" | |
579 "psubw %%mm1, %%mm5\n" | |
580 "paddw %%mm4, %%mm5\n" | |
581 "paddw %%mm5, %%mm6\n" | |
582 | |
583 "add %2,%0\n" | |
584 | |
585 "movq (%0),%%mm4\n" | |
586 "movq 1(%0),%%mm1\n" | |
587 "movq %%mm4, %%mm5\n" | |
588 "movq %%mm1, %%mm3\n" | |
589 "punpcklbw %%mm7,%%mm4\n" | |
590 "punpcklbw %%mm7,%%mm1\n" | |
591 "punpckhbw %%mm7,%%mm5\n" | |
592 "punpckhbw %%mm7,%%mm3\n" | |
593 "psubw %%mm1, %%mm4\n" | |
594 "psubw %%mm3, %%mm5\n" | |
595 "psubw %%mm4, %%mm0\n" | |
596 "psubw %%mm5, %%mm2\n" | |
597 "pxor %%mm3, %%mm3\n" | |
598 "pxor %%mm1, %%mm1\n" | |
599 "pcmpgtw %%mm0, %%mm3\n\t" | |
600 "pcmpgtw %%mm2, %%mm1\n\t" | |
601 "pxor %%mm3, %%mm0\n" | |
602 "pxor %%mm1, %%mm2\n" | |
603 "psubw %%mm3, %%mm0\n" | |
604 "psubw %%mm1, %%mm2\n" | |
605 "paddw %%mm0, %%mm2\n" | |
606 "paddw %%mm2, %%mm6\n" | |
607 | |
608 "add %2,%0\n" | |
609 "subl $2, %%ecx\n" | |
610 " jnz 1b\n" | |
611 | |
612 "movq %%mm6, %%mm0\n" | |
613 "punpcklwd %%mm7,%%mm0\n" | |
614 "punpckhwd %%mm7,%%mm6\n" | |
615 "paddd %%mm0, %%mm6\n" | |
616 | |
617 "movq %%mm6,%%mm0\n" | |
618 "psrlq $32, %%mm6\n" | |
619 "paddd %%mm6,%%mm0\n" | |
620 "movd %%mm0,%1\n" | |
621 : "+r" (pix1), "=r"(tmp) | |
622 : "r" ((x86_reg)line_size) , "g" (h-2) | |
623 : "%ecx"); | |
624 return tmp + hf_noise8_mmx(pix+8, line_size, h); | |
625 } | |
626 | |
627 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
628 MpegEncContext *c = p; | |
629 int score1, score2; | |
630 | |
631 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); | |
632 else score1 = sse16_mmx(c, pix1, pix2, line_size, h); | |
633 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); | |
634 | |
635 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; | |
636 else return score1 + FFABS(score2)*8; | |
637 } | |
638 | |
639 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
640 MpegEncContext *c = p; | |
641 int score1= sse8_mmx(c, pix1, pix2, line_size, h); | |
642 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); | |
643 | |
644 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; | |
645 else return score1 + FFABS(score2)*8; | |
646 } | |
647 | |
648 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { | |
649 int tmp; | |
650 | |
651 assert( (((int)pix) & 7) == 0); | |
652 assert((line_size &7) ==0); | |
653 | |
654 #define SUM(in0, in1, out0, out1) \ | |
655 "movq (%0), %%mm2\n"\ | |
656 "movq 8(%0), %%mm3\n"\ | |
657 "add %2,%0\n"\ | |
658 "movq %%mm2, " #out0 "\n"\ | |
659 "movq %%mm3, " #out1 "\n"\ | |
660 "psubusb " #in0 ", %%mm2\n"\ | |
661 "psubusb " #in1 ", %%mm3\n"\ | |
662 "psubusb " #out0 ", " #in0 "\n"\ | |
663 "psubusb " #out1 ", " #in1 "\n"\ | |
664 "por %%mm2, " #in0 "\n"\ | |
665 "por %%mm3, " #in1 "\n"\ | |
666 "movq " #in0 ", %%mm2\n"\ | |
667 "movq " #in1 ", %%mm3\n"\ | |
668 "punpcklbw %%mm7, " #in0 "\n"\ | |
669 "punpcklbw %%mm7, " #in1 "\n"\ | |
670 "punpckhbw %%mm7, %%mm2\n"\ | |
671 "punpckhbw %%mm7, %%mm3\n"\ | |
672 "paddw " #in1 ", " #in0 "\n"\ | |
673 "paddw %%mm3, %%mm2\n"\ | |
674 "paddw %%mm2, " #in0 "\n"\ | |
675 "paddw " #in0 ", %%mm6\n" | |
676 | |
677 | |
678 __asm__ volatile ( | |
679 "movl %3,%%ecx\n" | |
680 "pxor %%mm6,%%mm6\n" | |
681 "pxor %%mm7,%%mm7\n" | |
682 "movq (%0),%%mm0\n" | |
683 "movq 8(%0),%%mm1\n" | |
684 "add %2,%0\n" | |
685 "jmp 2f\n" | |
686 "1:\n" | |
687 | |
688 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
689 "2:\n" | |
690 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
691 | |
692 "subl $2, %%ecx\n" | |
693 "jnz 1b\n" | |
694 | |
695 "movq %%mm6,%%mm0\n" | |
696 "psrlq $32, %%mm6\n" | |
697 "paddw %%mm6,%%mm0\n" | |
698 "movq %%mm0,%%mm6\n" | |
699 "psrlq $16, %%mm0\n" | |
700 "paddw %%mm6,%%mm0\n" | |
701 "movd %%mm0,%1\n" | |
702 : "+r" (pix), "=r"(tmp) | |
703 : "r" ((x86_reg)line_size) , "m" (h) | |
704 : "%ecx"); | |
705 return tmp & 0xFFFF; | |
706 } | |
707 #undef SUM | |
708 | |
709 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { | |
710 int tmp; | |
711 | |
712 assert( (((int)pix) & 7) == 0); | |
713 assert((line_size &7) ==0); | |
714 | |
715 #define SUM(in0, in1, out0, out1) \ | |
716 "movq (%0), " #out0 "\n"\ | |
717 "movq 8(%0), " #out1 "\n"\ | |
718 "add %2,%0\n"\ | |
719 "psadbw " #out0 ", " #in0 "\n"\ | |
720 "psadbw " #out1 ", " #in1 "\n"\ | |
721 "paddw " #in1 ", " #in0 "\n"\ | |
722 "paddw " #in0 ", %%mm6\n" | |
723 | |
724 __asm__ volatile ( | |
725 "movl %3,%%ecx\n" | |
726 "pxor %%mm6,%%mm6\n" | |
727 "pxor %%mm7,%%mm7\n" | |
728 "movq (%0),%%mm0\n" | |
729 "movq 8(%0),%%mm1\n" | |
730 "add %2,%0\n" | |
731 "jmp 2f\n" | |
732 "1:\n" | |
733 | |
734 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
735 "2:\n" | |
736 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
737 | |
738 "subl $2, %%ecx\n" | |
739 "jnz 1b\n" | |
740 | |
741 "movd %%mm6,%1\n" | |
742 : "+r" (pix), "=r"(tmp) | |
743 : "r" ((x86_reg)line_size) , "m" (h) | |
744 : "%ecx"); | |
745 return tmp; | |
746 } | |
747 #undef SUM | |
748 | |
749 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
750 int tmp; | |
751 | |
752 assert( (((int)pix1) & 7) == 0); | |
753 assert( (((int)pix2) & 7) == 0); | |
754 assert((line_size &7) ==0); | |
755 | |
756 #define SUM(in0, in1, out0, out1) \ | |
757 "movq (%0),%%mm2\n"\ | |
758 "movq (%1)," #out0 "\n"\ | |
759 "movq 8(%0),%%mm3\n"\ | |
760 "movq 8(%1)," #out1 "\n"\ | |
761 "add %3,%0\n"\ | |
762 "add %3,%1\n"\ | |
763 "psubb " #out0 ", %%mm2\n"\ | |
764 "psubb " #out1 ", %%mm3\n"\ | |
765 "pxor %%mm7, %%mm2\n"\ | |
766 "pxor %%mm7, %%mm3\n"\ | |
767 "movq %%mm2, " #out0 "\n"\ | |
768 "movq %%mm3, " #out1 "\n"\ | |
769 "psubusb " #in0 ", %%mm2\n"\ | |
770 "psubusb " #in1 ", %%mm3\n"\ | |
771 "psubusb " #out0 ", " #in0 "\n"\ | |
772 "psubusb " #out1 ", " #in1 "\n"\ | |
773 "por %%mm2, " #in0 "\n"\ | |
774 "por %%mm3, " #in1 "\n"\ | |
775 "movq " #in0 ", %%mm2\n"\ | |
776 "movq " #in1 ", %%mm3\n"\ | |
777 "punpcklbw %%mm7, " #in0 "\n"\ | |
778 "punpcklbw %%mm7, " #in1 "\n"\ | |
779 "punpckhbw %%mm7, %%mm2\n"\ | |
780 "punpckhbw %%mm7, %%mm3\n"\ | |
781 "paddw " #in1 ", " #in0 "\n"\ | |
782 "paddw %%mm3, %%mm2\n"\ | |
783 "paddw %%mm2, " #in0 "\n"\ | |
784 "paddw " #in0 ", %%mm6\n" | |
785 | |
786 | |
787 __asm__ volatile ( | |
788 "movl %4,%%ecx\n" | |
789 "pxor %%mm6,%%mm6\n" | |
790 "pcmpeqw %%mm7,%%mm7\n" | |
791 "psllw $15, %%mm7\n" | |
792 "packsswb %%mm7, %%mm7\n" | |
793 "movq (%0),%%mm0\n" | |
794 "movq (%1),%%mm2\n" | |
795 "movq 8(%0),%%mm1\n" | |
796 "movq 8(%1),%%mm3\n" | |
797 "add %3,%0\n" | |
798 "add %3,%1\n" | |
799 "psubb %%mm2, %%mm0\n" | |
800 "psubb %%mm3, %%mm1\n" | |
801 "pxor %%mm7, %%mm0\n" | |
802 "pxor %%mm7, %%mm1\n" | |
803 "jmp 2f\n" | |
804 "1:\n" | |
805 | |
806 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
807 "2:\n" | |
808 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
809 | |
810 "subl $2, %%ecx\n" | |
811 "jnz 1b\n" | |
812 | |
813 "movq %%mm6,%%mm0\n" | |
814 "psrlq $32, %%mm6\n" | |
815 "paddw %%mm6,%%mm0\n" | |
816 "movq %%mm0,%%mm6\n" | |
817 "psrlq $16, %%mm0\n" | |
818 "paddw %%mm6,%%mm0\n" | |
819 "movd %%mm0,%2\n" | |
820 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
821 : "r" ((x86_reg)line_size) , "m" (h) | |
822 : "%ecx"); | |
823 return tmp & 0x7FFF; | |
824 } | |
825 #undef SUM | |
826 | |
827 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
828 int tmp; | |
829 | |
830 assert( (((int)pix1) & 7) == 0); | |
831 assert( (((int)pix2) & 7) == 0); | |
832 assert((line_size &7) ==0); | |
833 | |
834 #define SUM(in0, in1, out0, out1) \ | |
835 "movq (%0)," #out0 "\n"\ | |
836 "movq (%1),%%mm2\n"\ | |
837 "movq 8(%0)," #out1 "\n"\ | |
838 "movq 8(%1),%%mm3\n"\ | |
839 "add %3,%0\n"\ | |
840 "add %3,%1\n"\ | |
841 "psubb %%mm2, " #out0 "\n"\ | |
842 "psubb %%mm3, " #out1 "\n"\ | |
843 "pxor %%mm7, " #out0 "\n"\ | |
844 "pxor %%mm7, " #out1 "\n"\ | |
845 "psadbw " #out0 ", " #in0 "\n"\ | |
846 "psadbw " #out1 ", " #in1 "\n"\ | |
847 "paddw " #in1 ", " #in0 "\n"\ | |
848 "paddw " #in0 ", %%mm6\n" | |
849 | |
850 __asm__ volatile ( | |
851 "movl %4,%%ecx\n" | |
852 "pxor %%mm6,%%mm6\n" | |
853 "pcmpeqw %%mm7,%%mm7\n" | |
854 "psllw $15, %%mm7\n" | |
855 "packsswb %%mm7, %%mm7\n" | |
856 "movq (%0),%%mm0\n" | |
857 "movq (%1),%%mm2\n" | |
858 "movq 8(%0),%%mm1\n" | |
859 "movq 8(%1),%%mm3\n" | |
860 "add %3,%0\n" | |
861 "add %3,%1\n" | |
862 "psubb %%mm2, %%mm0\n" | |
863 "psubb %%mm3, %%mm1\n" | |
864 "pxor %%mm7, %%mm0\n" | |
865 "pxor %%mm7, %%mm1\n" | |
866 "jmp 2f\n" | |
867 "1:\n" | |
868 | |
869 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
870 "2:\n" | |
871 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
872 | |
873 "subl $2, %%ecx\n" | |
874 "jnz 1b\n" | |
875 | |
876 "movd %%mm6,%2\n" | |
877 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
878 : "r" ((x86_reg)line_size) , "m" (h) | |
879 : "%ecx"); | |
880 return tmp; | |
881 } | |
882 #undef SUM | |
883 | |
884 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ | |
885 x86_reg i=0; | |
886 __asm__ volatile( | |
887 "1: \n\t" | |
888 "movq (%2, %0), %%mm0 \n\t" | |
889 "movq (%1, %0), %%mm1 \n\t" | |
890 "psubb %%mm0, %%mm1 \n\t" | |
891 "movq %%mm1, (%3, %0) \n\t" | |
892 "movq 8(%2, %0), %%mm0 \n\t" | |
893 "movq 8(%1, %0), %%mm1 \n\t" | |
894 "psubb %%mm0, %%mm1 \n\t" | |
895 "movq %%mm1, 8(%3, %0) \n\t" | |
896 "add $16, %0 \n\t" | |
897 "cmp %4, %0 \n\t" | |
898 " jb 1b \n\t" | |
899 : "+r" (i) | |
900 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15) | |
901 ); | |
902 for(; i<w; i++) | |
903 dst[i+0] = src1[i+0]-src2[i+0]; | |
904 } | |
905 | |
10431 | 906 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){ |
8430 | 907 x86_reg i=0; |
908 uint8_t l, lt; | |
909 | |
910 __asm__ volatile( | |
911 "1: \n\t" | |
912 "movq -1(%1, %0), %%mm0 \n\t" // LT | |
913 "movq (%1, %0), %%mm1 \n\t" // T | |
914 "movq -1(%2, %0), %%mm2 \n\t" // L | |
915 "movq (%2, %0), %%mm3 \n\t" // X | |
916 "movq %%mm2, %%mm4 \n\t" // L | |
917 "psubb %%mm0, %%mm2 \n\t" | |
918 "paddb %%mm1, %%mm2 \n\t" // L + T - LT | |
919 "movq %%mm4, %%mm5 \n\t" // L | |
920 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) | |
921 "pminub %%mm5, %%mm1 \n\t" // min(T, L) | |
922 "pminub %%mm2, %%mm4 \n\t" | |
923 "pmaxub %%mm1, %%mm4 \n\t" | |
924 "psubb %%mm4, %%mm3 \n\t" // dst - pred | |
925 "movq %%mm3, (%3, %0) \n\t" | |
926 "add $8, %0 \n\t" | |
927 "cmp %4, %0 \n\t" | |
928 " jb 1b \n\t" | |
929 : "+r" (i) | |
930 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w) | |
931 ); | |
932 | |
933 l= *left; | |
934 lt= *left_top; | |
935 | |
936 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); | |
937 | |
938 *left_top= src1[w-1]; | |
939 *left = src2[w-1]; | |
940 } | |
941 | |
942 #define DIFF_PIXELS_1(m,a,t,p1,p2)\ | |
943 "mov"#m" "#p1", "#a" \n\t"\ | |
944 "mov"#m" "#p2", "#t" \n\t"\ | |
945 "punpcklbw "#a", "#t" \n\t"\ | |
946 "punpcklbw "#a", "#a" \n\t"\ | |
947 "psubw "#t", "#a" \n\t"\ | |
948 | |
949 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\ | |
950 uint8_t *p1b=p1, *p2b=p2;\ | |
951 __asm__ volatile(\ | |
952 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ | |
953 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ | |
954 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ | |
955 "add %4, %1 \n\t"\ | |
956 "add %4, %2 \n\t"\ | |
957 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ | |
958 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ | |
959 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ | |
960 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ | |
961 "mov"#m1" "#mm"0, %0 \n\t"\ | |
962 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ | |
963 "mov"#m1" %0, "#mm"0 \n\t"\ | |
964 : "+m"(temp), "+r"(p1b), "+r"(p2b)\ | |
965 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\ | |
966 );\ | |
967 } | |
968 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp) | |
969 | |
970 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp) | |
971 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp) | |
972 | |
973 #define LBUTTERFLY2(a1,b1,a2,b2)\ | |
974 "paddw " #b1 ", " #a1 " \n\t"\ | |
975 "paddw " #b2 ", " #a2 " \n\t"\ | |
976 "paddw " #b1 ", " #b1 " \n\t"\ | |
977 "paddw " #b2 ", " #b2 " \n\t"\ | |
978 "psubw " #a1 ", " #b1 " \n\t"\ | |
979 "psubw " #a2 ", " #b2 " \n\t" | |
980 | |
981 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\ | |
982 LBUTTERFLY2(m0, m1, m2, m3)\ | |
983 LBUTTERFLY2(m4, m5, m6, m7)\ | |
984 LBUTTERFLY2(m0, m2, m1, m3)\ | |
985 LBUTTERFLY2(m4, m6, m5, m7)\ | |
986 LBUTTERFLY2(m0, m4, m1, m5)\ | |
987 LBUTTERFLY2(m2, m6, m3, m7)\ | |
988 | |
989 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7) | |
990 | |
991 #define MMABS_MMX(a,z)\ | |
992 "pxor " #z ", " #z " \n\t"\ | |
993 "pcmpgtw " #a ", " #z " \n\t"\ | |
994 "pxor " #z ", " #a " \n\t"\ | |
995 "psubw " #z ", " #a " \n\t" | |
996 | |
997 #define MMABS_MMX2(a,z)\ | |
998 "pxor " #z ", " #z " \n\t"\ | |
999 "psubw " #a ", " #z " \n\t"\ | |
1000 "pmaxsw " #z ", " #a " \n\t" | |
1001 | |
1002 #define MMABS_SSSE3(a,z)\ | |
1003 "pabsw " #a ", " #a " \n\t" | |
1004 | |
1005 #define MMABS_SUM(a,z, sum)\ | |
1006 MMABS(a,z)\ | |
1007 "paddusw " #a ", " #sum " \n\t" | |
1008 | |
1009 #define MMABS_SUM_8x8_NOSPILL\ | |
1010 MMABS(%%xmm0, %%xmm8)\ | |
1011 MMABS(%%xmm1, %%xmm9)\ | |
1012 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ | |
1013 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ | |
1014 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ | |
1015 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ | |
1016 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ | |
1017 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ | |
1018 "paddusw %%xmm1, %%xmm0 \n\t" | |
1019 | |
8590 | 1020 #if ARCH_X86_64 |
8430 | 1021 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL |
1022 #else | |
1023 #define MMABS_SUM_8x8_SSE2\ | |
1024 "movdqa %%xmm7, (%1) \n\t"\ | |
1025 MMABS(%%xmm0, %%xmm7)\ | |
1026 MMABS(%%xmm1, %%xmm7)\ | |
1027 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ | |
1028 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ | |
1029 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ | |
1030 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ | |
1031 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ | |
1032 "movdqa (%1), %%xmm2 \n\t"\ | |
1033 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ | |
1034 "paddusw %%xmm1, %%xmm0 \n\t" | |
1035 #endif | |
1036 | |
1037 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to | |
1038 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, | |
1039 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ | |
1040 #define HSUM_MMX(a, t, dst)\ | |
1041 "movq "#a", "#t" \n\t"\ | |
1042 "psrlq $32, "#a" \n\t"\ | |
1043 "paddusw "#t", "#a" \n\t"\ | |
1044 "movq "#a", "#t" \n\t"\ | |
1045 "psrlq $16, "#a" \n\t"\ | |
1046 "paddusw "#t", "#a" \n\t"\ | |
1047 "movd "#a", "#dst" \n\t"\ | |
1048 | |
1049 #define HSUM_MMX2(a, t, dst)\ | |
1050 "pshufw $0x0E, "#a", "#t" \n\t"\ | |
1051 "paddusw "#t", "#a" \n\t"\ | |
1052 "pshufw $0x01, "#a", "#t" \n\t"\ | |
1053 "paddusw "#t", "#a" \n\t"\ | |
1054 "movd "#a", "#dst" \n\t"\ | |
1055 | |
1056 #define HSUM_SSE2(a, t, dst)\ | |
1057 "movhlps "#a", "#t" \n\t"\ | |
1058 "paddusw "#t", "#a" \n\t"\ | |
1059 "pshuflw $0x0E, "#a", "#t" \n\t"\ | |
1060 "paddusw "#t", "#a" \n\t"\ | |
1061 "pshuflw $0x01, "#a", "#t" \n\t"\ | |
1062 "paddusw "#t", "#a" \n\t"\ | |
1063 "movd "#a", "#dst" \n\t"\ | |
1064 | |
1065 #define HADAMARD8_DIFF_MMX(cpu) \ | |
1066 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ | |
11369 | 1067 DECLARE_ALIGNED(8, uint64_t, temp)[13];\ |
8430 | 1068 int sum;\ |
1069 \ | |
1070 assert(h==8);\ | |
1071 \ | |
1072 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ | |
1073 \ | |
1074 __asm__ volatile(\ | |
1075 HADAMARD48\ | |
1076 \ | |
1077 "movq %%mm7, 96(%1) \n\t"\ | |
1078 \ | |
1079 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ | |
1080 STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ | |
1081 \ | |
1082 "movq 96(%1), %%mm7 \n\t"\ | |
1083 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ | |
1084 STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\ | |
1085 \ | |
1086 : "=r" (sum)\ | |
1087 : "r"(temp)\ | |
1088 );\ | |
1089 \ | |
1090 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ | |
1091 \ | |
1092 __asm__ volatile(\ | |
1093 HADAMARD48\ | |
1094 \ | |
1095 "movq %%mm7, 96(%1) \n\t"\ | |
1096 \ | |
1097 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ | |
1098 STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ | |
1099 \ | |
1100 "movq 96(%1), %%mm7 \n\t"\ | |
1101 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ | |
1102 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ | |
1103 "movq %%mm6, %%mm7 \n\t"\ | |
1104 "movq %%mm0, %%mm6 \n\t"\ | |
1105 \ | |
1106 LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ | |
1107 \ | |
1108 HADAMARD48\ | |
1109 "movq %%mm7, 64(%1) \n\t"\ | |
1110 MMABS(%%mm0, %%mm7)\ | |
1111 MMABS(%%mm1, %%mm7)\ | |
1112 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ | |
1113 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ | |
1114 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ | |
1115 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ | |
1116 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ | |
1117 "movq 64(%1), %%mm2 \n\t"\ | |
1118 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ | |
1119 "paddusw %%mm1, %%mm0 \n\t"\ | |
1120 "movq %%mm0, 64(%1) \n\t"\ | |
1121 \ | |
1122 LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ | |
1123 LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\ | |
1124 \ | |
1125 HADAMARD48\ | |
1126 "movq %%mm7, (%1) \n\t"\ | |
1127 MMABS(%%mm0, %%mm7)\ | |
1128 MMABS(%%mm1, %%mm7)\ | |
1129 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ | |
1130 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ | |
1131 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ | |
1132 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ | |
1133 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ | |
1134 "movq (%1), %%mm2 \n\t"\ | |
1135 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ | |
1136 "paddusw 64(%1), %%mm0 \n\t"\ | |
1137 "paddusw %%mm1, %%mm0 \n\t"\ | |
1138 \ | |
1139 HSUM(%%mm0, %%mm1, %0)\ | |
1140 \ | |
1141 : "=r" (sum)\ | |
1142 : "r"(temp)\ | |
1143 );\ | |
1144 return sum&0xFFFF;\ | |
1145 }\ | |
1146 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) | |
1147 | |
1148 #define HADAMARD8_DIFF_SSE2(cpu) \ | |
1149 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ | |
11369 | 1150 DECLARE_ALIGNED(16, uint64_t, temp)[4];\ |
8430 | 1151 int sum;\ |
1152 \ | |
1153 assert(h==8);\ | |
1154 \ | |
1155 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\ | |
1156 \ | |
1157 __asm__ volatile(\ | |
1158 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ | |
1159 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\ | |
1160 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ | |
1161 MMABS_SUM_8x8\ | |
1162 HSUM_SSE2(%%xmm0, %%xmm1, %0)\ | |
1163 : "=r" (sum)\ | |
1164 : "r"(temp)\ | |
1165 );\ | |
1166 return sum&0xFFFF;\ | |
1167 }\ | |
1168 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) | |
1169 | |
1170 #define MMABS(a,z) MMABS_MMX(a,z) | |
1171 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) | |
1172 HADAMARD8_DIFF_MMX(mmx) | |
1173 #undef MMABS | |
1174 #undef HSUM | |
1175 | |
1176 #define MMABS(a,z) MMABS_MMX2(a,z) | |
1177 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2 | |
1178 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) | |
1179 HADAMARD8_DIFF_MMX(mmx2) | |
1180 HADAMARD8_DIFF_SSE2(sse2) | |
1181 #undef MMABS | |
1182 #undef MMABS_SUM_8x8 | |
1183 #undef HSUM | |
1184 | |
8590 | 1185 #if HAVE_SSSE3 |
8430 | 1186 #define MMABS(a,z) MMABS_SSSE3(a,z) |
1187 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL | |
1188 HADAMARD8_DIFF_SSE2(ssse3) | |
1189 #undef MMABS | |
1190 #undef MMABS_SUM_8x8 | |
1191 #endif | |
1192 | |
1193 #define DCT_SAD4(m,mm,o)\ | |
1194 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ | |
1195 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ | |
1196 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ | |
1197 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ | |
1198 MMABS_SUM(mm##2, mm##6, mm##0)\ | |
1199 MMABS_SUM(mm##3, mm##7, mm##1)\ | |
1200 MMABS_SUM(mm##4, mm##6, mm##0)\ | |
1201 MMABS_SUM(mm##5, mm##7, mm##1)\ | |
1202 | |
1203 #define DCT_SAD_MMX\ | |
1204 "pxor %%mm0, %%mm0 \n\t"\ | |
1205 "pxor %%mm1, %%mm1 \n\t"\ | |
1206 DCT_SAD4(q, %%mm, 0)\ | |
1207 DCT_SAD4(q, %%mm, 8)\ | |
1208 DCT_SAD4(q, %%mm, 64)\ | |
1209 DCT_SAD4(q, %%mm, 72)\ | |
1210 "paddusw %%mm1, %%mm0 \n\t"\ | |
1211 HSUM(%%mm0, %%mm1, %0) | |
1212 | |
1213 #define DCT_SAD_SSE2\ | |
1214 "pxor %%xmm0, %%xmm0 \n\t"\ | |
1215 "pxor %%xmm1, %%xmm1 \n\t"\ | |
1216 DCT_SAD4(dqa, %%xmm, 0)\ | |
1217 DCT_SAD4(dqa, %%xmm, 64)\ | |
1218 "paddusw %%xmm1, %%xmm0 \n\t"\ | |
1219 HSUM(%%xmm0, %%xmm1, %0) | |
1220 | |
1221 #define DCT_SAD_FUNC(cpu) \ | |
1222 static int sum_abs_dctelem_##cpu(DCTELEM *block){\ | |
1223 int sum;\ | |
1224 __asm__ volatile(\ | |
1225 DCT_SAD\ | |
1226 :"=r"(sum)\ | |
1227 :"r"(block)\ | |
1228 );\ | |
1229 return sum&0xFFFF;\ | |
1230 } | |
1231 | |
1232 #define DCT_SAD DCT_SAD_MMX | |
1233 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) | |
1234 #define MMABS(a,z) MMABS_MMX(a,z) | |
1235 DCT_SAD_FUNC(mmx) | |
1236 #undef MMABS | |
1237 #undef HSUM | |
1238 | |
1239 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) | |
1240 #define MMABS(a,z) MMABS_MMX2(a,z) | |
1241 DCT_SAD_FUNC(mmx2) | |
1242 #undef HSUM | |
1243 #undef DCT_SAD | |
1244 | |
1245 #define DCT_SAD DCT_SAD_SSE2 | |
1246 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) | |
1247 DCT_SAD_FUNC(sse2) | |
1248 #undef MMABS | |
1249 | |
8590 | 1250 #if HAVE_SSSE3 |
8430 | 1251 #define MMABS(a,z) MMABS_SSSE3(a,z) |
1252 DCT_SAD_FUNC(ssse3) | |
1253 #undef MMABS | |
1254 #endif | |
1255 #undef HSUM | |
1256 #undef DCT_SAD | |
1257 | |
1258 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ | |
1259 int sum; | |
1260 x86_reg i=size; | |
1261 __asm__ volatile( | |
1262 "pxor %%mm4, %%mm4 \n" | |
1263 "1: \n" | |
1264 "sub $8, %0 \n" | |
1265 "movq (%2,%0), %%mm2 \n" | |
1266 "movq (%3,%0,2), %%mm0 \n" | |
1267 "movq 8(%3,%0,2), %%mm1 \n" | |
1268 "punpckhbw %%mm2, %%mm3 \n" | |
1269 "punpcklbw %%mm2, %%mm2 \n" | |
1270 "psraw $8, %%mm3 \n" | |
1271 "psraw $8, %%mm2 \n" | |
1272 "psubw %%mm3, %%mm1 \n" | |
1273 "psubw %%mm2, %%mm0 \n" | |
1274 "pmaddwd %%mm1, %%mm1 \n" | |
1275 "pmaddwd %%mm0, %%mm0 \n" | |
1276 "paddd %%mm1, %%mm4 \n" | |
1277 "paddd %%mm0, %%mm4 \n" | |
1278 "jg 1b \n" | |
1279 "movq %%mm4, %%mm3 \n" | |
1280 "psrlq $32, %%mm3 \n" | |
1281 "paddd %%mm3, %%mm4 \n" | |
1282 "movd %%mm4, %1 \n" | |
1283 :"+r"(i), "=r"(sum) | |
1284 :"r"(pix1), "r"(pix2) | |
1285 ); | |
1286 return sum; | |
1287 } | |
1288 | |
1289 #define PHADDD(a, t)\ | |
1290 "movq "#a", "#t" \n\t"\ | |
1291 "psrlq $32, "#a" \n\t"\ | |
1292 "paddd "#t", "#a" \n\t" | |
1293 /* | |
1294 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31] | |
1295 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31] | |
1296 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30] | |
1297 */ | |
1298 #define PMULHRW(x, y, s, o)\ | |
1299 "pmulhw " #s ", "#x " \n\t"\ | |
1300 "pmulhw " #s ", "#y " \n\t"\ | |
1301 "paddw " #o ", "#x " \n\t"\ | |
1302 "paddw " #o ", "#y " \n\t"\ | |
1303 "psraw $1, "#x " \n\t"\ | |
1304 "psraw $1, "#y " \n\t" | |
1305 #define DEF(x) x ## _mmx | |
1306 #define SET_RND MOVQ_WONE | |
1307 #define SCALE_OFFSET 1 | |
1308 | |
1309 #include "dsputil_mmx_qns_template.c" | |
1310 | |
1311 #undef DEF | |
1312 #undef SET_RND | |
1313 #undef SCALE_OFFSET | |
1314 #undef PMULHRW | |
1315 | |
1316 #define DEF(x) x ## _3dnow | |
1317 #define SET_RND(x) | |
1318 #define SCALE_OFFSET 0 | |
1319 #define PMULHRW(x, y, s, o)\ | |
1320 "pmulhrw " #s ", "#x " \n\t"\ | |
1321 "pmulhrw " #s ", "#y " \n\t" | |
1322 | |
1323 #include "dsputil_mmx_qns_template.c" | |
1324 | |
1325 #undef DEF | |
1326 #undef SET_RND | |
1327 #undef SCALE_OFFSET | |
1328 #undef PMULHRW | |
1329 | |
8590 | 1330 #if HAVE_SSSE3 |
8430 | 1331 #undef PHADDD |
1332 #define DEF(x) x ## _ssse3 | |
1333 #define SET_RND(x) | |
1334 #define SCALE_OFFSET -1 | |
1335 #define PHADDD(a, t)\ | |
1336 "pshufw $0x0E, "#a", "#t" \n\t"\ | |
1337 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ | |
1338 #define PMULHRW(x, y, s, o)\ | |
1339 "pmulhrsw " #s ", "#x " \n\t"\ | |
1340 "pmulhrsw " #s ", "#y " \n\t" | |
1341 | |
1342 #include "dsputil_mmx_qns_template.c" | |
1343 | |
1344 #undef DEF | |
1345 #undef SET_RND | |
1346 #undef SCALE_OFFSET | |
1347 #undef PMULHRW | |
1348 #undef PHADDD | |
1349 #endif //HAVE_SSSE3 | |
1350 | |
1351 | |
1352 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) | |
1353 { | |
12475
9fef0a8ddd63
Move mm_support() from libavcodec to libavutil, make it a public
stefano
parents:
12456
diff
changeset
|
1354 int mm_flags = av_get_cpu_flags(); |
12414 | 1355 |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
1356 if (mm_flags & AV_CPU_FLAG_MMX) { |
8430 | 1357 const int dct_algo = avctx->dct_algo; |
1358 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ | |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
1359 if(mm_flags & AV_CPU_FLAG_SSE2){ |
8430 | 1360 c->fdct = ff_fdct_sse2; |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
1361 }else if(mm_flags & AV_CPU_FLAG_MMX2){ |
8430 | 1362 c->fdct = ff_fdct_mmx2; |
1363 }else{ | |
1364 c->fdct = ff_fdct_mmx; | |
1365 } | |
1366 } | |
1367 | |
1368 c->get_pixels = get_pixels_mmx; | |
1369 c->diff_pixels = diff_pixels_mmx; | |
1370 c->pix_sum = pix_sum16_mmx; | |
1371 | |
1372 c->diff_bytes= diff_bytes_mmx; | |
1373 c->sum_abs_dctelem= sum_abs_dctelem_mmx; | |
1374 | |
1375 c->hadamard8_diff[0]= hadamard8_diff16_mmx; | |
1376 c->hadamard8_diff[1]= hadamard8_diff_mmx; | |
1377 | |
1378 c->pix_norm1 = pix_norm1_mmx; | |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
1379 c->sse[0] = (mm_flags & AV_CPU_FLAG_SSE2) ? sse16_sse2 : sse16_mmx; |
8430 | 1380 c->sse[1] = sse8_mmx; |
1381 c->vsad[4]= vsad_intra16_mmx; | |
1382 | |
1383 c->nsse[0] = nsse16_mmx; | |
1384 c->nsse[1] = nsse8_mmx; | |
1385 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
1386 c->vsad[0] = vsad16_mmx; | |
1387 } | |
1388 | |
1389 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
1390 c->try_8x8basis= try_8x8basis_mmx; | |
1391 } | |
1392 c->add_8x8basis= add_8x8basis_mmx; | |
1393 | |
1394 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; | |
1395 | |
1396 | |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
1397 if (mm_flags & AV_CPU_FLAG_MMX2) { |
8430 | 1398 c->sum_abs_dctelem= sum_abs_dctelem_mmx2; |
1399 c->hadamard8_diff[0]= hadamard8_diff16_mmx2; | |
1400 c->hadamard8_diff[1]= hadamard8_diff_mmx2; | |
1401 c->vsad[4]= vsad_intra16_mmx2; | |
1402 | |
1403 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
1404 c->vsad[0] = vsad16_mmx2; | |
1405 } | |
1406 | |
1407 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; | |
1408 } | |
1409 | |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
1410 if(mm_flags & AV_CPU_FLAG_SSE2){ |
8430 | 1411 c->get_pixels = get_pixels_sse2; |
1412 c->sum_abs_dctelem= sum_abs_dctelem_sse2; | |
1413 c->hadamard8_diff[0]= hadamard8_diff16_sse2; | |
1414 c->hadamard8_diff[1]= hadamard8_diff_sse2; | |
12197
fbf4d5b1b664
Remove FF_MM_SSE2/3 flags for CPUs where this is generally not faster than
rbultje
parents:
11381
diff
changeset
|
1415 } |
fbf4d5b1b664
Remove FF_MM_SSE2/3 flags for CPUs where this is generally not faster than
rbultje
parents:
11381
diff
changeset
|
1416 |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
1417 if (CONFIG_LPC && mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) { |
10424
94595d0e617c
Move autocorrelation function from flacenc.c to lpc.c. Also rename the
jbr
parents:
9342
diff
changeset
|
1418 c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; |
8430 | 1419 } |
1420 | |
8590 | 1421 #if HAVE_SSSE3 |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
1422 if(mm_flags & AV_CPU_FLAG_SSSE3){ |
8430 | 1423 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
1424 c->try_8x8basis= try_8x8basis_ssse3; | |
1425 } | |
1426 c->add_8x8basis= add_8x8basis_ssse3; | |
1427 c->sum_abs_dctelem= sum_abs_dctelem_ssse3; | |
1428 c->hadamard8_diff[0]= hadamard8_diff16_ssse3; | |
1429 c->hadamard8_diff[1]= hadamard8_diff_ssse3; | |
1430 } | |
1431 #endif | |
1432 | |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
1433 if(mm_flags & AV_CPU_FLAG_3DNOW){ |
8430 | 1434 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
1435 c->try_8x8basis= try_8x8basis_3dnow; | |
1436 } | |
1437 c->add_8x8basis= add_8x8basis_3dnow; | |
1438 } | |
1439 } | |
1440 | |
1441 dsputil_init_pix_mmx(c, avctx); | |
1442 } |