Mercurial > libavcodec.hg
annotate x86/dsputilenc_mmx.c @ 12197:fbf4d5b1b664 libavcodec
Remove FF_MM_SSE2/3 flags for CPUs where this is generally not faster than
regular MMX code. Examples of this are the Core1 CPU. Instead, set a new flag,
FF_MM_SSE2/3SLOW, which can be checked for particular SSE2/3 functions that
have been checked specifically on such CPUs and are actually faster than
their MMX counterparts.
In addition, use this flag to enable particular VP8 and LPC SSE2 functions
that are faster than their MMX counterparts.
Based on a patch by Loren Merritt <lorenm AT u washington edu>.
author | rbultje |
---|---|
date | Mon, 19 Jul 2010 22:38:23 +0000 |
parents | f5ccf2e590d6 |
children | 3fc4c625b6f3 |
rev | line source |
---|---|
8430 | 1 /* |
2 * MMX optimized DSP utils | |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8627
diff
changeset
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard |
8430 | 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
5 * | |
6 * This file is part of FFmpeg. | |
7 * | |
8 * FFmpeg is free software; you can redistribute it and/or | |
9 * modify it under the terms of the GNU Lesser General Public | |
10 * License as published by the Free Software Foundation; either | |
11 * version 2.1 of the License, or (at your option) any later version. | |
12 * | |
13 * FFmpeg is distributed in the hope that it will be useful, | |
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 * Lesser General Public License for more details. | |
17 * | |
18 * You should have received a copy of the GNU Lesser General Public | |
19 * License along with FFmpeg; if not, write to the Free Software | |
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 * | |
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
23 */ | |
24 | |
25 #include "libavutil/x86_cpu.h" | |
26 #include "libavcodec/dsputil.h" | |
27 #include "libavcodec/mpegvideo.h" | |
8627
d6bab465b82c
moves mid_pred() into mathops.h (with arch specific code split by directory)
aurel
parents:
8596
diff
changeset
|
28 #include "libavcodec/mathops.h" |
8430 | 29 #include "dsputil_mmx.h" |
30 | |
31 | |
32 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) | |
33 { | |
34 __asm__ volatile( | |
35 "mov $-128, %%"REG_a" \n\t" | |
36 "pxor %%mm7, %%mm7 \n\t" | |
37 ASMALIGN(4) | |
38 "1: \n\t" | |
39 "movq (%0), %%mm0 \n\t" | |
40 "movq (%0, %2), %%mm2 \n\t" | |
41 "movq %%mm0, %%mm1 \n\t" | |
42 "movq %%mm2, %%mm3 \n\t" | |
43 "punpcklbw %%mm7, %%mm0 \n\t" | |
44 "punpckhbw %%mm7, %%mm1 \n\t" | |
45 "punpcklbw %%mm7, %%mm2 \n\t" | |
46 "punpckhbw %%mm7, %%mm3 \n\t" | |
47 "movq %%mm0, (%1, %%"REG_a") \n\t" | |
48 "movq %%mm1, 8(%1, %%"REG_a") \n\t" | |
49 "movq %%mm2, 16(%1, %%"REG_a") \n\t" | |
50 "movq %%mm3, 24(%1, %%"REG_a") \n\t" | |
51 "add %3, %0 \n\t" | |
52 "add $32, %%"REG_a" \n\t" | |
53 "js 1b \n\t" | |
54 : "+r" (pixels) | |
55 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2) | |
56 : "%"REG_a | |
57 ); | |
58 } | |
59 | |
60 static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) | |
61 { | |
62 __asm__ volatile( | |
63 "pxor %%xmm7, %%xmm7 \n\t" | |
64 "movq (%0), %%xmm0 \n\t" | |
65 "movq (%0, %2), %%xmm1 \n\t" | |
66 "movq (%0, %2,2), %%xmm2 \n\t" | |
67 "movq (%0, %3), %%xmm3 \n\t" | |
68 "lea (%0,%2,4), %0 \n\t" | |
69 "punpcklbw %%xmm7, %%xmm0 \n\t" | |
70 "punpcklbw %%xmm7, %%xmm1 \n\t" | |
71 "punpcklbw %%xmm7, %%xmm2 \n\t" | |
72 "punpcklbw %%xmm7, %%xmm3 \n\t" | |
73 "movdqa %%xmm0, (%1) \n\t" | |
74 "movdqa %%xmm1, 16(%1) \n\t" | |
75 "movdqa %%xmm2, 32(%1) \n\t" | |
76 "movdqa %%xmm3, 48(%1) \n\t" | |
77 "movq (%0), %%xmm0 \n\t" | |
78 "movq (%0, %2), %%xmm1 \n\t" | |
79 "movq (%0, %2,2), %%xmm2 \n\t" | |
80 "movq (%0, %3), %%xmm3 \n\t" | |
81 "punpcklbw %%xmm7, %%xmm0 \n\t" | |
82 "punpcklbw %%xmm7, %%xmm1 \n\t" | |
83 "punpcklbw %%xmm7, %%xmm2 \n\t" | |
84 "punpcklbw %%xmm7, %%xmm3 \n\t" | |
85 "movdqa %%xmm0, 64(%1) \n\t" | |
86 "movdqa %%xmm1, 80(%1) \n\t" | |
87 "movdqa %%xmm2, 96(%1) \n\t" | |
88 "movdqa %%xmm3, 112(%1) \n\t" | |
89 : "+r" (pixels) | |
90 : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3) | |
91 ); | |
92 } | |
93 | |
94 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) | |
95 { | |
96 __asm__ volatile( | |
97 "pxor %%mm7, %%mm7 \n\t" | |
98 "mov $-128, %%"REG_a" \n\t" | |
99 ASMALIGN(4) | |
100 "1: \n\t" | |
101 "movq (%0), %%mm0 \n\t" | |
102 "movq (%1), %%mm2 \n\t" | |
103 "movq %%mm0, %%mm1 \n\t" | |
104 "movq %%mm2, %%mm3 \n\t" | |
105 "punpcklbw %%mm7, %%mm0 \n\t" | |
106 "punpckhbw %%mm7, %%mm1 \n\t" | |
107 "punpcklbw %%mm7, %%mm2 \n\t" | |
108 "punpckhbw %%mm7, %%mm3 \n\t" | |
109 "psubw %%mm2, %%mm0 \n\t" | |
110 "psubw %%mm3, %%mm1 \n\t" | |
111 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
112 "movq %%mm1, 8(%2, %%"REG_a") \n\t" | |
113 "add %3, %0 \n\t" | |
114 "add %3, %1 \n\t" | |
115 "add $16, %%"REG_a" \n\t" | |
116 "jnz 1b \n\t" | |
117 : "+r" (s1), "+r" (s2) | |
118 : "r" (block+64), "r" ((x86_reg)stride) | |
119 : "%"REG_a | |
120 ); | |
121 } | |
122 | |
123 static int pix_sum16_mmx(uint8_t * pix, int line_size){ | |
124 const int h=16; | |
125 int sum; | |
126 x86_reg index= -line_size*h; | |
127 | |
128 __asm__ volatile( | |
129 "pxor %%mm7, %%mm7 \n\t" | |
130 "pxor %%mm6, %%mm6 \n\t" | |
131 "1: \n\t" | |
132 "movq (%2, %1), %%mm0 \n\t" | |
133 "movq (%2, %1), %%mm1 \n\t" | |
134 "movq 8(%2, %1), %%mm2 \n\t" | |
135 "movq 8(%2, %1), %%mm3 \n\t" | |
136 "punpcklbw %%mm7, %%mm0 \n\t" | |
137 "punpckhbw %%mm7, %%mm1 \n\t" | |
138 "punpcklbw %%mm7, %%mm2 \n\t" | |
139 "punpckhbw %%mm7, %%mm3 \n\t" | |
140 "paddw %%mm0, %%mm1 \n\t" | |
141 "paddw %%mm2, %%mm3 \n\t" | |
142 "paddw %%mm1, %%mm3 \n\t" | |
143 "paddw %%mm3, %%mm6 \n\t" | |
144 "add %3, %1 \n\t" | |
145 " js 1b \n\t" | |
146 "movq %%mm6, %%mm5 \n\t" | |
147 "psrlq $32, %%mm6 \n\t" | |
148 "paddw %%mm5, %%mm6 \n\t" | |
149 "movq %%mm6, %%mm5 \n\t" | |
150 "psrlq $16, %%mm6 \n\t" | |
151 "paddw %%mm5, %%mm6 \n\t" | |
152 "movd %%mm6, %0 \n\t" | |
153 "andl $0xFFFF, %0 \n\t" | |
154 : "=&r" (sum), "+r" (index) | |
155 : "r" (pix - index), "r" ((x86_reg)line_size) | |
156 ); | |
157 | |
158 return sum; | |
159 } | |
160 | |
161 static int pix_norm1_mmx(uint8_t *pix, int line_size) { | |
162 int tmp; | |
163 __asm__ volatile ( | |
164 "movl $16,%%ecx\n" | |
165 "pxor %%mm0,%%mm0\n" | |
166 "pxor %%mm7,%%mm7\n" | |
167 "1:\n" | |
168 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ | |
169 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ | |
170 | |
171 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ | |
172 | |
173 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ | |
174 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ | |
175 | |
176 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ | |
177 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ | |
178 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ | |
179 | |
180 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ | |
181 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ | |
182 | |
183 "pmaddwd %%mm3,%%mm3\n" | |
184 "pmaddwd %%mm4,%%mm4\n" | |
185 | |
186 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, | |
187 pix2^2+pix3^2+pix6^2+pix7^2) */ | |
188 "paddd %%mm3,%%mm4\n" | |
189 "paddd %%mm2,%%mm7\n" | |
190 | |
191 "add %2, %0\n" | |
192 "paddd %%mm4,%%mm7\n" | |
193 "dec %%ecx\n" | |
194 "jnz 1b\n" | |
195 | |
196 "movq %%mm7,%%mm1\n" | |
197 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | |
198 "paddd %%mm7,%%mm1\n" | |
199 "movd %%mm1,%1\n" | |
200 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" ); | |
201 return tmp; | |
202 } | |
203 | |
204 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
205 int tmp; | |
206 __asm__ volatile ( | |
207 "movl %4,%%ecx\n" | |
208 "shr $1,%%ecx\n" | |
209 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ | |
210 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ | |
211 "1:\n" | |
212 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ | |
213 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ | |
214 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ | |
215 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ | |
216 | |
217 /* todo: mm1-mm2, mm3-mm4 */ | |
218 /* algo: subtract mm1 from mm2 with saturation and vice versa */ | |
219 /* OR the results to get absolute difference */ | |
220 "movq %%mm1,%%mm5\n" | |
221 "movq %%mm3,%%mm6\n" | |
222 "psubusb %%mm2,%%mm1\n" | |
223 "psubusb %%mm4,%%mm3\n" | |
224 "psubusb %%mm5,%%mm2\n" | |
225 "psubusb %%mm6,%%mm4\n" | |
226 | |
227 "por %%mm1,%%mm2\n" | |
228 "por %%mm3,%%mm4\n" | |
229 | |
230 /* now convert to 16-bit vectors so we can square them */ | |
231 "movq %%mm2,%%mm1\n" | |
232 "movq %%mm4,%%mm3\n" | |
233 | |
234 "punpckhbw %%mm0,%%mm2\n" | |
235 "punpckhbw %%mm0,%%mm4\n" | |
236 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ | |
237 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ | |
238 | |
239 "pmaddwd %%mm2,%%mm2\n" | |
240 "pmaddwd %%mm4,%%mm4\n" | |
241 "pmaddwd %%mm1,%%mm1\n" | |
242 "pmaddwd %%mm3,%%mm3\n" | |
243 | |
244 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ | |
245 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ | |
246 | |
247 "paddd %%mm2,%%mm1\n" | |
248 "paddd %%mm4,%%mm3\n" | |
249 "paddd %%mm1,%%mm7\n" | |
250 "paddd %%mm3,%%mm7\n" | |
251 | |
252 "decl %%ecx\n" | |
253 "jnz 1b\n" | |
254 | |
255 "movq %%mm7,%%mm1\n" | |
256 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | |
257 "paddd %%mm7,%%mm1\n" | |
258 "movd %%mm1,%2\n" | |
259 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
260 : "r" ((x86_reg)line_size) , "m" (h) | |
261 : "%ecx"); | |
262 return tmp; | |
263 } | |
264 | |
265 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
266 int tmp; | |
267 __asm__ volatile ( | |
268 "movl %4,%%ecx\n" | |
269 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ | |
270 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ | |
271 "1:\n" | |
272 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ | |
273 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ | |
274 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ | |
275 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ | |
276 | |
277 /* todo: mm1-mm2, mm3-mm4 */ | |
278 /* algo: subtract mm1 from mm2 with saturation and vice versa */ | |
279 /* OR the results to get absolute difference */ | |
280 "movq %%mm1,%%mm5\n" | |
281 "movq %%mm3,%%mm6\n" | |
282 "psubusb %%mm2,%%mm1\n" | |
283 "psubusb %%mm4,%%mm3\n" | |
284 "psubusb %%mm5,%%mm2\n" | |
285 "psubusb %%mm6,%%mm4\n" | |
286 | |
287 "por %%mm1,%%mm2\n" | |
288 "por %%mm3,%%mm4\n" | |
289 | |
290 /* now convert to 16-bit vectors so we can square them */ | |
291 "movq %%mm2,%%mm1\n" | |
292 "movq %%mm4,%%mm3\n" | |
293 | |
294 "punpckhbw %%mm0,%%mm2\n" | |
295 "punpckhbw %%mm0,%%mm4\n" | |
296 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ | |
297 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ | |
298 | |
299 "pmaddwd %%mm2,%%mm2\n" | |
300 "pmaddwd %%mm4,%%mm4\n" | |
301 "pmaddwd %%mm1,%%mm1\n" | |
302 "pmaddwd %%mm3,%%mm3\n" | |
303 | |
304 "add %3,%0\n" | |
305 "add %3,%1\n" | |
306 | |
307 "paddd %%mm2,%%mm1\n" | |
308 "paddd %%mm4,%%mm3\n" | |
309 "paddd %%mm1,%%mm7\n" | |
310 "paddd %%mm3,%%mm7\n" | |
311 | |
312 "decl %%ecx\n" | |
313 "jnz 1b\n" | |
314 | |
315 "movq %%mm7,%%mm1\n" | |
316 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | |
317 "paddd %%mm7,%%mm1\n" | |
318 "movd %%mm1,%2\n" | |
319 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
320 : "r" ((x86_reg)line_size) , "m" (h) | |
321 : "%ecx"); | |
322 return tmp; | |
323 } | |
324 | |
325 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
326 int tmp; | |
327 __asm__ volatile ( | |
328 "shr $1,%2\n" | |
329 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ | |
330 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ | |
331 "1:\n" | |
332 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */ | |
333 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */ | |
334 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */ | |
335 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */ | |
336 | |
337 /* todo: mm1-mm2, mm3-mm4 */ | |
338 /* algo: subtract mm1 from mm2 with saturation and vice versa */ | |
339 /* OR the results to get absolute difference */ | |
340 "movdqa %%xmm1,%%xmm5\n" | |
341 "movdqa %%xmm3,%%xmm6\n" | |
342 "psubusb %%xmm2,%%xmm1\n" | |
343 "psubusb %%xmm4,%%xmm3\n" | |
344 "psubusb %%xmm5,%%xmm2\n" | |
345 "psubusb %%xmm6,%%xmm4\n" | |
346 | |
347 "por %%xmm1,%%xmm2\n" | |
348 "por %%xmm3,%%xmm4\n" | |
349 | |
350 /* now convert to 16-bit vectors so we can square them */ | |
351 "movdqa %%xmm2,%%xmm1\n" | |
352 "movdqa %%xmm4,%%xmm3\n" | |
353 | |
354 "punpckhbw %%xmm0,%%xmm2\n" | |
355 "punpckhbw %%xmm0,%%xmm4\n" | |
356 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ | |
357 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ | |
358 | |
359 "pmaddwd %%xmm2,%%xmm2\n" | |
360 "pmaddwd %%xmm4,%%xmm4\n" | |
361 "pmaddwd %%xmm1,%%xmm1\n" | |
362 "pmaddwd %%xmm3,%%xmm3\n" | |
363 | |
364 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ | |
365 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ | |
366 | |
367 "paddd %%xmm2,%%xmm1\n" | |
368 "paddd %%xmm4,%%xmm3\n" | |
369 "paddd %%xmm1,%%xmm7\n" | |
370 "paddd %%xmm3,%%xmm7\n" | |
371 | |
372 "decl %2\n" | |
373 "jnz 1b\n" | |
374 | |
375 "movdqa %%xmm7,%%xmm1\n" | |
376 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */ | |
377 "paddd %%xmm1,%%xmm7\n" | |
378 "movdqa %%xmm7,%%xmm1\n" | |
379 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ | |
380 "paddd %%xmm1,%%xmm7\n" | |
381 "movd %%xmm7,%3\n" | |
382 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) | |
383 : "r" ((x86_reg)line_size)); | |
384 return tmp; | |
385 } | |
386 | |
387 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { | |
388 int tmp; | |
389 __asm__ volatile ( | |
390 "movl %3,%%ecx\n" | |
391 "pxor %%mm7,%%mm7\n" | |
392 "pxor %%mm6,%%mm6\n" | |
393 | |
394 "movq (%0),%%mm0\n" | |
395 "movq %%mm0, %%mm1\n" | |
396 "psllq $8, %%mm0\n" | |
397 "psrlq $8, %%mm1\n" | |
398 "psrlq $8, %%mm0\n" | |
399 "movq %%mm0, %%mm2\n" | |
400 "movq %%mm1, %%mm3\n" | |
401 "punpcklbw %%mm7,%%mm0\n" | |
402 "punpcklbw %%mm7,%%mm1\n" | |
403 "punpckhbw %%mm7,%%mm2\n" | |
404 "punpckhbw %%mm7,%%mm3\n" | |
405 "psubw %%mm1, %%mm0\n" | |
406 "psubw %%mm3, %%mm2\n" | |
407 | |
408 "add %2,%0\n" | |
409 | |
410 "movq (%0),%%mm4\n" | |
411 "movq %%mm4, %%mm1\n" | |
412 "psllq $8, %%mm4\n" | |
413 "psrlq $8, %%mm1\n" | |
414 "psrlq $8, %%mm4\n" | |
415 "movq %%mm4, %%mm5\n" | |
416 "movq %%mm1, %%mm3\n" | |
417 "punpcklbw %%mm7,%%mm4\n" | |
418 "punpcklbw %%mm7,%%mm1\n" | |
419 "punpckhbw %%mm7,%%mm5\n" | |
420 "punpckhbw %%mm7,%%mm3\n" | |
421 "psubw %%mm1, %%mm4\n" | |
422 "psubw %%mm3, %%mm5\n" | |
423 "psubw %%mm4, %%mm0\n" | |
424 "psubw %%mm5, %%mm2\n" | |
425 "pxor %%mm3, %%mm3\n" | |
426 "pxor %%mm1, %%mm1\n" | |
427 "pcmpgtw %%mm0, %%mm3\n\t" | |
428 "pcmpgtw %%mm2, %%mm1\n\t" | |
429 "pxor %%mm3, %%mm0\n" | |
430 "pxor %%mm1, %%mm2\n" | |
431 "psubw %%mm3, %%mm0\n" | |
432 "psubw %%mm1, %%mm2\n" | |
433 "paddw %%mm0, %%mm2\n" | |
434 "paddw %%mm2, %%mm6\n" | |
435 | |
436 "add %2,%0\n" | |
437 "1:\n" | |
438 | |
439 "movq (%0),%%mm0\n" | |
440 "movq %%mm0, %%mm1\n" | |
441 "psllq $8, %%mm0\n" | |
442 "psrlq $8, %%mm1\n" | |
443 "psrlq $8, %%mm0\n" | |
444 "movq %%mm0, %%mm2\n" | |
445 "movq %%mm1, %%mm3\n" | |
446 "punpcklbw %%mm7,%%mm0\n" | |
447 "punpcklbw %%mm7,%%mm1\n" | |
448 "punpckhbw %%mm7,%%mm2\n" | |
449 "punpckhbw %%mm7,%%mm3\n" | |
450 "psubw %%mm1, %%mm0\n" | |
451 "psubw %%mm3, %%mm2\n" | |
452 "psubw %%mm0, %%mm4\n" | |
453 "psubw %%mm2, %%mm5\n" | |
454 "pxor %%mm3, %%mm3\n" | |
455 "pxor %%mm1, %%mm1\n" | |
456 "pcmpgtw %%mm4, %%mm3\n\t" | |
457 "pcmpgtw %%mm5, %%mm1\n\t" | |
458 "pxor %%mm3, %%mm4\n" | |
459 "pxor %%mm1, %%mm5\n" | |
460 "psubw %%mm3, %%mm4\n" | |
461 "psubw %%mm1, %%mm5\n" | |
462 "paddw %%mm4, %%mm5\n" | |
463 "paddw %%mm5, %%mm6\n" | |
464 | |
465 "add %2,%0\n" | |
466 | |
467 "movq (%0),%%mm4\n" | |
468 "movq %%mm4, %%mm1\n" | |
469 "psllq $8, %%mm4\n" | |
470 "psrlq $8, %%mm1\n" | |
471 "psrlq $8, %%mm4\n" | |
472 "movq %%mm4, %%mm5\n" | |
473 "movq %%mm1, %%mm3\n" | |
474 "punpcklbw %%mm7,%%mm4\n" | |
475 "punpcklbw %%mm7,%%mm1\n" | |
476 "punpckhbw %%mm7,%%mm5\n" | |
477 "punpckhbw %%mm7,%%mm3\n" | |
478 "psubw %%mm1, %%mm4\n" | |
479 "psubw %%mm3, %%mm5\n" | |
480 "psubw %%mm4, %%mm0\n" | |
481 "psubw %%mm5, %%mm2\n" | |
482 "pxor %%mm3, %%mm3\n" | |
483 "pxor %%mm1, %%mm1\n" | |
484 "pcmpgtw %%mm0, %%mm3\n\t" | |
485 "pcmpgtw %%mm2, %%mm1\n\t" | |
486 "pxor %%mm3, %%mm0\n" | |
487 "pxor %%mm1, %%mm2\n" | |
488 "psubw %%mm3, %%mm0\n" | |
489 "psubw %%mm1, %%mm2\n" | |
490 "paddw %%mm0, %%mm2\n" | |
491 "paddw %%mm2, %%mm6\n" | |
492 | |
493 "add %2,%0\n" | |
494 "subl $2, %%ecx\n" | |
495 " jnz 1b\n" | |
496 | |
497 "movq %%mm6, %%mm0\n" | |
498 "punpcklwd %%mm7,%%mm0\n" | |
499 "punpckhwd %%mm7,%%mm6\n" | |
500 "paddd %%mm0, %%mm6\n" | |
501 | |
502 "movq %%mm6,%%mm0\n" | |
503 "psrlq $32, %%mm6\n" | |
504 "paddd %%mm6,%%mm0\n" | |
505 "movd %%mm0,%1\n" | |
506 : "+r" (pix1), "=r"(tmp) | |
507 : "r" ((x86_reg)line_size) , "g" (h-2) | |
508 : "%ecx"); | |
509 return tmp; | |
510 } | |
511 | |
512 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { | |
513 int tmp; | |
514 uint8_t * pix= pix1; | |
515 __asm__ volatile ( | |
516 "movl %3,%%ecx\n" | |
517 "pxor %%mm7,%%mm7\n" | |
518 "pxor %%mm6,%%mm6\n" | |
519 | |
520 "movq (%0),%%mm0\n" | |
521 "movq 1(%0),%%mm1\n" | |
522 "movq %%mm0, %%mm2\n" | |
523 "movq %%mm1, %%mm3\n" | |
524 "punpcklbw %%mm7,%%mm0\n" | |
525 "punpcklbw %%mm7,%%mm1\n" | |
526 "punpckhbw %%mm7,%%mm2\n" | |
527 "punpckhbw %%mm7,%%mm3\n" | |
528 "psubw %%mm1, %%mm0\n" | |
529 "psubw %%mm3, %%mm2\n" | |
530 | |
531 "add %2,%0\n" | |
532 | |
533 "movq (%0),%%mm4\n" | |
534 "movq 1(%0),%%mm1\n" | |
535 "movq %%mm4, %%mm5\n" | |
536 "movq %%mm1, %%mm3\n" | |
537 "punpcklbw %%mm7,%%mm4\n" | |
538 "punpcklbw %%mm7,%%mm1\n" | |
539 "punpckhbw %%mm7,%%mm5\n" | |
540 "punpckhbw %%mm7,%%mm3\n" | |
541 "psubw %%mm1, %%mm4\n" | |
542 "psubw %%mm3, %%mm5\n" | |
543 "psubw %%mm4, %%mm0\n" | |
544 "psubw %%mm5, %%mm2\n" | |
545 "pxor %%mm3, %%mm3\n" | |
546 "pxor %%mm1, %%mm1\n" | |
547 "pcmpgtw %%mm0, %%mm3\n\t" | |
548 "pcmpgtw %%mm2, %%mm1\n\t" | |
549 "pxor %%mm3, %%mm0\n" | |
550 "pxor %%mm1, %%mm2\n" | |
551 "psubw %%mm3, %%mm0\n" | |
552 "psubw %%mm1, %%mm2\n" | |
553 "paddw %%mm0, %%mm2\n" | |
554 "paddw %%mm2, %%mm6\n" | |
555 | |
556 "add %2,%0\n" | |
557 "1:\n" | |
558 | |
559 "movq (%0),%%mm0\n" | |
560 "movq 1(%0),%%mm1\n" | |
561 "movq %%mm0, %%mm2\n" | |
562 "movq %%mm1, %%mm3\n" | |
563 "punpcklbw %%mm7,%%mm0\n" | |
564 "punpcklbw %%mm7,%%mm1\n" | |
565 "punpckhbw %%mm7,%%mm2\n" | |
566 "punpckhbw %%mm7,%%mm3\n" | |
567 "psubw %%mm1, %%mm0\n" | |
568 "psubw %%mm3, %%mm2\n" | |
569 "psubw %%mm0, %%mm4\n" | |
570 "psubw %%mm2, %%mm5\n" | |
571 "pxor %%mm3, %%mm3\n" | |
572 "pxor %%mm1, %%mm1\n" | |
573 "pcmpgtw %%mm4, %%mm3\n\t" | |
574 "pcmpgtw %%mm5, %%mm1\n\t" | |
575 "pxor %%mm3, %%mm4\n" | |
576 "pxor %%mm1, %%mm5\n" | |
577 "psubw %%mm3, %%mm4\n" | |
578 "psubw %%mm1, %%mm5\n" | |
579 "paddw %%mm4, %%mm5\n" | |
580 "paddw %%mm5, %%mm6\n" | |
581 | |
582 "add %2,%0\n" | |
583 | |
584 "movq (%0),%%mm4\n" | |
585 "movq 1(%0),%%mm1\n" | |
586 "movq %%mm4, %%mm5\n" | |
587 "movq %%mm1, %%mm3\n" | |
588 "punpcklbw %%mm7,%%mm4\n" | |
589 "punpcklbw %%mm7,%%mm1\n" | |
590 "punpckhbw %%mm7,%%mm5\n" | |
591 "punpckhbw %%mm7,%%mm3\n" | |
592 "psubw %%mm1, %%mm4\n" | |
593 "psubw %%mm3, %%mm5\n" | |
594 "psubw %%mm4, %%mm0\n" | |
595 "psubw %%mm5, %%mm2\n" | |
596 "pxor %%mm3, %%mm3\n" | |
597 "pxor %%mm1, %%mm1\n" | |
598 "pcmpgtw %%mm0, %%mm3\n\t" | |
599 "pcmpgtw %%mm2, %%mm1\n\t" | |
600 "pxor %%mm3, %%mm0\n" | |
601 "pxor %%mm1, %%mm2\n" | |
602 "psubw %%mm3, %%mm0\n" | |
603 "psubw %%mm1, %%mm2\n" | |
604 "paddw %%mm0, %%mm2\n" | |
605 "paddw %%mm2, %%mm6\n" | |
606 | |
607 "add %2,%0\n" | |
608 "subl $2, %%ecx\n" | |
609 " jnz 1b\n" | |
610 | |
611 "movq %%mm6, %%mm0\n" | |
612 "punpcklwd %%mm7,%%mm0\n" | |
613 "punpckhwd %%mm7,%%mm6\n" | |
614 "paddd %%mm0, %%mm6\n" | |
615 | |
616 "movq %%mm6,%%mm0\n" | |
617 "psrlq $32, %%mm6\n" | |
618 "paddd %%mm6,%%mm0\n" | |
619 "movd %%mm0,%1\n" | |
620 : "+r" (pix1), "=r"(tmp) | |
621 : "r" ((x86_reg)line_size) , "g" (h-2) | |
622 : "%ecx"); | |
623 return tmp + hf_noise8_mmx(pix+8, line_size, h); | |
624 } | |
625 | |
626 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
627 MpegEncContext *c = p; | |
628 int score1, score2; | |
629 | |
630 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); | |
631 else score1 = sse16_mmx(c, pix1, pix2, line_size, h); | |
632 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); | |
633 | |
634 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; | |
635 else return score1 + FFABS(score2)*8; | |
636 } | |
637 | |
638 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
639 MpegEncContext *c = p; | |
640 int score1= sse8_mmx(c, pix1, pix2, line_size, h); | |
641 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); | |
642 | |
643 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; | |
644 else return score1 + FFABS(score2)*8; | |
645 } | |
646 | |
647 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { | |
648 int tmp; | |
649 | |
650 assert( (((int)pix) & 7) == 0); | |
651 assert((line_size &7) ==0); | |
652 | |
653 #define SUM(in0, in1, out0, out1) \ | |
654 "movq (%0), %%mm2\n"\ | |
655 "movq 8(%0), %%mm3\n"\ | |
656 "add %2,%0\n"\ | |
657 "movq %%mm2, " #out0 "\n"\ | |
658 "movq %%mm3, " #out1 "\n"\ | |
659 "psubusb " #in0 ", %%mm2\n"\ | |
660 "psubusb " #in1 ", %%mm3\n"\ | |
661 "psubusb " #out0 ", " #in0 "\n"\ | |
662 "psubusb " #out1 ", " #in1 "\n"\ | |
663 "por %%mm2, " #in0 "\n"\ | |
664 "por %%mm3, " #in1 "\n"\ | |
665 "movq " #in0 ", %%mm2\n"\ | |
666 "movq " #in1 ", %%mm3\n"\ | |
667 "punpcklbw %%mm7, " #in0 "\n"\ | |
668 "punpcklbw %%mm7, " #in1 "\n"\ | |
669 "punpckhbw %%mm7, %%mm2\n"\ | |
670 "punpckhbw %%mm7, %%mm3\n"\ | |
671 "paddw " #in1 ", " #in0 "\n"\ | |
672 "paddw %%mm3, %%mm2\n"\ | |
673 "paddw %%mm2, " #in0 "\n"\ | |
674 "paddw " #in0 ", %%mm6\n" | |
675 | |
676 | |
677 __asm__ volatile ( | |
678 "movl %3,%%ecx\n" | |
679 "pxor %%mm6,%%mm6\n" | |
680 "pxor %%mm7,%%mm7\n" | |
681 "movq (%0),%%mm0\n" | |
682 "movq 8(%0),%%mm1\n" | |
683 "add %2,%0\n" | |
684 "jmp 2f\n" | |
685 "1:\n" | |
686 | |
687 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
688 "2:\n" | |
689 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
690 | |
691 "subl $2, %%ecx\n" | |
692 "jnz 1b\n" | |
693 | |
694 "movq %%mm6,%%mm0\n" | |
695 "psrlq $32, %%mm6\n" | |
696 "paddw %%mm6,%%mm0\n" | |
697 "movq %%mm0,%%mm6\n" | |
698 "psrlq $16, %%mm0\n" | |
699 "paddw %%mm6,%%mm0\n" | |
700 "movd %%mm0,%1\n" | |
701 : "+r" (pix), "=r"(tmp) | |
702 : "r" ((x86_reg)line_size) , "m" (h) | |
703 : "%ecx"); | |
704 return tmp & 0xFFFF; | |
705 } | |
706 #undef SUM | |
707 | |
708 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { | |
709 int tmp; | |
710 | |
711 assert( (((int)pix) & 7) == 0); | |
712 assert((line_size &7) ==0); | |
713 | |
714 #define SUM(in0, in1, out0, out1) \ | |
715 "movq (%0), " #out0 "\n"\ | |
716 "movq 8(%0), " #out1 "\n"\ | |
717 "add %2,%0\n"\ | |
718 "psadbw " #out0 ", " #in0 "\n"\ | |
719 "psadbw " #out1 ", " #in1 "\n"\ | |
720 "paddw " #in1 ", " #in0 "\n"\ | |
721 "paddw " #in0 ", %%mm6\n" | |
722 | |
723 __asm__ volatile ( | |
724 "movl %3,%%ecx\n" | |
725 "pxor %%mm6,%%mm6\n" | |
726 "pxor %%mm7,%%mm7\n" | |
727 "movq (%0),%%mm0\n" | |
728 "movq 8(%0),%%mm1\n" | |
729 "add %2,%0\n" | |
730 "jmp 2f\n" | |
731 "1:\n" | |
732 | |
733 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
734 "2:\n" | |
735 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
736 | |
737 "subl $2, %%ecx\n" | |
738 "jnz 1b\n" | |
739 | |
740 "movd %%mm6,%1\n" | |
741 : "+r" (pix), "=r"(tmp) | |
742 : "r" ((x86_reg)line_size) , "m" (h) | |
743 : "%ecx"); | |
744 return tmp; | |
745 } | |
746 #undef SUM | |
747 | |
748 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
749 int tmp; | |
750 | |
751 assert( (((int)pix1) & 7) == 0); | |
752 assert( (((int)pix2) & 7) == 0); | |
753 assert((line_size &7) ==0); | |
754 | |
755 #define SUM(in0, in1, out0, out1) \ | |
756 "movq (%0),%%mm2\n"\ | |
757 "movq (%1)," #out0 "\n"\ | |
758 "movq 8(%0),%%mm3\n"\ | |
759 "movq 8(%1)," #out1 "\n"\ | |
760 "add %3,%0\n"\ | |
761 "add %3,%1\n"\ | |
762 "psubb " #out0 ", %%mm2\n"\ | |
763 "psubb " #out1 ", %%mm3\n"\ | |
764 "pxor %%mm7, %%mm2\n"\ | |
765 "pxor %%mm7, %%mm3\n"\ | |
766 "movq %%mm2, " #out0 "\n"\ | |
767 "movq %%mm3, " #out1 "\n"\ | |
768 "psubusb " #in0 ", %%mm2\n"\ | |
769 "psubusb " #in1 ", %%mm3\n"\ | |
770 "psubusb " #out0 ", " #in0 "\n"\ | |
771 "psubusb " #out1 ", " #in1 "\n"\ | |
772 "por %%mm2, " #in0 "\n"\ | |
773 "por %%mm3, " #in1 "\n"\ | |
774 "movq " #in0 ", %%mm2\n"\ | |
775 "movq " #in1 ", %%mm3\n"\ | |
776 "punpcklbw %%mm7, " #in0 "\n"\ | |
777 "punpcklbw %%mm7, " #in1 "\n"\ | |
778 "punpckhbw %%mm7, %%mm2\n"\ | |
779 "punpckhbw %%mm7, %%mm3\n"\ | |
780 "paddw " #in1 ", " #in0 "\n"\ | |
781 "paddw %%mm3, %%mm2\n"\ | |
782 "paddw %%mm2, " #in0 "\n"\ | |
783 "paddw " #in0 ", %%mm6\n" | |
784 | |
785 | |
786 __asm__ volatile ( | |
787 "movl %4,%%ecx\n" | |
788 "pxor %%mm6,%%mm6\n" | |
789 "pcmpeqw %%mm7,%%mm7\n" | |
790 "psllw $15, %%mm7\n" | |
791 "packsswb %%mm7, %%mm7\n" | |
792 "movq (%0),%%mm0\n" | |
793 "movq (%1),%%mm2\n" | |
794 "movq 8(%0),%%mm1\n" | |
795 "movq 8(%1),%%mm3\n" | |
796 "add %3,%0\n" | |
797 "add %3,%1\n" | |
798 "psubb %%mm2, %%mm0\n" | |
799 "psubb %%mm3, %%mm1\n" | |
800 "pxor %%mm7, %%mm0\n" | |
801 "pxor %%mm7, %%mm1\n" | |
802 "jmp 2f\n" | |
803 "1:\n" | |
804 | |
805 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
806 "2:\n" | |
807 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
808 | |
809 "subl $2, %%ecx\n" | |
810 "jnz 1b\n" | |
811 | |
812 "movq %%mm6,%%mm0\n" | |
813 "psrlq $32, %%mm6\n" | |
814 "paddw %%mm6,%%mm0\n" | |
815 "movq %%mm0,%%mm6\n" | |
816 "psrlq $16, %%mm0\n" | |
817 "paddw %%mm6,%%mm0\n" | |
818 "movd %%mm0,%2\n" | |
819 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
820 : "r" ((x86_reg)line_size) , "m" (h) | |
821 : "%ecx"); | |
822 return tmp & 0x7FFF; | |
823 } | |
824 #undef SUM | |
825 | |
826 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | |
827 int tmp; | |
828 | |
829 assert( (((int)pix1) & 7) == 0); | |
830 assert( (((int)pix2) & 7) == 0); | |
831 assert((line_size &7) ==0); | |
832 | |
833 #define SUM(in0, in1, out0, out1) \ | |
834 "movq (%0)," #out0 "\n"\ | |
835 "movq (%1),%%mm2\n"\ | |
836 "movq 8(%0)," #out1 "\n"\ | |
837 "movq 8(%1),%%mm3\n"\ | |
838 "add %3,%0\n"\ | |
839 "add %3,%1\n"\ | |
840 "psubb %%mm2, " #out0 "\n"\ | |
841 "psubb %%mm3, " #out1 "\n"\ | |
842 "pxor %%mm7, " #out0 "\n"\ | |
843 "pxor %%mm7, " #out1 "\n"\ | |
844 "psadbw " #out0 ", " #in0 "\n"\ | |
845 "psadbw " #out1 ", " #in1 "\n"\ | |
846 "paddw " #in1 ", " #in0 "\n"\ | |
847 "paddw " #in0 ", %%mm6\n" | |
848 | |
849 __asm__ volatile ( | |
850 "movl %4,%%ecx\n" | |
851 "pxor %%mm6,%%mm6\n" | |
852 "pcmpeqw %%mm7,%%mm7\n" | |
853 "psllw $15, %%mm7\n" | |
854 "packsswb %%mm7, %%mm7\n" | |
855 "movq (%0),%%mm0\n" | |
856 "movq (%1),%%mm2\n" | |
857 "movq 8(%0),%%mm1\n" | |
858 "movq 8(%1),%%mm3\n" | |
859 "add %3,%0\n" | |
860 "add %3,%1\n" | |
861 "psubb %%mm2, %%mm0\n" | |
862 "psubb %%mm3, %%mm1\n" | |
863 "pxor %%mm7, %%mm0\n" | |
864 "pxor %%mm7, %%mm1\n" | |
865 "jmp 2f\n" | |
866 "1:\n" | |
867 | |
868 SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
869 "2:\n" | |
870 SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
871 | |
872 "subl $2, %%ecx\n" | |
873 "jnz 1b\n" | |
874 | |
875 "movd %%mm6,%2\n" | |
876 : "+r" (pix1), "+r" (pix2), "=r"(tmp) | |
877 : "r" ((x86_reg)line_size) , "m" (h) | |
878 : "%ecx"); | |
879 return tmp; | |
880 } | |
881 #undef SUM | |
882 | |
883 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ | |
884 x86_reg i=0; | |
885 __asm__ volatile( | |
886 "1: \n\t" | |
887 "movq (%2, %0), %%mm0 \n\t" | |
888 "movq (%1, %0), %%mm1 \n\t" | |
889 "psubb %%mm0, %%mm1 \n\t" | |
890 "movq %%mm1, (%3, %0) \n\t" | |
891 "movq 8(%2, %0), %%mm0 \n\t" | |
892 "movq 8(%1, %0), %%mm1 \n\t" | |
893 "psubb %%mm0, %%mm1 \n\t" | |
894 "movq %%mm1, 8(%3, %0) \n\t" | |
895 "add $16, %0 \n\t" | |
896 "cmp %4, %0 \n\t" | |
897 " jb 1b \n\t" | |
898 : "+r" (i) | |
899 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15) | |
900 ); | |
901 for(; i<w; i++) | |
902 dst[i+0] = src1[i+0]-src2[i+0]; | |
903 } | |
904 | |
10431 | 905 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){ |
8430 | 906 x86_reg i=0; |
907 uint8_t l, lt; | |
908 | |
909 __asm__ volatile( | |
910 "1: \n\t" | |
911 "movq -1(%1, %0), %%mm0 \n\t" // LT | |
912 "movq (%1, %0), %%mm1 \n\t" // T | |
913 "movq -1(%2, %0), %%mm2 \n\t" // L | |
914 "movq (%2, %0), %%mm3 \n\t" // X | |
915 "movq %%mm2, %%mm4 \n\t" // L | |
916 "psubb %%mm0, %%mm2 \n\t" | |
917 "paddb %%mm1, %%mm2 \n\t" // L + T - LT | |
918 "movq %%mm4, %%mm5 \n\t" // L | |
919 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) | |
920 "pminub %%mm5, %%mm1 \n\t" // min(T, L) | |
921 "pminub %%mm2, %%mm4 \n\t" | |
922 "pmaxub %%mm1, %%mm4 \n\t" | |
923 "psubb %%mm4, %%mm3 \n\t" // dst - pred | |
924 "movq %%mm3, (%3, %0) \n\t" | |
925 "add $8, %0 \n\t" | |
926 "cmp %4, %0 \n\t" | |
927 " jb 1b \n\t" | |
928 : "+r" (i) | |
929 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w) | |
930 ); | |
931 | |
932 l= *left; | |
933 lt= *left_top; | |
934 | |
935 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); | |
936 | |
937 *left_top= src1[w-1]; | |
938 *left = src2[w-1]; | |
939 } | |
940 | |
941 #define DIFF_PIXELS_1(m,a,t,p1,p2)\ | |
942 "mov"#m" "#p1", "#a" \n\t"\ | |
943 "mov"#m" "#p2", "#t" \n\t"\ | |
944 "punpcklbw "#a", "#t" \n\t"\ | |
945 "punpcklbw "#a", "#a" \n\t"\ | |
946 "psubw "#t", "#a" \n\t"\ | |
947 | |
948 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\ | |
949 uint8_t *p1b=p1, *p2b=p2;\ | |
950 __asm__ volatile(\ | |
951 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ | |
952 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ | |
953 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ | |
954 "add %4, %1 \n\t"\ | |
955 "add %4, %2 \n\t"\ | |
956 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ | |
957 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ | |
958 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ | |
959 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ | |
960 "mov"#m1" "#mm"0, %0 \n\t"\ | |
961 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ | |
962 "mov"#m1" %0, "#mm"0 \n\t"\ | |
963 : "+m"(temp), "+r"(p1b), "+r"(p2b)\ | |
964 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\ | |
965 );\ | |
966 } | |
967 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp) | |
968 | |
969 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp) | |
970 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp) | |
971 | |
972 #define LBUTTERFLY2(a1,b1,a2,b2)\ | |
973 "paddw " #b1 ", " #a1 " \n\t"\ | |
974 "paddw " #b2 ", " #a2 " \n\t"\ | |
975 "paddw " #b1 ", " #b1 " \n\t"\ | |
976 "paddw " #b2 ", " #b2 " \n\t"\ | |
977 "psubw " #a1 ", " #b1 " \n\t"\ | |
978 "psubw " #a2 ", " #b2 " \n\t" | |
979 | |
980 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\ | |
981 LBUTTERFLY2(m0, m1, m2, m3)\ | |
982 LBUTTERFLY2(m4, m5, m6, m7)\ | |
983 LBUTTERFLY2(m0, m2, m1, m3)\ | |
984 LBUTTERFLY2(m4, m6, m5, m7)\ | |
985 LBUTTERFLY2(m0, m4, m1, m5)\ | |
986 LBUTTERFLY2(m2, m6, m3, m7)\ | |
987 | |
988 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7) | |
989 | |
990 #define MMABS_MMX(a,z)\ | |
991 "pxor " #z ", " #z " \n\t"\ | |
992 "pcmpgtw " #a ", " #z " \n\t"\ | |
993 "pxor " #z ", " #a " \n\t"\ | |
994 "psubw " #z ", " #a " \n\t" | |
995 | |
996 #define MMABS_MMX2(a,z)\ | |
997 "pxor " #z ", " #z " \n\t"\ | |
998 "psubw " #a ", " #z " \n\t"\ | |
999 "pmaxsw " #z ", " #a " \n\t" | |
1000 | |
1001 #define MMABS_SSSE3(a,z)\ | |
1002 "pabsw " #a ", " #a " \n\t" | |
1003 | |
1004 #define MMABS_SUM(a,z, sum)\ | |
1005 MMABS(a,z)\ | |
1006 "paddusw " #a ", " #sum " \n\t" | |
1007 | |
1008 #define MMABS_SUM_8x8_NOSPILL\ | |
1009 MMABS(%%xmm0, %%xmm8)\ | |
1010 MMABS(%%xmm1, %%xmm9)\ | |
1011 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ | |
1012 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ | |
1013 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ | |
1014 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ | |
1015 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ | |
1016 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ | |
1017 "paddusw %%xmm1, %%xmm0 \n\t" | |
1018 | |
8590 | 1019 #if ARCH_X86_64 |
8430 | 1020 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL |
1021 #else | |
1022 #define MMABS_SUM_8x8_SSE2\ | |
1023 "movdqa %%xmm7, (%1) \n\t"\ | |
1024 MMABS(%%xmm0, %%xmm7)\ | |
1025 MMABS(%%xmm1, %%xmm7)\ | |
1026 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ | |
1027 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ | |
1028 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ | |
1029 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ | |
1030 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ | |
1031 "movdqa (%1), %%xmm2 \n\t"\ | |
1032 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ | |
1033 "paddusw %%xmm1, %%xmm0 \n\t" | |
1034 #endif | |
1035 | |
1036 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to | |
1037 * about 100k on extreme inputs. But that's very unlikely to occur in natural video, | |
1038 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ | |
1039 #define HSUM_MMX(a, t, dst)\ | |
1040 "movq "#a", "#t" \n\t"\ | |
1041 "psrlq $32, "#a" \n\t"\ | |
1042 "paddusw "#t", "#a" \n\t"\ | |
1043 "movq "#a", "#t" \n\t"\ | |
1044 "psrlq $16, "#a" \n\t"\ | |
1045 "paddusw "#t", "#a" \n\t"\ | |
1046 "movd "#a", "#dst" \n\t"\ | |
1047 | |
1048 #define HSUM_MMX2(a, t, dst)\ | |
1049 "pshufw $0x0E, "#a", "#t" \n\t"\ | |
1050 "paddusw "#t", "#a" \n\t"\ | |
1051 "pshufw $0x01, "#a", "#t" \n\t"\ | |
1052 "paddusw "#t", "#a" \n\t"\ | |
1053 "movd "#a", "#dst" \n\t"\ | |
1054 | |
1055 #define HSUM_SSE2(a, t, dst)\ | |
1056 "movhlps "#a", "#t" \n\t"\ | |
1057 "paddusw "#t", "#a" \n\t"\ | |
1058 "pshuflw $0x0E, "#a", "#t" \n\t"\ | |
1059 "paddusw "#t", "#a" \n\t"\ | |
1060 "pshuflw $0x01, "#a", "#t" \n\t"\ | |
1061 "paddusw "#t", "#a" \n\t"\ | |
1062 "movd "#a", "#dst" \n\t"\ | |
1063 | |
1064 #define HADAMARD8_DIFF_MMX(cpu) \ | |
1065 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ | |
11369 | 1066 DECLARE_ALIGNED(8, uint64_t, temp)[13];\ |
8430 | 1067 int sum;\ |
1068 \ | |
1069 assert(h==8);\ | |
1070 \ | |
1071 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ | |
1072 \ | |
1073 __asm__ volatile(\ | |
1074 HADAMARD48\ | |
1075 \ | |
1076 "movq %%mm7, 96(%1) \n\t"\ | |
1077 \ | |
1078 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ | |
1079 STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ | |
1080 \ | |
1081 "movq 96(%1), %%mm7 \n\t"\ | |
1082 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ | |
1083 STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\ | |
1084 \ | |
1085 : "=r" (sum)\ | |
1086 : "r"(temp)\ | |
1087 );\ | |
1088 \ | |
1089 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ | |
1090 \ | |
1091 __asm__ volatile(\ | |
1092 HADAMARD48\ | |
1093 \ | |
1094 "movq %%mm7, 96(%1) \n\t"\ | |
1095 \ | |
1096 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ | |
1097 STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ | |
1098 \ | |
1099 "movq 96(%1), %%mm7 \n\t"\ | |
1100 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ | |
1101 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ | |
1102 "movq %%mm6, %%mm7 \n\t"\ | |
1103 "movq %%mm0, %%mm6 \n\t"\ | |
1104 \ | |
1105 LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ | |
1106 \ | |
1107 HADAMARD48\ | |
1108 "movq %%mm7, 64(%1) \n\t"\ | |
1109 MMABS(%%mm0, %%mm7)\ | |
1110 MMABS(%%mm1, %%mm7)\ | |
1111 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ | |
1112 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ | |
1113 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ | |
1114 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ | |
1115 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ | |
1116 "movq 64(%1), %%mm2 \n\t"\ | |
1117 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ | |
1118 "paddusw %%mm1, %%mm0 \n\t"\ | |
1119 "movq %%mm0, 64(%1) \n\t"\ | |
1120 \ | |
1121 LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ | |
1122 LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\ | |
1123 \ | |
1124 HADAMARD48\ | |
1125 "movq %%mm7, (%1) \n\t"\ | |
1126 MMABS(%%mm0, %%mm7)\ | |
1127 MMABS(%%mm1, %%mm7)\ | |
1128 MMABS_SUM(%%mm2, %%mm7, %%mm0)\ | |
1129 MMABS_SUM(%%mm3, %%mm7, %%mm1)\ | |
1130 MMABS_SUM(%%mm4, %%mm7, %%mm0)\ | |
1131 MMABS_SUM(%%mm5, %%mm7, %%mm1)\ | |
1132 MMABS_SUM(%%mm6, %%mm7, %%mm0)\ | |
1133 "movq (%1), %%mm2 \n\t"\ | |
1134 MMABS_SUM(%%mm2, %%mm7, %%mm1)\ | |
1135 "paddusw 64(%1), %%mm0 \n\t"\ | |
1136 "paddusw %%mm1, %%mm0 \n\t"\ | |
1137 \ | |
1138 HSUM(%%mm0, %%mm1, %0)\ | |
1139 \ | |
1140 : "=r" (sum)\ | |
1141 : "r"(temp)\ | |
1142 );\ | |
1143 return sum&0xFFFF;\ | |
1144 }\ | |
1145 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) | |
1146 | |
1147 #define HADAMARD8_DIFF_SSE2(cpu) \ | |
1148 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ | |
11369 | 1149 DECLARE_ALIGNED(16, uint64_t, temp)[4];\ |
8430 | 1150 int sum;\ |
1151 \ | |
1152 assert(h==8);\ | |
1153 \ | |
1154 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\ | |
1155 \ | |
1156 __asm__ volatile(\ | |
1157 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ | |
1158 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\ | |
1159 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ | |
1160 MMABS_SUM_8x8\ | |
1161 HSUM_SSE2(%%xmm0, %%xmm1, %0)\ | |
1162 : "=r" (sum)\ | |
1163 : "r"(temp)\ | |
1164 );\ | |
1165 return sum&0xFFFF;\ | |
1166 }\ | |
1167 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) | |
1168 | |
1169 #define MMABS(a,z) MMABS_MMX(a,z) | |
1170 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) | |
1171 HADAMARD8_DIFF_MMX(mmx) | |
1172 #undef MMABS | |
1173 #undef HSUM | |
1174 | |
1175 #define MMABS(a,z) MMABS_MMX2(a,z) | |
1176 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2 | |
1177 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) | |
1178 HADAMARD8_DIFF_MMX(mmx2) | |
1179 HADAMARD8_DIFF_SSE2(sse2) | |
1180 #undef MMABS | |
1181 #undef MMABS_SUM_8x8 | |
1182 #undef HSUM | |
1183 | |
8590 | 1184 #if HAVE_SSSE3 |
8430 | 1185 #define MMABS(a,z) MMABS_SSSE3(a,z) |
1186 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL | |
1187 HADAMARD8_DIFF_SSE2(ssse3) | |
1188 #undef MMABS | |
1189 #undef MMABS_SUM_8x8 | |
1190 #endif | |
1191 | |
1192 #define DCT_SAD4(m,mm,o)\ | |
1193 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ | |
1194 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ | |
1195 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ | |
1196 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ | |
1197 MMABS_SUM(mm##2, mm##6, mm##0)\ | |
1198 MMABS_SUM(mm##3, mm##7, mm##1)\ | |
1199 MMABS_SUM(mm##4, mm##6, mm##0)\ | |
1200 MMABS_SUM(mm##5, mm##7, mm##1)\ | |
1201 | |
1202 #define DCT_SAD_MMX\ | |
1203 "pxor %%mm0, %%mm0 \n\t"\ | |
1204 "pxor %%mm1, %%mm1 \n\t"\ | |
1205 DCT_SAD4(q, %%mm, 0)\ | |
1206 DCT_SAD4(q, %%mm, 8)\ | |
1207 DCT_SAD4(q, %%mm, 64)\ | |
1208 DCT_SAD4(q, %%mm, 72)\ | |
1209 "paddusw %%mm1, %%mm0 \n\t"\ | |
1210 HSUM(%%mm0, %%mm1, %0) | |
1211 | |
1212 #define DCT_SAD_SSE2\ | |
1213 "pxor %%xmm0, %%xmm0 \n\t"\ | |
1214 "pxor %%xmm1, %%xmm1 \n\t"\ | |
1215 DCT_SAD4(dqa, %%xmm, 0)\ | |
1216 DCT_SAD4(dqa, %%xmm, 64)\ | |
1217 "paddusw %%xmm1, %%xmm0 \n\t"\ | |
1218 HSUM(%%xmm0, %%xmm1, %0) | |
1219 | |
1220 #define DCT_SAD_FUNC(cpu) \ | |
1221 static int sum_abs_dctelem_##cpu(DCTELEM *block){\ | |
1222 int sum;\ | |
1223 __asm__ volatile(\ | |
1224 DCT_SAD\ | |
1225 :"=r"(sum)\ | |
1226 :"r"(block)\ | |
1227 );\ | |
1228 return sum&0xFFFF;\ | |
1229 } | |
1230 | |
1231 #define DCT_SAD DCT_SAD_MMX | |
1232 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) | |
1233 #define MMABS(a,z) MMABS_MMX(a,z) | |
1234 DCT_SAD_FUNC(mmx) | |
1235 #undef MMABS | |
1236 #undef HSUM | |
1237 | |
1238 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) | |
1239 #define MMABS(a,z) MMABS_MMX2(a,z) | |
1240 DCT_SAD_FUNC(mmx2) | |
1241 #undef HSUM | |
1242 #undef DCT_SAD | |
1243 | |
1244 #define DCT_SAD DCT_SAD_SSE2 | |
1245 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) | |
1246 DCT_SAD_FUNC(sse2) | |
1247 #undef MMABS | |
1248 | |
8590 | 1249 #if HAVE_SSSE3 |
8430 | 1250 #define MMABS(a,z) MMABS_SSSE3(a,z) |
1251 DCT_SAD_FUNC(ssse3) | |
1252 #undef MMABS | |
1253 #endif | |
1254 #undef HSUM | |
1255 #undef DCT_SAD | |
1256 | |
1257 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ | |
1258 int sum; | |
1259 x86_reg i=size; | |
1260 __asm__ volatile( | |
1261 "pxor %%mm4, %%mm4 \n" | |
1262 "1: \n" | |
1263 "sub $8, %0 \n" | |
1264 "movq (%2,%0), %%mm2 \n" | |
1265 "movq (%3,%0,2), %%mm0 \n" | |
1266 "movq 8(%3,%0,2), %%mm1 \n" | |
1267 "punpckhbw %%mm2, %%mm3 \n" | |
1268 "punpcklbw %%mm2, %%mm2 \n" | |
1269 "psraw $8, %%mm3 \n" | |
1270 "psraw $8, %%mm2 \n" | |
1271 "psubw %%mm3, %%mm1 \n" | |
1272 "psubw %%mm2, %%mm0 \n" | |
1273 "pmaddwd %%mm1, %%mm1 \n" | |
1274 "pmaddwd %%mm0, %%mm0 \n" | |
1275 "paddd %%mm1, %%mm4 \n" | |
1276 "paddd %%mm0, %%mm4 \n" | |
1277 "jg 1b \n" | |
1278 "movq %%mm4, %%mm3 \n" | |
1279 "psrlq $32, %%mm3 \n" | |
1280 "paddd %%mm3, %%mm4 \n" | |
1281 "movd %%mm4, %1 \n" | |
1282 :"+r"(i), "=r"(sum) | |
1283 :"r"(pix1), "r"(pix2) | |
1284 ); | |
1285 return sum; | |
1286 } | |
1287 | |
1288 #define PHADDD(a, t)\ | |
1289 "movq "#a", "#t" \n\t"\ | |
1290 "psrlq $32, "#a" \n\t"\ | |
1291 "paddd "#t", "#a" \n\t" | |
1292 /* | |
1293 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31] | |
1294 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31] | |
1295 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30] | |
1296 */ | |
1297 #define PMULHRW(x, y, s, o)\ | |
1298 "pmulhw " #s ", "#x " \n\t"\ | |
1299 "pmulhw " #s ", "#y " \n\t"\ | |
1300 "paddw " #o ", "#x " \n\t"\ | |
1301 "paddw " #o ", "#y " \n\t"\ | |
1302 "psraw $1, "#x " \n\t"\ | |
1303 "psraw $1, "#y " \n\t" | |
1304 #define DEF(x) x ## _mmx | |
1305 #define SET_RND MOVQ_WONE | |
1306 #define SCALE_OFFSET 1 | |
1307 | |
1308 #include "dsputil_mmx_qns_template.c" | |
1309 | |
1310 #undef DEF | |
1311 #undef SET_RND | |
1312 #undef SCALE_OFFSET | |
1313 #undef PMULHRW | |
1314 | |
1315 #define DEF(x) x ## _3dnow | |
1316 #define SET_RND(x) | |
1317 #define SCALE_OFFSET 0 | |
1318 #define PMULHRW(x, y, s, o)\ | |
1319 "pmulhrw " #s ", "#x " \n\t"\ | |
1320 "pmulhrw " #s ", "#y " \n\t" | |
1321 | |
1322 #include "dsputil_mmx_qns_template.c" | |
1323 | |
1324 #undef DEF | |
1325 #undef SET_RND | |
1326 #undef SCALE_OFFSET | |
1327 #undef PMULHRW | |
1328 | |
8590 | 1329 #if HAVE_SSSE3 |
8430 | 1330 #undef PHADDD |
1331 #define DEF(x) x ## _ssse3 | |
1332 #define SET_RND(x) | |
1333 #define SCALE_OFFSET -1 | |
1334 #define PHADDD(a, t)\ | |
1335 "pshufw $0x0E, "#a", "#t" \n\t"\ | |
1336 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ | |
1337 #define PMULHRW(x, y, s, o)\ | |
1338 "pmulhrsw " #s ", "#x " \n\t"\ | |
1339 "pmulhrsw " #s ", "#y " \n\t" | |
1340 | |
1341 #include "dsputil_mmx_qns_template.c" | |
1342 | |
1343 #undef DEF | |
1344 #undef SET_RND | |
1345 #undef SCALE_OFFSET | |
1346 #undef PMULHRW | |
1347 #undef PHADDD | |
1348 #endif //HAVE_SSSE3 | |
1349 | |
1350 | |
1351 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) | |
1352 { | |
1353 if (mm_flags & FF_MM_MMX) { | |
1354 const int dct_algo = avctx->dct_algo; | |
1355 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ | |
1356 if(mm_flags & FF_MM_SSE2){ | |
1357 c->fdct = ff_fdct_sse2; | |
9342
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
8629
diff
changeset
|
1358 }else if(mm_flags & FF_MM_MMX2){ |
8430 | 1359 c->fdct = ff_fdct_mmx2; |
1360 }else{ | |
1361 c->fdct = ff_fdct_mmx; | |
1362 } | |
1363 } | |
1364 | |
1365 c->get_pixels = get_pixels_mmx; | |
1366 c->diff_pixels = diff_pixels_mmx; | |
1367 c->pix_sum = pix_sum16_mmx; | |
1368 | |
1369 c->diff_bytes= diff_bytes_mmx; | |
1370 c->sum_abs_dctelem= sum_abs_dctelem_mmx; | |
1371 | |
1372 c->hadamard8_diff[0]= hadamard8_diff16_mmx; | |
1373 c->hadamard8_diff[1]= hadamard8_diff_mmx; | |
1374 | |
1375 c->pix_norm1 = pix_norm1_mmx; | |
1376 c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx; | |
1377 c->sse[1] = sse8_mmx; | |
1378 c->vsad[4]= vsad_intra16_mmx; | |
1379 | |
1380 c->nsse[0] = nsse16_mmx; | |
1381 c->nsse[1] = nsse8_mmx; | |
1382 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
1383 c->vsad[0] = vsad16_mmx; | |
1384 } | |
1385 | |
1386 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
1387 c->try_8x8basis= try_8x8basis_mmx; | |
1388 } | |
1389 c->add_8x8basis= add_8x8basis_mmx; | |
1390 | |
1391 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; | |
1392 | |
1393 | |
9342
7f594601d5e9
Rename FF_MM_MMXEXT to FF_MM_MMX2, for both clarity and consistency
stefano
parents:
8629
diff
changeset
|
1394 if (mm_flags & FF_MM_MMX2) { |
8430 | 1395 c->sum_abs_dctelem= sum_abs_dctelem_mmx2; |
1396 c->hadamard8_diff[0]= hadamard8_diff16_mmx2; | |
1397 c->hadamard8_diff[1]= hadamard8_diff_mmx2; | |
1398 c->vsad[4]= vsad_intra16_mmx2; | |
1399 | |
1400 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
1401 c->vsad[0] = vsad16_mmx2; | |
1402 } | |
1403 | |
1404 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; | |
1405 } | |
1406 | |
1407 if(mm_flags & FF_MM_SSE2){ | |
1408 c->get_pixels = get_pixels_sse2; | |
1409 c->sum_abs_dctelem= sum_abs_dctelem_sse2; | |
1410 c->hadamard8_diff[0]= hadamard8_diff16_sse2; | |
1411 c->hadamard8_diff[1]= hadamard8_diff_sse2; | |
12197
fbf4d5b1b664
Remove FF_MM_SSE2/3 flags for CPUs where this is generally not faster than
rbultje
parents:
11381
diff
changeset
|
1412 } |
fbf4d5b1b664
Remove FF_MM_SSE2/3 flags for CPUs where this is generally not faster than
rbultje
parents:
11381
diff
changeset
|
1413 |
fbf4d5b1b664
Remove FF_MM_SSE2/3 flags for CPUs where this is generally not faster than
rbultje
parents:
11381
diff
changeset
|
1414 if (CONFIG_LPC && mm_flags & (FF_MM_SSE2|FF_MM_SSE2SLOW)) { |
10424
94595d0e617c
Move autocorrelation function from flacenc.c to lpc.c. Also rename the
jbr
parents:
9342
diff
changeset
|
1415 c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; |
8430 | 1416 } |
1417 | |
8590 | 1418 #if HAVE_SSSE3 |
8430 | 1419 if(mm_flags & FF_MM_SSSE3){ |
1420 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
1421 c->try_8x8basis= try_8x8basis_ssse3; | |
1422 } | |
1423 c->add_8x8basis= add_8x8basis_ssse3; | |
1424 c->sum_abs_dctelem= sum_abs_dctelem_ssse3; | |
1425 c->hadamard8_diff[0]= hadamard8_diff16_ssse3; | |
1426 c->hadamard8_diff[1]= hadamard8_diff_ssse3; | |
1427 } | |
1428 #endif | |
1429 | |
1430 if(mm_flags & FF_MM_3DNOW){ | |
1431 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
1432 c->try_8x8basis= try_8x8basis_3dnow; | |
1433 } | |
1434 c->add_8x8basis= add_8x8basis_3dnow; | |
1435 } | |
1436 } | |
1437 | |
1438 dsputil_init_pix_mmx(c, avctx); | |
1439 } |