comparison x86/dsputilenc_mmx.c @ 8430:7768bdfd4f7b libavcodec

Rename libavcodec/i386/ --> libavcodec/x86/. It contains optimizations that are not specific to i386 and libavutil uses this naming scheme already.
author diego
date Mon, 22 Dec 2008 09:12:42 +0000
parents
children 7a463923ecd1
comparison
equal deleted inserted replaced
8429:b3ecaba81501 8430:7768bdfd4f7b
1 /*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 *
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23 */
24
25 #include "libavutil/x86_cpu.h"
26 #include "libavcodec/dsputil.h"
27 #include "libavcodec/mpegvideo.h"
28 #include "dsputil_mmx.h"
29
30
31 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
32 {
33 __asm__ volatile(
34 "mov $-128, %%"REG_a" \n\t"
35 "pxor %%mm7, %%mm7 \n\t"
36 ASMALIGN(4)
37 "1: \n\t"
38 "movq (%0), %%mm0 \n\t"
39 "movq (%0, %2), %%mm2 \n\t"
40 "movq %%mm0, %%mm1 \n\t"
41 "movq %%mm2, %%mm3 \n\t"
42 "punpcklbw %%mm7, %%mm0 \n\t"
43 "punpckhbw %%mm7, %%mm1 \n\t"
44 "punpcklbw %%mm7, %%mm2 \n\t"
45 "punpckhbw %%mm7, %%mm3 \n\t"
46 "movq %%mm0, (%1, %%"REG_a") \n\t"
47 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
48 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
49 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
50 "add %3, %0 \n\t"
51 "add $32, %%"REG_a" \n\t"
52 "js 1b \n\t"
53 : "+r" (pixels)
54 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
55 : "%"REG_a
56 );
57 }
58
59 static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
60 {
61 __asm__ volatile(
62 "pxor %%xmm7, %%xmm7 \n\t"
63 "movq (%0), %%xmm0 \n\t"
64 "movq (%0, %2), %%xmm1 \n\t"
65 "movq (%0, %2,2), %%xmm2 \n\t"
66 "movq (%0, %3), %%xmm3 \n\t"
67 "lea (%0,%2,4), %0 \n\t"
68 "punpcklbw %%xmm7, %%xmm0 \n\t"
69 "punpcklbw %%xmm7, %%xmm1 \n\t"
70 "punpcklbw %%xmm7, %%xmm2 \n\t"
71 "punpcklbw %%xmm7, %%xmm3 \n\t"
72 "movdqa %%xmm0, (%1) \n\t"
73 "movdqa %%xmm1, 16(%1) \n\t"
74 "movdqa %%xmm2, 32(%1) \n\t"
75 "movdqa %%xmm3, 48(%1) \n\t"
76 "movq (%0), %%xmm0 \n\t"
77 "movq (%0, %2), %%xmm1 \n\t"
78 "movq (%0, %2,2), %%xmm2 \n\t"
79 "movq (%0, %3), %%xmm3 \n\t"
80 "punpcklbw %%xmm7, %%xmm0 \n\t"
81 "punpcklbw %%xmm7, %%xmm1 \n\t"
82 "punpcklbw %%xmm7, %%xmm2 \n\t"
83 "punpcklbw %%xmm7, %%xmm3 \n\t"
84 "movdqa %%xmm0, 64(%1) \n\t"
85 "movdqa %%xmm1, 80(%1) \n\t"
86 "movdqa %%xmm2, 96(%1) \n\t"
87 "movdqa %%xmm3, 112(%1) \n\t"
88 : "+r" (pixels)
89 : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
90 );
91 }
92
93 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
94 {
95 __asm__ volatile(
96 "pxor %%mm7, %%mm7 \n\t"
97 "mov $-128, %%"REG_a" \n\t"
98 ASMALIGN(4)
99 "1: \n\t"
100 "movq (%0), %%mm0 \n\t"
101 "movq (%1), %%mm2 \n\t"
102 "movq %%mm0, %%mm1 \n\t"
103 "movq %%mm2, %%mm3 \n\t"
104 "punpcklbw %%mm7, %%mm0 \n\t"
105 "punpckhbw %%mm7, %%mm1 \n\t"
106 "punpcklbw %%mm7, %%mm2 \n\t"
107 "punpckhbw %%mm7, %%mm3 \n\t"
108 "psubw %%mm2, %%mm0 \n\t"
109 "psubw %%mm3, %%mm1 \n\t"
110 "movq %%mm0, (%2, %%"REG_a") \n\t"
111 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
112 "add %3, %0 \n\t"
113 "add %3, %1 \n\t"
114 "add $16, %%"REG_a" \n\t"
115 "jnz 1b \n\t"
116 : "+r" (s1), "+r" (s2)
117 : "r" (block+64), "r" ((x86_reg)stride)
118 : "%"REG_a
119 );
120 }
121
122 static int pix_sum16_mmx(uint8_t * pix, int line_size){
123 const int h=16;
124 int sum;
125 x86_reg index= -line_size*h;
126
127 __asm__ volatile(
128 "pxor %%mm7, %%mm7 \n\t"
129 "pxor %%mm6, %%mm6 \n\t"
130 "1: \n\t"
131 "movq (%2, %1), %%mm0 \n\t"
132 "movq (%2, %1), %%mm1 \n\t"
133 "movq 8(%2, %1), %%mm2 \n\t"
134 "movq 8(%2, %1), %%mm3 \n\t"
135 "punpcklbw %%mm7, %%mm0 \n\t"
136 "punpckhbw %%mm7, %%mm1 \n\t"
137 "punpcklbw %%mm7, %%mm2 \n\t"
138 "punpckhbw %%mm7, %%mm3 \n\t"
139 "paddw %%mm0, %%mm1 \n\t"
140 "paddw %%mm2, %%mm3 \n\t"
141 "paddw %%mm1, %%mm3 \n\t"
142 "paddw %%mm3, %%mm6 \n\t"
143 "add %3, %1 \n\t"
144 " js 1b \n\t"
145 "movq %%mm6, %%mm5 \n\t"
146 "psrlq $32, %%mm6 \n\t"
147 "paddw %%mm5, %%mm6 \n\t"
148 "movq %%mm6, %%mm5 \n\t"
149 "psrlq $16, %%mm6 \n\t"
150 "paddw %%mm5, %%mm6 \n\t"
151 "movd %%mm6, %0 \n\t"
152 "andl $0xFFFF, %0 \n\t"
153 : "=&r" (sum), "+r" (index)
154 : "r" (pix - index), "r" ((x86_reg)line_size)
155 );
156
157 return sum;
158 }
159
160 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
161 int tmp;
162 __asm__ volatile (
163 "movl $16,%%ecx\n"
164 "pxor %%mm0,%%mm0\n"
165 "pxor %%mm7,%%mm7\n"
166 "1:\n"
167 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
168 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
169
170 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
171
172 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
173 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
174
175 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
176 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
177 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
178
179 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
180 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
181
182 "pmaddwd %%mm3,%%mm3\n"
183 "pmaddwd %%mm4,%%mm4\n"
184
185 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
186 pix2^2+pix3^2+pix6^2+pix7^2) */
187 "paddd %%mm3,%%mm4\n"
188 "paddd %%mm2,%%mm7\n"
189
190 "add %2, %0\n"
191 "paddd %%mm4,%%mm7\n"
192 "dec %%ecx\n"
193 "jnz 1b\n"
194
195 "movq %%mm7,%%mm1\n"
196 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
197 "paddd %%mm7,%%mm1\n"
198 "movd %%mm1,%1\n"
199 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
200 return tmp;
201 }
202
203 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
204 int tmp;
205 __asm__ volatile (
206 "movl %4,%%ecx\n"
207 "shr $1,%%ecx\n"
208 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
209 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
210 "1:\n"
211 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
212 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
213 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
214 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
215
216 /* todo: mm1-mm2, mm3-mm4 */
217 /* algo: subtract mm1 from mm2 with saturation and vice versa */
218 /* OR the results to get absolute difference */
219 "movq %%mm1,%%mm5\n"
220 "movq %%mm3,%%mm6\n"
221 "psubusb %%mm2,%%mm1\n"
222 "psubusb %%mm4,%%mm3\n"
223 "psubusb %%mm5,%%mm2\n"
224 "psubusb %%mm6,%%mm4\n"
225
226 "por %%mm1,%%mm2\n"
227 "por %%mm3,%%mm4\n"
228
229 /* now convert to 16-bit vectors so we can square them */
230 "movq %%mm2,%%mm1\n"
231 "movq %%mm4,%%mm3\n"
232
233 "punpckhbw %%mm0,%%mm2\n"
234 "punpckhbw %%mm0,%%mm4\n"
235 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
236 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
237
238 "pmaddwd %%mm2,%%mm2\n"
239 "pmaddwd %%mm4,%%mm4\n"
240 "pmaddwd %%mm1,%%mm1\n"
241 "pmaddwd %%mm3,%%mm3\n"
242
243 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
244 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
245
246 "paddd %%mm2,%%mm1\n"
247 "paddd %%mm4,%%mm3\n"
248 "paddd %%mm1,%%mm7\n"
249 "paddd %%mm3,%%mm7\n"
250
251 "decl %%ecx\n"
252 "jnz 1b\n"
253
254 "movq %%mm7,%%mm1\n"
255 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
256 "paddd %%mm7,%%mm1\n"
257 "movd %%mm1,%2\n"
258 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
259 : "r" ((x86_reg)line_size) , "m" (h)
260 : "%ecx");
261 return tmp;
262 }
263
264 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
265 int tmp;
266 __asm__ volatile (
267 "movl %4,%%ecx\n"
268 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
269 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
270 "1:\n"
271 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
272 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
273 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
274 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
275
276 /* todo: mm1-mm2, mm3-mm4 */
277 /* algo: subtract mm1 from mm2 with saturation and vice versa */
278 /* OR the results to get absolute difference */
279 "movq %%mm1,%%mm5\n"
280 "movq %%mm3,%%mm6\n"
281 "psubusb %%mm2,%%mm1\n"
282 "psubusb %%mm4,%%mm3\n"
283 "psubusb %%mm5,%%mm2\n"
284 "psubusb %%mm6,%%mm4\n"
285
286 "por %%mm1,%%mm2\n"
287 "por %%mm3,%%mm4\n"
288
289 /* now convert to 16-bit vectors so we can square them */
290 "movq %%mm2,%%mm1\n"
291 "movq %%mm4,%%mm3\n"
292
293 "punpckhbw %%mm0,%%mm2\n"
294 "punpckhbw %%mm0,%%mm4\n"
295 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
296 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
297
298 "pmaddwd %%mm2,%%mm2\n"
299 "pmaddwd %%mm4,%%mm4\n"
300 "pmaddwd %%mm1,%%mm1\n"
301 "pmaddwd %%mm3,%%mm3\n"
302
303 "add %3,%0\n"
304 "add %3,%1\n"
305
306 "paddd %%mm2,%%mm1\n"
307 "paddd %%mm4,%%mm3\n"
308 "paddd %%mm1,%%mm7\n"
309 "paddd %%mm3,%%mm7\n"
310
311 "decl %%ecx\n"
312 "jnz 1b\n"
313
314 "movq %%mm7,%%mm1\n"
315 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
316 "paddd %%mm7,%%mm1\n"
317 "movd %%mm1,%2\n"
318 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
319 : "r" ((x86_reg)line_size) , "m" (h)
320 : "%ecx");
321 return tmp;
322 }
323
324 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
325 int tmp;
326 __asm__ volatile (
327 "shr $1,%2\n"
328 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
329 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
330 "1:\n"
331 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
332 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
333 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
334 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
335
336 /* todo: mm1-mm2, mm3-mm4 */
337 /* algo: subtract mm1 from mm2 with saturation and vice versa */
338 /* OR the results to get absolute difference */
339 "movdqa %%xmm1,%%xmm5\n"
340 "movdqa %%xmm3,%%xmm6\n"
341 "psubusb %%xmm2,%%xmm1\n"
342 "psubusb %%xmm4,%%xmm3\n"
343 "psubusb %%xmm5,%%xmm2\n"
344 "psubusb %%xmm6,%%xmm4\n"
345
346 "por %%xmm1,%%xmm2\n"
347 "por %%xmm3,%%xmm4\n"
348
349 /* now convert to 16-bit vectors so we can square them */
350 "movdqa %%xmm2,%%xmm1\n"
351 "movdqa %%xmm4,%%xmm3\n"
352
353 "punpckhbw %%xmm0,%%xmm2\n"
354 "punpckhbw %%xmm0,%%xmm4\n"
355 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
356 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
357
358 "pmaddwd %%xmm2,%%xmm2\n"
359 "pmaddwd %%xmm4,%%xmm4\n"
360 "pmaddwd %%xmm1,%%xmm1\n"
361 "pmaddwd %%xmm3,%%xmm3\n"
362
363 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
364 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
365
366 "paddd %%xmm2,%%xmm1\n"
367 "paddd %%xmm4,%%xmm3\n"
368 "paddd %%xmm1,%%xmm7\n"
369 "paddd %%xmm3,%%xmm7\n"
370
371 "decl %2\n"
372 "jnz 1b\n"
373
374 "movdqa %%xmm7,%%xmm1\n"
375 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
376 "paddd %%xmm1,%%xmm7\n"
377 "movdqa %%xmm7,%%xmm1\n"
378 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
379 "paddd %%xmm1,%%xmm7\n"
380 "movd %%xmm7,%3\n"
381 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
382 : "r" ((x86_reg)line_size));
383 return tmp;
384 }
385
386 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
387 int tmp;
388 __asm__ volatile (
389 "movl %3,%%ecx\n"
390 "pxor %%mm7,%%mm7\n"
391 "pxor %%mm6,%%mm6\n"
392
393 "movq (%0),%%mm0\n"
394 "movq %%mm0, %%mm1\n"
395 "psllq $8, %%mm0\n"
396 "psrlq $8, %%mm1\n"
397 "psrlq $8, %%mm0\n"
398 "movq %%mm0, %%mm2\n"
399 "movq %%mm1, %%mm3\n"
400 "punpcklbw %%mm7,%%mm0\n"
401 "punpcklbw %%mm7,%%mm1\n"
402 "punpckhbw %%mm7,%%mm2\n"
403 "punpckhbw %%mm7,%%mm3\n"
404 "psubw %%mm1, %%mm0\n"
405 "psubw %%mm3, %%mm2\n"
406
407 "add %2,%0\n"
408
409 "movq (%0),%%mm4\n"
410 "movq %%mm4, %%mm1\n"
411 "psllq $8, %%mm4\n"
412 "psrlq $8, %%mm1\n"
413 "psrlq $8, %%mm4\n"
414 "movq %%mm4, %%mm5\n"
415 "movq %%mm1, %%mm3\n"
416 "punpcklbw %%mm7,%%mm4\n"
417 "punpcklbw %%mm7,%%mm1\n"
418 "punpckhbw %%mm7,%%mm5\n"
419 "punpckhbw %%mm7,%%mm3\n"
420 "psubw %%mm1, %%mm4\n"
421 "psubw %%mm3, %%mm5\n"
422 "psubw %%mm4, %%mm0\n"
423 "psubw %%mm5, %%mm2\n"
424 "pxor %%mm3, %%mm3\n"
425 "pxor %%mm1, %%mm1\n"
426 "pcmpgtw %%mm0, %%mm3\n\t"
427 "pcmpgtw %%mm2, %%mm1\n\t"
428 "pxor %%mm3, %%mm0\n"
429 "pxor %%mm1, %%mm2\n"
430 "psubw %%mm3, %%mm0\n"
431 "psubw %%mm1, %%mm2\n"
432 "paddw %%mm0, %%mm2\n"
433 "paddw %%mm2, %%mm6\n"
434
435 "add %2,%0\n"
436 "1:\n"
437
438 "movq (%0),%%mm0\n"
439 "movq %%mm0, %%mm1\n"
440 "psllq $8, %%mm0\n"
441 "psrlq $8, %%mm1\n"
442 "psrlq $8, %%mm0\n"
443 "movq %%mm0, %%mm2\n"
444 "movq %%mm1, %%mm3\n"
445 "punpcklbw %%mm7,%%mm0\n"
446 "punpcklbw %%mm7,%%mm1\n"
447 "punpckhbw %%mm7,%%mm2\n"
448 "punpckhbw %%mm7,%%mm3\n"
449 "psubw %%mm1, %%mm0\n"
450 "psubw %%mm3, %%mm2\n"
451 "psubw %%mm0, %%mm4\n"
452 "psubw %%mm2, %%mm5\n"
453 "pxor %%mm3, %%mm3\n"
454 "pxor %%mm1, %%mm1\n"
455 "pcmpgtw %%mm4, %%mm3\n\t"
456 "pcmpgtw %%mm5, %%mm1\n\t"
457 "pxor %%mm3, %%mm4\n"
458 "pxor %%mm1, %%mm5\n"
459 "psubw %%mm3, %%mm4\n"
460 "psubw %%mm1, %%mm5\n"
461 "paddw %%mm4, %%mm5\n"
462 "paddw %%mm5, %%mm6\n"
463
464 "add %2,%0\n"
465
466 "movq (%0),%%mm4\n"
467 "movq %%mm4, %%mm1\n"
468 "psllq $8, %%mm4\n"
469 "psrlq $8, %%mm1\n"
470 "psrlq $8, %%mm4\n"
471 "movq %%mm4, %%mm5\n"
472 "movq %%mm1, %%mm3\n"
473 "punpcklbw %%mm7,%%mm4\n"
474 "punpcklbw %%mm7,%%mm1\n"
475 "punpckhbw %%mm7,%%mm5\n"
476 "punpckhbw %%mm7,%%mm3\n"
477 "psubw %%mm1, %%mm4\n"
478 "psubw %%mm3, %%mm5\n"
479 "psubw %%mm4, %%mm0\n"
480 "psubw %%mm5, %%mm2\n"
481 "pxor %%mm3, %%mm3\n"
482 "pxor %%mm1, %%mm1\n"
483 "pcmpgtw %%mm0, %%mm3\n\t"
484 "pcmpgtw %%mm2, %%mm1\n\t"
485 "pxor %%mm3, %%mm0\n"
486 "pxor %%mm1, %%mm2\n"
487 "psubw %%mm3, %%mm0\n"
488 "psubw %%mm1, %%mm2\n"
489 "paddw %%mm0, %%mm2\n"
490 "paddw %%mm2, %%mm6\n"
491
492 "add %2,%0\n"
493 "subl $2, %%ecx\n"
494 " jnz 1b\n"
495
496 "movq %%mm6, %%mm0\n"
497 "punpcklwd %%mm7,%%mm0\n"
498 "punpckhwd %%mm7,%%mm6\n"
499 "paddd %%mm0, %%mm6\n"
500
501 "movq %%mm6,%%mm0\n"
502 "psrlq $32, %%mm6\n"
503 "paddd %%mm6,%%mm0\n"
504 "movd %%mm0,%1\n"
505 : "+r" (pix1), "=r"(tmp)
506 : "r" ((x86_reg)line_size) , "g" (h-2)
507 : "%ecx");
508 return tmp;
509 }
510
511 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
512 int tmp;
513 uint8_t * pix= pix1;
514 __asm__ volatile (
515 "movl %3,%%ecx\n"
516 "pxor %%mm7,%%mm7\n"
517 "pxor %%mm6,%%mm6\n"
518
519 "movq (%0),%%mm0\n"
520 "movq 1(%0),%%mm1\n"
521 "movq %%mm0, %%mm2\n"
522 "movq %%mm1, %%mm3\n"
523 "punpcklbw %%mm7,%%mm0\n"
524 "punpcklbw %%mm7,%%mm1\n"
525 "punpckhbw %%mm7,%%mm2\n"
526 "punpckhbw %%mm7,%%mm3\n"
527 "psubw %%mm1, %%mm0\n"
528 "psubw %%mm3, %%mm2\n"
529
530 "add %2,%0\n"
531
532 "movq (%0),%%mm4\n"
533 "movq 1(%0),%%mm1\n"
534 "movq %%mm4, %%mm5\n"
535 "movq %%mm1, %%mm3\n"
536 "punpcklbw %%mm7,%%mm4\n"
537 "punpcklbw %%mm7,%%mm1\n"
538 "punpckhbw %%mm7,%%mm5\n"
539 "punpckhbw %%mm7,%%mm3\n"
540 "psubw %%mm1, %%mm4\n"
541 "psubw %%mm3, %%mm5\n"
542 "psubw %%mm4, %%mm0\n"
543 "psubw %%mm5, %%mm2\n"
544 "pxor %%mm3, %%mm3\n"
545 "pxor %%mm1, %%mm1\n"
546 "pcmpgtw %%mm0, %%mm3\n\t"
547 "pcmpgtw %%mm2, %%mm1\n\t"
548 "pxor %%mm3, %%mm0\n"
549 "pxor %%mm1, %%mm2\n"
550 "psubw %%mm3, %%mm0\n"
551 "psubw %%mm1, %%mm2\n"
552 "paddw %%mm0, %%mm2\n"
553 "paddw %%mm2, %%mm6\n"
554
555 "add %2,%0\n"
556 "1:\n"
557
558 "movq (%0),%%mm0\n"
559 "movq 1(%0),%%mm1\n"
560 "movq %%mm0, %%mm2\n"
561 "movq %%mm1, %%mm3\n"
562 "punpcklbw %%mm7,%%mm0\n"
563 "punpcklbw %%mm7,%%mm1\n"
564 "punpckhbw %%mm7,%%mm2\n"
565 "punpckhbw %%mm7,%%mm3\n"
566 "psubw %%mm1, %%mm0\n"
567 "psubw %%mm3, %%mm2\n"
568 "psubw %%mm0, %%mm4\n"
569 "psubw %%mm2, %%mm5\n"
570 "pxor %%mm3, %%mm3\n"
571 "pxor %%mm1, %%mm1\n"
572 "pcmpgtw %%mm4, %%mm3\n\t"
573 "pcmpgtw %%mm5, %%mm1\n\t"
574 "pxor %%mm3, %%mm4\n"
575 "pxor %%mm1, %%mm5\n"
576 "psubw %%mm3, %%mm4\n"
577 "psubw %%mm1, %%mm5\n"
578 "paddw %%mm4, %%mm5\n"
579 "paddw %%mm5, %%mm6\n"
580
581 "add %2,%0\n"
582
583 "movq (%0),%%mm4\n"
584 "movq 1(%0),%%mm1\n"
585 "movq %%mm4, %%mm5\n"
586 "movq %%mm1, %%mm3\n"
587 "punpcklbw %%mm7,%%mm4\n"
588 "punpcklbw %%mm7,%%mm1\n"
589 "punpckhbw %%mm7,%%mm5\n"
590 "punpckhbw %%mm7,%%mm3\n"
591 "psubw %%mm1, %%mm4\n"
592 "psubw %%mm3, %%mm5\n"
593 "psubw %%mm4, %%mm0\n"
594 "psubw %%mm5, %%mm2\n"
595 "pxor %%mm3, %%mm3\n"
596 "pxor %%mm1, %%mm1\n"
597 "pcmpgtw %%mm0, %%mm3\n\t"
598 "pcmpgtw %%mm2, %%mm1\n\t"
599 "pxor %%mm3, %%mm0\n"
600 "pxor %%mm1, %%mm2\n"
601 "psubw %%mm3, %%mm0\n"
602 "psubw %%mm1, %%mm2\n"
603 "paddw %%mm0, %%mm2\n"
604 "paddw %%mm2, %%mm6\n"
605
606 "add %2,%0\n"
607 "subl $2, %%ecx\n"
608 " jnz 1b\n"
609
610 "movq %%mm6, %%mm0\n"
611 "punpcklwd %%mm7,%%mm0\n"
612 "punpckhwd %%mm7,%%mm6\n"
613 "paddd %%mm0, %%mm6\n"
614
615 "movq %%mm6,%%mm0\n"
616 "psrlq $32, %%mm6\n"
617 "paddd %%mm6,%%mm0\n"
618 "movd %%mm0,%1\n"
619 : "+r" (pix1), "=r"(tmp)
620 : "r" ((x86_reg)line_size) , "g" (h-2)
621 : "%ecx");
622 return tmp + hf_noise8_mmx(pix+8, line_size, h);
623 }
624
625 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
626 MpegEncContext *c = p;
627 int score1, score2;
628
629 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
630 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
631 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
632
633 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
634 else return score1 + FFABS(score2)*8;
635 }
636
637 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
638 MpegEncContext *c = p;
639 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
640 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
641
642 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
643 else return score1 + FFABS(score2)*8;
644 }
645
646 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
647 int tmp;
648
649 assert( (((int)pix) & 7) == 0);
650 assert((line_size &7) ==0);
651
652 #define SUM(in0, in1, out0, out1) \
653 "movq (%0), %%mm2\n"\
654 "movq 8(%0), %%mm3\n"\
655 "add %2,%0\n"\
656 "movq %%mm2, " #out0 "\n"\
657 "movq %%mm3, " #out1 "\n"\
658 "psubusb " #in0 ", %%mm2\n"\
659 "psubusb " #in1 ", %%mm3\n"\
660 "psubusb " #out0 ", " #in0 "\n"\
661 "psubusb " #out1 ", " #in1 "\n"\
662 "por %%mm2, " #in0 "\n"\
663 "por %%mm3, " #in1 "\n"\
664 "movq " #in0 ", %%mm2\n"\
665 "movq " #in1 ", %%mm3\n"\
666 "punpcklbw %%mm7, " #in0 "\n"\
667 "punpcklbw %%mm7, " #in1 "\n"\
668 "punpckhbw %%mm7, %%mm2\n"\
669 "punpckhbw %%mm7, %%mm3\n"\
670 "paddw " #in1 ", " #in0 "\n"\
671 "paddw %%mm3, %%mm2\n"\
672 "paddw %%mm2, " #in0 "\n"\
673 "paddw " #in0 ", %%mm6\n"
674
675
676 __asm__ volatile (
677 "movl %3,%%ecx\n"
678 "pxor %%mm6,%%mm6\n"
679 "pxor %%mm7,%%mm7\n"
680 "movq (%0),%%mm0\n"
681 "movq 8(%0),%%mm1\n"
682 "add %2,%0\n"
683 "jmp 2f\n"
684 "1:\n"
685
686 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
687 "2:\n"
688 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
689
690 "subl $2, %%ecx\n"
691 "jnz 1b\n"
692
693 "movq %%mm6,%%mm0\n"
694 "psrlq $32, %%mm6\n"
695 "paddw %%mm6,%%mm0\n"
696 "movq %%mm0,%%mm6\n"
697 "psrlq $16, %%mm0\n"
698 "paddw %%mm6,%%mm0\n"
699 "movd %%mm0,%1\n"
700 : "+r" (pix), "=r"(tmp)
701 : "r" ((x86_reg)line_size) , "m" (h)
702 : "%ecx");
703 return tmp & 0xFFFF;
704 }
705 #undef SUM
706
707 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
708 int tmp;
709
710 assert( (((int)pix) & 7) == 0);
711 assert((line_size &7) ==0);
712
713 #define SUM(in0, in1, out0, out1) \
714 "movq (%0), " #out0 "\n"\
715 "movq 8(%0), " #out1 "\n"\
716 "add %2,%0\n"\
717 "psadbw " #out0 ", " #in0 "\n"\
718 "psadbw " #out1 ", " #in1 "\n"\
719 "paddw " #in1 ", " #in0 "\n"\
720 "paddw " #in0 ", %%mm6\n"
721
722 __asm__ volatile (
723 "movl %3,%%ecx\n"
724 "pxor %%mm6,%%mm6\n"
725 "pxor %%mm7,%%mm7\n"
726 "movq (%0),%%mm0\n"
727 "movq 8(%0),%%mm1\n"
728 "add %2,%0\n"
729 "jmp 2f\n"
730 "1:\n"
731
732 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
733 "2:\n"
734 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
735
736 "subl $2, %%ecx\n"
737 "jnz 1b\n"
738
739 "movd %%mm6,%1\n"
740 : "+r" (pix), "=r"(tmp)
741 : "r" ((x86_reg)line_size) , "m" (h)
742 : "%ecx");
743 return tmp;
744 }
745 #undef SUM
746
747 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
748 int tmp;
749
750 assert( (((int)pix1) & 7) == 0);
751 assert( (((int)pix2) & 7) == 0);
752 assert((line_size &7) ==0);
753
754 #define SUM(in0, in1, out0, out1) \
755 "movq (%0),%%mm2\n"\
756 "movq (%1)," #out0 "\n"\
757 "movq 8(%0),%%mm3\n"\
758 "movq 8(%1)," #out1 "\n"\
759 "add %3,%0\n"\
760 "add %3,%1\n"\
761 "psubb " #out0 ", %%mm2\n"\
762 "psubb " #out1 ", %%mm3\n"\
763 "pxor %%mm7, %%mm2\n"\
764 "pxor %%mm7, %%mm3\n"\
765 "movq %%mm2, " #out0 "\n"\
766 "movq %%mm3, " #out1 "\n"\
767 "psubusb " #in0 ", %%mm2\n"\
768 "psubusb " #in1 ", %%mm3\n"\
769 "psubusb " #out0 ", " #in0 "\n"\
770 "psubusb " #out1 ", " #in1 "\n"\
771 "por %%mm2, " #in0 "\n"\
772 "por %%mm3, " #in1 "\n"\
773 "movq " #in0 ", %%mm2\n"\
774 "movq " #in1 ", %%mm3\n"\
775 "punpcklbw %%mm7, " #in0 "\n"\
776 "punpcklbw %%mm7, " #in1 "\n"\
777 "punpckhbw %%mm7, %%mm2\n"\
778 "punpckhbw %%mm7, %%mm3\n"\
779 "paddw " #in1 ", " #in0 "\n"\
780 "paddw %%mm3, %%mm2\n"\
781 "paddw %%mm2, " #in0 "\n"\
782 "paddw " #in0 ", %%mm6\n"
783
784
785 __asm__ volatile (
786 "movl %4,%%ecx\n"
787 "pxor %%mm6,%%mm6\n"
788 "pcmpeqw %%mm7,%%mm7\n"
789 "psllw $15, %%mm7\n"
790 "packsswb %%mm7, %%mm7\n"
791 "movq (%0),%%mm0\n"
792 "movq (%1),%%mm2\n"
793 "movq 8(%0),%%mm1\n"
794 "movq 8(%1),%%mm3\n"
795 "add %3,%0\n"
796 "add %3,%1\n"
797 "psubb %%mm2, %%mm0\n"
798 "psubb %%mm3, %%mm1\n"
799 "pxor %%mm7, %%mm0\n"
800 "pxor %%mm7, %%mm1\n"
801 "jmp 2f\n"
802 "1:\n"
803
804 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
805 "2:\n"
806 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
807
808 "subl $2, %%ecx\n"
809 "jnz 1b\n"
810
811 "movq %%mm6,%%mm0\n"
812 "psrlq $32, %%mm6\n"
813 "paddw %%mm6,%%mm0\n"
814 "movq %%mm0,%%mm6\n"
815 "psrlq $16, %%mm0\n"
816 "paddw %%mm6,%%mm0\n"
817 "movd %%mm0,%2\n"
818 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
819 : "r" ((x86_reg)line_size) , "m" (h)
820 : "%ecx");
821 return tmp & 0x7FFF;
822 }
823 #undef SUM
824
825 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
826 int tmp;
827
828 assert( (((int)pix1) & 7) == 0);
829 assert( (((int)pix2) & 7) == 0);
830 assert((line_size &7) ==0);
831
832 #define SUM(in0, in1, out0, out1) \
833 "movq (%0)," #out0 "\n"\
834 "movq (%1),%%mm2\n"\
835 "movq 8(%0)," #out1 "\n"\
836 "movq 8(%1),%%mm3\n"\
837 "add %3,%0\n"\
838 "add %3,%1\n"\
839 "psubb %%mm2, " #out0 "\n"\
840 "psubb %%mm3, " #out1 "\n"\
841 "pxor %%mm7, " #out0 "\n"\
842 "pxor %%mm7, " #out1 "\n"\
843 "psadbw " #out0 ", " #in0 "\n"\
844 "psadbw " #out1 ", " #in1 "\n"\
845 "paddw " #in1 ", " #in0 "\n"\
846 "paddw " #in0 ", %%mm6\n"
847
848 __asm__ volatile (
849 "movl %4,%%ecx\n"
850 "pxor %%mm6,%%mm6\n"
851 "pcmpeqw %%mm7,%%mm7\n"
852 "psllw $15, %%mm7\n"
853 "packsswb %%mm7, %%mm7\n"
854 "movq (%0),%%mm0\n"
855 "movq (%1),%%mm2\n"
856 "movq 8(%0),%%mm1\n"
857 "movq 8(%1),%%mm3\n"
858 "add %3,%0\n"
859 "add %3,%1\n"
860 "psubb %%mm2, %%mm0\n"
861 "psubb %%mm3, %%mm1\n"
862 "pxor %%mm7, %%mm0\n"
863 "pxor %%mm7, %%mm1\n"
864 "jmp 2f\n"
865 "1:\n"
866
867 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
868 "2:\n"
869 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
870
871 "subl $2, %%ecx\n"
872 "jnz 1b\n"
873
874 "movd %%mm6,%2\n"
875 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
876 : "r" ((x86_reg)line_size) , "m" (h)
877 : "%ecx");
878 return tmp;
879 }
880 #undef SUM
881
882 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
883 x86_reg i=0;
884 __asm__ volatile(
885 "1: \n\t"
886 "movq (%2, %0), %%mm0 \n\t"
887 "movq (%1, %0), %%mm1 \n\t"
888 "psubb %%mm0, %%mm1 \n\t"
889 "movq %%mm1, (%3, %0) \n\t"
890 "movq 8(%2, %0), %%mm0 \n\t"
891 "movq 8(%1, %0), %%mm1 \n\t"
892 "psubb %%mm0, %%mm1 \n\t"
893 "movq %%mm1, 8(%3, %0) \n\t"
894 "add $16, %0 \n\t"
895 "cmp %4, %0 \n\t"
896 " jb 1b \n\t"
897 : "+r" (i)
898 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
899 );
900 for(; i<w; i++)
901 dst[i+0] = src1[i+0]-src2[i+0];
902 }
903
904 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
905 x86_reg i=0;
906 uint8_t l, lt;
907
908 __asm__ volatile(
909 "1: \n\t"
910 "movq -1(%1, %0), %%mm0 \n\t" // LT
911 "movq (%1, %0), %%mm1 \n\t" // T
912 "movq -1(%2, %0), %%mm2 \n\t" // L
913 "movq (%2, %0), %%mm3 \n\t" // X
914 "movq %%mm2, %%mm4 \n\t" // L
915 "psubb %%mm0, %%mm2 \n\t"
916 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
917 "movq %%mm4, %%mm5 \n\t" // L
918 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
919 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
920 "pminub %%mm2, %%mm4 \n\t"
921 "pmaxub %%mm1, %%mm4 \n\t"
922 "psubb %%mm4, %%mm3 \n\t" // dst - pred
923 "movq %%mm3, (%3, %0) \n\t"
924 "add $8, %0 \n\t"
925 "cmp %4, %0 \n\t"
926 " jb 1b \n\t"
927 : "+r" (i)
928 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
929 );
930
931 l= *left;
932 lt= *left_top;
933
934 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
935
936 *left_top= src1[w-1];
937 *left = src2[w-1];
938 }
939
940 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
941 "mov"#m" "#p1", "#a" \n\t"\
942 "mov"#m" "#p2", "#t" \n\t"\
943 "punpcklbw "#a", "#t" \n\t"\
944 "punpcklbw "#a", "#a" \n\t"\
945 "psubw "#t", "#a" \n\t"\
946
947 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
948 uint8_t *p1b=p1, *p2b=p2;\
949 __asm__ volatile(\
950 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
951 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
952 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
953 "add %4, %1 \n\t"\
954 "add %4, %2 \n\t"\
955 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
956 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
957 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
958 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
959 "mov"#m1" "#mm"0, %0 \n\t"\
960 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
961 "mov"#m1" %0, "#mm"0 \n\t"\
962 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
963 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
964 );\
965 }
966 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
967
968 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
969 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
970
971 #define LBUTTERFLY2(a1,b1,a2,b2)\
972 "paddw " #b1 ", " #a1 " \n\t"\
973 "paddw " #b2 ", " #a2 " \n\t"\
974 "paddw " #b1 ", " #b1 " \n\t"\
975 "paddw " #b2 ", " #b2 " \n\t"\
976 "psubw " #a1 ", " #b1 " \n\t"\
977 "psubw " #a2 ", " #b2 " \n\t"
978
979 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
980 LBUTTERFLY2(m0, m1, m2, m3)\
981 LBUTTERFLY2(m4, m5, m6, m7)\
982 LBUTTERFLY2(m0, m2, m1, m3)\
983 LBUTTERFLY2(m4, m6, m5, m7)\
984 LBUTTERFLY2(m0, m4, m1, m5)\
985 LBUTTERFLY2(m2, m6, m3, m7)\
986
987 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
988
989 #define MMABS_MMX(a,z)\
990 "pxor " #z ", " #z " \n\t"\
991 "pcmpgtw " #a ", " #z " \n\t"\
992 "pxor " #z ", " #a " \n\t"\
993 "psubw " #z ", " #a " \n\t"
994
995 #define MMABS_MMX2(a,z)\
996 "pxor " #z ", " #z " \n\t"\
997 "psubw " #a ", " #z " \n\t"\
998 "pmaxsw " #z ", " #a " \n\t"
999
1000 #define MMABS_SSSE3(a,z)\
1001 "pabsw " #a ", " #a " \n\t"
1002
1003 #define MMABS_SUM(a,z, sum)\
1004 MMABS(a,z)\
1005 "paddusw " #a ", " #sum " \n\t"
1006
1007 #define MMABS_SUM_8x8_NOSPILL\
1008 MMABS(%%xmm0, %%xmm8)\
1009 MMABS(%%xmm1, %%xmm9)\
1010 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1011 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1012 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1013 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1014 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1015 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1016 "paddusw %%xmm1, %%xmm0 \n\t"
1017
1018 #ifdef ARCH_X86_64
1019 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1020 #else
1021 #define MMABS_SUM_8x8_SSE2\
1022 "movdqa %%xmm7, (%1) \n\t"\
1023 MMABS(%%xmm0, %%xmm7)\
1024 MMABS(%%xmm1, %%xmm7)\
1025 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1026 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1027 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1028 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1029 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1030 "movdqa (%1), %%xmm2 \n\t"\
1031 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1032 "paddusw %%xmm1, %%xmm0 \n\t"
1033 #endif
1034
1035 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1036 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1037 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1038 #define HSUM_MMX(a, t, dst)\
1039 "movq "#a", "#t" \n\t"\
1040 "psrlq $32, "#a" \n\t"\
1041 "paddusw "#t", "#a" \n\t"\
1042 "movq "#a", "#t" \n\t"\
1043 "psrlq $16, "#a" \n\t"\
1044 "paddusw "#t", "#a" \n\t"\
1045 "movd "#a", "#dst" \n\t"\
1046
1047 #define HSUM_MMX2(a, t, dst)\
1048 "pshufw $0x0E, "#a", "#t" \n\t"\
1049 "paddusw "#t", "#a" \n\t"\
1050 "pshufw $0x01, "#a", "#t" \n\t"\
1051 "paddusw "#t", "#a" \n\t"\
1052 "movd "#a", "#dst" \n\t"\
1053
1054 #define HSUM_SSE2(a, t, dst)\
1055 "movhlps "#a", "#t" \n\t"\
1056 "paddusw "#t", "#a" \n\t"\
1057 "pshuflw $0x0E, "#a", "#t" \n\t"\
1058 "paddusw "#t", "#a" \n\t"\
1059 "pshuflw $0x01, "#a", "#t" \n\t"\
1060 "paddusw "#t", "#a" \n\t"\
1061 "movd "#a", "#dst" \n\t"\
1062
1063 #define HADAMARD8_DIFF_MMX(cpu) \
1064 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1065 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1066 int sum;\
1067 \
1068 assert(h==8);\
1069 \
1070 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1071 \
1072 __asm__ volatile(\
1073 HADAMARD48\
1074 \
1075 "movq %%mm7, 96(%1) \n\t"\
1076 \
1077 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1078 STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1079 \
1080 "movq 96(%1), %%mm7 \n\t"\
1081 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1082 STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
1083 \
1084 : "=r" (sum)\
1085 : "r"(temp)\
1086 );\
1087 \
1088 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1089 \
1090 __asm__ volatile(\
1091 HADAMARD48\
1092 \
1093 "movq %%mm7, 96(%1) \n\t"\
1094 \
1095 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1096 STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1097 \
1098 "movq 96(%1), %%mm7 \n\t"\
1099 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1100 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1101 "movq %%mm6, %%mm7 \n\t"\
1102 "movq %%mm0, %%mm6 \n\t"\
1103 \
1104 LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1105 \
1106 HADAMARD48\
1107 "movq %%mm7, 64(%1) \n\t"\
1108 MMABS(%%mm0, %%mm7)\
1109 MMABS(%%mm1, %%mm7)\
1110 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1111 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1112 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1113 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1114 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1115 "movq 64(%1), %%mm2 \n\t"\
1116 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1117 "paddusw %%mm1, %%mm0 \n\t"\
1118 "movq %%mm0, 64(%1) \n\t"\
1119 \
1120 LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1121 LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
1122 \
1123 HADAMARD48\
1124 "movq %%mm7, (%1) \n\t"\
1125 MMABS(%%mm0, %%mm7)\
1126 MMABS(%%mm1, %%mm7)\
1127 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1128 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1129 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1130 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1131 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1132 "movq (%1), %%mm2 \n\t"\
1133 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1134 "paddusw 64(%1), %%mm0 \n\t"\
1135 "paddusw %%mm1, %%mm0 \n\t"\
1136 \
1137 HSUM(%%mm0, %%mm1, %0)\
1138 \
1139 : "=r" (sum)\
1140 : "r"(temp)\
1141 );\
1142 return sum&0xFFFF;\
1143 }\
1144 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1145
1146 #define HADAMARD8_DIFF_SSE2(cpu) \
1147 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1148 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1149 int sum;\
1150 \
1151 assert(h==8);\
1152 \
1153 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1154 \
1155 __asm__ volatile(\
1156 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1157 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1158 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1159 MMABS_SUM_8x8\
1160 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1161 : "=r" (sum)\
1162 : "r"(temp)\
1163 );\
1164 return sum&0xFFFF;\
1165 }\
1166 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1167
1168 #define MMABS(a,z) MMABS_MMX(a,z)
1169 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1170 HADAMARD8_DIFF_MMX(mmx)
1171 #undef MMABS
1172 #undef HSUM
1173
1174 #define MMABS(a,z) MMABS_MMX2(a,z)
1175 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1176 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1177 HADAMARD8_DIFF_MMX(mmx2)
1178 HADAMARD8_DIFF_SSE2(sse2)
1179 #undef MMABS
1180 #undef MMABS_SUM_8x8
1181 #undef HSUM
1182
1183 #ifdef HAVE_SSSE3
1184 #define MMABS(a,z) MMABS_SSSE3(a,z)
1185 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1186 HADAMARD8_DIFF_SSE2(ssse3)
1187 #undef MMABS
1188 #undef MMABS_SUM_8x8
1189 #endif
1190
1191 #define DCT_SAD4(m,mm,o)\
1192 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1193 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1194 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1195 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1196 MMABS_SUM(mm##2, mm##6, mm##0)\
1197 MMABS_SUM(mm##3, mm##7, mm##1)\
1198 MMABS_SUM(mm##4, mm##6, mm##0)\
1199 MMABS_SUM(mm##5, mm##7, mm##1)\
1200
1201 #define DCT_SAD_MMX\
1202 "pxor %%mm0, %%mm0 \n\t"\
1203 "pxor %%mm1, %%mm1 \n\t"\
1204 DCT_SAD4(q, %%mm, 0)\
1205 DCT_SAD4(q, %%mm, 8)\
1206 DCT_SAD4(q, %%mm, 64)\
1207 DCT_SAD4(q, %%mm, 72)\
1208 "paddusw %%mm1, %%mm0 \n\t"\
1209 HSUM(%%mm0, %%mm1, %0)
1210
1211 #define DCT_SAD_SSE2\
1212 "pxor %%xmm0, %%xmm0 \n\t"\
1213 "pxor %%xmm1, %%xmm1 \n\t"\
1214 DCT_SAD4(dqa, %%xmm, 0)\
1215 DCT_SAD4(dqa, %%xmm, 64)\
1216 "paddusw %%xmm1, %%xmm0 \n\t"\
1217 HSUM(%%xmm0, %%xmm1, %0)
1218
1219 #define DCT_SAD_FUNC(cpu) \
1220 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1221 int sum;\
1222 __asm__ volatile(\
1223 DCT_SAD\
1224 :"=r"(sum)\
1225 :"r"(block)\
1226 );\
1227 return sum&0xFFFF;\
1228 }
1229
1230 #define DCT_SAD DCT_SAD_MMX
1231 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1232 #define MMABS(a,z) MMABS_MMX(a,z)
1233 DCT_SAD_FUNC(mmx)
1234 #undef MMABS
1235 #undef HSUM
1236
1237 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1238 #define MMABS(a,z) MMABS_MMX2(a,z)
1239 DCT_SAD_FUNC(mmx2)
1240 #undef HSUM
1241 #undef DCT_SAD
1242
1243 #define DCT_SAD DCT_SAD_SSE2
1244 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1245 DCT_SAD_FUNC(sse2)
1246 #undef MMABS
1247
1248 #ifdef HAVE_SSSE3
1249 #define MMABS(a,z) MMABS_SSSE3(a,z)
1250 DCT_SAD_FUNC(ssse3)
1251 #undef MMABS
1252 #endif
1253 #undef HSUM
1254 #undef DCT_SAD
1255
1256 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1257 int sum;
1258 x86_reg i=size;
1259 __asm__ volatile(
1260 "pxor %%mm4, %%mm4 \n"
1261 "1: \n"
1262 "sub $8, %0 \n"
1263 "movq (%2,%0), %%mm2 \n"
1264 "movq (%3,%0,2), %%mm0 \n"
1265 "movq 8(%3,%0,2), %%mm1 \n"
1266 "punpckhbw %%mm2, %%mm3 \n"
1267 "punpcklbw %%mm2, %%mm2 \n"
1268 "psraw $8, %%mm3 \n"
1269 "psraw $8, %%mm2 \n"
1270 "psubw %%mm3, %%mm1 \n"
1271 "psubw %%mm2, %%mm0 \n"
1272 "pmaddwd %%mm1, %%mm1 \n"
1273 "pmaddwd %%mm0, %%mm0 \n"
1274 "paddd %%mm1, %%mm4 \n"
1275 "paddd %%mm0, %%mm4 \n"
1276 "jg 1b \n"
1277 "movq %%mm4, %%mm3 \n"
1278 "psrlq $32, %%mm3 \n"
1279 "paddd %%mm3, %%mm4 \n"
1280 "movd %%mm4, %1 \n"
1281 :"+r"(i), "=r"(sum)
1282 :"r"(pix1), "r"(pix2)
1283 );
1284 return sum;
1285 }
1286
1287 #define PHADDD(a, t)\
1288 "movq "#a", "#t" \n\t"\
1289 "psrlq $32, "#a" \n\t"\
1290 "paddd "#t", "#a" \n\t"
1291 /*
1292 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1293 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1294 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1295 */
1296 #define PMULHRW(x, y, s, o)\
1297 "pmulhw " #s ", "#x " \n\t"\
1298 "pmulhw " #s ", "#y " \n\t"\
1299 "paddw " #o ", "#x " \n\t"\
1300 "paddw " #o ", "#y " \n\t"\
1301 "psraw $1, "#x " \n\t"\
1302 "psraw $1, "#y " \n\t"
1303 #define DEF(x) x ## _mmx
1304 #define SET_RND MOVQ_WONE
1305 #define SCALE_OFFSET 1
1306
1307 #include "dsputil_mmx_qns_template.c"
1308
1309 #undef DEF
1310 #undef SET_RND
1311 #undef SCALE_OFFSET
1312 #undef PMULHRW
1313
1314 #define DEF(x) x ## _3dnow
1315 #define SET_RND(x)
1316 #define SCALE_OFFSET 0
1317 #define PMULHRW(x, y, s, o)\
1318 "pmulhrw " #s ", "#x " \n\t"\
1319 "pmulhrw " #s ", "#y " \n\t"
1320
1321 #include "dsputil_mmx_qns_template.c"
1322
1323 #undef DEF
1324 #undef SET_RND
1325 #undef SCALE_OFFSET
1326 #undef PMULHRW
1327
1328 #ifdef HAVE_SSSE3
1329 #undef PHADDD
1330 #define DEF(x) x ## _ssse3
1331 #define SET_RND(x)
1332 #define SCALE_OFFSET -1
1333 #define PHADDD(a, t)\
1334 "pshufw $0x0E, "#a", "#t" \n\t"\
1335 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
1336 #define PMULHRW(x, y, s, o)\
1337 "pmulhrsw " #s ", "#x " \n\t"\
1338 "pmulhrsw " #s ", "#y " \n\t"
1339
1340 #include "dsputil_mmx_qns_template.c"
1341
1342 #undef DEF
1343 #undef SET_RND
1344 #undef SCALE_OFFSET
1345 #undef PMULHRW
1346 #undef PHADDD
1347 #endif //HAVE_SSSE3
1348
1349
1350 /* FLAC specific */
1351 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
1352 double *autoc);
1353
1354
1355 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1356 {
1357 if (mm_flags & FF_MM_MMX) {
1358 const int dct_algo = avctx->dct_algo;
1359 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1360 if(mm_flags & FF_MM_SSE2){
1361 c->fdct = ff_fdct_sse2;
1362 }else if(mm_flags & FF_MM_MMXEXT){
1363 c->fdct = ff_fdct_mmx2;
1364 }else{
1365 c->fdct = ff_fdct_mmx;
1366 }
1367 }
1368
1369 c->get_pixels = get_pixels_mmx;
1370 c->diff_pixels = diff_pixels_mmx;
1371 c->pix_sum = pix_sum16_mmx;
1372
1373 c->diff_bytes= diff_bytes_mmx;
1374 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1375
1376 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1377 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1378
1379 c->pix_norm1 = pix_norm1_mmx;
1380 c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx;
1381 c->sse[1] = sse8_mmx;
1382 c->vsad[4]= vsad_intra16_mmx;
1383
1384 c->nsse[0] = nsse16_mmx;
1385 c->nsse[1] = nsse8_mmx;
1386 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1387 c->vsad[0] = vsad16_mmx;
1388 }
1389
1390 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1391 c->try_8x8basis= try_8x8basis_mmx;
1392 }
1393 c->add_8x8basis= add_8x8basis_mmx;
1394
1395 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1396
1397
1398 if (mm_flags & FF_MM_MMXEXT) {
1399 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1400 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1401 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1402 c->vsad[4]= vsad_intra16_mmx2;
1403
1404 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1405 c->vsad[0] = vsad16_mmx2;
1406 }
1407
1408 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1409 }
1410
1411 if(mm_flags & FF_MM_SSE2){
1412 c->get_pixels = get_pixels_sse2;
1413 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1414 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
1415 c->hadamard8_diff[1]= hadamard8_diff_sse2;
1416 if (ENABLE_FLAC_ENCODER)
1417 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
1418 }
1419
1420 #ifdef HAVE_SSSE3
1421 if(mm_flags & FF_MM_SSSE3){
1422 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1423 c->try_8x8basis= try_8x8basis_ssse3;
1424 }
1425 c->add_8x8basis= add_8x8basis_ssse3;
1426 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1427 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
1428 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
1429 }
1430 #endif
1431
1432 if(mm_flags & FF_MM_3DNOW){
1433 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1434 c->try_8x8basis= try_8x8basis_3dnow;
1435 }
1436 c->add_8x8basis= add_8x8basis_3dnow;
1437 }
1438 }
1439
1440 dsputil_init_pix_mmx(c, avctx);
1441 }