Mercurial > libavcodec.hg
annotate x86/motion_est_mmx.c @ 9300:6a1aacfa3043 libavcodec
Slightly simplify part of ipvideo_decode_block_opcode_0x8
author | reimar |
---|---|
date | Tue, 31 Mar 2009 13:30:42 +0000 |
parents | 04423b2f6e0b |
children | 7f594601d5e9 |
rev | line source |
---|---|
8430 | 1 /* |
2 * MMX optimized motion estimation | |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8430
diff
changeset
|
3 * Copyright (c) 2001 Fabrice Bellard |
8430 | 4 * Copyright (c) 2002-2004 Michael Niedermayer |
5 * | |
6 * mostly by Michael Niedermayer <michaelni@gmx.at> | |
7 * | |
8 * This file is part of FFmpeg. | |
9 * | |
10 * FFmpeg is free software; you can redistribute it and/or | |
11 * modify it under the terms of the GNU Lesser General Public | |
12 * License as published by the Free Software Foundation; either | |
13 * version 2.1 of the License, or (at your option) any later version. | |
14 * | |
15 * FFmpeg is distributed in the hope that it will be useful, | |
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 * Lesser General Public License for more details. | |
19 * | |
20 * You should have received a copy of the GNU Lesser General Public | |
21 * License along with FFmpeg; if not, write to the Free Software | |
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 */ | |
24 | |
25 #include "libavutil/x86_cpu.h" | |
26 #include "libavcodec/dsputil.h" | |
27 | |
28 DECLARE_ASM_CONST(8, uint64_t, round_tab[3])={ | |
29 0x0000000000000000ULL, | |
30 0x0001000100010001ULL, | |
31 0x0002000200020002ULL, | |
32 }; | |
33 | |
34 DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL; | |
35 | |
36 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
37 { | |
38 x86_reg len= -(stride*h); | |
39 __asm__ volatile( | |
40 ASMALIGN(4) | |
41 "1: \n\t" | |
42 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
43 "movq (%2, %%"REG_a"), %%mm2 \n\t" | |
44 "movq (%2, %%"REG_a"), %%mm4 \n\t" | |
45 "add %3, %%"REG_a" \n\t" | |
46 "psubusb %%mm0, %%mm2 \n\t" | |
47 "psubusb %%mm4, %%mm0 \n\t" | |
48 "movq (%1, %%"REG_a"), %%mm1 \n\t" | |
49 "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
50 "movq (%2, %%"REG_a"), %%mm5 \n\t" | |
51 "psubusb %%mm1, %%mm3 \n\t" | |
52 "psubusb %%mm5, %%mm1 \n\t" | |
53 "por %%mm2, %%mm0 \n\t" | |
54 "por %%mm1, %%mm3 \n\t" | |
55 "movq %%mm0, %%mm1 \n\t" | |
56 "movq %%mm3, %%mm2 \n\t" | |
57 "punpcklbw %%mm7, %%mm0 \n\t" | |
58 "punpckhbw %%mm7, %%mm1 \n\t" | |
59 "punpcklbw %%mm7, %%mm3 \n\t" | |
60 "punpckhbw %%mm7, %%mm2 \n\t" | |
61 "paddw %%mm1, %%mm0 \n\t" | |
62 "paddw %%mm3, %%mm2 \n\t" | |
63 "paddw %%mm2, %%mm0 \n\t" | |
64 "paddw %%mm0, %%mm6 \n\t" | |
65 "add %3, %%"REG_a" \n\t" | |
66 " js 1b \n\t" | |
67 : "+a" (len) | |
68 : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride) | |
69 ); | |
70 } | |
71 | |
72 static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
73 { | |
74 __asm__ volatile( | |
75 ASMALIGN(4) | |
76 "1: \n\t" | |
77 "movq (%1), %%mm0 \n\t" | |
78 "movq (%1, %3), %%mm1 \n\t" | |
79 "psadbw (%2), %%mm0 \n\t" | |
80 "psadbw (%2, %3), %%mm1 \n\t" | |
81 "paddw %%mm0, %%mm6 \n\t" | |
82 "paddw %%mm1, %%mm6 \n\t" | |
83 "lea (%1,%3,2), %1 \n\t" | |
84 "lea (%2,%3,2), %2 \n\t" | |
85 "sub $2, %0 \n\t" | |
86 " jg 1b \n\t" | |
87 : "+r" (h), "+r" (blk1), "+r" (blk2) | |
88 : "r" ((x86_reg)stride) | |
89 ); | |
90 } | |
91 | |
92 static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) | |
93 { | |
94 int ret; | |
95 __asm__ volatile( | |
96 "pxor %%xmm6, %%xmm6 \n\t" | |
97 ASMALIGN(4) | |
98 "1: \n\t" | |
99 "movdqu (%1), %%xmm0 \n\t" | |
100 "movdqu (%1, %3), %%xmm1 \n\t" | |
101 "psadbw (%2), %%xmm0 \n\t" | |
102 "psadbw (%2, %3), %%xmm1 \n\t" | |
103 "paddw %%xmm0, %%xmm6 \n\t" | |
104 "paddw %%xmm1, %%xmm6 \n\t" | |
105 "lea (%1,%3,2), %1 \n\t" | |
106 "lea (%2,%3,2), %2 \n\t" | |
107 "sub $2, %0 \n\t" | |
108 " jg 1b \n\t" | |
109 : "+r" (h), "+r" (blk1), "+r" (blk2) | |
110 : "r" ((x86_reg)stride) | |
111 ); | |
112 __asm__ volatile( | |
113 "movhlps %%xmm6, %%xmm0 \n\t" | |
114 "paddw %%xmm0, %%xmm6 \n\t" | |
115 "movd %%xmm6, %0 \n\t" | |
116 : "=r"(ret) | |
117 ); | |
118 return ret; | |
119 } | |
120 | |
121 static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
122 { | |
123 __asm__ volatile( | |
124 ASMALIGN(4) | |
125 "1: \n\t" | |
126 "movq (%1), %%mm0 \n\t" | |
127 "movq (%1, %3), %%mm1 \n\t" | |
128 "pavgb 1(%1), %%mm0 \n\t" | |
129 "pavgb 1(%1, %3), %%mm1 \n\t" | |
130 "psadbw (%2), %%mm0 \n\t" | |
131 "psadbw (%2, %3), %%mm1 \n\t" | |
132 "paddw %%mm0, %%mm6 \n\t" | |
133 "paddw %%mm1, %%mm6 \n\t" | |
134 "lea (%1,%3,2), %1 \n\t" | |
135 "lea (%2,%3,2), %2 \n\t" | |
136 "sub $2, %0 \n\t" | |
137 " jg 1b \n\t" | |
138 : "+r" (h), "+r" (blk1), "+r" (blk2) | |
139 : "r" ((x86_reg)stride) | |
140 ); | |
141 } | |
142 | |
143 static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
144 { | |
145 __asm__ volatile( | |
146 "movq (%1), %%mm0 \n\t" | |
147 "add %3, %1 \n\t" | |
148 ASMALIGN(4) | |
149 "1: \n\t" | |
150 "movq (%1), %%mm1 \n\t" | |
151 "movq (%1, %3), %%mm2 \n\t" | |
152 "pavgb %%mm1, %%mm0 \n\t" | |
153 "pavgb %%mm2, %%mm1 \n\t" | |
154 "psadbw (%2), %%mm0 \n\t" | |
155 "psadbw (%2, %3), %%mm1 \n\t" | |
156 "paddw %%mm0, %%mm6 \n\t" | |
157 "paddw %%mm1, %%mm6 \n\t" | |
158 "movq %%mm2, %%mm0 \n\t" | |
159 "lea (%1,%3,2), %1 \n\t" | |
160 "lea (%2,%3,2), %2 \n\t" | |
161 "sub $2, %0 \n\t" | |
162 " jg 1b \n\t" | |
163 : "+r" (h), "+r" (blk1), "+r" (blk2) | |
164 : "r" ((x86_reg)stride) | |
165 ); | |
166 } | |
167 | |
168 static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
169 { | |
170 __asm__ volatile( | |
171 "movq "MANGLE(bone)", %%mm5 \n\t" | |
172 "movq (%1), %%mm0 \n\t" | |
173 "pavgb 1(%1), %%mm0 \n\t" | |
174 "add %3, %1 \n\t" | |
175 ASMALIGN(4) | |
176 "1: \n\t" | |
177 "movq (%1), %%mm1 \n\t" | |
178 "movq (%1,%3), %%mm2 \n\t" | |
179 "pavgb 1(%1), %%mm1 \n\t" | |
180 "pavgb 1(%1,%3), %%mm2 \n\t" | |
181 "psubusb %%mm5, %%mm1 \n\t" | |
182 "pavgb %%mm1, %%mm0 \n\t" | |
183 "pavgb %%mm2, %%mm1 \n\t" | |
184 "psadbw (%2), %%mm0 \n\t" | |
185 "psadbw (%2,%3), %%mm1 \n\t" | |
186 "paddw %%mm0, %%mm6 \n\t" | |
187 "paddw %%mm1, %%mm6 \n\t" | |
188 "movq %%mm2, %%mm0 \n\t" | |
189 "lea (%1,%3,2), %1 \n\t" | |
190 "lea (%2,%3,2), %2 \n\t" | |
191 "sub $2, %0 \n\t" | |
192 " jg 1b \n\t" | |
193 : "+r" (h), "+r" (blk1), "+r" (blk2) | |
194 : "r" ((x86_reg)stride) | |
195 ); | |
196 } | |
197 | |
198 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) | |
199 { | |
200 x86_reg len= -(stride*h); | |
201 __asm__ volatile( | |
202 ASMALIGN(4) | |
203 "1: \n\t" | |
204 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
205 "movq (%2, %%"REG_a"), %%mm1 \n\t" | |
206 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
207 "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
208 "punpcklbw %%mm7, %%mm0 \n\t" | |
209 "punpcklbw %%mm7, %%mm1 \n\t" | |
210 "punpckhbw %%mm7, %%mm2 \n\t" | |
211 "punpckhbw %%mm7, %%mm3 \n\t" | |
212 "paddw %%mm0, %%mm1 \n\t" | |
213 "paddw %%mm2, %%mm3 \n\t" | |
214 "movq (%3, %%"REG_a"), %%mm4 \n\t" | |
215 "movq (%3, %%"REG_a"), %%mm2 \n\t" | |
216 "paddw %%mm5, %%mm1 \n\t" | |
217 "paddw %%mm5, %%mm3 \n\t" | |
218 "psrlw $1, %%mm1 \n\t" | |
219 "psrlw $1, %%mm3 \n\t" | |
220 "packuswb %%mm3, %%mm1 \n\t" | |
221 "psubusb %%mm1, %%mm4 \n\t" | |
222 "psubusb %%mm2, %%mm1 \n\t" | |
223 "por %%mm4, %%mm1 \n\t" | |
224 "movq %%mm1, %%mm0 \n\t" | |
225 "punpcklbw %%mm7, %%mm0 \n\t" | |
226 "punpckhbw %%mm7, %%mm1 \n\t" | |
227 "paddw %%mm1, %%mm0 \n\t" | |
228 "paddw %%mm0, %%mm6 \n\t" | |
229 "add %4, %%"REG_a" \n\t" | |
230 " js 1b \n\t" | |
231 : "+a" (len) | |
232 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride) | |
233 ); | |
234 } | |
235 | |
236 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
237 { | |
238 x86_reg len= -(stride*h); | |
239 __asm__ volatile( | |
240 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
241 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |
242 "movq %%mm0, %%mm1 \n\t" | |
243 "movq %%mm2, %%mm3 \n\t" | |
244 "punpcklbw %%mm7, %%mm0 \n\t" | |
245 "punpckhbw %%mm7, %%mm1 \n\t" | |
246 "punpcklbw %%mm7, %%mm2 \n\t" | |
247 "punpckhbw %%mm7, %%mm3 \n\t" | |
248 "paddw %%mm2, %%mm0 \n\t" | |
249 "paddw %%mm3, %%mm1 \n\t" | |
250 ASMALIGN(4) | |
251 "1: \n\t" | |
252 "movq (%2, %%"REG_a"), %%mm2 \n\t" | |
253 "movq 1(%2, %%"REG_a"), %%mm4 \n\t" | |
254 "movq %%mm2, %%mm3 \n\t" | |
255 "movq %%mm4, %%mm5 \n\t" | |
256 "punpcklbw %%mm7, %%mm2 \n\t" | |
257 "punpckhbw %%mm7, %%mm3 \n\t" | |
258 "punpcklbw %%mm7, %%mm4 \n\t" | |
259 "punpckhbw %%mm7, %%mm5 \n\t" | |
260 "paddw %%mm4, %%mm2 \n\t" | |
261 "paddw %%mm5, %%mm3 \n\t" | |
262 "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" | |
263 "paddw %%mm2, %%mm0 \n\t" | |
264 "paddw %%mm3, %%mm1 \n\t" | |
265 "paddw %%mm5, %%mm0 \n\t" | |
266 "paddw %%mm5, %%mm1 \n\t" | |
267 "movq (%3, %%"REG_a"), %%mm4 \n\t" | |
268 "movq (%3, %%"REG_a"), %%mm5 \n\t" | |
269 "psrlw $2, %%mm0 \n\t" | |
270 "psrlw $2, %%mm1 \n\t" | |
271 "packuswb %%mm1, %%mm0 \n\t" | |
272 "psubusb %%mm0, %%mm4 \n\t" | |
273 "psubusb %%mm5, %%mm0 \n\t" | |
274 "por %%mm4, %%mm0 \n\t" | |
275 "movq %%mm0, %%mm4 \n\t" | |
276 "punpcklbw %%mm7, %%mm0 \n\t" | |
277 "punpckhbw %%mm7, %%mm4 \n\t" | |
278 "paddw %%mm0, %%mm6 \n\t" | |
279 "paddw %%mm4, %%mm6 \n\t" | |
280 "movq %%mm2, %%mm0 \n\t" | |
281 "movq %%mm3, %%mm1 \n\t" | |
282 "add %4, %%"REG_a" \n\t" | |
283 " js 1b \n\t" | |
284 : "+a" (len) | |
285 : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride) | |
286 ); | |
287 } | |
288 | |
289 static inline int sum_mmx(void) | |
290 { | |
291 int ret; | |
292 __asm__ volatile( | |
293 "movq %%mm6, %%mm0 \n\t" | |
294 "psrlq $32, %%mm6 \n\t" | |
295 "paddw %%mm0, %%mm6 \n\t" | |
296 "movq %%mm6, %%mm0 \n\t" | |
297 "psrlq $16, %%mm6 \n\t" | |
298 "paddw %%mm0, %%mm6 \n\t" | |
299 "movd %%mm6, %0 \n\t" | |
300 : "=r" (ret) | |
301 ); | |
302 return ret&0xFFFF; | |
303 } | |
304 | |
305 static inline int sum_mmx2(void) | |
306 { | |
307 int ret; | |
308 __asm__ volatile( | |
309 "movd %%mm6, %0 \n\t" | |
310 : "=r" (ret) | |
311 ); | |
312 return ret; | |
313 } | |
314 | |
315 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
316 { | |
317 sad8_2_mmx(blk1, blk1+1, blk2, stride, h); | |
318 } | |
319 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
320 { | |
321 sad8_2_mmx(blk1, blk1+stride, blk2, stride, h); | |
322 } | |
323 | |
324 | |
325 #define PIX_SAD(suf)\ | |
326 static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ | |
327 {\ | |
328 assert(h==8);\ | |
329 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ | |
330 "pxor %%mm6, %%mm6 \n\t":);\ | |
331 \ | |
332 sad8_1_ ## suf(blk1, blk2, stride, 8);\ | |
333 \ | |
334 return sum_ ## suf();\ | |
335 }\ | |
336 static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ | |
337 {\ | |
338 assert(h==8);\ | |
339 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ | |
340 "pxor %%mm6, %%mm6 \n\t"\ | |
341 "movq %0, %%mm5 \n\t"\ | |
342 :: "m"(round_tab[1]) \ | |
343 );\ | |
344 \ | |
345 sad8_x2a_ ## suf(blk1, blk2, stride, 8);\ | |
346 \ | |
347 return sum_ ## suf();\ | |
348 }\ | |
349 \ | |
350 static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ | |
351 {\ | |
352 assert(h==8);\ | |
353 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ | |
354 "pxor %%mm6, %%mm6 \n\t"\ | |
355 "movq %0, %%mm5 \n\t"\ | |
356 :: "m"(round_tab[1]) \ | |
357 );\ | |
358 \ | |
359 sad8_y2a_ ## suf(blk1, blk2, stride, 8);\ | |
360 \ | |
361 return sum_ ## suf();\ | |
362 }\ | |
363 \ | |
364 static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ | |
365 {\ | |
366 assert(h==8);\ | |
367 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ | |
368 "pxor %%mm6, %%mm6 \n\t"\ | |
369 ::);\ | |
370 \ | |
371 sad8_4_ ## suf(blk1, blk2, stride, 8);\ | |
372 \ | |
373 return sum_ ## suf();\ | |
374 }\ | |
375 \ | |
376 static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ | |
377 {\ | |
378 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ | |
379 "pxor %%mm6, %%mm6 \n\t":);\ | |
380 \ | |
381 sad8_1_ ## suf(blk1 , blk2 , stride, h);\ | |
382 sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\ | |
383 \ | |
384 return sum_ ## suf();\ | |
385 }\ | |
386 static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ | |
387 {\ | |
388 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ | |
389 "pxor %%mm6, %%mm6 \n\t"\ | |
390 "movq %0, %%mm5 \n\t"\ | |
391 :: "m"(round_tab[1]) \ | |
392 );\ | |
393 \ | |
394 sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\ | |
395 sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\ | |
396 \ | |
397 return sum_ ## suf();\ | |
398 }\ | |
399 static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ | |
400 {\ | |
401 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ | |
402 "pxor %%mm6, %%mm6 \n\t"\ | |
403 "movq %0, %%mm5 \n\t"\ | |
404 :: "m"(round_tab[1]) \ | |
405 );\ | |
406 \ | |
407 sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\ | |
408 sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\ | |
409 \ | |
410 return sum_ ## suf();\ | |
411 }\ | |
412 static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ | |
413 {\ | |
414 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ | |
415 "pxor %%mm6, %%mm6 \n\t"\ | |
416 ::);\ | |
417 \ | |
418 sad8_4_ ## suf(blk1 , blk2 , stride, h);\ | |
419 sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\ | |
420 \ | |
421 return sum_ ## suf();\ | |
422 }\ | |
423 | |
424 PIX_SAD(mmx) | |
425 PIX_SAD(mmx2) | |
426 | |
427 void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx) | |
428 { | |
429 if (mm_flags & FF_MM_MMX) { | |
430 c->pix_abs[0][0] = sad16_mmx; | |
431 c->pix_abs[0][1] = sad16_x2_mmx; | |
432 c->pix_abs[0][2] = sad16_y2_mmx; | |
433 c->pix_abs[0][3] = sad16_xy2_mmx; | |
434 c->pix_abs[1][0] = sad8_mmx; | |
435 c->pix_abs[1][1] = sad8_x2_mmx; | |
436 c->pix_abs[1][2] = sad8_y2_mmx; | |
437 c->pix_abs[1][3] = sad8_xy2_mmx; | |
438 | |
439 c->sad[0]= sad16_mmx; | |
440 c->sad[1]= sad8_mmx; | |
441 } | |
442 if (mm_flags & FF_MM_MMXEXT) { | |
443 c->pix_abs[0][0] = sad16_mmx2; | |
444 c->pix_abs[1][0] = sad8_mmx2; | |
445 | |
446 c->sad[0]= sad16_mmx2; | |
447 c->sad[1]= sad8_mmx2; | |
448 | |
449 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |
450 c->pix_abs[0][1] = sad16_x2_mmx2; | |
451 c->pix_abs[0][2] = sad16_y2_mmx2; | |
452 c->pix_abs[0][3] = sad16_xy2_mmx2; | |
453 c->pix_abs[1][1] = sad8_x2_mmx2; | |
454 c->pix_abs[1][2] = sad8_y2_mmx2; | |
455 c->pix_abs[1][3] = sad8_xy2_mmx2; | |
456 } | |
457 } | |
458 if ((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)) { | |
459 c->sad[0]= sad16_sse2; | |
460 } | |
461 } |