Mercurial > libavcodec.hg
annotate x86/dsputil_mmx_avg_template.c @ 12454:f4355cd85faa libavcodec
Port latest x264 deblock asm (before they moved to using NV12 as internal
format), LGPL'ed with permission from Jason and Loren. This includes mmx2
code, so remove inline asm from h264dsp_mmx.c accordingly.
author | rbultje |
---|---|
date | Fri, 03 Sep 2010 16:52:46 +0000 |
parents | 11c5a87497d3 |
children |
rev | line source |
---|---|
8430 | 1 /* |
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2 | |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8430
diff
changeset
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard |
8430 | 4 * Copyright (c) 2002-2004 Michael Niedermayer |
5 * | |
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> | |
8 * and improved by Zdenek Kabelac <kabi@users.sf.net> | |
9 * | |
10 * This file is part of FFmpeg. | |
11 * | |
12 * FFmpeg is free software; you can redistribute it and/or | |
13 * modify it under the terms of the GNU Lesser General Public | |
14 * License as published by the Free Software Foundation; either | |
15 * version 2.1 of the License, or (at your option) any later version. | |
16 * | |
17 * FFmpeg is distributed in the hope that it will be useful, | |
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
20 * Lesser General Public License for more details. | |
21 * | |
22 * You should have received a copy of the GNU Lesser General Public | |
23 * License along with FFmpeg; if not, write to the Free Software | |
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
25 */ | |
26 | |
27 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm | |
28 clobber bug - now it will work with 2.95.2 and also with -fPIC | |
29 */ | |
30 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
31 { | |
32 __asm__ volatile( | |
33 "lea (%3, %3), %%"REG_a" \n\t" | |
34 "1: \n\t" | |
35 "movq (%1), %%mm0 \n\t" | |
36 "movq (%1, %3), %%mm1 \n\t" | |
37 PAVGB" 1(%1), %%mm0 \n\t" | |
38 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
39 "movq %%mm0, (%2) \n\t" | |
40 "movq %%mm1, (%2, %3) \n\t" | |
41 "add %%"REG_a", %1 \n\t" | |
42 "add %%"REG_a", %2 \n\t" | |
43 "movq (%1), %%mm0 \n\t" | |
44 "movq (%1, %3), %%mm1 \n\t" | |
45 PAVGB" 1(%1), %%mm0 \n\t" | |
46 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
47 "add %%"REG_a", %1 \n\t" | |
48 "movq %%mm0, (%2) \n\t" | |
49 "movq %%mm1, (%2, %3) \n\t" | |
50 "add %%"REG_a", %2 \n\t" | |
51 "subl $4, %0 \n\t" | |
52 "jnz 1b \n\t" | |
53 :"+g"(h), "+S"(pixels), "+D"(block) | |
54 :"r" ((x86_reg)line_size) | |
55 :"%"REG_a, "memory"); | |
56 } | |
57 | |
58 static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
59 { | |
60 __asm__ volatile( | |
61 "testl $1, %0 \n\t" | |
62 " jz 1f \n\t" | |
63 "movd (%1), %%mm0 \n\t" | |
64 "movd (%2), %%mm1 \n\t" | |
65 "add %4, %1 \n\t" | |
66 "add $4, %2 \n\t" | |
67 PAVGB" %%mm1, %%mm0 \n\t" | |
68 "movd %%mm0, (%3) \n\t" | |
69 "add %5, %3 \n\t" | |
70 "decl %0 \n\t" | |
71 "1: \n\t" | |
72 "movd (%1), %%mm0 \n\t" | |
73 "add %4, %1 \n\t" | |
74 "movd (%1), %%mm1 \n\t" | |
75 "movd (%2), %%mm2 \n\t" | |
76 "movd 4(%2), %%mm3 \n\t" | |
77 "add %4, %1 \n\t" | |
78 PAVGB" %%mm2, %%mm0 \n\t" | |
79 PAVGB" %%mm3, %%mm1 \n\t" | |
80 "movd %%mm0, (%3) \n\t" | |
81 "add %5, %3 \n\t" | |
82 "movd %%mm1, (%3) \n\t" | |
83 "add %5, %3 \n\t" | |
84 "movd (%1), %%mm0 \n\t" | |
85 "add %4, %1 \n\t" | |
86 "movd (%1), %%mm1 \n\t" | |
87 "movd 8(%2), %%mm2 \n\t" | |
88 "movd 12(%2), %%mm3 \n\t" | |
89 "add %4, %1 \n\t" | |
90 PAVGB" %%mm2, %%mm0 \n\t" | |
91 PAVGB" %%mm3, %%mm1 \n\t" | |
92 "movd %%mm0, (%3) \n\t" | |
93 "add %5, %3 \n\t" | |
94 "movd %%mm1, (%3) \n\t" | |
95 "add %5, %3 \n\t" | |
96 "add $16, %2 \n\t" | |
97 "subl $4, %0 \n\t" | |
98 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
99 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 100 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
101 #else | |
102 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
103 #endif | |
104 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
105 :"memory"); | |
106 } | |
107 | |
108 | |
109 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
110 { | |
111 __asm__ volatile( | |
112 "testl $1, %0 \n\t" | |
113 " jz 1f \n\t" | |
114 "movq (%1), %%mm0 \n\t" | |
115 "movq (%2), %%mm1 \n\t" | |
116 "add %4, %1 \n\t" | |
117 "add $8, %2 \n\t" | |
118 PAVGB" %%mm1, %%mm0 \n\t" | |
119 "movq %%mm0, (%3) \n\t" | |
120 "add %5, %3 \n\t" | |
121 "decl %0 \n\t" | |
122 "1: \n\t" | |
123 "movq (%1), %%mm0 \n\t" | |
124 "add %4, %1 \n\t" | |
125 "movq (%1), %%mm1 \n\t" | |
126 "add %4, %1 \n\t" | |
127 PAVGB" (%2), %%mm0 \n\t" | |
128 PAVGB" 8(%2), %%mm1 \n\t" | |
129 "movq %%mm0, (%3) \n\t" | |
130 "add %5, %3 \n\t" | |
131 "movq %%mm1, (%3) \n\t" | |
132 "add %5, %3 \n\t" | |
133 "movq (%1), %%mm0 \n\t" | |
134 "add %4, %1 \n\t" | |
135 "movq (%1), %%mm1 \n\t" | |
136 "add %4, %1 \n\t" | |
137 PAVGB" 16(%2), %%mm0 \n\t" | |
138 PAVGB" 24(%2), %%mm1 \n\t" | |
139 "movq %%mm0, (%3) \n\t" | |
140 "add %5, %3 \n\t" | |
141 "movq %%mm1, (%3) \n\t" | |
142 "add %5, %3 \n\t" | |
143 "add $32, %2 \n\t" | |
144 "subl $4, %0 \n\t" | |
145 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
146 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 147 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
148 #else | |
149 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
150 #endif | |
151 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
152 :"memory"); | |
153 //the following should be used, though better not with gcc ... | |
154 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
155 :"r"(src1Stride), "r"(dstStride) | |
156 :"memory");*/ | |
157 } | |
158 | |
159 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
160 { | |
161 __asm__ volatile( | |
162 "pcmpeqb %%mm6, %%mm6 \n\t" | |
163 "testl $1, %0 \n\t" | |
164 " jz 1f \n\t" | |
165 "movq (%1), %%mm0 \n\t" | |
166 "movq (%2), %%mm1 \n\t" | |
167 "add %4, %1 \n\t" | |
168 "add $8, %2 \n\t" | |
169 "pxor %%mm6, %%mm0 \n\t" | |
170 "pxor %%mm6, %%mm1 \n\t" | |
171 PAVGB" %%mm1, %%mm0 \n\t" | |
172 "pxor %%mm6, %%mm0 \n\t" | |
173 "movq %%mm0, (%3) \n\t" | |
174 "add %5, %3 \n\t" | |
175 "decl %0 \n\t" | |
176 "1: \n\t" | |
177 "movq (%1), %%mm0 \n\t" | |
178 "add %4, %1 \n\t" | |
179 "movq (%1), %%mm1 \n\t" | |
180 "add %4, %1 \n\t" | |
181 "movq (%2), %%mm2 \n\t" | |
182 "movq 8(%2), %%mm3 \n\t" | |
183 "pxor %%mm6, %%mm0 \n\t" | |
184 "pxor %%mm6, %%mm1 \n\t" | |
185 "pxor %%mm6, %%mm2 \n\t" | |
186 "pxor %%mm6, %%mm3 \n\t" | |
187 PAVGB" %%mm2, %%mm0 \n\t" | |
188 PAVGB" %%mm3, %%mm1 \n\t" | |
189 "pxor %%mm6, %%mm0 \n\t" | |
190 "pxor %%mm6, %%mm1 \n\t" | |
191 "movq %%mm0, (%3) \n\t" | |
192 "add %5, %3 \n\t" | |
193 "movq %%mm1, (%3) \n\t" | |
194 "add %5, %3 \n\t" | |
195 "movq (%1), %%mm0 \n\t" | |
196 "add %4, %1 \n\t" | |
197 "movq (%1), %%mm1 \n\t" | |
198 "add %4, %1 \n\t" | |
199 "movq 16(%2), %%mm2 \n\t" | |
200 "movq 24(%2), %%mm3 \n\t" | |
201 "pxor %%mm6, %%mm0 \n\t" | |
202 "pxor %%mm6, %%mm1 \n\t" | |
203 "pxor %%mm6, %%mm2 \n\t" | |
204 "pxor %%mm6, %%mm3 \n\t" | |
205 PAVGB" %%mm2, %%mm0 \n\t" | |
206 PAVGB" %%mm3, %%mm1 \n\t" | |
207 "pxor %%mm6, %%mm0 \n\t" | |
208 "pxor %%mm6, %%mm1 \n\t" | |
209 "movq %%mm0, (%3) \n\t" | |
210 "add %5, %3 \n\t" | |
211 "movq %%mm1, (%3) \n\t" | |
212 "add %5, %3 \n\t" | |
213 "add $32, %2 \n\t" | |
214 "subl $4, %0 \n\t" | |
215 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
216 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 217 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
218 #else | |
219 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
220 #endif | |
221 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
222 :"memory"); | |
223 //the following should be used, though better not with gcc ... | |
224 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
225 :"r"(src1Stride), "r"(dstStride) | |
226 :"memory");*/ | |
227 } | |
228 | |
229 static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
230 { | |
231 __asm__ volatile( | |
232 "testl $1, %0 \n\t" | |
233 " jz 1f \n\t" | |
234 "movd (%1), %%mm0 \n\t" | |
235 "movd (%2), %%mm1 \n\t" | |
236 "add %4, %1 \n\t" | |
237 "add $4, %2 \n\t" | |
238 PAVGB" %%mm1, %%mm0 \n\t" | |
239 PAVGB" (%3), %%mm0 \n\t" | |
240 "movd %%mm0, (%3) \n\t" | |
241 "add %5, %3 \n\t" | |
242 "decl %0 \n\t" | |
243 "1: \n\t" | |
244 "movd (%1), %%mm0 \n\t" | |
245 "add %4, %1 \n\t" | |
246 "movd (%1), %%mm1 \n\t" | |
247 "add %4, %1 \n\t" | |
248 PAVGB" (%2), %%mm0 \n\t" | |
249 PAVGB" 4(%2), %%mm1 \n\t" | |
250 PAVGB" (%3), %%mm0 \n\t" | |
251 "movd %%mm0, (%3) \n\t" | |
252 "add %5, %3 \n\t" | |
253 PAVGB" (%3), %%mm1 \n\t" | |
254 "movd %%mm1, (%3) \n\t" | |
255 "add %5, %3 \n\t" | |
256 "movd (%1), %%mm0 \n\t" | |
257 "add %4, %1 \n\t" | |
258 "movd (%1), %%mm1 \n\t" | |
259 "add %4, %1 \n\t" | |
260 PAVGB" 8(%2), %%mm0 \n\t" | |
261 PAVGB" 12(%2), %%mm1 \n\t" | |
262 PAVGB" (%3), %%mm0 \n\t" | |
263 "movd %%mm0, (%3) \n\t" | |
264 "add %5, %3 \n\t" | |
265 PAVGB" (%3), %%mm1 \n\t" | |
266 "movd %%mm1, (%3) \n\t" | |
267 "add %5, %3 \n\t" | |
268 "add $16, %2 \n\t" | |
269 "subl $4, %0 \n\t" | |
270 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
271 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 272 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
273 #else | |
274 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
275 #endif | |
276 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
277 :"memory"); | |
278 } | |
279 | |
280 | |
281 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
282 { | |
283 __asm__ volatile( | |
284 "testl $1, %0 \n\t" | |
285 " jz 1f \n\t" | |
286 "movq (%1), %%mm0 \n\t" | |
287 "movq (%2), %%mm1 \n\t" | |
288 "add %4, %1 \n\t" | |
289 "add $8, %2 \n\t" | |
290 PAVGB" %%mm1, %%mm0 \n\t" | |
291 PAVGB" (%3), %%mm0 \n\t" | |
292 "movq %%mm0, (%3) \n\t" | |
293 "add %5, %3 \n\t" | |
294 "decl %0 \n\t" | |
295 "1: \n\t" | |
296 "movq (%1), %%mm0 \n\t" | |
297 "add %4, %1 \n\t" | |
298 "movq (%1), %%mm1 \n\t" | |
299 "add %4, %1 \n\t" | |
300 PAVGB" (%2), %%mm0 \n\t" | |
301 PAVGB" 8(%2), %%mm1 \n\t" | |
302 PAVGB" (%3), %%mm0 \n\t" | |
303 "movq %%mm0, (%3) \n\t" | |
304 "add %5, %3 \n\t" | |
305 PAVGB" (%3), %%mm1 \n\t" | |
306 "movq %%mm1, (%3) \n\t" | |
307 "add %5, %3 \n\t" | |
308 "movq (%1), %%mm0 \n\t" | |
309 "add %4, %1 \n\t" | |
310 "movq (%1), %%mm1 \n\t" | |
311 "add %4, %1 \n\t" | |
312 PAVGB" 16(%2), %%mm0 \n\t" | |
313 PAVGB" 24(%2), %%mm1 \n\t" | |
314 PAVGB" (%3), %%mm0 \n\t" | |
315 "movq %%mm0, (%3) \n\t" | |
316 "add %5, %3 \n\t" | |
317 PAVGB" (%3), %%mm1 \n\t" | |
318 "movq %%mm1, (%3) \n\t" | |
319 "add %5, %3 \n\t" | |
320 "add $32, %2 \n\t" | |
321 "subl $4, %0 \n\t" | |
322 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
323 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 324 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
325 #else | |
326 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
327 #endif | |
328 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
329 :"memory"); | |
330 //the following should be used, though better not with gcc ... | |
331 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
332 :"r"(src1Stride), "r"(dstStride) | |
333 :"memory");*/ | |
334 } | |
335 | |
336 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
337 { | |
338 __asm__ volatile( | |
339 "lea (%3, %3), %%"REG_a" \n\t" | |
340 "1: \n\t" | |
341 "movq (%1), %%mm0 \n\t" | |
342 "movq (%1, %3), %%mm1 \n\t" | |
343 "movq 8(%1), %%mm2 \n\t" | |
344 "movq 8(%1, %3), %%mm3 \n\t" | |
345 PAVGB" 1(%1), %%mm0 \n\t" | |
346 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
347 PAVGB" 9(%1), %%mm2 \n\t" | |
348 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
349 "movq %%mm0, (%2) \n\t" | |
350 "movq %%mm1, (%2, %3) \n\t" | |
351 "movq %%mm2, 8(%2) \n\t" | |
352 "movq %%mm3, 8(%2, %3) \n\t" | |
353 "add %%"REG_a", %1 \n\t" | |
354 "add %%"REG_a", %2 \n\t" | |
355 "movq (%1), %%mm0 \n\t" | |
356 "movq (%1, %3), %%mm1 \n\t" | |
357 "movq 8(%1), %%mm2 \n\t" | |
358 "movq 8(%1, %3), %%mm3 \n\t" | |
359 PAVGB" 1(%1), %%mm0 \n\t" | |
360 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
361 PAVGB" 9(%1), %%mm2 \n\t" | |
362 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
363 "add %%"REG_a", %1 \n\t" | |
364 "movq %%mm0, (%2) \n\t" | |
365 "movq %%mm1, (%2, %3) \n\t" | |
366 "movq %%mm2, 8(%2) \n\t" | |
367 "movq %%mm3, 8(%2, %3) \n\t" | |
368 "add %%"REG_a", %2 \n\t" | |
369 "subl $4, %0 \n\t" | |
370 "jnz 1b \n\t" | |
371 :"+g"(h), "+S"(pixels), "+D"(block) | |
372 :"r" ((x86_reg)line_size) | |
373 :"%"REG_a, "memory"); | |
374 } | |
375 | |
376 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
377 { | |
378 __asm__ volatile( | |
379 "testl $1, %0 \n\t" | |
380 " jz 1f \n\t" | |
381 "movq (%1), %%mm0 \n\t" | |
382 "movq 8(%1), %%mm1 \n\t" | |
383 PAVGB" (%2), %%mm0 \n\t" | |
384 PAVGB" 8(%2), %%mm1 \n\t" | |
385 "add %4, %1 \n\t" | |
386 "add $16, %2 \n\t" | |
387 "movq %%mm0, (%3) \n\t" | |
388 "movq %%mm1, 8(%3) \n\t" | |
389 "add %5, %3 \n\t" | |
390 "decl %0 \n\t" | |
391 "1: \n\t" | |
392 "movq (%1), %%mm0 \n\t" | |
393 "movq 8(%1), %%mm1 \n\t" | |
394 "add %4, %1 \n\t" | |
395 PAVGB" (%2), %%mm0 \n\t" | |
396 PAVGB" 8(%2), %%mm1 \n\t" | |
397 "movq %%mm0, (%3) \n\t" | |
398 "movq %%mm1, 8(%3) \n\t" | |
399 "add %5, %3 \n\t" | |
400 "movq (%1), %%mm0 \n\t" | |
401 "movq 8(%1), %%mm1 \n\t" | |
402 "add %4, %1 \n\t" | |
403 PAVGB" 16(%2), %%mm0 \n\t" | |
404 PAVGB" 24(%2), %%mm1 \n\t" | |
405 "movq %%mm0, (%3) \n\t" | |
406 "movq %%mm1, 8(%3) \n\t" | |
407 "add %5, %3 \n\t" | |
408 "add $32, %2 \n\t" | |
409 "subl $2, %0 \n\t" | |
410 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
411 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 412 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
413 #else | |
414 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
415 #endif | |
416 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
417 :"memory"); | |
418 //the following should be used, though better not with gcc ... | |
419 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
420 :"r"(src1Stride), "r"(dstStride) | |
421 :"memory");*/ | |
422 } | |
423 | |
424 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
425 { | |
426 __asm__ volatile( | |
427 "testl $1, %0 \n\t" | |
428 " jz 1f \n\t" | |
429 "movq (%1), %%mm0 \n\t" | |
430 "movq 8(%1), %%mm1 \n\t" | |
431 PAVGB" (%2), %%mm0 \n\t" | |
432 PAVGB" 8(%2), %%mm1 \n\t" | |
433 "add %4, %1 \n\t" | |
434 "add $16, %2 \n\t" | |
435 PAVGB" (%3), %%mm0 \n\t" | |
436 PAVGB" 8(%3), %%mm1 \n\t" | |
437 "movq %%mm0, (%3) \n\t" | |
438 "movq %%mm1, 8(%3) \n\t" | |
439 "add %5, %3 \n\t" | |
440 "decl %0 \n\t" | |
441 "1: \n\t" | |
442 "movq (%1), %%mm0 \n\t" | |
443 "movq 8(%1), %%mm1 \n\t" | |
444 "add %4, %1 \n\t" | |
445 PAVGB" (%2), %%mm0 \n\t" | |
446 PAVGB" 8(%2), %%mm1 \n\t" | |
447 PAVGB" (%3), %%mm0 \n\t" | |
448 PAVGB" 8(%3), %%mm1 \n\t" | |
449 "movq %%mm0, (%3) \n\t" | |
450 "movq %%mm1, 8(%3) \n\t" | |
451 "add %5, %3 \n\t" | |
452 "movq (%1), %%mm0 \n\t" | |
453 "movq 8(%1), %%mm1 \n\t" | |
454 "add %4, %1 \n\t" | |
455 PAVGB" 16(%2), %%mm0 \n\t" | |
456 PAVGB" 24(%2), %%mm1 \n\t" | |
457 PAVGB" (%3), %%mm0 \n\t" | |
458 PAVGB" 8(%3), %%mm1 \n\t" | |
459 "movq %%mm0, (%3) \n\t" | |
460 "movq %%mm1, 8(%3) \n\t" | |
461 "add %5, %3 \n\t" | |
462 "add $32, %2 \n\t" | |
463 "subl $2, %0 \n\t" | |
464 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
465 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 466 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
467 #else | |
468 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
469 #endif | |
470 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
471 :"memory"); | |
472 //the following should be used, though better not with gcc ... | |
473 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
474 :"r"(src1Stride), "r"(dstStride) | |
475 :"memory");*/ | |
476 } | |
477 | |
478 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
479 { | |
480 __asm__ volatile( | |
481 "pcmpeqb %%mm6, %%mm6 \n\t" | |
482 "testl $1, %0 \n\t" | |
483 " jz 1f \n\t" | |
484 "movq (%1), %%mm0 \n\t" | |
485 "movq 8(%1), %%mm1 \n\t" | |
486 "movq (%2), %%mm2 \n\t" | |
487 "movq 8(%2), %%mm3 \n\t" | |
488 "pxor %%mm6, %%mm0 \n\t" | |
489 "pxor %%mm6, %%mm1 \n\t" | |
490 "pxor %%mm6, %%mm2 \n\t" | |
491 "pxor %%mm6, %%mm3 \n\t" | |
492 PAVGB" %%mm2, %%mm0 \n\t" | |
493 PAVGB" %%mm3, %%mm1 \n\t" | |
494 "pxor %%mm6, %%mm0 \n\t" | |
495 "pxor %%mm6, %%mm1 \n\t" | |
496 "add %4, %1 \n\t" | |
497 "add $16, %2 \n\t" | |
498 "movq %%mm0, (%3) \n\t" | |
499 "movq %%mm1, 8(%3) \n\t" | |
500 "add %5, %3 \n\t" | |
501 "decl %0 \n\t" | |
502 "1: \n\t" | |
503 "movq (%1), %%mm0 \n\t" | |
504 "movq 8(%1), %%mm1 \n\t" | |
505 "add %4, %1 \n\t" | |
506 "movq (%2), %%mm2 \n\t" | |
507 "movq 8(%2), %%mm3 \n\t" | |
508 "pxor %%mm6, %%mm0 \n\t" | |
509 "pxor %%mm6, %%mm1 \n\t" | |
510 "pxor %%mm6, %%mm2 \n\t" | |
511 "pxor %%mm6, %%mm3 \n\t" | |
512 PAVGB" %%mm2, %%mm0 \n\t" | |
513 PAVGB" %%mm3, %%mm1 \n\t" | |
514 "pxor %%mm6, %%mm0 \n\t" | |
515 "pxor %%mm6, %%mm1 \n\t" | |
516 "movq %%mm0, (%3) \n\t" | |
517 "movq %%mm1, 8(%3) \n\t" | |
518 "add %5, %3 \n\t" | |
519 "movq (%1), %%mm0 \n\t" | |
520 "movq 8(%1), %%mm1 \n\t" | |
521 "add %4, %1 \n\t" | |
522 "movq 16(%2), %%mm2 \n\t" | |
523 "movq 24(%2), %%mm3 \n\t" | |
524 "pxor %%mm6, %%mm0 \n\t" | |
525 "pxor %%mm6, %%mm1 \n\t" | |
526 "pxor %%mm6, %%mm2 \n\t" | |
527 "pxor %%mm6, %%mm3 \n\t" | |
528 PAVGB" %%mm2, %%mm0 \n\t" | |
529 PAVGB" %%mm3, %%mm1 \n\t" | |
530 "pxor %%mm6, %%mm0 \n\t" | |
531 "pxor %%mm6, %%mm1 \n\t" | |
532 "movq %%mm0, (%3) \n\t" | |
533 "movq %%mm1, 8(%3) \n\t" | |
534 "add %5, %3 \n\t" | |
535 "add $32, %2 \n\t" | |
536 "subl $2, %0 \n\t" | |
537 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
538 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 539 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
540 #else | |
541 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
542 #endif | |
543 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
544 :"memory"); | |
545 //the following should be used, though better not with gcc ... | |
546 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
547 :"r"(src1Stride), "r"(dstStride) | |
548 :"memory");*/ | |
549 } | |
550 | |
551 /* GL: this function does incorrect rounding if overflow */ | |
552 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
553 { | |
554 MOVQ_BONE(mm6); | |
555 __asm__ volatile( | |
556 "lea (%3, %3), %%"REG_a" \n\t" | |
557 "1: \n\t" | |
558 "movq (%1), %%mm0 \n\t" | |
559 "movq (%1, %3), %%mm2 \n\t" | |
560 "movq 1(%1), %%mm1 \n\t" | |
561 "movq 1(%1, %3), %%mm3 \n\t" | |
562 "add %%"REG_a", %1 \n\t" | |
563 "psubusb %%mm6, %%mm0 \n\t" | |
564 "psubusb %%mm6, %%mm2 \n\t" | |
565 PAVGB" %%mm1, %%mm0 \n\t" | |
566 PAVGB" %%mm3, %%mm2 \n\t" | |
567 "movq %%mm0, (%2) \n\t" | |
568 "movq %%mm2, (%2, %3) \n\t" | |
569 "movq (%1), %%mm0 \n\t" | |
570 "movq 1(%1), %%mm1 \n\t" | |
571 "movq (%1, %3), %%mm2 \n\t" | |
572 "movq 1(%1, %3), %%mm3 \n\t" | |
573 "add %%"REG_a", %2 \n\t" | |
574 "add %%"REG_a", %1 \n\t" | |
575 "psubusb %%mm6, %%mm0 \n\t" | |
576 "psubusb %%mm6, %%mm2 \n\t" | |
577 PAVGB" %%mm1, %%mm0 \n\t" | |
578 PAVGB" %%mm3, %%mm2 \n\t" | |
579 "movq %%mm0, (%2) \n\t" | |
580 "movq %%mm2, (%2, %3) \n\t" | |
581 "add %%"REG_a", %2 \n\t" | |
582 "subl $4, %0 \n\t" | |
583 "jnz 1b \n\t" | |
584 :"+g"(h), "+S"(pixels), "+D"(block) | |
585 :"r" ((x86_reg)line_size) | |
586 :"%"REG_a, "memory"); | |
587 } | |
588 | |
11826
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
589 static void DEF(put_no_rnd_pixels8_x2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
590 { |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
591 __asm__ volatile ( |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
592 "pcmpeqb %%mm6, %%mm6 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
593 "1: \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
594 "movq (%1), %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
595 "movq (%1, %3), %%mm2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
596 "movq 1(%1), %%mm1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
597 "movq 1(%1, %3), %%mm3 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
598 "pxor %%mm6, %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
599 "pxor %%mm6, %%mm2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
600 "pxor %%mm6, %%mm1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
601 "pxor %%mm6, %%mm3 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
602 PAVGB" %%mm1, %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
603 PAVGB" %%mm3, %%mm2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
604 "pxor %%mm6, %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
605 "pxor %%mm6, %%mm2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
606 "movq %%mm0, (%2) \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
607 "movq %%mm2, (%2, %3) \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
608 "movq (%1, %3,2), %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
609 "movq 1(%1, %3,2), %%mm1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
610 "movq (%1, %4), %%mm2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
611 "movq 1(%1, %4), %%mm3 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
612 "pxor %%mm6, %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
613 "pxor %%mm6, %%mm1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
614 "pxor %%mm6, %%mm2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
615 "pxor %%mm6, %%mm3 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
616 PAVGB" %%mm1, %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
617 PAVGB" %%mm3, %%mm2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
618 "pxor %%mm6, %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
619 "pxor %%mm6, %%mm2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
620 "movq %%mm0, (%2, %3,2) \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
621 "movq %%mm2, (%2, %4) \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
622 "lea (%1, %3,4), %1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
623 "lea (%2, %3,4), %2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
624 "subl $4, %0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
625 "jg 1b \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
626 : "+g"(h), "+r"(pixels), "+r"(block) |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
627 : "r" ((x86_reg)line_size), "r"((x86_reg)3*line_size) |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
628 : "memory" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
629 ); |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
630 } |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
631 |
8430 | 632 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
633 { | |
634 __asm__ volatile( | |
635 "lea (%3, %3), %%"REG_a" \n\t" | |
636 "movq (%1), %%mm0 \n\t" | |
637 "sub %3, %2 \n\t" | |
638 "1: \n\t" | |
639 "movq (%1, %3), %%mm1 \n\t" | |
640 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
641 "add %%"REG_a", %1 \n\t" | |
642 PAVGB" %%mm1, %%mm0 \n\t" | |
643 PAVGB" %%mm2, %%mm1 \n\t" | |
644 "movq %%mm0, (%2, %3) \n\t" | |
645 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
646 "movq (%1, %3), %%mm1 \n\t" | |
647 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
648 "add %%"REG_a", %2 \n\t" | |
649 "add %%"REG_a", %1 \n\t" | |
650 PAVGB" %%mm1, %%mm2 \n\t" | |
651 PAVGB" %%mm0, %%mm1 \n\t" | |
652 "movq %%mm2, (%2, %3) \n\t" | |
653 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
654 "add %%"REG_a", %2 \n\t" | |
655 "subl $4, %0 \n\t" | |
656 "jnz 1b \n\t" | |
657 :"+g"(h), "+S"(pixels), "+D" (block) | |
658 :"r" ((x86_reg)line_size) | |
659 :"%"REG_a, "memory"); | |
660 } | |
661 | |
662 /* GL: this function does incorrect rounding if overflow */ | |
663 static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
664 { | |
665 MOVQ_BONE(mm6); | |
666 __asm__ volatile( | |
667 "lea (%3, %3), %%"REG_a" \n\t" | |
668 "movq (%1), %%mm0 \n\t" | |
669 "sub %3, %2 \n\t" | |
670 "1: \n\t" | |
671 "movq (%1, %3), %%mm1 \n\t" | |
672 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
673 "add %%"REG_a", %1 \n\t" | |
674 "psubusb %%mm6, %%mm1 \n\t" | |
675 PAVGB" %%mm1, %%mm0 \n\t" | |
676 PAVGB" %%mm2, %%mm1 \n\t" | |
677 "movq %%mm0, (%2, %3) \n\t" | |
678 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
679 "movq (%1, %3), %%mm1 \n\t" | |
680 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
681 "add %%"REG_a", %2 \n\t" | |
682 "add %%"REG_a", %1 \n\t" | |
683 "psubusb %%mm6, %%mm1 \n\t" | |
684 PAVGB" %%mm1, %%mm2 \n\t" | |
685 PAVGB" %%mm0, %%mm1 \n\t" | |
686 "movq %%mm2, (%2, %3) \n\t" | |
687 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
688 "add %%"REG_a", %2 \n\t" | |
689 "subl $4, %0 \n\t" | |
690 "jnz 1b \n\t" | |
691 :"+g"(h), "+S"(pixels), "+D" (block) | |
692 :"r" ((x86_reg)line_size) | |
693 :"%"REG_a, "memory"); | |
694 } | |
695 | |
11826
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
696 static void DEF(put_no_rnd_pixels8_y2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
697 { |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
698 __asm__ volatile ( |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
699 "movq (%1), %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
700 "pcmpeqb %%mm6, %%mm6 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
701 "add %3, %1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
702 "pxor %%mm6, %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
703 "1: \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
704 "movq (%1), %%mm1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
705 "movq (%1, %3), %%mm2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
706 "pxor %%mm6, %%mm1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
707 "pxor %%mm6, %%mm2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
708 PAVGB" %%mm1, %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
709 PAVGB" %%mm2, %%mm1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
710 "pxor %%mm6, %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
711 "pxor %%mm6, %%mm1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
712 "movq %%mm0, (%2) \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
713 "movq %%mm1, (%2, %3) \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
714 "movq (%1, %3,2), %%mm1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
715 "movq (%1, %4), %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
716 "pxor %%mm6, %%mm1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
717 "pxor %%mm6, %%mm0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
718 PAVGB" %%mm1, %%mm2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
719 PAVGB" %%mm0, %%mm1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
720 "pxor %%mm6, %%mm2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
721 "pxor %%mm6, %%mm1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
722 "movq %%mm2, (%2, %3,2) \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
723 "movq %%mm1, (%2, %4) \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
724 "lea (%1, %3,4), %1 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
725 "lea (%2, %3,4), %2 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
726 "subl $4, %0 \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
727 "jg 1b \n\t" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
728 :"+g"(h), "+r"(pixels), "+r" (block) |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
729 :"r" ((x86_reg)line_size), "r"((x86_reg)3*line_size) |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
730 :"memory" |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
731 ); |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
732 } |
11c5a87497d3
Add bitexact versions of put_no_rnd_pixels8 _x2 and _y2 for vp3/theora
conrad
parents:
10325
diff
changeset
|
733 |
8430 | 734 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
735 { | |
736 __asm__ volatile( | |
737 "lea (%3, %3), %%"REG_a" \n\t" | |
738 "1: \n\t" | |
739 "movq (%2), %%mm0 \n\t" | |
740 "movq (%2, %3), %%mm1 \n\t" | |
741 PAVGB" (%1), %%mm0 \n\t" | |
742 PAVGB" (%1, %3), %%mm1 \n\t" | |
743 "movq %%mm0, (%2) \n\t" | |
744 "movq %%mm1, (%2, %3) \n\t" | |
745 "add %%"REG_a", %1 \n\t" | |
746 "add %%"REG_a", %2 \n\t" | |
747 "movq (%2), %%mm0 \n\t" | |
748 "movq (%2, %3), %%mm1 \n\t" | |
749 PAVGB" (%1), %%mm0 \n\t" | |
750 PAVGB" (%1, %3), %%mm1 \n\t" | |
751 "add %%"REG_a", %1 \n\t" | |
752 "movq %%mm0, (%2) \n\t" | |
753 "movq %%mm1, (%2, %3) \n\t" | |
754 "add %%"REG_a", %2 \n\t" | |
755 "subl $4, %0 \n\t" | |
756 "jnz 1b \n\t" | |
757 :"+g"(h), "+S"(pixels), "+D"(block) | |
758 :"r" ((x86_reg)line_size) | |
759 :"%"REG_a, "memory"); | |
760 } | |
761 | |
762 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
763 { | |
764 __asm__ volatile( | |
765 "lea (%3, %3), %%"REG_a" \n\t" | |
766 "1: \n\t" | |
767 "movq (%1), %%mm0 \n\t" | |
768 "movq (%1, %3), %%mm2 \n\t" | |
769 PAVGB" 1(%1), %%mm0 \n\t" | |
770 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
771 PAVGB" (%2), %%mm0 \n\t" | |
772 PAVGB" (%2, %3), %%mm2 \n\t" | |
773 "add %%"REG_a", %1 \n\t" | |
774 "movq %%mm0, (%2) \n\t" | |
775 "movq %%mm2, (%2, %3) \n\t" | |
776 "movq (%1), %%mm0 \n\t" | |
777 "movq (%1, %3), %%mm2 \n\t" | |
778 PAVGB" 1(%1), %%mm0 \n\t" | |
779 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
780 "add %%"REG_a", %2 \n\t" | |
781 "add %%"REG_a", %1 \n\t" | |
782 PAVGB" (%2), %%mm0 \n\t" | |
783 PAVGB" (%2, %3), %%mm2 \n\t" | |
784 "movq %%mm0, (%2) \n\t" | |
785 "movq %%mm2, (%2, %3) \n\t" | |
786 "add %%"REG_a", %2 \n\t" | |
787 "subl $4, %0 \n\t" | |
788 "jnz 1b \n\t" | |
789 :"+g"(h), "+S"(pixels), "+D"(block) | |
790 :"r" ((x86_reg)line_size) | |
791 :"%"REG_a, "memory"); | |
792 } | |
793 | |
794 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
795 { | |
796 __asm__ volatile( | |
797 "lea (%3, %3), %%"REG_a" \n\t" | |
798 "movq (%1), %%mm0 \n\t" | |
799 "sub %3, %2 \n\t" | |
800 "1: \n\t" | |
801 "movq (%1, %3), %%mm1 \n\t" | |
802 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
803 "add %%"REG_a", %1 \n\t" | |
804 PAVGB" %%mm1, %%mm0 \n\t" | |
805 PAVGB" %%mm2, %%mm1 \n\t" | |
806 "movq (%2, %3), %%mm3 \n\t" | |
807 "movq (%2, %%"REG_a"), %%mm4 \n\t" | |
808 PAVGB" %%mm3, %%mm0 \n\t" | |
809 PAVGB" %%mm4, %%mm1 \n\t" | |
810 "movq %%mm0, (%2, %3) \n\t" | |
811 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
812 "movq (%1, %3), %%mm1 \n\t" | |
813 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
814 PAVGB" %%mm1, %%mm2 \n\t" | |
815 PAVGB" %%mm0, %%mm1 \n\t" | |
816 "add %%"REG_a", %2 \n\t" | |
817 "add %%"REG_a", %1 \n\t" | |
818 "movq (%2, %3), %%mm3 \n\t" | |
819 "movq (%2, %%"REG_a"), %%mm4 \n\t" | |
820 PAVGB" %%mm3, %%mm2 \n\t" | |
821 PAVGB" %%mm4, %%mm1 \n\t" | |
822 "movq %%mm2, (%2, %3) \n\t" | |
823 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
824 "add %%"REG_a", %2 \n\t" | |
825 "subl $4, %0 \n\t" | |
826 "jnz 1b \n\t" | |
827 :"+g"(h), "+S"(pixels), "+D"(block) | |
828 :"r" ((x86_reg)line_size) | |
829 :"%"REG_a, "memory"); | |
830 } | |
831 | |
832 /* Note this is not correctly rounded, but this function is only | |
833 * used for B-frames so it does not matter. */ | |
834 static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
835 { | |
836 MOVQ_BONE(mm6); | |
837 __asm__ volatile( | |
838 "lea (%3, %3), %%"REG_a" \n\t" | |
839 "movq (%1), %%mm0 \n\t" | |
840 PAVGB" 1(%1), %%mm0 \n\t" | |
841 ASMALIGN(3) | |
842 "1: \n\t" | |
843 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
844 "movq (%1, %3), %%mm1 \n\t" | |
845 "psubusb %%mm6, %%mm2 \n\t" | |
846 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
847 PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t" | |
848 "add %%"REG_a", %1 \n\t" | |
849 PAVGB" %%mm1, %%mm0 \n\t" | |
850 PAVGB" %%mm2, %%mm1 \n\t" | |
851 PAVGB" (%2), %%mm0 \n\t" | |
852 PAVGB" (%2, %3), %%mm1 \n\t" | |
853 "movq %%mm0, (%2) \n\t" | |
854 "movq %%mm1, (%2, %3) \n\t" | |
855 "movq (%1, %3), %%mm1 \n\t" | |
856 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
857 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
858 PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t" | |
859 "add %%"REG_a", %2 \n\t" | |
860 "add %%"REG_a", %1 \n\t" | |
861 PAVGB" %%mm1, %%mm2 \n\t" | |
862 PAVGB" %%mm0, %%mm1 \n\t" | |
863 PAVGB" (%2), %%mm2 \n\t" | |
864 PAVGB" (%2, %3), %%mm1 \n\t" | |
865 "movq %%mm2, (%2) \n\t" | |
866 "movq %%mm1, (%2, %3) \n\t" | |
867 "add %%"REG_a", %2 \n\t" | |
868 "subl $4, %0 \n\t" | |
869 "jnz 1b \n\t" | |
870 :"+g"(h), "+S"(pixels), "+D"(block) | |
871 :"r" ((x86_reg)line_size) | |
872 :"%"REG_a, "memory"); | |
873 } | |
874 | |
875 static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
876 { | |
877 do { | |
878 __asm__ volatile( | |
879 "movd (%1), %%mm0 \n\t" | |
880 "movd (%1, %2), %%mm1 \n\t" | |
881 "movd (%1, %2, 2), %%mm2 \n\t" | |
882 "movd (%1, %3), %%mm3 \n\t" | |
883 PAVGB" (%0), %%mm0 \n\t" | |
884 PAVGB" (%0, %2), %%mm1 \n\t" | |
885 PAVGB" (%0, %2, 2), %%mm2 \n\t" | |
886 PAVGB" (%0, %3), %%mm3 \n\t" | |
887 "movd %%mm0, (%1) \n\t" | |
888 "movd %%mm1, (%1, %2) \n\t" | |
889 "movd %%mm2, (%1, %2, 2) \n\t" | |
890 "movd %%mm3, (%1, %3) \n\t" | |
891 ::"S"(pixels), "D"(block), | |
892 "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size) | |
893 :"memory"); | |
894 block += 4*line_size; | |
895 pixels += 4*line_size; | |
896 h -= 4; | |
897 } while(h > 0); | |
898 } | |
899 | |
900 //FIXME the following could be optimized too ... | |
901 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
902 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); | |
903 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); | |
904 } | |
905 static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
906 DEF(put_pixels8_y2)(block , pixels , line_size, h); | |
907 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); | |
908 } | |
909 static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
910 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); | |
911 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); | |
912 } | |
913 static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
914 DEF(avg_pixels8)(block , pixels , line_size, h); | |
915 DEF(avg_pixels8)(block+8, pixels+8, line_size, h); | |
916 } | |
917 static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
918 DEF(avg_pixels8_x2)(block , pixels , line_size, h); | |
919 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); | |
920 } | |
921 static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
922 DEF(avg_pixels8_y2)(block , pixels , line_size, h); | |
923 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); | |
924 } | |
925 static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
926 DEF(avg_pixels8_xy2)(block , pixels , line_size, h); | |
927 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); | |
928 } | |
929 | |
930 #define QPEL_2TAP_L3(OPNAME) \ | |
931 static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ | |
932 __asm__ volatile(\ | |
933 "1: \n\t"\ | |
934 "movq (%1,%2), %%mm0 \n\t"\ | |
935 "movq 8(%1,%2), %%mm1 \n\t"\ | |
936 PAVGB" (%1,%3), %%mm0 \n\t"\ | |
937 PAVGB" 8(%1,%3), %%mm1 \n\t"\ | |
938 PAVGB" (%1), %%mm0 \n\t"\ | |
939 PAVGB" 8(%1), %%mm1 \n\t"\ | |
940 STORE_OP( (%1,%4),%%mm0)\ | |
941 STORE_OP(8(%1,%4),%%mm1)\ | |
942 "movq %%mm0, (%1,%4) \n\t"\ | |
943 "movq %%mm1, 8(%1,%4) \n\t"\ | |
944 "add %5, %1 \n\t"\ | |
945 "decl %0 \n\t"\ | |
946 "jnz 1b \n\t"\ | |
947 :"+g"(h), "+r"(src)\ | |
948 :"r"((x86_reg)off1), "r"((x86_reg)off2),\ | |
949 "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\ | |
950 :"memory"\ | |
951 );\ | |
952 }\ | |
953 static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ | |
954 __asm__ volatile(\ | |
955 "1: \n\t"\ | |
956 "movq (%1,%2), %%mm0 \n\t"\ | |
957 PAVGB" (%1,%3), %%mm0 \n\t"\ | |
958 PAVGB" (%1), %%mm0 \n\t"\ | |
959 STORE_OP((%1,%4),%%mm0)\ | |
960 "movq %%mm0, (%1,%4) \n\t"\ | |
961 "add %5, %1 \n\t"\ | |
962 "decl %0 \n\t"\ | |
963 "jnz 1b \n\t"\ | |
964 :"+g"(h), "+r"(src)\ | |
965 :"r"((x86_reg)off1), "r"((x86_reg)off2),\ | |
966 "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\ | |
967 :"memory"\ | |
968 );\ | |
969 } | |
970 | |
971 #define STORE_OP(a,b) PAVGB" "#a","#b" \n\t" | |
972 QPEL_2TAP_L3(avg_) | |
973 #undef STORE_OP | |
974 #define STORE_OP(a,b) | |
975 QPEL_2TAP_L3(put_) | |
976 #undef STORE_OP | |
977 #undef QPEL_2TAP_L3 |