Mercurial > libavcodec.hg
annotate x86/dsputil_mmx_avg_template.c @ 11352:6e0af2cfdcfe libavcodec
Do MC and IDCT in coding (hilbert) order
This increases the slice size to 64 pixels, due to having to decode an
entire chroma superblock row per slice.
This can be up to 6% slower depending on clip and CPU, but is necessary
for future optimizations that gain significantly more than was lost.
author | conrad |
---|---|
date | Wed, 03 Mar 2010 23:27:40 +0000 |
parents | 36b60aa6bc75 |
children | 11c5a87497d3 |
rev | line source |
---|---|
8430 | 1 /* |
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2 | |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8430
diff
changeset
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard |
8430 | 4 * Copyright (c) 2002-2004 Michael Niedermayer |
5 * | |
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> | |
8 * and improved by Zdenek Kabelac <kabi@users.sf.net> | |
9 * | |
10 * This file is part of FFmpeg. | |
11 * | |
12 * FFmpeg is free software; you can redistribute it and/or | |
13 * modify it under the terms of the GNU Lesser General Public | |
14 * License as published by the Free Software Foundation; either | |
15 * version 2.1 of the License, or (at your option) any later version. | |
16 * | |
17 * FFmpeg is distributed in the hope that it will be useful, | |
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
20 * Lesser General Public License for more details. | |
21 * | |
22 * You should have received a copy of the GNU Lesser General Public | |
23 * License along with FFmpeg; if not, write to the Free Software | |
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
25 */ | |
26 | |
27 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm | |
28 clobber bug - now it will work with 2.95.2 and also with -fPIC | |
29 */ | |
30 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
31 { | |
32 __asm__ volatile( | |
33 "lea (%3, %3), %%"REG_a" \n\t" | |
34 "1: \n\t" | |
35 "movq (%1), %%mm0 \n\t" | |
36 "movq (%1, %3), %%mm1 \n\t" | |
37 PAVGB" 1(%1), %%mm0 \n\t" | |
38 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
39 "movq %%mm0, (%2) \n\t" | |
40 "movq %%mm1, (%2, %3) \n\t" | |
41 "add %%"REG_a", %1 \n\t" | |
42 "add %%"REG_a", %2 \n\t" | |
43 "movq (%1), %%mm0 \n\t" | |
44 "movq (%1, %3), %%mm1 \n\t" | |
45 PAVGB" 1(%1), %%mm0 \n\t" | |
46 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
47 "add %%"REG_a", %1 \n\t" | |
48 "movq %%mm0, (%2) \n\t" | |
49 "movq %%mm1, (%2, %3) \n\t" | |
50 "add %%"REG_a", %2 \n\t" | |
51 "subl $4, %0 \n\t" | |
52 "jnz 1b \n\t" | |
53 :"+g"(h), "+S"(pixels), "+D"(block) | |
54 :"r" ((x86_reg)line_size) | |
55 :"%"REG_a, "memory"); | |
56 } | |
57 | |
58 static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
59 { | |
60 __asm__ volatile( | |
61 "testl $1, %0 \n\t" | |
62 " jz 1f \n\t" | |
63 "movd (%1), %%mm0 \n\t" | |
64 "movd (%2), %%mm1 \n\t" | |
65 "add %4, %1 \n\t" | |
66 "add $4, %2 \n\t" | |
67 PAVGB" %%mm1, %%mm0 \n\t" | |
68 "movd %%mm0, (%3) \n\t" | |
69 "add %5, %3 \n\t" | |
70 "decl %0 \n\t" | |
71 "1: \n\t" | |
72 "movd (%1), %%mm0 \n\t" | |
73 "add %4, %1 \n\t" | |
74 "movd (%1), %%mm1 \n\t" | |
75 "movd (%2), %%mm2 \n\t" | |
76 "movd 4(%2), %%mm3 \n\t" | |
77 "add %4, %1 \n\t" | |
78 PAVGB" %%mm2, %%mm0 \n\t" | |
79 PAVGB" %%mm3, %%mm1 \n\t" | |
80 "movd %%mm0, (%3) \n\t" | |
81 "add %5, %3 \n\t" | |
82 "movd %%mm1, (%3) \n\t" | |
83 "add %5, %3 \n\t" | |
84 "movd (%1), %%mm0 \n\t" | |
85 "add %4, %1 \n\t" | |
86 "movd (%1), %%mm1 \n\t" | |
87 "movd 8(%2), %%mm2 \n\t" | |
88 "movd 12(%2), %%mm3 \n\t" | |
89 "add %4, %1 \n\t" | |
90 PAVGB" %%mm2, %%mm0 \n\t" | |
91 PAVGB" %%mm3, %%mm1 \n\t" | |
92 "movd %%mm0, (%3) \n\t" | |
93 "add %5, %3 \n\t" | |
94 "movd %%mm1, (%3) \n\t" | |
95 "add %5, %3 \n\t" | |
96 "add $16, %2 \n\t" | |
97 "subl $4, %0 \n\t" | |
98 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
99 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 100 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
101 #else | |
102 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
103 #endif | |
104 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
105 :"memory"); | |
106 } | |
107 | |
108 | |
109 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
110 { | |
111 __asm__ volatile( | |
112 "testl $1, %0 \n\t" | |
113 " jz 1f \n\t" | |
114 "movq (%1), %%mm0 \n\t" | |
115 "movq (%2), %%mm1 \n\t" | |
116 "add %4, %1 \n\t" | |
117 "add $8, %2 \n\t" | |
118 PAVGB" %%mm1, %%mm0 \n\t" | |
119 "movq %%mm0, (%3) \n\t" | |
120 "add %5, %3 \n\t" | |
121 "decl %0 \n\t" | |
122 "1: \n\t" | |
123 "movq (%1), %%mm0 \n\t" | |
124 "add %4, %1 \n\t" | |
125 "movq (%1), %%mm1 \n\t" | |
126 "add %4, %1 \n\t" | |
127 PAVGB" (%2), %%mm0 \n\t" | |
128 PAVGB" 8(%2), %%mm1 \n\t" | |
129 "movq %%mm0, (%3) \n\t" | |
130 "add %5, %3 \n\t" | |
131 "movq %%mm1, (%3) \n\t" | |
132 "add %5, %3 \n\t" | |
133 "movq (%1), %%mm0 \n\t" | |
134 "add %4, %1 \n\t" | |
135 "movq (%1), %%mm1 \n\t" | |
136 "add %4, %1 \n\t" | |
137 PAVGB" 16(%2), %%mm0 \n\t" | |
138 PAVGB" 24(%2), %%mm1 \n\t" | |
139 "movq %%mm0, (%3) \n\t" | |
140 "add %5, %3 \n\t" | |
141 "movq %%mm1, (%3) \n\t" | |
142 "add %5, %3 \n\t" | |
143 "add $32, %2 \n\t" | |
144 "subl $4, %0 \n\t" | |
145 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
146 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 147 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
148 #else | |
149 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
150 #endif | |
151 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
152 :"memory"); | |
153 //the following should be used, though better not with gcc ... | |
154 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
155 :"r"(src1Stride), "r"(dstStride) | |
156 :"memory");*/ | |
157 } | |
158 | |
159 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
160 { | |
161 __asm__ volatile( | |
162 "pcmpeqb %%mm6, %%mm6 \n\t" | |
163 "testl $1, %0 \n\t" | |
164 " jz 1f \n\t" | |
165 "movq (%1), %%mm0 \n\t" | |
166 "movq (%2), %%mm1 \n\t" | |
167 "add %4, %1 \n\t" | |
168 "add $8, %2 \n\t" | |
169 "pxor %%mm6, %%mm0 \n\t" | |
170 "pxor %%mm6, %%mm1 \n\t" | |
171 PAVGB" %%mm1, %%mm0 \n\t" | |
172 "pxor %%mm6, %%mm0 \n\t" | |
173 "movq %%mm0, (%3) \n\t" | |
174 "add %5, %3 \n\t" | |
175 "decl %0 \n\t" | |
176 "1: \n\t" | |
177 "movq (%1), %%mm0 \n\t" | |
178 "add %4, %1 \n\t" | |
179 "movq (%1), %%mm1 \n\t" | |
180 "add %4, %1 \n\t" | |
181 "movq (%2), %%mm2 \n\t" | |
182 "movq 8(%2), %%mm3 \n\t" | |
183 "pxor %%mm6, %%mm0 \n\t" | |
184 "pxor %%mm6, %%mm1 \n\t" | |
185 "pxor %%mm6, %%mm2 \n\t" | |
186 "pxor %%mm6, %%mm3 \n\t" | |
187 PAVGB" %%mm2, %%mm0 \n\t" | |
188 PAVGB" %%mm3, %%mm1 \n\t" | |
189 "pxor %%mm6, %%mm0 \n\t" | |
190 "pxor %%mm6, %%mm1 \n\t" | |
191 "movq %%mm0, (%3) \n\t" | |
192 "add %5, %3 \n\t" | |
193 "movq %%mm1, (%3) \n\t" | |
194 "add %5, %3 \n\t" | |
195 "movq (%1), %%mm0 \n\t" | |
196 "add %4, %1 \n\t" | |
197 "movq (%1), %%mm1 \n\t" | |
198 "add %4, %1 \n\t" | |
199 "movq 16(%2), %%mm2 \n\t" | |
200 "movq 24(%2), %%mm3 \n\t" | |
201 "pxor %%mm6, %%mm0 \n\t" | |
202 "pxor %%mm6, %%mm1 \n\t" | |
203 "pxor %%mm6, %%mm2 \n\t" | |
204 "pxor %%mm6, %%mm3 \n\t" | |
205 PAVGB" %%mm2, %%mm0 \n\t" | |
206 PAVGB" %%mm3, %%mm1 \n\t" | |
207 "pxor %%mm6, %%mm0 \n\t" | |
208 "pxor %%mm6, %%mm1 \n\t" | |
209 "movq %%mm0, (%3) \n\t" | |
210 "add %5, %3 \n\t" | |
211 "movq %%mm1, (%3) \n\t" | |
212 "add %5, %3 \n\t" | |
213 "add $32, %2 \n\t" | |
214 "subl $4, %0 \n\t" | |
215 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
216 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 217 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
218 #else | |
219 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
220 #endif | |
221 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
222 :"memory"); | |
223 //the following should be used, though better not with gcc ... | |
224 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
225 :"r"(src1Stride), "r"(dstStride) | |
226 :"memory");*/ | |
227 } | |
228 | |
229 static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
230 { | |
231 __asm__ volatile( | |
232 "testl $1, %0 \n\t" | |
233 " jz 1f \n\t" | |
234 "movd (%1), %%mm0 \n\t" | |
235 "movd (%2), %%mm1 \n\t" | |
236 "add %4, %1 \n\t" | |
237 "add $4, %2 \n\t" | |
238 PAVGB" %%mm1, %%mm0 \n\t" | |
239 PAVGB" (%3), %%mm0 \n\t" | |
240 "movd %%mm0, (%3) \n\t" | |
241 "add %5, %3 \n\t" | |
242 "decl %0 \n\t" | |
243 "1: \n\t" | |
244 "movd (%1), %%mm0 \n\t" | |
245 "add %4, %1 \n\t" | |
246 "movd (%1), %%mm1 \n\t" | |
247 "add %4, %1 \n\t" | |
248 PAVGB" (%2), %%mm0 \n\t" | |
249 PAVGB" 4(%2), %%mm1 \n\t" | |
250 PAVGB" (%3), %%mm0 \n\t" | |
251 "movd %%mm0, (%3) \n\t" | |
252 "add %5, %3 \n\t" | |
253 PAVGB" (%3), %%mm1 \n\t" | |
254 "movd %%mm1, (%3) \n\t" | |
255 "add %5, %3 \n\t" | |
256 "movd (%1), %%mm0 \n\t" | |
257 "add %4, %1 \n\t" | |
258 "movd (%1), %%mm1 \n\t" | |
259 "add %4, %1 \n\t" | |
260 PAVGB" 8(%2), %%mm0 \n\t" | |
261 PAVGB" 12(%2), %%mm1 \n\t" | |
262 PAVGB" (%3), %%mm0 \n\t" | |
263 "movd %%mm0, (%3) \n\t" | |
264 "add %5, %3 \n\t" | |
265 PAVGB" (%3), %%mm1 \n\t" | |
266 "movd %%mm1, (%3) \n\t" | |
267 "add %5, %3 \n\t" | |
268 "add $16, %2 \n\t" | |
269 "subl $4, %0 \n\t" | |
270 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
271 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 272 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
273 #else | |
274 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
275 #endif | |
276 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
277 :"memory"); | |
278 } | |
279 | |
280 | |
281 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
282 { | |
283 __asm__ volatile( | |
284 "testl $1, %0 \n\t" | |
285 " jz 1f \n\t" | |
286 "movq (%1), %%mm0 \n\t" | |
287 "movq (%2), %%mm1 \n\t" | |
288 "add %4, %1 \n\t" | |
289 "add $8, %2 \n\t" | |
290 PAVGB" %%mm1, %%mm0 \n\t" | |
291 PAVGB" (%3), %%mm0 \n\t" | |
292 "movq %%mm0, (%3) \n\t" | |
293 "add %5, %3 \n\t" | |
294 "decl %0 \n\t" | |
295 "1: \n\t" | |
296 "movq (%1), %%mm0 \n\t" | |
297 "add %4, %1 \n\t" | |
298 "movq (%1), %%mm1 \n\t" | |
299 "add %4, %1 \n\t" | |
300 PAVGB" (%2), %%mm0 \n\t" | |
301 PAVGB" 8(%2), %%mm1 \n\t" | |
302 PAVGB" (%3), %%mm0 \n\t" | |
303 "movq %%mm0, (%3) \n\t" | |
304 "add %5, %3 \n\t" | |
305 PAVGB" (%3), %%mm1 \n\t" | |
306 "movq %%mm1, (%3) \n\t" | |
307 "add %5, %3 \n\t" | |
308 "movq (%1), %%mm0 \n\t" | |
309 "add %4, %1 \n\t" | |
310 "movq (%1), %%mm1 \n\t" | |
311 "add %4, %1 \n\t" | |
312 PAVGB" 16(%2), %%mm0 \n\t" | |
313 PAVGB" 24(%2), %%mm1 \n\t" | |
314 PAVGB" (%3), %%mm0 \n\t" | |
315 "movq %%mm0, (%3) \n\t" | |
316 "add %5, %3 \n\t" | |
317 PAVGB" (%3), %%mm1 \n\t" | |
318 "movq %%mm1, (%3) \n\t" | |
319 "add %5, %3 \n\t" | |
320 "add $32, %2 \n\t" | |
321 "subl $4, %0 \n\t" | |
322 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
323 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 324 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
325 #else | |
326 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
327 #endif | |
328 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
329 :"memory"); | |
330 //the following should be used, though better not with gcc ... | |
331 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
332 :"r"(src1Stride), "r"(dstStride) | |
333 :"memory");*/ | |
334 } | |
335 | |
336 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
337 { | |
338 __asm__ volatile( | |
339 "lea (%3, %3), %%"REG_a" \n\t" | |
340 "1: \n\t" | |
341 "movq (%1), %%mm0 \n\t" | |
342 "movq (%1, %3), %%mm1 \n\t" | |
343 "movq 8(%1), %%mm2 \n\t" | |
344 "movq 8(%1, %3), %%mm3 \n\t" | |
345 PAVGB" 1(%1), %%mm0 \n\t" | |
346 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
347 PAVGB" 9(%1), %%mm2 \n\t" | |
348 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
349 "movq %%mm0, (%2) \n\t" | |
350 "movq %%mm1, (%2, %3) \n\t" | |
351 "movq %%mm2, 8(%2) \n\t" | |
352 "movq %%mm3, 8(%2, %3) \n\t" | |
353 "add %%"REG_a", %1 \n\t" | |
354 "add %%"REG_a", %2 \n\t" | |
355 "movq (%1), %%mm0 \n\t" | |
356 "movq (%1, %3), %%mm1 \n\t" | |
357 "movq 8(%1), %%mm2 \n\t" | |
358 "movq 8(%1, %3), %%mm3 \n\t" | |
359 PAVGB" 1(%1), %%mm0 \n\t" | |
360 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
361 PAVGB" 9(%1), %%mm2 \n\t" | |
362 PAVGB" 9(%1, %3), %%mm3 \n\t" | |
363 "add %%"REG_a", %1 \n\t" | |
364 "movq %%mm0, (%2) \n\t" | |
365 "movq %%mm1, (%2, %3) \n\t" | |
366 "movq %%mm2, 8(%2) \n\t" | |
367 "movq %%mm3, 8(%2, %3) \n\t" | |
368 "add %%"REG_a", %2 \n\t" | |
369 "subl $4, %0 \n\t" | |
370 "jnz 1b \n\t" | |
371 :"+g"(h), "+S"(pixels), "+D"(block) | |
372 :"r" ((x86_reg)line_size) | |
373 :"%"REG_a, "memory"); | |
374 } | |
375 | |
376 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
377 { | |
378 __asm__ volatile( | |
379 "testl $1, %0 \n\t" | |
380 " jz 1f \n\t" | |
381 "movq (%1), %%mm0 \n\t" | |
382 "movq 8(%1), %%mm1 \n\t" | |
383 PAVGB" (%2), %%mm0 \n\t" | |
384 PAVGB" 8(%2), %%mm1 \n\t" | |
385 "add %4, %1 \n\t" | |
386 "add $16, %2 \n\t" | |
387 "movq %%mm0, (%3) \n\t" | |
388 "movq %%mm1, 8(%3) \n\t" | |
389 "add %5, %3 \n\t" | |
390 "decl %0 \n\t" | |
391 "1: \n\t" | |
392 "movq (%1), %%mm0 \n\t" | |
393 "movq 8(%1), %%mm1 \n\t" | |
394 "add %4, %1 \n\t" | |
395 PAVGB" (%2), %%mm0 \n\t" | |
396 PAVGB" 8(%2), %%mm1 \n\t" | |
397 "movq %%mm0, (%3) \n\t" | |
398 "movq %%mm1, 8(%3) \n\t" | |
399 "add %5, %3 \n\t" | |
400 "movq (%1), %%mm0 \n\t" | |
401 "movq 8(%1), %%mm1 \n\t" | |
402 "add %4, %1 \n\t" | |
403 PAVGB" 16(%2), %%mm0 \n\t" | |
404 PAVGB" 24(%2), %%mm1 \n\t" | |
405 "movq %%mm0, (%3) \n\t" | |
406 "movq %%mm1, 8(%3) \n\t" | |
407 "add %5, %3 \n\t" | |
408 "add $32, %2 \n\t" | |
409 "subl $2, %0 \n\t" | |
410 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
411 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 412 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
413 #else | |
414 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
415 #endif | |
416 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
417 :"memory"); | |
418 //the following should be used, though better not with gcc ... | |
419 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
420 :"r"(src1Stride), "r"(dstStride) | |
421 :"memory");*/ | |
422 } | |
423 | |
424 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
425 { | |
426 __asm__ volatile( | |
427 "testl $1, %0 \n\t" | |
428 " jz 1f \n\t" | |
429 "movq (%1), %%mm0 \n\t" | |
430 "movq 8(%1), %%mm1 \n\t" | |
431 PAVGB" (%2), %%mm0 \n\t" | |
432 PAVGB" 8(%2), %%mm1 \n\t" | |
433 "add %4, %1 \n\t" | |
434 "add $16, %2 \n\t" | |
435 PAVGB" (%3), %%mm0 \n\t" | |
436 PAVGB" 8(%3), %%mm1 \n\t" | |
437 "movq %%mm0, (%3) \n\t" | |
438 "movq %%mm1, 8(%3) \n\t" | |
439 "add %5, %3 \n\t" | |
440 "decl %0 \n\t" | |
441 "1: \n\t" | |
442 "movq (%1), %%mm0 \n\t" | |
443 "movq 8(%1), %%mm1 \n\t" | |
444 "add %4, %1 \n\t" | |
445 PAVGB" (%2), %%mm0 \n\t" | |
446 PAVGB" 8(%2), %%mm1 \n\t" | |
447 PAVGB" (%3), %%mm0 \n\t" | |
448 PAVGB" 8(%3), %%mm1 \n\t" | |
449 "movq %%mm0, (%3) \n\t" | |
450 "movq %%mm1, 8(%3) \n\t" | |
451 "add %5, %3 \n\t" | |
452 "movq (%1), %%mm0 \n\t" | |
453 "movq 8(%1), %%mm1 \n\t" | |
454 "add %4, %1 \n\t" | |
455 PAVGB" 16(%2), %%mm0 \n\t" | |
456 PAVGB" 24(%2), %%mm1 \n\t" | |
457 PAVGB" (%3), %%mm0 \n\t" | |
458 PAVGB" 8(%3), %%mm1 \n\t" | |
459 "movq %%mm0, (%3) \n\t" | |
460 "movq %%mm1, 8(%3) \n\t" | |
461 "add %5, %3 \n\t" | |
462 "add $32, %2 \n\t" | |
463 "subl $2, %0 \n\t" | |
464 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
465 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 466 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
467 #else | |
468 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
469 #endif | |
470 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
471 :"memory"); | |
472 //the following should be used, though better not with gcc ... | |
473 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
474 :"r"(src1Stride), "r"(dstStride) | |
475 :"memory");*/ | |
476 } | |
477 | |
478 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
479 { | |
480 __asm__ volatile( | |
481 "pcmpeqb %%mm6, %%mm6 \n\t" | |
482 "testl $1, %0 \n\t" | |
483 " jz 1f \n\t" | |
484 "movq (%1), %%mm0 \n\t" | |
485 "movq 8(%1), %%mm1 \n\t" | |
486 "movq (%2), %%mm2 \n\t" | |
487 "movq 8(%2), %%mm3 \n\t" | |
488 "pxor %%mm6, %%mm0 \n\t" | |
489 "pxor %%mm6, %%mm1 \n\t" | |
490 "pxor %%mm6, %%mm2 \n\t" | |
491 "pxor %%mm6, %%mm3 \n\t" | |
492 PAVGB" %%mm2, %%mm0 \n\t" | |
493 PAVGB" %%mm3, %%mm1 \n\t" | |
494 "pxor %%mm6, %%mm0 \n\t" | |
495 "pxor %%mm6, %%mm1 \n\t" | |
496 "add %4, %1 \n\t" | |
497 "add $16, %2 \n\t" | |
498 "movq %%mm0, (%3) \n\t" | |
499 "movq %%mm1, 8(%3) \n\t" | |
500 "add %5, %3 \n\t" | |
501 "decl %0 \n\t" | |
502 "1: \n\t" | |
503 "movq (%1), %%mm0 \n\t" | |
504 "movq 8(%1), %%mm1 \n\t" | |
505 "add %4, %1 \n\t" | |
506 "movq (%2), %%mm2 \n\t" | |
507 "movq 8(%2), %%mm3 \n\t" | |
508 "pxor %%mm6, %%mm0 \n\t" | |
509 "pxor %%mm6, %%mm1 \n\t" | |
510 "pxor %%mm6, %%mm2 \n\t" | |
511 "pxor %%mm6, %%mm3 \n\t" | |
512 PAVGB" %%mm2, %%mm0 \n\t" | |
513 PAVGB" %%mm3, %%mm1 \n\t" | |
514 "pxor %%mm6, %%mm0 \n\t" | |
515 "pxor %%mm6, %%mm1 \n\t" | |
516 "movq %%mm0, (%3) \n\t" | |
517 "movq %%mm1, 8(%3) \n\t" | |
518 "add %5, %3 \n\t" | |
519 "movq (%1), %%mm0 \n\t" | |
520 "movq 8(%1), %%mm1 \n\t" | |
521 "add %4, %1 \n\t" | |
522 "movq 16(%2), %%mm2 \n\t" | |
523 "movq 24(%2), %%mm3 \n\t" | |
524 "pxor %%mm6, %%mm0 \n\t" | |
525 "pxor %%mm6, %%mm1 \n\t" | |
526 "pxor %%mm6, %%mm2 \n\t" | |
527 "pxor %%mm6, %%mm3 \n\t" | |
528 PAVGB" %%mm2, %%mm0 \n\t" | |
529 PAVGB" %%mm3, %%mm1 \n\t" | |
530 "pxor %%mm6, %%mm0 \n\t" | |
531 "pxor %%mm6, %%mm1 \n\t" | |
532 "movq %%mm0, (%3) \n\t" | |
533 "movq %%mm1, 8(%3) \n\t" | |
534 "add %5, %3 \n\t" | |
535 "add $32, %2 \n\t" | |
536 "subl $2, %0 \n\t" | |
537 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
8629
diff
changeset
|
538 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 539 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
540 #else | |
541 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
542 #endif | |
543 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
544 :"memory"); | |
545 //the following should be used, though better not with gcc ... | |
546 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) | |
547 :"r"(src1Stride), "r"(dstStride) | |
548 :"memory");*/ | |
549 } | |
550 | |
551 /* GL: this function does incorrect rounding if overflow */ | |
552 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
553 { | |
554 MOVQ_BONE(mm6); | |
555 __asm__ volatile( | |
556 "lea (%3, %3), %%"REG_a" \n\t" | |
557 "1: \n\t" | |
558 "movq (%1), %%mm0 \n\t" | |
559 "movq (%1, %3), %%mm2 \n\t" | |
560 "movq 1(%1), %%mm1 \n\t" | |
561 "movq 1(%1, %3), %%mm3 \n\t" | |
562 "add %%"REG_a", %1 \n\t" | |
563 "psubusb %%mm6, %%mm0 \n\t" | |
564 "psubusb %%mm6, %%mm2 \n\t" | |
565 PAVGB" %%mm1, %%mm0 \n\t" | |
566 PAVGB" %%mm3, %%mm2 \n\t" | |
567 "movq %%mm0, (%2) \n\t" | |
568 "movq %%mm2, (%2, %3) \n\t" | |
569 "movq (%1), %%mm0 \n\t" | |
570 "movq 1(%1), %%mm1 \n\t" | |
571 "movq (%1, %3), %%mm2 \n\t" | |
572 "movq 1(%1, %3), %%mm3 \n\t" | |
573 "add %%"REG_a", %2 \n\t" | |
574 "add %%"REG_a", %1 \n\t" | |
575 "psubusb %%mm6, %%mm0 \n\t" | |
576 "psubusb %%mm6, %%mm2 \n\t" | |
577 PAVGB" %%mm1, %%mm0 \n\t" | |
578 PAVGB" %%mm3, %%mm2 \n\t" | |
579 "movq %%mm0, (%2) \n\t" | |
580 "movq %%mm2, (%2, %3) \n\t" | |
581 "add %%"REG_a", %2 \n\t" | |
582 "subl $4, %0 \n\t" | |
583 "jnz 1b \n\t" | |
584 :"+g"(h), "+S"(pixels), "+D"(block) | |
585 :"r" ((x86_reg)line_size) | |
586 :"%"REG_a, "memory"); | |
587 } | |
588 | |
589 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
590 { | |
591 __asm__ volatile( | |
592 "lea (%3, %3), %%"REG_a" \n\t" | |
593 "movq (%1), %%mm0 \n\t" | |
594 "sub %3, %2 \n\t" | |
595 "1: \n\t" | |
596 "movq (%1, %3), %%mm1 \n\t" | |
597 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
598 "add %%"REG_a", %1 \n\t" | |
599 PAVGB" %%mm1, %%mm0 \n\t" | |
600 PAVGB" %%mm2, %%mm1 \n\t" | |
601 "movq %%mm0, (%2, %3) \n\t" | |
602 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
603 "movq (%1, %3), %%mm1 \n\t" | |
604 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
605 "add %%"REG_a", %2 \n\t" | |
606 "add %%"REG_a", %1 \n\t" | |
607 PAVGB" %%mm1, %%mm2 \n\t" | |
608 PAVGB" %%mm0, %%mm1 \n\t" | |
609 "movq %%mm2, (%2, %3) \n\t" | |
610 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
611 "add %%"REG_a", %2 \n\t" | |
612 "subl $4, %0 \n\t" | |
613 "jnz 1b \n\t" | |
614 :"+g"(h), "+S"(pixels), "+D" (block) | |
615 :"r" ((x86_reg)line_size) | |
616 :"%"REG_a, "memory"); | |
617 } | |
618 | |
619 /* GL: this function does incorrect rounding if overflow */ | |
620 static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
621 { | |
622 MOVQ_BONE(mm6); | |
623 __asm__ volatile( | |
624 "lea (%3, %3), %%"REG_a" \n\t" | |
625 "movq (%1), %%mm0 \n\t" | |
626 "sub %3, %2 \n\t" | |
627 "1: \n\t" | |
628 "movq (%1, %3), %%mm1 \n\t" | |
629 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
630 "add %%"REG_a", %1 \n\t" | |
631 "psubusb %%mm6, %%mm1 \n\t" | |
632 PAVGB" %%mm1, %%mm0 \n\t" | |
633 PAVGB" %%mm2, %%mm1 \n\t" | |
634 "movq %%mm0, (%2, %3) \n\t" | |
635 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
636 "movq (%1, %3), %%mm1 \n\t" | |
637 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
638 "add %%"REG_a", %2 \n\t" | |
639 "add %%"REG_a", %1 \n\t" | |
640 "psubusb %%mm6, %%mm1 \n\t" | |
641 PAVGB" %%mm1, %%mm2 \n\t" | |
642 PAVGB" %%mm0, %%mm1 \n\t" | |
643 "movq %%mm2, (%2, %3) \n\t" | |
644 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
645 "add %%"REG_a", %2 \n\t" | |
646 "subl $4, %0 \n\t" | |
647 "jnz 1b \n\t" | |
648 :"+g"(h), "+S"(pixels), "+D" (block) | |
649 :"r" ((x86_reg)line_size) | |
650 :"%"REG_a, "memory"); | |
651 } | |
652 | |
653 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
654 { | |
655 __asm__ volatile( | |
656 "lea (%3, %3), %%"REG_a" \n\t" | |
657 "1: \n\t" | |
658 "movq (%2), %%mm0 \n\t" | |
659 "movq (%2, %3), %%mm1 \n\t" | |
660 PAVGB" (%1), %%mm0 \n\t" | |
661 PAVGB" (%1, %3), %%mm1 \n\t" | |
662 "movq %%mm0, (%2) \n\t" | |
663 "movq %%mm1, (%2, %3) \n\t" | |
664 "add %%"REG_a", %1 \n\t" | |
665 "add %%"REG_a", %2 \n\t" | |
666 "movq (%2), %%mm0 \n\t" | |
667 "movq (%2, %3), %%mm1 \n\t" | |
668 PAVGB" (%1), %%mm0 \n\t" | |
669 PAVGB" (%1, %3), %%mm1 \n\t" | |
670 "add %%"REG_a", %1 \n\t" | |
671 "movq %%mm0, (%2) \n\t" | |
672 "movq %%mm1, (%2, %3) \n\t" | |
673 "add %%"REG_a", %2 \n\t" | |
674 "subl $4, %0 \n\t" | |
675 "jnz 1b \n\t" | |
676 :"+g"(h), "+S"(pixels), "+D"(block) | |
677 :"r" ((x86_reg)line_size) | |
678 :"%"REG_a, "memory"); | |
679 } | |
680 | |
681 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
682 { | |
683 __asm__ volatile( | |
684 "lea (%3, %3), %%"REG_a" \n\t" | |
685 "1: \n\t" | |
686 "movq (%1), %%mm0 \n\t" | |
687 "movq (%1, %3), %%mm2 \n\t" | |
688 PAVGB" 1(%1), %%mm0 \n\t" | |
689 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
690 PAVGB" (%2), %%mm0 \n\t" | |
691 PAVGB" (%2, %3), %%mm2 \n\t" | |
692 "add %%"REG_a", %1 \n\t" | |
693 "movq %%mm0, (%2) \n\t" | |
694 "movq %%mm2, (%2, %3) \n\t" | |
695 "movq (%1), %%mm0 \n\t" | |
696 "movq (%1, %3), %%mm2 \n\t" | |
697 PAVGB" 1(%1), %%mm0 \n\t" | |
698 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
699 "add %%"REG_a", %2 \n\t" | |
700 "add %%"REG_a", %1 \n\t" | |
701 PAVGB" (%2), %%mm0 \n\t" | |
702 PAVGB" (%2, %3), %%mm2 \n\t" | |
703 "movq %%mm0, (%2) \n\t" | |
704 "movq %%mm2, (%2, %3) \n\t" | |
705 "add %%"REG_a", %2 \n\t" | |
706 "subl $4, %0 \n\t" | |
707 "jnz 1b \n\t" | |
708 :"+g"(h), "+S"(pixels), "+D"(block) | |
709 :"r" ((x86_reg)line_size) | |
710 :"%"REG_a, "memory"); | |
711 } | |
712 | |
713 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
714 { | |
715 __asm__ volatile( | |
716 "lea (%3, %3), %%"REG_a" \n\t" | |
717 "movq (%1), %%mm0 \n\t" | |
718 "sub %3, %2 \n\t" | |
719 "1: \n\t" | |
720 "movq (%1, %3), %%mm1 \n\t" | |
721 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
722 "add %%"REG_a", %1 \n\t" | |
723 PAVGB" %%mm1, %%mm0 \n\t" | |
724 PAVGB" %%mm2, %%mm1 \n\t" | |
725 "movq (%2, %3), %%mm3 \n\t" | |
726 "movq (%2, %%"REG_a"), %%mm4 \n\t" | |
727 PAVGB" %%mm3, %%mm0 \n\t" | |
728 PAVGB" %%mm4, %%mm1 \n\t" | |
729 "movq %%mm0, (%2, %3) \n\t" | |
730 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
731 "movq (%1, %3), %%mm1 \n\t" | |
732 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
733 PAVGB" %%mm1, %%mm2 \n\t" | |
734 PAVGB" %%mm0, %%mm1 \n\t" | |
735 "add %%"REG_a", %2 \n\t" | |
736 "add %%"REG_a", %1 \n\t" | |
737 "movq (%2, %3), %%mm3 \n\t" | |
738 "movq (%2, %%"REG_a"), %%mm4 \n\t" | |
739 PAVGB" %%mm3, %%mm2 \n\t" | |
740 PAVGB" %%mm4, %%mm1 \n\t" | |
741 "movq %%mm2, (%2, %3) \n\t" | |
742 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
743 "add %%"REG_a", %2 \n\t" | |
744 "subl $4, %0 \n\t" | |
745 "jnz 1b \n\t" | |
746 :"+g"(h), "+S"(pixels), "+D"(block) | |
747 :"r" ((x86_reg)line_size) | |
748 :"%"REG_a, "memory"); | |
749 } | |
750 | |
751 /* Note this is not correctly rounded, but this function is only | |
752 * used for B-frames so it does not matter. */ | |
753 static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
754 { | |
755 MOVQ_BONE(mm6); | |
756 __asm__ volatile( | |
757 "lea (%3, %3), %%"REG_a" \n\t" | |
758 "movq (%1), %%mm0 \n\t" | |
759 PAVGB" 1(%1), %%mm0 \n\t" | |
760 ASMALIGN(3) | |
761 "1: \n\t" | |
762 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
763 "movq (%1, %3), %%mm1 \n\t" | |
764 "psubusb %%mm6, %%mm2 \n\t" | |
765 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
766 PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t" | |
767 "add %%"REG_a", %1 \n\t" | |
768 PAVGB" %%mm1, %%mm0 \n\t" | |
769 PAVGB" %%mm2, %%mm1 \n\t" | |
770 PAVGB" (%2), %%mm0 \n\t" | |
771 PAVGB" (%2, %3), %%mm1 \n\t" | |
772 "movq %%mm0, (%2) \n\t" | |
773 "movq %%mm1, (%2, %3) \n\t" | |
774 "movq (%1, %3), %%mm1 \n\t" | |
775 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
776 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
777 PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t" | |
778 "add %%"REG_a", %2 \n\t" | |
779 "add %%"REG_a", %1 \n\t" | |
780 PAVGB" %%mm1, %%mm2 \n\t" | |
781 PAVGB" %%mm0, %%mm1 \n\t" | |
782 PAVGB" (%2), %%mm2 \n\t" | |
783 PAVGB" (%2, %3), %%mm1 \n\t" | |
784 "movq %%mm2, (%2) \n\t" | |
785 "movq %%mm1, (%2, %3) \n\t" | |
786 "add %%"REG_a", %2 \n\t" | |
787 "subl $4, %0 \n\t" | |
788 "jnz 1b \n\t" | |
789 :"+g"(h), "+S"(pixels), "+D"(block) | |
790 :"r" ((x86_reg)line_size) | |
791 :"%"REG_a, "memory"); | |
792 } | |
793 | |
794 static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
795 { | |
796 do { | |
797 __asm__ volatile( | |
798 "movd (%1), %%mm0 \n\t" | |
799 "movd (%1, %2), %%mm1 \n\t" | |
800 "movd (%1, %2, 2), %%mm2 \n\t" | |
801 "movd (%1, %3), %%mm3 \n\t" | |
802 PAVGB" (%0), %%mm0 \n\t" | |
803 PAVGB" (%0, %2), %%mm1 \n\t" | |
804 PAVGB" (%0, %2, 2), %%mm2 \n\t" | |
805 PAVGB" (%0, %3), %%mm3 \n\t" | |
806 "movd %%mm0, (%1) \n\t" | |
807 "movd %%mm1, (%1, %2) \n\t" | |
808 "movd %%mm2, (%1, %2, 2) \n\t" | |
809 "movd %%mm3, (%1, %3) \n\t" | |
810 ::"S"(pixels), "D"(block), | |
811 "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size) | |
812 :"memory"); | |
813 block += 4*line_size; | |
814 pixels += 4*line_size; | |
815 h -= 4; | |
816 } while(h > 0); | |
817 } | |
818 | |
819 //FIXME the following could be optimized too ... | |
820 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
821 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); | |
822 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); | |
823 } | |
824 static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
825 DEF(put_pixels8_y2)(block , pixels , line_size, h); | |
826 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); | |
827 } | |
828 static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
829 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); | |
830 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); | |
831 } | |
832 static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
833 DEF(avg_pixels8)(block , pixels , line_size, h); | |
834 DEF(avg_pixels8)(block+8, pixels+8, line_size, h); | |
835 } | |
836 static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
837 DEF(avg_pixels8_x2)(block , pixels , line_size, h); | |
838 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); | |
839 } | |
840 static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
841 DEF(avg_pixels8_y2)(block , pixels , line_size, h); | |
842 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); | |
843 } | |
844 static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
845 DEF(avg_pixels8_xy2)(block , pixels , line_size, h); | |
846 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); | |
847 } | |
848 | |
849 #define QPEL_2TAP_L3(OPNAME) \ | |
850 static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ | |
851 __asm__ volatile(\ | |
852 "1: \n\t"\ | |
853 "movq (%1,%2), %%mm0 \n\t"\ | |
854 "movq 8(%1,%2), %%mm1 \n\t"\ | |
855 PAVGB" (%1,%3), %%mm0 \n\t"\ | |
856 PAVGB" 8(%1,%3), %%mm1 \n\t"\ | |
857 PAVGB" (%1), %%mm0 \n\t"\ | |
858 PAVGB" 8(%1), %%mm1 \n\t"\ | |
859 STORE_OP( (%1,%4),%%mm0)\ | |
860 STORE_OP(8(%1,%4),%%mm1)\ | |
861 "movq %%mm0, (%1,%4) \n\t"\ | |
862 "movq %%mm1, 8(%1,%4) \n\t"\ | |
863 "add %5, %1 \n\t"\ | |
864 "decl %0 \n\t"\ | |
865 "jnz 1b \n\t"\ | |
866 :"+g"(h), "+r"(src)\ | |
867 :"r"((x86_reg)off1), "r"((x86_reg)off2),\ | |
868 "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\ | |
869 :"memory"\ | |
870 );\ | |
871 }\ | |
872 static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ | |
873 __asm__ volatile(\ | |
874 "1: \n\t"\ | |
875 "movq (%1,%2), %%mm0 \n\t"\ | |
876 PAVGB" (%1,%3), %%mm0 \n\t"\ | |
877 PAVGB" (%1), %%mm0 \n\t"\ | |
878 STORE_OP((%1,%4),%%mm0)\ | |
879 "movq %%mm0, (%1,%4) \n\t"\ | |
880 "add %5, %1 \n\t"\ | |
881 "decl %0 \n\t"\ | |
882 "jnz 1b \n\t"\ | |
883 :"+g"(h), "+r"(src)\ | |
884 :"r"((x86_reg)off1), "r"((x86_reg)off2),\ | |
885 "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\ | |
886 :"memory"\ | |
887 );\ | |
888 } | |
889 | |
890 #define STORE_OP(a,b) PAVGB" "#a","#b" \n\t" | |
891 QPEL_2TAP_L3(avg_) | |
892 #undef STORE_OP | |
893 #define STORE_OP(a,b) | |
894 QPEL_2TAP_L3(put_) | |
895 #undef STORE_OP | |
896 #undef QPEL_2TAP_L3 |