Mercurial > libavcodec.hg
annotate x86/dsputil_mmx_rnd_template.c @ 8760:31138c296ac6 libavcodec
ff_add_hfyu_median_prediction_mmx2
overall ffvhuff decoding speedup: 28% on core2, 25% on k8.
author | lorenm |
---|---|
date | Sun, 08 Feb 2009 17:45:30 +0000 |
parents | 04423b2f6e0b |
children | 41245484dc0b |
rev | line source |
---|---|
8430 | 1 /* |
2 * DSP utils mmx functions are compiled twice for rnd/no_rnd | |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8430
diff
changeset
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard |
8430 | 4 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> |
5 * | |
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> | |
8 * and improved by Zdenek Kabelac <kabi@users.sf.net> | |
9 * | |
10 * This file is part of FFmpeg. | |
11 * | |
12 * FFmpeg is free software; you can redistribute it and/or | |
13 * modify it under the terms of the GNU Lesser General Public | |
14 * License as published by the Free Software Foundation; either | |
15 * version 2.1 of the License, or (at your option) any later version. | |
16 * | |
17 * FFmpeg is distributed in the hope that it will be useful, | |
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
20 * Lesser General Public License for more details. | |
21 * | |
22 * You should have received a copy of the GNU Lesser General Public | |
23 * License along with FFmpeg; if not, write to the Free Software | |
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
25 */ | |
26 | |
27 // put_pixels | |
28 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
29 { | |
30 MOVQ_BFE(mm6); | |
31 __asm__ volatile( | |
32 "lea (%3, %3), %%"REG_a" \n\t" | |
33 ASMALIGN(3) | |
34 "1: \n\t" | |
35 "movq (%1), %%mm0 \n\t" | |
36 "movq 1(%1), %%mm1 \n\t" | |
37 "movq (%1, %3), %%mm2 \n\t" | |
38 "movq 1(%1, %3), %%mm3 \n\t" | |
39 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
40 "movq %%mm4, (%2) \n\t" | |
41 "movq %%mm5, (%2, %3) \n\t" | |
42 "add %%"REG_a", %1 \n\t" | |
43 "add %%"REG_a", %2 \n\t" | |
44 "movq (%1), %%mm0 \n\t" | |
45 "movq 1(%1), %%mm1 \n\t" | |
46 "movq (%1, %3), %%mm2 \n\t" | |
47 "movq 1(%1, %3), %%mm3 \n\t" | |
48 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
49 "movq %%mm4, (%2) \n\t" | |
50 "movq %%mm5, (%2, %3) \n\t" | |
51 "add %%"REG_a", %1 \n\t" | |
52 "add %%"REG_a", %2 \n\t" | |
53 "subl $4, %0 \n\t" | |
54 "jnz 1b \n\t" | |
55 :"+g"(h), "+S"(pixels), "+D"(block) | |
56 :"r"((x86_reg)line_size) | |
57 :REG_a, "memory"); | |
58 } | |
59 | |
60 static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
61 { | |
62 MOVQ_BFE(mm6); | |
63 __asm__ volatile( | |
64 "testl $1, %0 \n\t" | |
65 " jz 1f \n\t" | |
66 "movq (%1), %%mm0 \n\t" | |
67 "movq (%2), %%mm1 \n\t" | |
68 "add %4, %1 \n\t" | |
69 "add $8, %2 \n\t" | |
70 PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) | |
71 "movq %%mm4, (%3) \n\t" | |
72 "add %5, %3 \n\t" | |
73 "decl %0 \n\t" | |
74 ASMALIGN(3) | |
75 "1: \n\t" | |
76 "movq (%1), %%mm0 \n\t" | |
77 "movq (%2), %%mm1 \n\t" | |
78 "add %4, %1 \n\t" | |
79 "movq (%1), %%mm2 \n\t" | |
80 "movq 8(%2), %%mm3 \n\t" | |
81 "add %4, %1 \n\t" | |
82 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
83 "movq %%mm4, (%3) \n\t" | |
84 "add %5, %3 \n\t" | |
85 "movq %%mm5, (%3) \n\t" | |
86 "add %5, %3 \n\t" | |
87 "movq (%1), %%mm0 \n\t" | |
88 "movq 16(%2), %%mm1 \n\t" | |
89 "add %4, %1 \n\t" | |
90 "movq (%1), %%mm2 \n\t" | |
91 "movq 24(%2), %%mm3 \n\t" | |
92 "add %4, %1 \n\t" | |
93 "add $32, %2 \n\t" | |
94 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
95 "movq %%mm4, (%3) \n\t" | |
96 "add %5, %3 \n\t" | |
97 "movq %%mm5, (%3) \n\t" | |
98 "add %5, %3 \n\t" | |
99 "subl $4, %0 \n\t" | |
100 "jnz 1b \n\t" | |
101 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used | |
102 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
103 #else | |
104 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
105 #endif | |
106 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
107 :"memory"); | |
108 } | |
109 | |
110 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
111 { | |
112 MOVQ_BFE(mm6); | |
113 __asm__ volatile( | |
114 "lea (%3, %3), %%"REG_a" \n\t" | |
115 ASMALIGN(3) | |
116 "1: \n\t" | |
117 "movq (%1), %%mm0 \n\t" | |
118 "movq 1(%1), %%mm1 \n\t" | |
119 "movq (%1, %3), %%mm2 \n\t" | |
120 "movq 1(%1, %3), %%mm3 \n\t" | |
121 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
122 "movq %%mm4, (%2) \n\t" | |
123 "movq %%mm5, (%2, %3) \n\t" | |
124 "movq 8(%1), %%mm0 \n\t" | |
125 "movq 9(%1), %%mm1 \n\t" | |
126 "movq 8(%1, %3), %%mm2 \n\t" | |
127 "movq 9(%1, %3), %%mm3 \n\t" | |
128 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
129 "movq %%mm4, 8(%2) \n\t" | |
130 "movq %%mm5, 8(%2, %3) \n\t" | |
131 "add %%"REG_a", %1 \n\t" | |
132 "add %%"REG_a", %2 \n\t" | |
133 "movq (%1), %%mm0 \n\t" | |
134 "movq 1(%1), %%mm1 \n\t" | |
135 "movq (%1, %3), %%mm2 \n\t" | |
136 "movq 1(%1, %3), %%mm3 \n\t" | |
137 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
138 "movq %%mm4, (%2) \n\t" | |
139 "movq %%mm5, (%2, %3) \n\t" | |
140 "movq 8(%1), %%mm0 \n\t" | |
141 "movq 9(%1), %%mm1 \n\t" | |
142 "movq 8(%1, %3), %%mm2 \n\t" | |
143 "movq 9(%1, %3), %%mm3 \n\t" | |
144 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
145 "movq %%mm4, 8(%2) \n\t" | |
146 "movq %%mm5, 8(%2, %3) \n\t" | |
147 "add %%"REG_a", %1 \n\t" | |
148 "add %%"REG_a", %2 \n\t" | |
149 "subl $4, %0 \n\t" | |
150 "jnz 1b \n\t" | |
151 :"+g"(h), "+S"(pixels), "+D"(block) | |
152 :"r"((x86_reg)line_size) | |
153 :REG_a, "memory"); | |
154 } | |
155 | |
156 static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
157 { | |
158 MOVQ_BFE(mm6); | |
159 __asm__ volatile( | |
160 "testl $1, %0 \n\t" | |
161 " jz 1f \n\t" | |
162 "movq (%1), %%mm0 \n\t" | |
163 "movq (%2), %%mm1 \n\t" | |
164 "movq 8(%1), %%mm2 \n\t" | |
165 "movq 8(%2), %%mm3 \n\t" | |
166 "add %4, %1 \n\t" | |
167 "add $16, %2 \n\t" | |
168 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
169 "movq %%mm4, (%3) \n\t" | |
170 "movq %%mm5, 8(%3) \n\t" | |
171 "add %5, %3 \n\t" | |
172 "decl %0 \n\t" | |
173 ASMALIGN(3) | |
174 "1: \n\t" | |
175 "movq (%1), %%mm0 \n\t" | |
176 "movq (%2), %%mm1 \n\t" | |
177 "movq 8(%1), %%mm2 \n\t" | |
178 "movq 8(%2), %%mm3 \n\t" | |
179 "add %4, %1 \n\t" | |
180 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
181 "movq %%mm4, (%3) \n\t" | |
182 "movq %%mm5, 8(%3) \n\t" | |
183 "add %5, %3 \n\t" | |
184 "movq (%1), %%mm0 \n\t" | |
185 "movq 16(%2), %%mm1 \n\t" | |
186 "movq 8(%1), %%mm2 \n\t" | |
187 "movq 24(%2), %%mm3 \n\t" | |
188 "add %4, %1 \n\t" | |
189 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
190 "movq %%mm4, (%3) \n\t" | |
191 "movq %%mm5, 8(%3) \n\t" | |
192 "add %5, %3 \n\t" | |
193 "add $32, %2 \n\t" | |
194 "subl $2, %0 \n\t" | |
195 "jnz 1b \n\t" | |
196 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used | |
197 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
198 #else | |
199 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
200 #endif | |
201 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
202 :"memory"); | |
203 } | |
204 | |
205 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
206 { | |
207 MOVQ_BFE(mm6); | |
208 __asm__ volatile( | |
209 "lea (%3, %3), %%"REG_a" \n\t" | |
210 "movq (%1), %%mm0 \n\t" | |
211 ASMALIGN(3) | |
212 "1: \n\t" | |
213 "movq (%1, %3), %%mm1 \n\t" | |
214 "movq (%1, %%"REG_a"),%%mm2 \n\t" | |
215 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |
216 "movq %%mm4, (%2) \n\t" | |
217 "movq %%mm5, (%2, %3) \n\t" | |
218 "add %%"REG_a", %1 \n\t" | |
219 "add %%"REG_a", %2 \n\t" | |
220 "movq (%1, %3), %%mm1 \n\t" | |
221 "movq (%1, %%"REG_a"),%%mm0 \n\t" | |
222 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |
223 "movq %%mm4, (%2) \n\t" | |
224 "movq %%mm5, (%2, %3) \n\t" | |
225 "add %%"REG_a", %1 \n\t" | |
226 "add %%"REG_a", %2 \n\t" | |
227 "subl $4, %0 \n\t" | |
228 "jnz 1b \n\t" | |
229 :"+g"(h), "+S"(pixels), "+D"(block) | |
230 :"r"((x86_reg)line_size) | |
231 :REG_a, "memory"); | |
232 } | |
233 | |
234 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
235 { | |
236 MOVQ_ZERO(mm7); | |
237 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |
238 __asm__ volatile( | |
239 "movq (%1), %%mm0 \n\t" | |
240 "movq 1(%1), %%mm4 \n\t" | |
241 "movq %%mm0, %%mm1 \n\t" | |
242 "movq %%mm4, %%mm5 \n\t" | |
243 "punpcklbw %%mm7, %%mm0 \n\t" | |
244 "punpcklbw %%mm7, %%mm4 \n\t" | |
245 "punpckhbw %%mm7, %%mm1 \n\t" | |
246 "punpckhbw %%mm7, %%mm5 \n\t" | |
247 "paddusw %%mm0, %%mm4 \n\t" | |
248 "paddusw %%mm1, %%mm5 \n\t" | |
249 "xor %%"REG_a", %%"REG_a" \n\t" | |
250 "add %3, %1 \n\t" | |
251 ASMALIGN(3) | |
252 "1: \n\t" | |
253 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
254 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |
255 "movq %%mm0, %%mm1 \n\t" | |
256 "movq %%mm2, %%mm3 \n\t" | |
257 "punpcklbw %%mm7, %%mm0 \n\t" | |
258 "punpcklbw %%mm7, %%mm2 \n\t" | |
259 "punpckhbw %%mm7, %%mm1 \n\t" | |
260 "punpckhbw %%mm7, %%mm3 \n\t" | |
261 "paddusw %%mm2, %%mm0 \n\t" | |
262 "paddusw %%mm3, %%mm1 \n\t" | |
263 "paddusw %%mm6, %%mm4 \n\t" | |
264 "paddusw %%mm6, %%mm5 \n\t" | |
265 "paddusw %%mm0, %%mm4 \n\t" | |
266 "paddusw %%mm1, %%mm5 \n\t" | |
267 "psrlw $2, %%mm4 \n\t" | |
268 "psrlw $2, %%mm5 \n\t" | |
269 "packuswb %%mm5, %%mm4 \n\t" | |
270 "movq %%mm4, (%2, %%"REG_a") \n\t" | |
271 "add %3, %%"REG_a" \n\t" | |
272 | |
273 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | |
274 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" | |
275 "movq %%mm2, %%mm3 \n\t" | |
276 "movq %%mm4, %%mm5 \n\t" | |
277 "punpcklbw %%mm7, %%mm2 \n\t" | |
278 "punpcklbw %%mm7, %%mm4 \n\t" | |
279 "punpckhbw %%mm7, %%mm3 \n\t" | |
280 "punpckhbw %%mm7, %%mm5 \n\t" | |
281 "paddusw %%mm2, %%mm4 \n\t" | |
282 "paddusw %%mm3, %%mm5 \n\t" | |
283 "paddusw %%mm6, %%mm0 \n\t" | |
284 "paddusw %%mm6, %%mm1 \n\t" | |
285 "paddusw %%mm4, %%mm0 \n\t" | |
286 "paddusw %%mm5, %%mm1 \n\t" | |
287 "psrlw $2, %%mm0 \n\t" | |
288 "psrlw $2, %%mm1 \n\t" | |
289 "packuswb %%mm1, %%mm0 \n\t" | |
290 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
291 "add %3, %%"REG_a" \n\t" | |
292 | |
293 "subl $2, %0 \n\t" | |
294 "jnz 1b \n\t" | |
295 :"+g"(h), "+S"(pixels) | |
296 :"D"(block), "r"((x86_reg)line_size) | |
297 :REG_a, "memory"); | |
298 } | |
299 | |
300 // avg_pixels | |
301 static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
302 { | |
303 MOVQ_BFE(mm6); | |
304 JUMPALIGN(); | |
305 do { | |
306 __asm__ volatile( | |
307 "movd %0, %%mm0 \n\t" | |
308 "movd %1, %%mm1 \n\t" | |
309 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
310 "movd %%mm2, %0 \n\t" | |
311 :"+m"(*block) | |
312 :"m"(*pixels) | |
313 :"memory"); | |
314 pixels += line_size; | |
315 block += line_size; | |
316 } | |
317 while (--h); | |
318 } | |
319 | |
320 // in case more speed is needed - unroling would certainly help | |
321 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
322 { | |
323 MOVQ_BFE(mm6); | |
324 JUMPALIGN(); | |
325 do { | |
326 __asm__ volatile( | |
327 "movq %0, %%mm0 \n\t" | |
328 "movq %1, %%mm1 \n\t" | |
329 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
330 "movq %%mm2, %0 \n\t" | |
331 :"+m"(*block) | |
332 :"m"(*pixels) | |
333 :"memory"); | |
334 pixels += line_size; | |
335 block += line_size; | |
336 } | |
337 while (--h); | |
338 } | |
339 | |
340 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
341 { | |
342 MOVQ_BFE(mm6); | |
343 JUMPALIGN(); | |
344 do { | |
345 __asm__ volatile( | |
346 "movq %0, %%mm0 \n\t" | |
347 "movq %1, %%mm1 \n\t" | |
348 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
349 "movq %%mm2, %0 \n\t" | |
350 "movq 8%0, %%mm0 \n\t" | |
351 "movq 8%1, %%mm1 \n\t" | |
352 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
353 "movq %%mm2, 8%0 \n\t" | |
354 :"+m"(*block) | |
355 :"m"(*pixels) | |
356 :"memory"); | |
357 pixels += line_size; | |
358 block += line_size; | |
359 } | |
360 while (--h); | |
361 } | |
362 | |
363 static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
364 { | |
365 MOVQ_BFE(mm6); | |
366 JUMPALIGN(); | |
367 do { | |
368 __asm__ volatile( | |
369 "movq %1, %%mm0 \n\t" | |
370 "movq 1%1, %%mm1 \n\t" | |
371 "movq %0, %%mm3 \n\t" | |
372 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
373 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | |
374 "movq %%mm0, %0 \n\t" | |
375 :"+m"(*block) | |
376 :"m"(*pixels) | |
377 :"memory"); | |
378 pixels += line_size; | |
379 block += line_size; | |
380 } while (--h); | |
381 } | |
382 | |
383 static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
384 { | |
385 MOVQ_BFE(mm6); | |
386 JUMPALIGN(); | |
387 do { | |
388 __asm__ volatile( | |
389 "movq %1, %%mm0 \n\t" | |
390 "movq %2, %%mm1 \n\t" | |
391 "movq %0, %%mm3 \n\t" | |
392 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
393 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | |
394 "movq %%mm0, %0 \n\t" | |
395 :"+m"(*dst) | |
396 :"m"(*src1), "m"(*src2) | |
397 :"memory"); | |
398 dst += dstStride; | |
399 src1 += src1Stride; | |
400 src2 += 8; | |
401 } while (--h); | |
402 } | |
403 | |
404 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
405 { | |
406 MOVQ_BFE(mm6); | |
407 JUMPALIGN(); | |
408 do { | |
409 __asm__ volatile( | |
410 "movq %1, %%mm0 \n\t" | |
411 "movq 1%1, %%mm1 \n\t" | |
412 "movq %0, %%mm3 \n\t" | |
413 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
414 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | |
415 "movq %%mm0, %0 \n\t" | |
416 "movq 8%1, %%mm0 \n\t" | |
417 "movq 9%1, %%mm1 \n\t" | |
418 "movq 8%0, %%mm3 \n\t" | |
419 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
420 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | |
421 "movq %%mm0, 8%0 \n\t" | |
422 :"+m"(*block) | |
423 :"m"(*pixels) | |
424 :"memory"); | |
425 pixels += line_size; | |
426 block += line_size; | |
427 } while (--h); | |
428 } | |
429 | |
430 static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
431 { | |
432 MOVQ_BFE(mm6); | |
433 JUMPALIGN(); | |
434 do { | |
435 __asm__ volatile( | |
436 "movq %1, %%mm0 \n\t" | |
437 "movq %2, %%mm1 \n\t" | |
438 "movq %0, %%mm3 \n\t" | |
439 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
440 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | |
441 "movq %%mm0, %0 \n\t" | |
442 "movq 8%1, %%mm0 \n\t" | |
443 "movq 8%2, %%mm1 \n\t" | |
444 "movq 8%0, %%mm3 \n\t" | |
445 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
446 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | |
447 "movq %%mm0, 8%0 \n\t" | |
448 :"+m"(*dst) | |
449 :"m"(*src1), "m"(*src2) | |
450 :"memory"); | |
451 dst += dstStride; | |
452 src1 += src1Stride; | |
453 src2 += 16; | |
454 } while (--h); | |
455 } | |
456 | |
457 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
458 { | |
459 MOVQ_BFE(mm6); | |
460 __asm__ volatile( | |
461 "lea (%3, %3), %%"REG_a" \n\t" | |
462 "movq (%1), %%mm0 \n\t" | |
463 ASMALIGN(3) | |
464 "1: \n\t" | |
465 "movq (%1, %3), %%mm1 \n\t" | |
466 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
467 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |
468 "movq (%2), %%mm3 \n\t" | |
469 PAVGB(%%mm3, %%mm4, %%mm0, %%mm6) | |
470 "movq (%2, %3), %%mm3 \n\t" | |
471 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) | |
472 "movq %%mm0, (%2) \n\t" | |
473 "movq %%mm1, (%2, %3) \n\t" | |
474 "add %%"REG_a", %1 \n\t" | |
475 "add %%"REG_a", %2 \n\t" | |
476 | |
477 "movq (%1, %3), %%mm1 \n\t" | |
478 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
479 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |
480 "movq (%2), %%mm3 \n\t" | |
481 PAVGB(%%mm3, %%mm4, %%mm2, %%mm6) | |
482 "movq (%2, %3), %%mm3 \n\t" | |
483 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) | |
484 "movq %%mm2, (%2) \n\t" | |
485 "movq %%mm1, (%2, %3) \n\t" | |
486 "add %%"REG_a", %1 \n\t" | |
487 "add %%"REG_a", %2 \n\t" | |
488 | |
489 "subl $4, %0 \n\t" | |
490 "jnz 1b \n\t" | |
491 :"+g"(h), "+S"(pixels), "+D"(block) | |
492 :"r"((x86_reg)line_size) | |
493 :REG_a, "memory"); | |
494 } | |
495 | |
496 // this routine is 'slightly' suboptimal but mostly unused | |
497 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
498 { | |
499 MOVQ_ZERO(mm7); | |
500 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |
501 __asm__ volatile( | |
502 "movq (%1), %%mm0 \n\t" | |
503 "movq 1(%1), %%mm4 \n\t" | |
504 "movq %%mm0, %%mm1 \n\t" | |
505 "movq %%mm4, %%mm5 \n\t" | |
506 "punpcklbw %%mm7, %%mm0 \n\t" | |
507 "punpcklbw %%mm7, %%mm4 \n\t" | |
508 "punpckhbw %%mm7, %%mm1 \n\t" | |
509 "punpckhbw %%mm7, %%mm5 \n\t" | |
510 "paddusw %%mm0, %%mm4 \n\t" | |
511 "paddusw %%mm1, %%mm5 \n\t" | |
512 "xor %%"REG_a", %%"REG_a" \n\t" | |
513 "add %3, %1 \n\t" | |
514 ASMALIGN(3) | |
515 "1: \n\t" | |
516 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
517 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |
518 "movq %%mm0, %%mm1 \n\t" | |
519 "movq %%mm2, %%mm3 \n\t" | |
520 "punpcklbw %%mm7, %%mm0 \n\t" | |
521 "punpcklbw %%mm7, %%mm2 \n\t" | |
522 "punpckhbw %%mm7, %%mm1 \n\t" | |
523 "punpckhbw %%mm7, %%mm3 \n\t" | |
524 "paddusw %%mm2, %%mm0 \n\t" | |
525 "paddusw %%mm3, %%mm1 \n\t" | |
526 "paddusw %%mm6, %%mm4 \n\t" | |
527 "paddusw %%mm6, %%mm5 \n\t" | |
528 "paddusw %%mm0, %%mm4 \n\t" | |
529 "paddusw %%mm1, %%mm5 \n\t" | |
530 "psrlw $2, %%mm4 \n\t" | |
531 "psrlw $2, %%mm5 \n\t" | |
532 "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
533 "packuswb %%mm5, %%mm4 \n\t" | |
534 "pcmpeqd %%mm2, %%mm2 \n\t" | |
535 "paddb %%mm2, %%mm2 \n\t" | |
536 PAVGB(%%mm3, %%mm4, %%mm5, %%mm2) | |
537 "movq %%mm5, (%2, %%"REG_a") \n\t" | |
538 "add %3, %%"REG_a" \n\t" | |
539 | |
540 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | |
541 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" | |
542 "movq %%mm2, %%mm3 \n\t" | |
543 "movq %%mm4, %%mm5 \n\t" | |
544 "punpcklbw %%mm7, %%mm2 \n\t" | |
545 "punpcklbw %%mm7, %%mm4 \n\t" | |
546 "punpckhbw %%mm7, %%mm3 \n\t" | |
547 "punpckhbw %%mm7, %%mm5 \n\t" | |
548 "paddusw %%mm2, %%mm4 \n\t" | |
549 "paddusw %%mm3, %%mm5 \n\t" | |
550 "paddusw %%mm6, %%mm0 \n\t" | |
551 "paddusw %%mm6, %%mm1 \n\t" | |
552 "paddusw %%mm4, %%mm0 \n\t" | |
553 "paddusw %%mm5, %%mm1 \n\t" | |
554 "psrlw $2, %%mm0 \n\t" | |
555 "psrlw $2, %%mm1 \n\t" | |
556 "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
557 "packuswb %%mm1, %%mm0 \n\t" | |
558 "pcmpeqd %%mm2, %%mm2 \n\t" | |
559 "paddb %%mm2, %%mm2 \n\t" | |
560 PAVGB(%%mm3, %%mm0, %%mm1, %%mm2) | |
561 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
562 "add %3, %%"REG_a" \n\t" | |
563 | |
564 "subl $2, %0 \n\t" | |
565 "jnz 1b \n\t" | |
566 :"+g"(h), "+S"(pixels) | |
567 :"D"(block), "r"((x86_reg)line_size) | |
568 :REG_a, "memory"); | |
569 } | |
570 | |
571 //FIXME optimize | |
572 static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
573 DEF(put, pixels8_y2)(block , pixels , line_size, h); | |
574 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); | |
575 } | |
576 | |
577 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
578 DEF(put, pixels8_xy2)(block , pixels , line_size, h); | |
579 DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); | |
580 } | |
581 | |
582 static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
583 DEF(avg, pixels8_y2)(block , pixels , line_size, h); | |
584 DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); | |
585 } | |
586 | |
587 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
588 DEF(avg, pixels8_xy2)(block , pixels , line_size, h); | |
589 DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); | |
590 } |