Mercurial > libavcodec.hg
annotate x86/dsputil_mmx_rnd_template.c @ 12483:0159a19bfff7 libavcodec
aacdec: Rework channel mapping compatibility hacks.
For a PCE based configuration map the channels solely based on tags.
For an indexed configuration map the channels solely based on position.
This works with all known exotic samples including al17, elem_id0, bad_concat,
and lfe_is_sce.
author | alexc |
---|---|
date | Fri, 10 Sep 2010 18:01:48 +0000 |
parents | 36b60aa6bc75 |
children |
rev | line source |
---|---|
8430 | 1 /* |
2 * DSP utils mmx functions are compiled twice for rnd/no_rnd | |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8430
diff
changeset
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard |
8430 | 4 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> |
5 * | |
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> | |
8 * and improved by Zdenek Kabelac <kabi@users.sf.net> | |
9 * | |
10 * This file is part of FFmpeg. | |
11 * | |
12 * FFmpeg is free software; you can redistribute it and/or | |
13 * modify it under the terms of the GNU Lesser General Public | |
14 * License as published by the Free Software Foundation; either | |
15 * version 2.1 of the License, or (at your option) any later version. | |
16 * | |
17 * FFmpeg is distributed in the hope that it will be useful, | |
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
20 * Lesser General Public License for more details. | |
21 * | |
22 * You should have received a copy of the GNU Lesser General Public | |
23 * License along with FFmpeg; if not, write to the Free Software | |
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
25 */ | |
26 | |
27 // put_pixels | |
28 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
29 { | |
30 MOVQ_BFE(mm6); | |
31 __asm__ volatile( | |
32 "lea (%3, %3), %%"REG_a" \n\t" | |
33 ASMALIGN(3) | |
34 "1: \n\t" | |
35 "movq (%1), %%mm0 \n\t" | |
36 "movq 1(%1), %%mm1 \n\t" | |
37 "movq (%1, %3), %%mm2 \n\t" | |
38 "movq 1(%1, %3), %%mm3 \n\t" | |
39 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
40 "movq %%mm4, (%2) \n\t" | |
41 "movq %%mm5, (%2, %3) \n\t" | |
42 "add %%"REG_a", %1 \n\t" | |
43 "add %%"REG_a", %2 \n\t" | |
44 "movq (%1), %%mm0 \n\t" | |
45 "movq 1(%1), %%mm1 \n\t" | |
46 "movq (%1, %3), %%mm2 \n\t" | |
47 "movq 1(%1, %3), %%mm3 \n\t" | |
48 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
49 "movq %%mm4, (%2) \n\t" | |
50 "movq %%mm5, (%2, %3) \n\t" | |
51 "add %%"REG_a", %1 \n\t" | |
52 "add %%"REG_a", %2 \n\t" | |
53 "subl $4, %0 \n\t" | |
54 "jnz 1b \n\t" | |
55 :"+g"(h), "+S"(pixels), "+D"(block) | |
56 :"r"((x86_reg)line_size) | |
57 :REG_a, "memory"); | |
58 } | |
59 | |
60 static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
61 { | |
62 MOVQ_BFE(mm6); | |
63 __asm__ volatile( | |
64 "testl $1, %0 \n\t" | |
65 " jz 1f \n\t" | |
66 "movq (%1), %%mm0 \n\t" | |
67 "movq (%2), %%mm1 \n\t" | |
68 "add %4, %1 \n\t" | |
69 "add $8, %2 \n\t" | |
70 PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) | |
71 "movq %%mm4, (%3) \n\t" | |
72 "add %5, %3 \n\t" | |
73 "decl %0 \n\t" | |
74 ASMALIGN(3) | |
75 "1: \n\t" | |
76 "movq (%1), %%mm0 \n\t" | |
77 "movq (%2), %%mm1 \n\t" | |
78 "add %4, %1 \n\t" | |
79 "movq (%1), %%mm2 \n\t" | |
80 "movq 8(%2), %%mm3 \n\t" | |
81 "add %4, %1 \n\t" | |
82 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
83 "movq %%mm4, (%3) \n\t" | |
84 "add %5, %3 \n\t" | |
85 "movq %%mm5, (%3) \n\t" | |
86 "add %5, %3 \n\t" | |
87 "movq (%1), %%mm0 \n\t" | |
88 "movq 16(%2), %%mm1 \n\t" | |
89 "add %4, %1 \n\t" | |
90 "movq (%1), %%mm2 \n\t" | |
91 "movq 24(%2), %%mm3 \n\t" | |
92 "add %4, %1 \n\t" | |
93 "add $32, %2 \n\t" | |
94 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
95 "movq %%mm4, (%3) \n\t" | |
96 "add %5, %3 \n\t" | |
97 "movq %%mm5, (%3) \n\t" | |
98 "add %5, %3 \n\t" | |
99 "subl $4, %0 \n\t" | |
100 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
9445
diff
changeset
|
101 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 102 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
103 #else | |
104 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
105 #endif | |
106 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
107 :"memory"); | |
108 } | |
109 | |
110 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
111 { | |
112 MOVQ_BFE(mm6); | |
113 __asm__ volatile( | |
114 "lea (%3, %3), %%"REG_a" \n\t" | |
115 ASMALIGN(3) | |
116 "1: \n\t" | |
117 "movq (%1), %%mm0 \n\t" | |
118 "movq 1(%1), %%mm1 \n\t" | |
119 "movq (%1, %3), %%mm2 \n\t" | |
120 "movq 1(%1, %3), %%mm3 \n\t" | |
121 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
122 "movq %%mm4, (%2) \n\t" | |
123 "movq %%mm5, (%2, %3) \n\t" | |
124 "movq 8(%1), %%mm0 \n\t" | |
125 "movq 9(%1), %%mm1 \n\t" | |
126 "movq 8(%1, %3), %%mm2 \n\t" | |
127 "movq 9(%1, %3), %%mm3 \n\t" | |
128 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
129 "movq %%mm4, 8(%2) \n\t" | |
130 "movq %%mm5, 8(%2, %3) \n\t" | |
131 "add %%"REG_a", %1 \n\t" | |
132 "add %%"REG_a", %2 \n\t" | |
133 "movq (%1), %%mm0 \n\t" | |
134 "movq 1(%1), %%mm1 \n\t" | |
135 "movq (%1, %3), %%mm2 \n\t" | |
136 "movq 1(%1, %3), %%mm3 \n\t" | |
137 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
138 "movq %%mm4, (%2) \n\t" | |
139 "movq %%mm5, (%2, %3) \n\t" | |
140 "movq 8(%1), %%mm0 \n\t" | |
141 "movq 9(%1), %%mm1 \n\t" | |
142 "movq 8(%1, %3), %%mm2 \n\t" | |
143 "movq 9(%1, %3), %%mm3 \n\t" | |
144 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
145 "movq %%mm4, 8(%2) \n\t" | |
146 "movq %%mm5, 8(%2, %3) \n\t" | |
147 "add %%"REG_a", %1 \n\t" | |
148 "add %%"REG_a", %2 \n\t" | |
149 "subl $4, %0 \n\t" | |
150 "jnz 1b \n\t" | |
151 :"+g"(h), "+S"(pixels), "+D"(block) | |
152 :"r"((x86_reg)line_size) | |
153 :REG_a, "memory"); | |
154 } | |
155 | |
156 static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
157 { | |
158 MOVQ_BFE(mm6); | |
159 __asm__ volatile( | |
160 "testl $1, %0 \n\t" | |
161 " jz 1f \n\t" | |
162 "movq (%1), %%mm0 \n\t" | |
163 "movq (%2), %%mm1 \n\t" | |
164 "movq 8(%1), %%mm2 \n\t" | |
165 "movq 8(%2), %%mm3 \n\t" | |
166 "add %4, %1 \n\t" | |
167 "add $16, %2 \n\t" | |
168 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
169 "movq %%mm4, (%3) \n\t" | |
170 "movq %%mm5, 8(%3) \n\t" | |
171 "add %5, %3 \n\t" | |
172 "decl %0 \n\t" | |
173 ASMALIGN(3) | |
174 "1: \n\t" | |
175 "movq (%1), %%mm0 \n\t" | |
176 "movq (%2), %%mm1 \n\t" | |
177 "movq 8(%1), %%mm2 \n\t" | |
178 "movq 8(%2), %%mm3 \n\t" | |
179 "add %4, %1 \n\t" | |
180 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
181 "movq %%mm4, (%3) \n\t" | |
182 "movq %%mm5, 8(%3) \n\t" | |
183 "add %5, %3 \n\t" | |
184 "movq (%1), %%mm0 \n\t" | |
185 "movq 16(%2), %%mm1 \n\t" | |
186 "movq 8(%1), %%mm2 \n\t" | |
187 "movq 24(%2), %%mm3 \n\t" | |
188 "add %4, %1 \n\t" | |
189 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
190 "movq %%mm4, (%3) \n\t" | |
191 "movq %%mm5, 8(%3) \n\t" | |
192 "add %5, %3 \n\t" | |
193 "add $32, %2 \n\t" | |
194 "subl $2, %0 \n\t" | |
195 "jnz 1b \n\t" | |
10325
36b60aa6bc75
Replace several #ifdef PIC with the more obvious and correct
reimar
parents:
9445
diff
changeset
|
196 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
8430 | 197 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
198 #else | |
199 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
200 #endif | |
201 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) | |
202 :"memory"); | |
203 } | |
204 | |
205 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
206 { | |
207 MOVQ_BFE(mm6); | |
208 __asm__ volatile( | |
209 "lea (%3, %3), %%"REG_a" \n\t" | |
210 "movq (%1), %%mm0 \n\t" | |
211 ASMALIGN(3) | |
212 "1: \n\t" | |
213 "movq (%1, %3), %%mm1 \n\t" | |
214 "movq (%1, %%"REG_a"),%%mm2 \n\t" | |
215 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |
216 "movq %%mm4, (%2) \n\t" | |
217 "movq %%mm5, (%2, %3) \n\t" | |
218 "add %%"REG_a", %1 \n\t" | |
219 "add %%"REG_a", %2 \n\t" | |
220 "movq (%1, %3), %%mm1 \n\t" | |
221 "movq (%1, %%"REG_a"),%%mm0 \n\t" | |
222 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |
223 "movq %%mm4, (%2) \n\t" | |
224 "movq %%mm5, (%2, %3) \n\t" | |
225 "add %%"REG_a", %1 \n\t" | |
226 "add %%"REG_a", %2 \n\t" | |
227 "subl $4, %0 \n\t" | |
228 "jnz 1b \n\t" | |
229 :"+g"(h), "+S"(pixels), "+D"(block) | |
230 :"r"((x86_reg)line_size) | |
231 :REG_a, "memory"); | |
232 } | |
233 | |
234 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
235 { | |
236 MOVQ_ZERO(mm7); | |
237 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |
238 __asm__ volatile( | |
239 "movq (%1), %%mm0 \n\t" | |
240 "movq 1(%1), %%mm4 \n\t" | |
241 "movq %%mm0, %%mm1 \n\t" | |
242 "movq %%mm4, %%mm5 \n\t" | |
243 "punpcklbw %%mm7, %%mm0 \n\t" | |
244 "punpcklbw %%mm7, %%mm4 \n\t" | |
245 "punpckhbw %%mm7, %%mm1 \n\t" | |
246 "punpckhbw %%mm7, %%mm5 \n\t" | |
247 "paddusw %%mm0, %%mm4 \n\t" | |
248 "paddusw %%mm1, %%mm5 \n\t" | |
249 "xor %%"REG_a", %%"REG_a" \n\t" | |
250 "add %3, %1 \n\t" | |
251 ASMALIGN(3) | |
252 "1: \n\t" | |
253 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
254 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |
255 "movq %%mm0, %%mm1 \n\t" | |
256 "movq %%mm2, %%mm3 \n\t" | |
257 "punpcklbw %%mm7, %%mm0 \n\t" | |
258 "punpcklbw %%mm7, %%mm2 \n\t" | |
259 "punpckhbw %%mm7, %%mm1 \n\t" | |
260 "punpckhbw %%mm7, %%mm3 \n\t" | |
261 "paddusw %%mm2, %%mm0 \n\t" | |
262 "paddusw %%mm3, %%mm1 \n\t" | |
263 "paddusw %%mm6, %%mm4 \n\t" | |
264 "paddusw %%mm6, %%mm5 \n\t" | |
265 "paddusw %%mm0, %%mm4 \n\t" | |
266 "paddusw %%mm1, %%mm5 \n\t" | |
267 "psrlw $2, %%mm4 \n\t" | |
268 "psrlw $2, %%mm5 \n\t" | |
269 "packuswb %%mm5, %%mm4 \n\t" | |
270 "movq %%mm4, (%2, %%"REG_a") \n\t" | |
271 "add %3, %%"REG_a" \n\t" | |
272 | |
273 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | |
274 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" | |
275 "movq %%mm2, %%mm3 \n\t" | |
276 "movq %%mm4, %%mm5 \n\t" | |
277 "punpcklbw %%mm7, %%mm2 \n\t" | |
278 "punpcklbw %%mm7, %%mm4 \n\t" | |
279 "punpckhbw %%mm7, %%mm3 \n\t" | |
280 "punpckhbw %%mm7, %%mm5 \n\t" | |
281 "paddusw %%mm2, %%mm4 \n\t" | |
282 "paddusw %%mm3, %%mm5 \n\t" | |
283 "paddusw %%mm6, %%mm0 \n\t" | |
284 "paddusw %%mm6, %%mm1 \n\t" | |
285 "paddusw %%mm4, %%mm0 \n\t" | |
286 "paddusw %%mm5, %%mm1 \n\t" | |
287 "psrlw $2, %%mm0 \n\t" | |
288 "psrlw $2, %%mm1 \n\t" | |
289 "packuswb %%mm1, %%mm0 \n\t" | |
290 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
291 "add %3, %%"REG_a" \n\t" | |
292 | |
293 "subl $2, %0 \n\t" | |
294 "jnz 1b \n\t" | |
295 :"+g"(h), "+S"(pixels) | |
296 :"D"(block), "r"((x86_reg)line_size) | |
297 :REG_a, "memory"); | |
298 } | |
299 | |
300 // avg_pixels | |
301 static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
302 { | |
303 MOVQ_BFE(mm6); | |
304 JUMPALIGN(); | |
305 do { | |
306 __asm__ volatile( | |
307 "movd %0, %%mm0 \n\t" | |
308 "movd %1, %%mm1 \n\t" | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
309 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) |
8430 | 310 "movd %%mm2, %0 \n\t" |
311 :"+m"(*block) | |
312 :"m"(*pixels) | |
313 :"memory"); | |
314 pixels += line_size; | |
315 block += line_size; | |
316 } | |
317 while (--h); | |
318 } | |
319 | |
320 // in case more speed is needed - unroling would certainly help | |
321 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
322 { | |
323 MOVQ_BFE(mm6); | |
324 JUMPALIGN(); | |
325 do { | |
326 __asm__ volatile( | |
327 "movq %0, %%mm0 \n\t" | |
328 "movq %1, %%mm1 \n\t" | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
329 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) |
8430 | 330 "movq %%mm2, %0 \n\t" |
331 :"+m"(*block) | |
332 :"m"(*pixels) | |
333 :"memory"); | |
334 pixels += line_size; | |
335 block += line_size; | |
336 } | |
337 while (--h); | |
338 } | |
339 | |
340 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
341 { | |
342 MOVQ_BFE(mm6); | |
343 JUMPALIGN(); | |
344 do { | |
345 __asm__ volatile( | |
346 "movq %0, %%mm0 \n\t" | |
347 "movq %1, %%mm1 \n\t" | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
348 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) |
8430 | 349 "movq %%mm2, %0 \n\t" |
350 "movq 8%0, %%mm0 \n\t" | |
351 "movq 8%1, %%mm1 \n\t" | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
352 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) |
8430 | 353 "movq %%mm2, 8%0 \n\t" |
354 :"+m"(*block) | |
355 :"m"(*pixels) | |
356 :"memory"); | |
357 pixels += line_size; | |
358 block += line_size; | |
359 } | |
360 while (--h); | |
361 } | |
362 | |
363 static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
364 { | |
365 MOVQ_BFE(mm6); | |
366 JUMPALIGN(); | |
367 do { | |
368 __asm__ volatile( | |
369 "movq %1, %%mm0 \n\t" | |
370 "movq 1%1, %%mm1 \n\t" | |
371 "movq %0, %%mm3 \n\t" | |
372 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
373 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) |
8430 | 374 "movq %%mm0, %0 \n\t" |
375 :"+m"(*block) | |
376 :"m"(*pixels) | |
377 :"memory"); | |
378 pixels += line_size; | |
379 block += line_size; | |
380 } while (--h); | |
381 } | |
382 | |
383 static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
384 { | |
385 MOVQ_BFE(mm6); | |
386 JUMPALIGN(); | |
387 do { | |
388 __asm__ volatile( | |
389 "movq %1, %%mm0 \n\t" | |
390 "movq %2, %%mm1 \n\t" | |
391 "movq %0, %%mm3 \n\t" | |
392 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
393 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) |
8430 | 394 "movq %%mm0, %0 \n\t" |
395 :"+m"(*dst) | |
396 :"m"(*src1), "m"(*src2) | |
397 :"memory"); | |
398 dst += dstStride; | |
399 src1 += src1Stride; | |
400 src2 += 8; | |
401 } while (--h); | |
402 } | |
403 | |
404 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
405 { | |
406 MOVQ_BFE(mm6); | |
407 JUMPALIGN(); | |
408 do { | |
409 __asm__ volatile( | |
410 "movq %1, %%mm0 \n\t" | |
411 "movq 1%1, %%mm1 \n\t" | |
412 "movq %0, %%mm3 \n\t" | |
413 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
414 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) |
8430 | 415 "movq %%mm0, %0 \n\t" |
416 "movq 8%1, %%mm0 \n\t" | |
417 "movq 9%1, %%mm1 \n\t" | |
418 "movq 8%0, %%mm3 \n\t" | |
419 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
420 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) |
8430 | 421 "movq %%mm0, 8%0 \n\t" |
422 :"+m"(*block) | |
423 :"m"(*pixels) | |
424 :"memory"); | |
425 pixels += line_size; | |
426 block += line_size; | |
427 } while (--h); | |
428 } | |
429 | |
430 static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | |
431 { | |
432 MOVQ_BFE(mm6); | |
433 JUMPALIGN(); | |
434 do { | |
435 __asm__ volatile( | |
436 "movq %1, %%mm0 \n\t" | |
437 "movq %2, %%mm1 \n\t" | |
438 "movq %0, %%mm3 \n\t" | |
439 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
440 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) |
8430 | 441 "movq %%mm0, %0 \n\t" |
442 "movq 8%1, %%mm0 \n\t" | |
443 "movq 8%2, %%mm1 \n\t" | |
444 "movq 8%0, %%mm3 \n\t" | |
445 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
446 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) |
8430 | 447 "movq %%mm0, 8%0 \n\t" |
448 :"+m"(*dst) | |
449 :"m"(*src1), "m"(*src2) | |
450 :"memory"); | |
451 dst += dstStride; | |
452 src1 += src1Stride; | |
453 src2 += 16; | |
454 } while (--h); | |
455 } | |
456 | |
457 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
458 { | |
459 MOVQ_BFE(mm6); | |
460 __asm__ volatile( | |
461 "lea (%3, %3), %%"REG_a" \n\t" | |
462 "movq (%1), %%mm0 \n\t" | |
463 ASMALIGN(3) | |
464 "1: \n\t" | |
465 "movq (%1, %3), %%mm1 \n\t" | |
466 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
467 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |
468 "movq (%2), %%mm3 \n\t" | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
469 OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6) |
8430 | 470 "movq (%2, %3), %%mm3 \n\t" |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
471 OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) |
8430 | 472 "movq %%mm0, (%2) \n\t" |
473 "movq %%mm1, (%2, %3) \n\t" | |
474 "add %%"REG_a", %1 \n\t" | |
475 "add %%"REG_a", %2 \n\t" | |
476 | |
477 "movq (%1, %3), %%mm1 \n\t" | |
478 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
479 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |
480 "movq (%2), %%mm3 \n\t" | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
481 OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6) |
8430 | 482 "movq (%2, %3), %%mm3 \n\t" |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
483 OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) |
8430 | 484 "movq %%mm2, (%2) \n\t" |
485 "movq %%mm1, (%2, %3) \n\t" | |
486 "add %%"REG_a", %1 \n\t" | |
487 "add %%"REG_a", %2 \n\t" | |
488 | |
489 "subl $4, %0 \n\t" | |
490 "jnz 1b \n\t" | |
491 :"+g"(h), "+S"(pixels), "+D"(block) | |
492 :"r"((x86_reg)line_size) | |
493 :REG_a, "memory"); | |
494 } | |
495 | |
496 // this routine is 'slightly' suboptimal but mostly unused | |
497 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
498 { | |
499 MOVQ_ZERO(mm7); | |
500 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |
501 __asm__ volatile( | |
502 "movq (%1), %%mm0 \n\t" | |
503 "movq 1(%1), %%mm4 \n\t" | |
504 "movq %%mm0, %%mm1 \n\t" | |
505 "movq %%mm4, %%mm5 \n\t" | |
506 "punpcklbw %%mm7, %%mm0 \n\t" | |
507 "punpcklbw %%mm7, %%mm4 \n\t" | |
508 "punpckhbw %%mm7, %%mm1 \n\t" | |
509 "punpckhbw %%mm7, %%mm5 \n\t" | |
510 "paddusw %%mm0, %%mm4 \n\t" | |
511 "paddusw %%mm1, %%mm5 \n\t" | |
512 "xor %%"REG_a", %%"REG_a" \n\t" | |
513 "add %3, %1 \n\t" | |
514 ASMALIGN(3) | |
515 "1: \n\t" | |
516 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
517 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |
518 "movq %%mm0, %%mm1 \n\t" | |
519 "movq %%mm2, %%mm3 \n\t" | |
520 "punpcklbw %%mm7, %%mm0 \n\t" | |
521 "punpcklbw %%mm7, %%mm2 \n\t" | |
522 "punpckhbw %%mm7, %%mm1 \n\t" | |
523 "punpckhbw %%mm7, %%mm3 \n\t" | |
524 "paddusw %%mm2, %%mm0 \n\t" | |
525 "paddusw %%mm3, %%mm1 \n\t" | |
526 "paddusw %%mm6, %%mm4 \n\t" | |
527 "paddusw %%mm6, %%mm5 \n\t" | |
528 "paddusw %%mm0, %%mm4 \n\t" | |
529 "paddusw %%mm1, %%mm5 \n\t" | |
530 "psrlw $2, %%mm4 \n\t" | |
531 "psrlw $2, %%mm5 \n\t" | |
532 "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
533 "packuswb %%mm5, %%mm4 \n\t" | |
534 "pcmpeqd %%mm2, %%mm2 \n\t" | |
535 "paddb %%mm2, %%mm2 \n\t" | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
536 OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2) |
8430 | 537 "movq %%mm5, (%2, %%"REG_a") \n\t" |
538 "add %3, %%"REG_a" \n\t" | |
539 | |
540 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | |
541 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" | |
542 "movq %%mm2, %%mm3 \n\t" | |
543 "movq %%mm4, %%mm5 \n\t" | |
544 "punpcklbw %%mm7, %%mm2 \n\t" | |
545 "punpcklbw %%mm7, %%mm4 \n\t" | |
546 "punpckhbw %%mm7, %%mm3 \n\t" | |
547 "punpckhbw %%mm7, %%mm5 \n\t" | |
548 "paddusw %%mm2, %%mm4 \n\t" | |
549 "paddusw %%mm3, %%mm5 \n\t" | |
550 "paddusw %%mm6, %%mm0 \n\t" | |
551 "paddusw %%mm6, %%mm1 \n\t" | |
552 "paddusw %%mm4, %%mm0 \n\t" | |
553 "paddusw %%mm5, %%mm1 \n\t" | |
554 "psrlw $2, %%mm0 \n\t" | |
555 "psrlw $2, %%mm1 \n\t" | |
556 "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
557 "packuswb %%mm1, %%mm0 \n\t" | |
558 "pcmpeqd %%mm2, %%mm2 \n\t" | |
559 "paddb %%mm2, %%mm2 \n\t" | |
9445
41245484dc0b
avg_ pixel functions need to use (dst+pix+1)>>1 to average with existing
conrad
parents:
8629
diff
changeset
|
560 OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2) |
8430 | 561 "movq %%mm1, (%2, %%"REG_a") \n\t" |
562 "add %3, %%"REG_a" \n\t" | |
563 | |
564 "subl $2, %0 \n\t" | |
565 "jnz 1b \n\t" | |
566 :"+g"(h), "+S"(pixels) | |
567 :"D"(block), "r"((x86_reg)line_size) | |
568 :REG_a, "memory"); | |
569 } | |
570 | |
571 //FIXME optimize | |
572 static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
573 DEF(put, pixels8_y2)(block , pixels , line_size, h); | |
574 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); | |
575 } | |
576 | |
577 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
578 DEF(put, pixels8_xy2)(block , pixels , line_size, h); | |
579 DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); | |
580 } | |
581 | |
582 static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
583 DEF(avg, pixels8_y2)(block , pixels , line_size, h); | |
584 DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); | |
585 } | |
586 | |
587 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | |
588 DEF(avg, pixels8_xy2)(block , pixels , line_size, h); | |
589 DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); | |
590 } |