8430
|
1 /*
|
|
2 * DSP utils mmx functions are compiled twice for rnd/no_rnd
|
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
|
|
4 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
|
|
5 *
|
|
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
|
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
|
|
8 * and improved by Zdenek Kabelac <kabi@users.sf.net>
|
|
9 *
|
|
10 * This file is part of FFmpeg.
|
|
11 *
|
|
12 * FFmpeg is free software; you can redistribute it and/or
|
|
13 * modify it under the terms of the GNU Lesser General Public
|
|
14 * License as published by the Free Software Foundation; either
|
|
15 * version 2.1 of the License, or (at your option) any later version.
|
|
16 *
|
|
17 * FFmpeg is distributed in the hope that it will be useful,
|
|
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
20 * Lesser General Public License for more details.
|
|
21 *
|
|
22 * You should have received a copy of the GNU Lesser General Public
|
|
23 * License along with FFmpeg; if not, write to the Free Software
|
|
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
25 */
|
|
26
|
|
27 // put_pixels
|
|
28 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
29 {
|
|
30 MOVQ_BFE(mm6);
|
|
31 __asm__ volatile(
|
|
32 "lea (%3, %3), %%"REG_a" \n\t"
|
|
33 ASMALIGN(3)
|
|
34 "1: \n\t"
|
|
35 "movq (%1), %%mm0 \n\t"
|
|
36 "movq 1(%1), %%mm1 \n\t"
|
|
37 "movq (%1, %3), %%mm2 \n\t"
|
|
38 "movq 1(%1, %3), %%mm3 \n\t"
|
|
39 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
40 "movq %%mm4, (%2) \n\t"
|
|
41 "movq %%mm5, (%2, %3) \n\t"
|
|
42 "add %%"REG_a", %1 \n\t"
|
|
43 "add %%"REG_a", %2 \n\t"
|
|
44 "movq (%1), %%mm0 \n\t"
|
|
45 "movq 1(%1), %%mm1 \n\t"
|
|
46 "movq (%1, %3), %%mm2 \n\t"
|
|
47 "movq 1(%1, %3), %%mm3 \n\t"
|
|
48 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
49 "movq %%mm4, (%2) \n\t"
|
|
50 "movq %%mm5, (%2, %3) \n\t"
|
|
51 "add %%"REG_a", %1 \n\t"
|
|
52 "add %%"REG_a", %2 \n\t"
|
|
53 "subl $4, %0 \n\t"
|
|
54 "jnz 1b \n\t"
|
|
55 :"+g"(h), "+S"(pixels), "+D"(block)
|
|
56 :"r"((x86_reg)line_size)
|
|
57 :REG_a, "memory");
|
|
58 }
|
|
59
|
|
60 static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
|
61 {
|
|
62 MOVQ_BFE(mm6);
|
|
63 __asm__ volatile(
|
|
64 "testl $1, %0 \n\t"
|
|
65 " jz 1f \n\t"
|
|
66 "movq (%1), %%mm0 \n\t"
|
|
67 "movq (%2), %%mm1 \n\t"
|
|
68 "add %4, %1 \n\t"
|
|
69 "add $8, %2 \n\t"
|
|
70 PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
|
|
71 "movq %%mm4, (%3) \n\t"
|
|
72 "add %5, %3 \n\t"
|
|
73 "decl %0 \n\t"
|
|
74 ASMALIGN(3)
|
|
75 "1: \n\t"
|
|
76 "movq (%1), %%mm0 \n\t"
|
|
77 "movq (%2), %%mm1 \n\t"
|
|
78 "add %4, %1 \n\t"
|
|
79 "movq (%1), %%mm2 \n\t"
|
|
80 "movq 8(%2), %%mm3 \n\t"
|
|
81 "add %4, %1 \n\t"
|
|
82 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
83 "movq %%mm4, (%3) \n\t"
|
|
84 "add %5, %3 \n\t"
|
|
85 "movq %%mm5, (%3) \n\t"
|
|
86 "add %5, %3 \n\t"
|
|
87 "movq (%1), %%mm0 \n\t"
|
|
88 "movq 16(%2), %%mm1 \n\t"
|
|
89 "add %4, %1 \n\t"
|
|
90 "movq (%1), %%mm2 \n\t"
|
|
91 "movq 24(%2), %%mm3 \n\t"
|
|
92 "add %4, %1 \n\t"
|
|
93 "add $32, %2 \n\t"
|
|
94 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
95 "movq %%mm4, (%3) \n\t"
|
|
96 "add %5, %3 \n\t"
|
|
97 "movq %%mm5, (%3) \n\t"
|
|
98 "add %5, %3 \n\t"
|
|
99 "subl $4, %0 \n\t"
|
|
100 "jnz 1b \n\t"
|
|
101 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
|
|
102 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
|
103 #else
|
|
104 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
|
105 #endif
|
|
106 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
|
107 :"memory");
|
|
108 }
|
|
109
|
|
110 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
111 {
|
|
112 MOVQ_BFE(mm6);
|
|
113 __asm__ volatile(
|
|
114 "lea (%3, %3), %%"REG_a" \n\t"
|
|
115 ASMALIGN(3)
|
|
116 "1: \n\t"
|
|
117 "movq (%1), %%mm0 \n\t"
|
|
118 "movq 1(%1), %%mm1 \n\t"
|
|
119 "movq (%1, %3), %%mm2 \n\t"
|
|
120 "movq 1(%1, %3), %%mm3 \n\t"
|
|
121 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
122 "movq %%mm4, (%2) \n\t"
|
|
123 "movq %%mm5, (%2, %3) \n\t"
|
|
124 "movq 8(%1), %%mm0 \n\t"
|
|
125 "movq 9(%1), %%mm1 \n\t"
|
|
126 "movq 8(%1, %3), %%mm2 \n\t"
|
|
127 "movq 9(%1, %3), %%mm3 \n\t"
|
|
128 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
129 "movq %%mm4, 8(%2) \n\t"
|
|
130 "movq %%mm5, 8(%2, %3) \n\t"
|
|
131 "add %%"REG_a", %1 \n\t"
|
|
132 "add %%"REG_a", %2 \n\t"
|
|
133 "movq (%1), %%mm0 \n\t"
|
|
134 "movq 1(%1), %%mm1 \n\t"
|
|
135 "movq (%1, %3), %%mm2 \n\t"
|
|
136 "movq 1(%1, %3), %%mm3 \n\t"
|
|
137 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
138 "movq %%mm4, (%2) \n\t"
|
|
139 "movq %%mm5, (%2, %3) \n\t"
|
|
140 "movq 8(%1), %%mm0 \n\t"
|
|
141 "movq 9(%1), %%mm1 \n\t"
|
|
142 "movq 8(%1, %3), %%mm2 \n\t"
|
|
143 "movq 9(%1, %3), %%mm3 \n\t"
|
|
144 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
145 "movq %%mm4, 8(%2) \n\t"
|
|
146 "movq %%mm5, 8(%2, %3) \n\t"
|
|
147 "add %%"REG_a", %1 \n\t"
|
|
148 "add %%"REG_a", %2 \n\t"
|
|
149 "subl $4, %0 \n\t"
|
|
150 "jnz 1b \n\t"
|
|
151 :"+g"(h), "+S"(pixels), "+D"(block)
|
|
152 :"r"((x86_reg)line_size)
|
|
153 :REG_a, "memory");
|
|
154 }
|
|
155
|
|
156 static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
|
157 {
|
|
158 MOVQ_BFE(mm6);
|
|
159 __asm__ volatile(
|
|
160 "testl $1, %0 \n\t"
|
|
161 " jz 1f \n\t"
|
|
162 "movq (%1), %%mm0 \n\t"
|
|
163 "movq (%2), %%mm1 \n\t"
|
|
164 "movq 8(%1), %%mm2 \n\t"
|
|
165 "movq 8(%2), %%mm3 \n\t"
|
|
166 "add %4, %1 \n\t"
|
|
167 "add $16, %2 \n\t"
|
|
168 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
169 "movq %%mm4, (%3) \n\t"
|
|
170 "movq %%mm5, 8(%3) \n\t"
|
|
171 "add %5, %3 \n\t"
|
|
172 "decl %0 \n\t"
|
|
173 ASMALIGN(3)
|
|
174 "1: \n\t"
|
|
175 "movq (%1), %%mm0 \n\t"
|
|
176 "movq (%2), %%mm1 \n\t"
|
|
177 "movq 8(%1), %%mm2 \n\t"
|
|
178 "movq 8(%2), %%mm3 \n\t"
|
|
179 "add %4, %1 \n\t"
|
|
180 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
181 "movq %%mm4, (%3) \n\t"
|
|
182 "movq %%mm5, 8(%3) \n\t"
|
|
183 "add %5, %3 \n\t"
|
|
184 "movq (%1), %%mm0 \n\t"
|
|
185 "movq 16(%2), %%mm1 \n\t"
|
|
186 "movq 8(%1), %%mm2 \n\t"
|
|
187 "movq 24(%2), %%mm3 \n\t"
|
|
188 "add %4, %1 \n\t"
|
|
189 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
|
190 "movq %%mm4, (%3) \n\t"
|
|
191 "movq %%mm5, 8(%3) \n\t"
|
|
192 "add %5, %3 \n\t"
|
|
193 "add $32, %2 \n\t"
|
|
194 "subl $2, %0 \n\t"
|
|
195 "jnz 1b \n\t"
|
|
196 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
|
|
197 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
|
198 #else
|
|
199 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
|
200 #endif
|
|
201 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
|
202 :"memory");
|
|
203 }
|
|
204
|
|
205 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
206 {
|
|
207 MOVQ_BFE(mm6);
|
|
208 __asm__ volatile(
|
|
209 "lea (%3, %3), %%"REG_a" \n\t"
|
|
210 "movq (%1), %%mm0 \n\t"
|
|
211 ASMALIGN(3)
|
|
212 "1: \n\t"
|
|
213 "movq (%1, %3), %%mm1 \n\t"
|
|
214 "movq (%1, %%"REG_a"),%%mm2 \n\t"
|
|
215 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
|
216 "movq %%mm4, (%2) \n\t"
|
|
217 "movq %%mm5, (%2, %3) \n\t"
|
|
218 "add %%"REG_a", %1 \n\t"
|
|
219 "add %%"REG_a", %2 \n\t"
|
|
220 "movq (%1, %3), %%mm1 \n\t"
|
|
221 "movq (%1, %%"REG_a"),%%mm0 \n\t"
|
|
222 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
|
223 "movq %%mm4, (%2) \n\t"
|
|
224 "movq %%mm5, (%2, %3) \n\t"
|
|
225 "add %%"REG_a", %1 \n\t"
|
|
226 "add %%"REG_a", %2 \n\t"
|
|
227 "subl $4, %0 \n\t"
|
|
228 "jnz 1b \n\t"
|
|
229 :"+g"(h), "+S"(pixels), "+D"(block)
|
|
230 :"r"((x86_reg)line_size)
|
|
231 :REG_a, "memory");
|
|
232 }
|
|
233
|
|
234 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
235 {
|
|
236 MOVQ_ZERO(mm7);
|
|
237 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
|
238 __asm__ volatile(
|
|
239 "movq (%1), %%mm0 \n\t"
|
|
240 "movq 1(%1), %%mm4 \n\t"
|
|
241 "movq %%mm0, %%mm1 \n\t"
|
|
242 "movq %%mm4, %%mm5 \n\t"
|
|
243 "punpcklbw %%mm7, %%mm0 \n\t"
|
|
244 "punpcklbw %%mm7, %%mm4 \n\t"
|
|
245 "punpckhbw %%mm7, %%mm1 \n\t"
|
|
246 "punpckhbw %%mm7, %%mm5 \n\t"
|
|
247 "paddusw %%mm0, %%mm4 \n\t"
|
|
248 "paddusw %%mm1, %%mm5 \n\t"
|
|
249 "xor %%"REG_a", %%"REG_a" \n\t"
|
|
250 "add %3, %1 \n\t"
|
|
251 ASMALIGN(3)
|
|
252 "1: \n\t"
|
|
253 "movq (%1, %%"REG_a"), %%mm0 \n\t"
|
|
254 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
|
|
255 "movq %%mm0, %%mm1 \n\t"
|
|
256 "movq %%mm2, %%mm3 \n\t"
|
|
257 "punpcklbw %%mm7, %%mm0 \n\t"
|
|
258 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
259 "punpckhbw %%mm7, %%mm1 \n\t"
|
|
260 "punpckhbw %%mm7, %%mm3 \n\t"
|
|
261 "paddusw %%mm2, %%mm0 \n\t"
|
|
262 "paddusw %%mm3, %%mm1 \n\t"
|
|
263 "paddusw %%mm6, %%mm4 \n\t"
|
|
264 "paddusw %%mm6, %%mm5 \n\t"
|
|
265 "paddusw %%mm0, %%mm4 \n\t"
|
|
266 "paddusw %%mm1, %%mm5 \n\t"
|
|
267 "psrlw $2, %%mm4 \n\t"
|
|
268 "psrlw $2, %%mm5 \n\t"
|
|
269 "packuswb %%mm5, %%mm4 \n\t"
|
|
270 "movq %%mm4, (%2, %%"REG_a") \n\t"
|
|
271 "add %3, %%"REG_a" \n\t"
|
|
272
|
|
273 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
|
274 "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
|
|
275 "movq %%mm2, %%mm3 \n\t"
|
|
276 "movq %%mm4, %%mm5 \n\t"
|
|
277 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
278 "punpcklbw %%mm7, %%mm4 \n\t"
|
|
279 "punpckhbw %%mm7, %%mm3 \n\t"
|
|
280 "punpckhbw %%mm7, %%mm5 \n\t"
|
|
281 "paddusw %%mm2, %%mm4 \n\t"
|
|
282 "paddusw %%mm3, %%mm5 \n\t"
|
|
283 "paddusw %%mm6, %%mm0 \n\t"
|
|
284 "paddusw %%mm6, %%mm1 \n\t"
|
|
285 "paddusw %%mm4, %%mm0 \n\t"
|
|
286 "paddusw %%mm5, %%mm1 \n\t"
|
|
287 "psrlw $2, %%mm0 \n\t"
|
|
288 "psrlw $2, %%mm1 \n\t"
|
|
289 "packuswb %%mm1, %%mm0 \n\t"
|
|
290 "movq %%mm0, (%2, %%"REG_a") \n\t"
|
|
291 "add %3, %%"REG_a" \n\t"
|
|
292
|
|
293 "subl $2, %0 \n\t"
|
|
294 "jnz 1b \n\t"
|
|
295 :"+g"(h), "+S"(pixels)
|
|
296 :"D"(block), "r"((x86_reg)line_size)
|
|
297 :REG_a, "memory");
|
|
298 }
|
|
299
|
|
300 // avg_pixels
|
|
301 static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
302 {
|
|
303 MOVQ_BFE(mm6);
|
|
304 JUMPALIGN();
|
|
305 do {
|
|
306 __asm__ volatile(
|
|
307 "movd %0, %%mm0 \n\t"
|
|
308 "movd %1, %%mm1 \n\t"
|
|
309 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
310 "movd %%mm2, %0 \n\t"
|
|
311 :"+m"(*block)
|
|
312 :"m"(*pixels)
|
|
313 :"memory");
|
|
314 pixels += line_size;
|
|
315 block += line_size;
|
|
316 }
|
|
317 while (--h);
|
|
318 }
|
|
319
|
|
320 // in case more speed is needed - unroling would certainly help
|
|
321 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
322 {
|
|
323 MOVQ_BFE(mm6);
|
|
324 JUMPALIGN();
|
|
325 do {
|
|
326 __asm__ volatile(
|
|
327 "movq %0, %%mm0 \n\t"
|
|
328 "movq %1, %%mm1 \n\t"
|
|
329 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
330 "movq %%mm2, %0 \n\t"
|
|
331 :"+m"(*block)
|
|
332 :"m"(*pixels)
|
|
333 :"memory");
|
|
334 pixels += line_size;
|
|
335 block += line_size;
|
|
336 }
|
|
337 while (--h);
|
|
338 }
|
|
339
|
|
340 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
341 {
|
|
342 MOVQ_BFE(mm6);
|
|
343 JUMPALIGN();
|
|
344 do {
|
|
345 __asm__ volatile(
|
|
346 "movq %0, %%mm0 \n\t"
|
|
347 "movq %1, %%mm1 \n\t"
|
|
348 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
349 "movq %%mm2, %0 \n\t"
|
|
350 "movq 8%0, %%mm0 \n\t"
|
|
351 "movq 8%1, %%mm1 \n\t"
|
|
352 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
353 "movq %%mm2, 8%0 \n\t"
|
|
354 :"+m"(*block)
|
|
355 :"m"(*pixels)
|
|
356 :"memory");
|
|
357 pixels += line_size;
|
|
358 block += line_size;
|
|
359 }
|
|
360 while (--h);
|
|
361 }
|
|
362
|
|
363 static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
364 {
|
|
365 MOVQ_BFE(mm6);
|
|
366 JUMPALIGN();
|
|
367 do {
|
|
368 __asm__ volatile(
|
|
369 "movq %1, %%mm0 \n\t"
|
|
370 "movq 1%1, %%mm1 \n\t"
|
|
371 "movq %0, %%mm3 \n\t"
|
|
372 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
373 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
374 "movq %%mm0, %0 \n\t"
|
|
375 :"+m"(*block)
|
|
376 :"m"(*pixels)
|
|
377 :"memory");
|
|
378 pixels += line_size;
|
|
379 block += line_size;
|
|
380 } while (--h);
|
|
381 }
|
|
382
|
|
383 static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
|
384 {
|
|
385 MOVQ_BFE(mm6);
|
|
386 JUMPALIGN();
|
|
387 do {
|
|
388 __asm__ volatile(
|
|
389 "movq %1, %%mm0 \n\t"
|
|
390 "movq %2, %%mm1 \n\t"
|
|
391 "movq %0, %%mm3 \n\t"
|
|
392 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
393 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
394 "movq %%mm0, %0 \n\t"
|
|
395 :"+m"(*dst)
|
|
396 :"m"(*src1), "m"(*src2)
|
|
397 :"memory");
|
|
398 dst += dstStride;
|
|
399 src1 += src1Stride;
|
|
400 src2 += 8;
|
|
401 } while (--h);
|
|
402 }
|
|
403
|
|
404 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
405 {
|
|
406 MOVQ_BFE(mm6);
|
|
407 JUMPALIGN();
|
|
408 do {
|
|
409 __asm__ volatile(
|
|
410 "movq %1, %%mm0 \n\t"
|
|
411 "movq 1%1, %%mm1 \n\t"
|
|
412 "movq %0, %%mm3 \n\t"
|
|
413 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
414 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
415 "movq %%mm0, %0 \n\t"
|
|
416 "movq 8%1, %%mm0 \n\t"
|
|
417 "movq 9%1, %%mm1 \n\t"
|
|
418 "movq 8%0, %%mm3 \n\t"
|
|
419 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
420 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
421 "movq %%mm0, 8%0 \n\t"
|
|
422 :"+m"(*block)
|
|
423 :"m"(*pixels)
|
|
424 :"memory");
|
|
425 pixels += line_size;
|
|
426 block += line_size;
|
|
427 } while (--h);
|
|
428 }
|
|
429
|
|
430 static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
|
431 {
|
|
432 MOVQ_BFE(mm6);
|
|
433 JUMPALIGN();
|
|
434 do {
|
|
435 __asm__ volatile(
|
|
436 "movq %1, %%mm0 \n\t"
|
|
437 "movq %2, %%mm1 \n\t"
|
|
438 "movq %0, %%mm3 \n\t"
|
|
439 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
440 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
441 "movq %%mm0, %0 \n\t"
|
|
442 "movq 8%1, %%mm0 \n\t"
|
|
443 "movq 8%2, %%mm1 \n\t"
|
|
444 "movq 8%0, %%mm3 \n\t"
|
|
445 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
|
446 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
|
|
447 "movq %%mm0, 8%0 \n\t"
|
|
448 :"+m"(*dst)
|
|
449 :"m"(*src1), "m"(*src2)
|
|
450 :"memory");
|
|
451 dst += dstStride;
|
|
452 src1 += src1Stride;
|
|
453 src2 += 16;
|
|
454 } while (--h);
|
|
455 }
|
|
456
|
|
457 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
458 {
|
|
459 MOVQ_BFE(mm6);
|
|
460 __asm__ volatile(
|
|
461 "lea (%3, %3), %%"REG_a" \n\t"
|
|
462 "movq (%1), %%mm0 \n\t"
|
|
463 ASMALIGN(3)
|
|
464 "1: \n\t"
|
|
465 "movq (%1, %3), %%mm1 \n\t"
|
|
466 "movq (%1, %%"REG_a"), %%mm2 \n\t"
|
|
467 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
|
468 "movq (%2), %%mm3 \n\t"
|
|
469 PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
|
|
470 "movq (%2, %3), %%mm3 \n\t"
|
|
471 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
|
|
472 "movq %%mm0, (%2) \n\t"
|
|
473 "movq %%mm1, (%2, %3) \n\t"
|
|
474 "add %%"REG_a", %1 \n\t"
|
|
475 "add %%"REG_a", %2 \n\t"
|
|
476
|
|
477 "movq (%1, %3), %%mm1 \n\t"
|
|
478 "movq (%1, %%"REG_a"), %%mm0 \n\t"
|
|
479 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
|
480 "movq (%2), %%mm3 \n\t"
|
|
481 PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
|
|
482 "movq (%2, %3), %%mm3 \n\t"
|
|
483 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
|
|
484 "movq %%mm2, (%2) \n\t"
|
|
485 "movq %%mm1, (%2, %3) \n\t"
|
|
486 "add %%"REG_a", %1 \n\t"
|
|
487 "add %%"REG_a", %2 \n\t"
|
|
488
|
|
489 "subl $4, %0 \n\t"
|
|
490 "jnz 1b \n\t"
|
|
491 :"+g"(h), "+S"(pixels), "+D"(block)
|
|
492 :"r"((x86_reg)line_size)
|
|
493 :REG_a, "memory");
|
|
494 }
|
|
495
|
|
496 // this routine is 'slightly' suboptimal but mostly unused
|
|
497 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
498 {
|
|
499 MOVQ_ZERO(mm7);
|
|
500 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
|
501 __asm__ volatile(
|
|
502 "movq (%1), %%mm0 \n\t"
|
|
503 "movq 1(%1), %%mm4 \n\t"
|
|
504 "movq %%mm0, %%mm1 \n\t"
|
|
505 "movq %%mm4, %%mm5 \n\t"
|
|
506 "punpcklbw %%mm7, %%mm0 \n\t"
|
|
507 "punpcklbw %%mm7, %%mm4 \n\t"
|
|
508 "punpckhbw %%mm7, %%mm1 \n\t"
|
|
509 "punpckhbw %%mm7, %%mm5 \n\t"
|
|
510 "paddusw %%mm0, %%mm4 \n\t"
|
|
511 "paddusw %%mm1, %%mm5 \n\t"
|
|
512 "xor %%"REG_a", %%"REG_a" \n\t"
|
|
513 "add %3, %1 \n\t"
|
|
514 ASMALIGN(3)
|
|
515 "1: \n\t"
|
|
516 "movq (%1, %%"REG_a"), %%mm0 \n\t"
|
|
517 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
|
|
518 "movq %%mm0, %%mm1 \n\t"
|
|
519 "movq %%mm2, %%mm3 \n\t"
|
|
520 "punpcklbw %%mm7, %%mm0 \n\t"
|
|
521 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
522 "punpckhbw %%mm7, %%mm1 \n\t"
|
|
523 "punpckhbw %%mm7, %%mm3 \n\t"
|
|
524 "paddusw %%mm2, %%mm0 \n\t"
|
|
525 "paddusw %%mm3, %%mm1 \n\t"
|
|
526 "paddusw %%mm6, %%mm4 \n\t"
|
|
527 "paddusw %%mm6, %%mm5 \n\t"
|
|
528 "paddusw %%mm0, %%mm4 \n\t"
|
|
529 "paddusw %%mm1, %%mm5 \n\t"
|
|
530 "psrlw $2, %%mm4 \n\t"
|
|
531 "psrlw $2, %%mm5 \n\t"
|
|
532 "movq (%2, %%"REG_a"), %%mm3 \n\t"
|
|
533 "packuswb %%mm5, %%mm4 \n\t"
|
|
534 "pcmpeqd %%mm2, %%mm2 \n\t"
|
|
535 "paddb %%mm2, %%mm2 \n\t"
|
|
536 PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
|
|
537 "movq %%mm5, (%2, %%"REG_a") \n\t"
|
|
538 "add %3, %%"REG_a" \n\t"
|
|
539
|
|
540 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
|
541 "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
|
|
542 "movq %%mm2, %%mm3 \n\t"
|
|
543 "movq %%mm4, %%mm5 \n\t"
|
|
544 "punpcklbw %%mm7, %%mm2 \n\t"
|
|
545 "punpcklbw %%mm7, %%mm4 \n\t"
|
|
546 "punpckhbw %%mm7, %%mm3 \n\t"
|
|
547 "punpckhbw %%mm7, %%mm5 \n\t"
|
|
548 "paddusw %%mm2, %%mm4 \n\t"
|
|
549 "paddusw %%mm3, %%mm5 \n\t"
|
|
550 "paddusw %%mm6, %%mm0 \n\t"
|
|
551 "paddusw %%mm6, %%mm1 \n\t"
|
|
552 "paddusw %%mm4, %%mm0 \n\t"
|
|
553 "paddusw %%mm5, %%mm1 \n\t"
|
|
554 "psrlw $2, %%mm0 \n\t"
|
|
555 "psrlw $2, %%mm1 \n\t"
|
|
556 "movq (%2, %%"REG_a"), %%mm3 \n\t"
|
|
557 "packuswb %%mm1, %%mm0 \n\t"
|
|
558 "pcmpeqd %%mm2, %%mm2 \n\t"
|
|
559 "paddb %%mm2, %%mm2 \n\t"
|
|
560 PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
|
|
561 "movq %%mm1, (%2, %%"REG_a") \n\t"
|
|
562 "add %3, %%"REG_a" \n\t"
|
|
563
|
|
564 "subl $2, %0 \n\t"
|
|
565 "jnz 1b \n\t"
|
|
566 :"+g"(h), "+S"(pixels)
|
|
567 :"D"(block), "r"((x86_reg)line_size)
|
|
568 :REG_a, "memory");
|
|
569 }
|
|
570
|
|
571 //FIXME optimize
|
|
572 static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
|
573 DEF(put, pixels8_y2)(block , pixels , line_size, h);
|
|
574 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
|
|
575 }
|
|
576
|
|
577 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
|
578 DEF(put, pixels8_xy2)(block , pixels , line_size, h);
|
|
579 DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
|
580 }
|
|
581
|
|
582 static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
|
583 DEF(avg, pixels8_y2)(block , pixels , line_size, h);
|
|
584 DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
|
|
585 }
|
|
586
|
|
587 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
|
588 DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
|
|
589 DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
|
590 }
|