comparison libmpeg2/motion_comp_altivec.c @ 9857:89b48bc6c441

Importing libmpeg2 from mpeg2dec-0.3.1
author arpi
date Sun, 06 Apr 2003 16:41:49 +0000
parents
children f0e14d641160
comparison
equal deleted inserted replaced
9856:08496327b7ec 9857:89b48bc6c441
1 /*
2 * motion_comp_altivec.c
3 * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
5 *
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
7 * See http://libmpeg2.sourceforge.net/ for updates.
8 *
9 * mpeg2dec is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * mpeg2dec is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24 #ifndef __ALTIVEC__
25
26 #include "config.h"
27
28 #ifdef ARCH_PPC
29
30 #include <inttypes.h>
31
32 #include "mpeg2.h"
33 #include "mpeg2_internal.h"
34
35 /*
36 * The asm code is generated with:
37 *
38 * gcc-2.95 -fvec -D__ALTIVEC__ -O9 -fomit-frame-pointer -mregnames -S
39 * motion_comp_altivec.c
40 *
41 * sed 's/.L/._L/g' motion_comp_altivec.s |
42 * awk '{args=""; len=split ($2, arg, ",");
43 * for (i=1; i<=len; i++) { a=arg[i]; if (i<len) a=a",";
44 * args = args sprintf ("%-6s", a) }
45 * printf ("\t\"\t%-16s%-24s\\n\"\n", $1, args) }' |
46 * unexpand -a
47 */
48
49 static void MC_put_o_16_altivec (uint8_t * dest, const uint8_t * ref,
50 int stride, int height)
51 {
52 asm (" \n"
53 " srawi %r6, %r6, 1 \n"
54 " li %r9, 15 \n"
55 " addi %r6, %r6, -1 \n"
56 " lvsl %v12, 0, %r4 \n"
57 " mtctr %r6 \n"
58 " lvx %v1, 0, %r4 \n"
59 " lvx %v0, %r9, %r4 \n"
60 " add %r0, %r5, %r5 \n"
61 " vperm %v13, %v1, %v0, %v12 \n"
62 " add %r4, %r4, %r5 \n"
63 "._L6: \n"
64 " li %r9, 15 \n"
65 " lvx %v1, 0, %r4 \n"
66 " lvx %v0, %r9, %r4 \n"
67 " stvx %v13, 0, %r3 \n"
68 " vperm %v13, %v1, %v0, %v12 \n"
69 " add %r4, %r4, %r5 \n"
70 " lvx %v1, 0, %r4 \n"
71 " lvx %v0, %r9, %r4 \n"
72 " stvx %v13, %r5, %r3 \n"
73 " vperm %v13, %v1, %v0, %v12 \n"
74 " add %r4, %r4, %r5 \n"
75 " add %r3, %r3, %r0 \n"
76 " bdnz ._L6 \n"
77 " lvx %v0, %r9, %r4 \n"
78 " lvx %v1, 0, %r4 \n"
79 " stvx %v13, 0, %r3 \n"
80 " vperm %v13, %v1, %v0, %v12 \n"
81 " stvx %v13, %r5, %r3 \n"
82 );
83 }
84
85 static void MC_put_o_8_altivec (uint8_t * dest, const uint8_t * ref,
86 int stride, int height)
87 {
88 asm (" \n"
89 " lvsl %v12, 0, %r4 \n"
90 " lvsl %v1, %r5, %r4 \n"
91 " vmrghb %v12, %v12, %v12 \n"
92 " srawi %r6, %r6, 1 \n"
93 " li %r9, 7 \n"
94 " vmrghb %v1, %v1, %v1 \n"
95 " addi %r6, %r6, -1 \n"
96 " vpkuhum %v10, %v12, %v12 \n"
97 " lvx %v13, 0, %r4 \n"
98 " mtctr %r6 \n"
99 " vpkuhum %v11, %v1, %v1 \n"
100 " lvx %v0, %r9, %r4 \n"
101 " add %r4, %r4, %r5 \n"
102 " vperm %v12, %v13, %v0, %v10 \n"
103 "._L11: \n"
104 " li %r9, 7 \n"
105 " lvx %v0, %r9, %r4 \n"
106 " lvx %v13, 0, %r4 \n"
107 " stvewx %v12, 0, %r3 \n"
108 " li %r9, 4 \n"
109 " vperm %v1, %v13, %v0, %v11 \n"
110 " stvewx %v12, %r9, %r3 \n"
111 " add %r4, %r4, %r5 \n"
112 " li %r9, 7 \n"
113 " lvx %v0, %r9, %r4 \n"
114 " lvx %v13, 0, %r4 \n"
115 " add %r3, %r3, %r5 \n"
116 " stvewx %v1, 0, %r3 \n"
117 " vperm %v12, %v13, %v0, %v10 \n"
118 " li %r9, 4 \n"
119 " stvewx %v1, %r9, %r3 \n"
120 " add %r4, %r4, %r5 \n"
121 " add %r3, %r3, %r5 \n"
122 " bdnz ._L11 \n"
123 " li %r9, 7 \n"
124 " lvx %v0, %r9, %r4 \n"
125 " lvx %v13, 0, %r4 \n"
126 " stvewx %v12, 0, %r3 \n"
127 " li %r9, 4 \n"
128 " vperm %v1, %v13, %v0, %v11 \n"
129 " stvewx %v12, %r9, %r3 \n"
130 " add %r3, %r3, %r5 \n"
131 " stvewx %v1, 0, %r3 \n"
132 " stvewx %v1, %r9, %r3 \n"
133 );
134 }
135
136 static void MC_put_x_16_altivec (uint8_t * dest, const uint8_t * ref,
137 int stride, int height)
138 {
139 asm (" \n"
140 " lvsl %v11, 0, %r4 \n"
141 " vspltisb %v0, 1 \n"
142 " li %r9, 16 \n"
143 " lvx %v12, 0, %r4 \n"
144 " vaddubm %v10, %v11, %v0 \n"
145 " lvx %v13, %r9, %r4 \n"
146 " srawi %r6, %r6, 1 \n"
147 " addi %r6, %r6, -1 \n"
148 " vperm %v1, %v12, %v13, %v10 \n"
149 " vperm %v0, %v12, %v13, %v11 \n"
150 " mtctr %r6 \n"
151 " add %r0, %r5, %r5 \n"
152 " add %r4, %r4, %r5 \n"
153 " vavgub %v0, %v0, %v1 \n"
154 "._L16: \n"
155 " li %r9, 16 \n"
156 " lvx %v12, 0, %r4 \n"
157 " lvx %v13, %r9, %r4 \n"
158 " stvx %v0, 0, %r3 \n"
159 " vperm %v1, %v12, %v13, %v10 \n"
160 " add %r4, %r4, %r5 \n"
161 " vperm %v0, %v12, %v13, %v11 \n"
162 " lvx %v12, 0, %r4 \n"
163 " lvx %v13, %r9, %r4 \n"
164 " vavgub %v0, %v0, %v1 \n"
165 " stvx %v0, %r5, %r3 \n"
166 " vperm %v1, %v12, %v13, %v10 \n"
167 " add %r4, %r4, %r5 \n"
168 " vperm %v0, %v12, %v13, %v11 \n"
169 " add %r3, %r3, %r0 \n"
170 " vavgub %v0, %v0, %v1 \n"
171 " bdnz ._L16 \n"
172 " lvx %v13, %r9, %r4 \n"
173 " lvx %v12, 0, %r4 \n"
174 " stvx %v0, 0, %r3 \n"
175 " vperm %v1, %v12, %v13, %v10 \n"
176 " vperm %v0, %v12, %v13, %v11 \n"
177 " vavgub %v0, %v0, %v1 \n"
178 " stvx %v0, %r5, %r3 \n"
179 );
180 }
181
182 static void MC_put_x_8_altivec (uint8_t * dest, const uint8_t * ref,
183 int stride, int height)
184 {
185 asm (" \n"
186 " lvsl %v0, 0, %r4 \n"
187 " vspltisb %v13, 1 \n"
188 " lvsl %v10, %r5, %r4 \n"
189 " vmrghb %v0, %v0, %v0 \n"
190 " li %r9, 8 \n"
191 " lvx %v11, 0, %r4 \n"
192 " vmrghb %v10, %v10, %v10 \n"
193 " vpkuhum %v8, %v0, %v0 \n"
194 " lvx %v12, %r9, %r4 \n"
195 " srawi %r6, %r6, 1 \n"
196 " vpkuhum %v9, %v10, %v10 \n"
197 " vaddubm %v7, %v8, %v13 \n"
198 " addi %r6, %r6, -1 \n"
199 " vperm %v1, %v11, %v12, %v8 \n"
200 " mtctr %r6 \n"
201 " vaddubm %v13, %v9, %v13 \n"
202 " add %r4, %r4, %r5 \n"
203 " vperm %v0, %v11, %v12, %v7 \n"
204 " vavgub %v0, %v1, %v0 \n"
205 "._L21: \n"
206 " li %r9, 8 \n"
207 " lvx %v12, %r9, %r4 \n"
208 " lvx %v11, 0, %r4 \n"
209 " stvewx %v0, 0, %r3 \n"
210 " li %r9, 4 \n"
211 " vperm %v1, %v11, %v12, %v13 \n"
212 " stvewx %v0, %r9, %r3 \n"
213 " vperm %v0, %v11, %v12, %v9 \n"
214 " add %r4, %r4, %r5 \n"
215 " li %r9, 8 \n"
216 " lvx %v12, %r9, %r4 \n"
217 " vavgub %v10, %v0, %v1 \n"
218 " lvx %v11, 0, %r4 \n"
219 " add %r3, %r3, %r5 \n"
220 " stvewx %v10, 0, %r3 \n"
221 " vperm %v1, %v11, %v12, %v7 \n"
222 " vperm %v0, %v11, %v12, %v8 \n"
223 " li %r9, 4 \n"
224 " stvewx %v10, %r9, %r3 \n"
225 " add %r4, %r4, %r5 \n"
226 " vavgub %v0, %v0, %v1 \n"
227 " add %r3, %r3, %r5 \n"
228 " bdnz ._L21 \n"
229 " li %r9, 8 \n"
230 " lvx %v12, %r9, %r4 \n"
231 " lvx %v11, 0, %r4 \n"
232 " stvewx %v0, 0, %r3 \n"
233 " li %r9, 4 \n"
234 " vperm %v1, %v11, %v12, %v13 \n"
235 " stvewx %v0, %r9, %r3 \n"
236 " vperm %v0, %v11, %v12, %v9 \n"
237 " add %r3, %r3, %r5 \n"
238 " vavgub %v10, %v0, %v1 \n"
239 " stvewx %v10, 0, %r3 \n"
240 " stvewx %v10, %r9, %r3 \n"
241 );
242 }
243
244 static void MC_put_y_16_altivec (uint8_t * dest, const uint8_t * ref,
245 int stride, int height)
246 {
247 asm (" \n"
248 " li %r9, 15 \n"
249 " lvsl %v10, 0, %r4 \n"
250 " lvx %v13, 0, %r4 \n"
251 " lvx %v1, %r9, %r4 \n"
252 " add %r4, %r4, %r5 \n"
253 " vperm %v12, %v13, %v1, %v10 \n"
254 " srawi %r6, %r6, 1 \n"
255 " lvx %v13, 0, %r4 \n"
256 " lvx %v1, %r9, %r4 \n"
257 " addi %r6, %r6, -1 \n"
258 " vperm %v11, %v13, %v1, %v10 \n"
259 " mtctr %r6 \n"
260 " add %r0, %r5, %r5 \n"
261 " add %r4, %r4, %r5 \n"
262 " vavgub %v0, %v12, %v11 \n"
263 "._L26: \n"
264 " li %r9, 15 \n"
265 " lvx %v13, 0, %r4 \n"
266 " lvx %v1, %r9, %r4 \n"
267 " stvx %v0, 0, %r3 \n"
268 " vperm %v12, %v13, %v1, %v10 \n"
269 " add %r4, %r4, %r5 \n"
270 " lvx %v13, 0, %r4 \n"
271 " lvx %v1, %r9, %r4 \n"
272 " vavgub %v0, %v12, %v11 \n"
273 " stvx %v0, %r5, %r3 \n"
274 " vperm %v11, %v13, %v1, %v10 \n"
275 " add %r4, %r4, %r5 \n"
276 " add %r3, %r3, %r0 \n"
277 " vavgub %v0, %v12, %v11 \n"
278 " bdnz ._L26 \n"
279 " lvx %v1, %r9, %r4 \n"
280 " lvx %v13, 0, %r4 \n"
281 " stvx %v0, 0, %r3 \n"
282 " vperm %v12, %v13, %v1, %v10 \n"
283 " vavgub %v0, %v12, %v11 \n"
284 " stvx %v0, %r5, %r3 \n"
285 );
286 }
287
288 static void MC_put_y_8_altivec (uint8_t * dest, const uint8_t * ref,
289 int stride, int height)
290 {
291 asm (" \n"
292 " lvsl %v13, 0, %r4 \n"
293 " lvsl %v11, %r5, %r4 \n"
294 " vmrghb %v13, %v13, %v13 \n"
295 " li %r9, 7 \n"
296 " lvx %v12, 0, %r4 \n"
297 " vmrghb %v11, %v11, %v11 \n"
298 " lvx %v1, %r9, %r4 \n"
299 " vpkuhum %v9, %v13, %v13 \n"
300 " add %r4, %r4, %r5 \n"
301 " vpkuhum %v10, %v11, %v11 \n"
302 " vperm %v13, %v12, %v1, %v9 \n"
303 " srawi %r6, %r6, 1 \n"
304 " lvx %v12, 0, %r4 \n"
305 " lvx %v1, %r9, %r4 \n"
306 " addi %r6, %r6, -1 \n"
307 " vperm %v11, %v12, %v1, %v10 \n"
308 " mtctr %r6 \n"
309 " add %r4, %r4, %r5 \n"
310 " vavgub %v0, %v13, %v11 \n"
311 "._L31: \n"
312 " li %r9, 7 \n"
313 " lvx %v1, %r9, %r4 \n"
314 " lvx %v12, 0, %r4 \n"
315 " stvewx %v0, 0, %r3 \n"
316 " li %r9, 4 \n"
317 " vperm %v13, %v12, %v1, %v9 \n"
318 " stvewx %v0, %r9, %r3 \n"
319 " add %r4, %r4, %r5 \n"
320 " vavgub %v0, %v13, %v11 \n"
321 " li %r9, 7 \n"
322 " lvx %v1, %r9, %r4 \n"
323 " lvx %v12, 0, %r4 \n"
324 " add %r3, %r3, %r5 \n"
325 " stvewx %v0, 0, %r3 \n"
326 " vperm %v11, %v12, %v1, %v10 \n"
327 " li %r9, 4 \n"
328 " stvewx %v0, %r9, %r3 \n"
329 " vavgub %v0, %v13, %v11 \n"
330 " add %r4, %r4, %r5 \n"
331 " add %r3, %r3, %r5 \n"
332 " bdnz ._L31 \n"
333 " li %r9, 7 \n"
334 " lvx %v1, %r9, %r4 \n"
335 " lvx %v12, 0, %r4 \n"
336 " stvewx %v0, 0, %r3 \n"
337 " li %r9, 4 \n"
338 " vperm %v13, %v12, %v1, %v9 \n"
339 " stvewx %v0, %r9, %r3 \n"
340 " add %r3, %r3, %r5 \n"
341 " vavgub %v0, %v13, %v11 \n"
342 " stvewx %v0, 0, %r3 \n"
343 " stvewx %v0, %r9, %r3 \n"
344 );
345 }
346
347 static void MC_put_xy_16_altivec (uint8_t * dest, const uint8_t * ref,
348 int stride, int height)
349 {
350 asm (" \n"
351 " lvsl %v5, 0, %r4 \n"
352 " vspltisb %v3, 1 \n"
353 " li %r9, 16 \n"
354 " lvx %v1, 0, %r4 \n"
355 " vaddubm %v4, %v5, %v3 \n"
356 " lvx %v0, %r9, %r4 \n"
357 " add %r4, %r4, %r5 \n"
358 " vperm %v10, %v1, %v0, %v4 \n"
359 " srawi %r6, %r6, 1 \n"
360 " vperm %v11, %v1, %v0, %v5 \n"
361 " addi %r6, %r6, -1 \n"
362 " lvx %v1, 0, %r4 \n"
363 " mtctr %r6 \n"
364 " lvx %v0, %r9, %r4 \n"
365 " vavgub %v9, %v11, %v10 \n"
366 " vxor %v8, %v11, %v10 \n"
367 " add %r0, %r5, %r5 \n"
368 " vperm %v10, %v1, %v0, %v4 \n"
369 " add %r4, %r4, %r5 \n"
370 " vperm %v11, %v1, %v0, %v5 \n"
371 " vxor %v6, %v11, %v10 \n"
372 " vavgub %v7, %v11, %v10 \n"
373 " vor %v0, %v8, %v6 \n"
374 " vxor %v13, %v9, %v7 \n"
375 " vand %v0, %v3, %v0 \n"
376 " vavgub %v1, %v9, %v7 \n"
377 " vand %v0, %v0, %v13 \n"
378 " vsububm %v13, %v1, %v0 \n"
379 "._L36: \n"
380 " li %r9, 16 \n"
381 " lvx %v1, 0, %r4 \n"
382 " lvx %v0, %r9, %r4 \n"
383 " stvx %v13, 0, %r3 \n"
384 " vperm %v10, %v1, %v0, %v4 \n"
385 " add %r4, %r4, %r5 \n"
386 " vperm %v11, %v1, %v0, %v5 \n"
387 " lvx %v1, 0, %r4 \n"
388 " lvx %v0, %r9, %r4 \n"
389 " vavgub %v9, %v11, %v10 \n"
390 " vxor %v8, %v11, %v10 \n"
391 " add %r4, %r4, %r5 \n"
392 " vperm %v10, %v1, %v0, %v4 \n"
393 " vavgub %v12, %v9, %v7 \n"
394 " vperm %v11, %v1, %v0, %v5 \n"
395 " vor %v13, %v8, %v6 \n"
396 " vxor %v0, %v9, %v7 \n"
397 " vxor %v6, %v11, %v10 \n"
398 " vand %v13, %v3, %v13 \n"
399 " vavgub %v7, %v11, %v10 \n"
400 " vor %v1, %v8, %v6 \n"
401 " vand %v13, %v13, %v0 \n"
402 " vxor %v0, %v9, %v7 \n"
403 " vand %v1, %v3, %v1 \n"
404 " vsububm %v13, %v12, %v13 \n"
405 " vand %v1, %v1, %v0 \n"
406 " stvx %v13, %r5, %r3 \n"
407 " vavgub %v0, %v9, %v7 \n"
408 " add %r3, %r3, %r0 \n"
409 " vsububm %v13, %v0, %v1 \n"
410 " bdnz ._L36 \n"
411 " lvx %v0, %r9, %r4 \n"
412 " lvx %v1, 0, %r4 \n"
413 " stvx %v13, 0, %r3 \n"
414 " vperm %v10, %v1, %v0, %v4 \n"
415 " vperm %v11, %v1, %v0, %v5 \n"
416 " vxor %v8, %v11, %v10 \n"
417 " vavgub %v9, %v11, %v10 \n"
418 " vor %v0, %v8, %v6 \n"
419 " vxor %v13, %v9, %v7 \n"
420 " vand %v0, %v3, %v0 \n"
421 " vavgub %v1, %v9, %v7 \n"
422 " vand %v0, %v0, %v13 \n"
423 " vsububm %v13, %v1, %v0 \n"
424 " stvx %v13, %r5, %r3 \n"
425 );
426 }
427
428 static void MC_put_xy_8_altivec (uint8_t * dest, const uint8_t * ref,
429 int stride, int height)
430 {
431 asm (" \n"
432 " lvsl %v4, 0, %r4 \n"
433 " vspltisb %v3, 1 \n"
434 " lvsl %v5, %r5, %r4 \n"
435 " vmrghb %v4, %v4, %v4 \n"
436 " li %r9, 8 \n"
437 " vmrghb %v5, %v5, %v5 \n"
438 " lvx %v1, 0, %r4 \n"
439 " vpkuhum %v4, %v4, %v4 \n"
440 " lvx %v0, %r9, %r4 \n"
441 " vpkuhum %v5, %v5, %v5 \n"
442 " add %r4, %r4, %r5 \n"
443 " vaddubm %v2, %v4, %v3 \n"
444 " vperm %v11, %v1, %v0, %v4 \n"
445 " srawi %r6, %r6, 1 \n"
446 " vaddubm %v19, %v5, %v3 \n"
447 " addi %r6, %r6, -1 \n"
448 " vperm %v10, %v1, %v0, %v2 \n"
449 " mtctr %r6 \n"
450 " lvx %v1, 0, %r4 \n"
451 " lvx %v0, %r9, %r4 \n"
452 " vavgub %v9, %v11, %v10 \n"
453 " vxor %v8, %v11, %v10 \n"
454 " add %r4, %r4, %r5 \n"
455 " vperm %v10, %v1, %v0, %v19 \n"
456 " vperm %v11, %v1, %v0, %v5 \n"
457 " vxor %v6, %v11, %v10 \n"
458 " vavgub %v7, %v11, %v10 \n"
459 " vor %v0, %v8, %v6 \n"
460 " vxor %v13, %v9, %v7 \n"
461 " vand %v0, %v3, %v0 \n"
462 " vavgub %v1, %v9, %v7 \n"
463 " vand %v0, %v0, %v13 \n"
464 " vsububm %v13, %v1, %v0 \n"
465 "._L41: \n"
466 " li %r9, 8 \n"
467 " lvx %v0, %r9, %r4 \n"
468 " lvx %v1, 0, %r4 \n"
469 " stvewx %v13, 0, %r3 \n"
470 " li %r9, 4 \n"
471 " vperm %v10, %v1, %v0, %v2 \n"
472 " stvewx %v13, %r9, %r3 \n"
473 " vperm %v11, %v1, %v0, %v4 \n"
474 " add %r4, %r4, %r5 \n"
475 " li %r9, 8 \n"
476 " vavgub %v9, %v11, %v10 \n"
477 " lvx %v0, %r9, %r4 \n"
478 " vxor %v8, %v11, %v10 \n"
479 " lvx %v1, 0, %r4 \n"
480 " vavgub %v12, %v9, %v7 \n"
481 " vor %v13, %v8, %v6 \n"
482 " add %r3, %r3, %r5 \n"
483 " vperm %v10, %v1, %v0, %v19 \n"
484 " li %r9, 4 \n"
485 " vperm %v11, %v1, %v0, %v5 \n"
486 " vand %v13, %v3, %v13 \n"
487 " add %r4, %r4, %r5 \n"
488 " vxor %v0, %v9, %v7 \n"
489 " vxor %v6, %v11, %v10 \n"
490 " vavgub %v7, %v11, %v10 \n"
491 " vor %v1, %v8, %v6 \n"
492 " vand %v13, %v13, %v0 \n"
493 " vxor %v0, %v9, %v7 \n"
494 " vand %v1, %v3, %v1 \n"
495 " vsububm %v13, %v12, %v13 \n"
496 " vand %v1, %v1, %v0 \n"
497 " stvewx %v13, 0, %r3 \n"
498 " vavgub %v0, %v9, %v7 \n"
499 " stvewx %v13, %r9, %r3 \n"
500 " add %r3, %r3, %r5 \n"
501 " vsububm %v13, %v0, %v1 \n"
502 " bdnz ._L41 \n"
503 " li %r9, 8 \n"
504 " lvx %v0, %r9, %r4 \n"
505 " lvx %v1, 0, %r4 \n"
506 " stvewx %v13, 0, %r3 \n"
507 " vperm %v10, %v1, %v0, %v2 \n"
508 " li %r9, 4 \n"
509 " vperm %v11, %v1, %v0, %v4 \n"
510 " stvewx %v13, %r9, %r3 \n"
511 " add %r3, %r3, %r5 \n"
512 " vxor %v8, %v11, %v10 \n"
513 " vavgub %v9, %v11, %v10 \n"
514 " vor %v0, %v8, %v6 \n"
515 " vxor %v13, %v9, %v7 \n"
516 " vand %v0, %v3, %v0 \n"
517 " vavgub %v1, %v9, %v7 \n"
518 " vand %v0, %v0, %v13 \n"
519 " vsububm %v13, %v1, %v0 \n"
520 " stvewx %v13, 0, %r3 \n"
521 " stvewx %v13, %r9, %r3 \n"
522 );
523 }
524
525 static void MC_avg_o_16_altivec (uint8_t * dest, const uint8_t * ref,
526 int stride, int height)
527 {
528 asm (" \n"
529 " li %r9, 15 \n"
530 " lvx %v0, %r9, %r4 \n"
531 " lvsl %v11, 0, %r4 \n"
532 " lvx %v1, 0, %r4 \n"
533 " srawi %r6, %r6, 1 \n"
534 " addi %r6, %r6, -1 \n"
535 " vperm %v0, %v1, %v0, %v11 \n"
536 " lvx %v13, 0, %r3 \n"
537 " mtctr %r6 \n"
538 " add %r9, %r5, %r5 \n"
539 " vavgub %v12, %v13, %v0 \n"
540 " add %r4, %r4, %r5 \n"
541 "._L46: \n"
542 " li %r11, 15 \n"
543 " lvx %v1, 0, %r4 \n"
544 " lvx %v0, %r11, %r4 \n"
545 " lvx %v13, %r5, %r3 \n"
546 " vperm %v0, %v1, %v0, %v11 \n"
547 " stvx %v12, 0, %r3 \n"
548 " add %r4, %r4, %r5 \n"
549 " vavgub %v12, %v13, %v0 \n"
550 " lvx %v1, 0, %r4 \n"
551 " lvx %v0, %r11, %r4 \n"
552 " lvx %v13, %r9, %r3 \n"
553 " vperm %v0, %v1, %v0, %v11 \n"
554 " stvx %v12, %r5, %r3 \n"
555 " add %r4, %r4, %r5 \n"
556 " vavgub %v12, %v13, %v0 \n"
557 " add %r3, %r3, %r9 \n"
558 " bdnz ._L46 \n"
559 " lvx %v0, %r11, %r4 \n"
560 " lvx %v1, 0, %r4 \n"
561 " lvx %v13, %r5, %r3 \n"
562 " vperm %v0, %v1, %v0, %v11 \n"
563 " stvx %v12, 0, %r3 \n"
564 " vavgub %v12, %v13, %v0 \n"
565 " stvx %v12, %r5, %r3 \n"
566 );
567 }
568
569 static void MC_avg_o_8_altivec (uint8_t * dest, const uint8_t * ref,
570 int stride, int height)
571 {
572 asm (" \n"
573 " lvsl %v12, 0, %r4 \n"
574 " li %r9, 7 \n"
575 " vmrghb %v12, %v12, %v12 \n"
576 " lvsl %v1, %r5, %r4 \n"
577 " lvx %v13, 0, %r4 \n"
578 " vpkuhum %v9, %v12, %v12 \n"
579 " lvx %v0, %r9, %r4 \n"
580 " srawi %r6, %r6, 1 \n"
581 " vmrghb %v1, %v1, %v1 \n"
582 " addi %r6, %r6, -1 \n"
583 " vperm %v0, %v13, %v0, %v9 \n"
584 " lvx %v11, 0, %r3 \n"
585 " mtctr %r6 \n"
586 " vpkuhum %v10, %v1, %v1 \n"
587 " add %r4, %r4, %r5 \n"
588 " vavgub %v12, %v11, %v0 \n"
589 "._L51: \n"
590 " li %r9, 7 \n"
591 " lvx %v0, %r9, %r4 \n"
592 " lvx %v13, 0, %r4 \n"
593 " lvx %v11, %r5, %r3 \n"
594 " stvewx %v12, 0, %r3 \n"
595 " vperm %v0, %v13, %v0, %v10 \n"
596 " li %r9, 4 \n"
597 " stvewx %v12, %r9, %r3 \n"
598 " vavgub %v1, %v11, %v0 \n"
599 " add %r4, %r4, %r5 \n"
600 " li %r9, 7 \n"
601 " lvx %v0, %r9, %r4 \n"
602 " add %r3, %r3, %r5 \n"
603 " lvx %v13, 0, %r4 \n"
604 " lvx %v11, %r5, %r3 \n"
605 " stvewx %v1, 0, %r3 \n"
606 " vperm %v0, %v13, %v0, %v9 \n"
607 " li %r9, 4 \n"
608 " stvewx %v1, %r9, %r3 \n"
609 " vavgub %v12, %v11, %v0 \n"
610 " add %r4, %r4, %r5 \n"
611 " add %r3, %r3, %r5 \n"
612 " bdnz ._L51 \n"
613 " li %r9, 7 \n"
614 " lvx %v0, %r9, %r4 \n"
615 " lvx %v13, 0, %r4 \n"
616 " lvx %v11, %r5, %r3 \n"
617 " stvewx %v12, 0, %r3 \n"
618 " vperm %v0, %v13, %v0, %v10 \n"
619 " li %r9, 4 \n"
620 " stvewx %v12, %r9, %r3 \n"
621 " vavgub %v1, %v11, %v0 \n"
622 " add %r3, %r3, %r5 \n"
623 " stvewx %v1, 0, %r3 \n"
624 " stvewx %v1, %r9, %r3 \n"
625 );
626 }
627
628 static void MC_avg_x_16_altivec (uint8_t * dest, const uint8_t * ref,
629 int stride, int height)
630 {
631 asm (" \n"
632 " lvsl %v8, 0, %r4 \n"
633 " vspltisb %v0, 1 \n"
634 " li %r9, 16 \n"
635 " lvx %v12, %r9, %r4 \n"
636 " vaddubm %v7, %v8, %v0 \n"
637 " lvx %v11, 0, %r4 \n"
638 " srawi %r6, %r6, 1 \n"
639 " vperm %v1, %v11, %v12, %v7 \n"
640 " addi %r6, %r6, -1 \n"
641 " vperm %v0, %v11, %v12, %v8 \n"
642 " lvx %v9, 0, %r3 \n"
643 " mtctr %r6 \n"
644 " add %r9, %r5, %r5 \n"
645 " vavgub %v0, %v0, %v1 \n"
646 " add %r4, %r4, %r5 \n"
647 " vavgub %v10, %v9, %v0 \n"
648 "._L56: \n"
649 " li %r11, 16 \n"
650 " lvx %v11, 0, %r4 \n"
651 " lvx %v12, %r11, %r4 \n"
652 " lvx %v9, %r5, %r3 \n"
653 " stvx %v10, 0, %r3 \n"
654 " vperm %v0, %v11, %v12, %v7 \n"
655 " add %r4, %r4, %r5 \n"
656 " vperm %v1, %v11, %v12, %v8 \n"
657 " lvx %v11, 0, %r4 \n"
658 " lvx %v12, %r11, %r4 \n"
659 " vavgub %v1, %v1, %v0 \n"
660 " add %r4, %r4, %r5 \n"
661 " vperm %v13, %v11, %v12, %v7 \n"
662 " vavgub %v10, %v9, %v1 \n"
663 " vperm %v0, %v11, %v12, %v8 \n"
664 " lvx %v9, %r9, %r3 \n"
665 " stvx %v10, %r5, %r3 \n"
666 " vavgub %v0, %v0, %v13 \n"
667 " add %r3, %r3, %r9 \n"
668 " vavgub %v10, %v9, %v0 \n"
669 " bdnz ._L56 \n"
670 " lvx %v12, %r11, %r4 \n"
671 " lvx %v11, 0, %r4 \n"
672 " lvx %v9, %r5, %r3 \n"
673 " vperm %v1, %v11, %v12, %v7 \n"
674 " stvx %v10, 0, %r3 \n"
675 " vperm %v0, %v11, %v12, %v8 \n"
676 " vavgub %v0, %v0, %v1 \n"
677 " vavgub %v10, %v9, %v0 \n"
678 " stvx %v10, %r5, %r3 \n"
679 );
680 }
681
682 static void MC_avg_x_8_altivec (uint8_t * dest, const uint8_t * ref,
683 int stride, int height)
684 {
685 asm (" \n"
686 " lvsl %v10, 0, %r4 \n"
687 " vspltisb %v13, 1 \n"
688 " li %r9, 8 \n"
689 " vmrghb %v10, %v10, %v10 \n"
690 " lvx %v11, 0, %r4 \n"
691 " lvx %v12, %r9, %r4 \n"
692 " vpkuhum %v7, %v10, %v10 \n"
693 " srawi %r6, %r6, 1 \n"
694 " lvsl %v10, %r5, %r4 \n"
695 " vaddubm %v6, %v7, %v13 \n"
696 " vperm %v0, %v11, %v12, %v7 \n"
697 " addi %r6, %r6, -1 \n"
698 " vmrghb %v10, %v10, %v10 \n"
699 " lvx %v9, 0, %r3 \n"
700 " mtctr %r6 \n"
701 " vperm %v1, %v11, %v12, %v6 \n"
702 " add %r4, %r4, %r5 \n"
703 " vpkuhum %v8, %v10, %v10 \n"
704 " vavgub %v0, %v0, %v1 \n"
705 " vaddubm %v13, %v8, %v13 \n"
706 " vavgub %v10, %v9, %v0 \n"
707 "._L61: \n"
708 " li %r9, 8 \n"
709 " lvx %v12, %r9, %r4 \n"
710 " lvx %v11, 0, %r4 \n"
711 " lvx %v9, %r5, %r3 \n"
712 " stvewx %v10, 0, %r3 \n"
713 " vperm %v1, %v11, %v12, %v13 \n"
714 " vperm %v0, %v11, %v12, %v8 \n"
715 " li %r9, 4 \n"
716 " stvewx %v10, %r9, %r3 \n"
717 " add %r4, %r4, %r5 \n"
718 " vavgub %v0, %v0, %v1 \n"
719 " li %r9, 8 \n"
720 " lvx %v12, %r9, %r4 \n"
721 " vavgub %v10, %v9, %v0 \n"
722 " lvx %v11, 0, %r4 \n"
723 " add %r3, %r3, %r5 \n"
724 " vperm %v1, %v11, %v12, %v6 \n"
725 " lvx %v9, %r5, %r3 \n"
726 " vperm %v0, %v11, %v12, %v7 \n"
727 " stvewx %v10, 0, %r3 \n"
728 " li %r9, 4 \n"
729 " vavgub %v0, %v0, %v1 \n"
730 " stvewx %v10, %r9, %r3 \n"
731 " add %r4, %r4, %r5 \n"
732 " add %r3, %r3, %r5 \n"
733 " vavgub %v10, %v9, %v0 \n"
734 " bdnz ._L61 \n"
735 " li %r9, 8 \n"
736 " lvx %v12, %r9, %r4 \n"
737 " lvx %v11, 0, %r4 \n"
738 " lvx %v9, %r5, %r3 \n"
739 " vperm %v1, %v11, %v12, %v13 \n"
740 " stvewx %v10, 0, %r3 \n"
741 " vperm %v0, %v11, %v12, %v8 \n"
742 " li %r9, 4 \n"
743 " stvewx %v10, %r9, %r3 \n"
744 " vavgub %v0, %v0, %v1 \n"
745 " add %r3, %r3, %r5 \n"
746 " vavgub %v10, %v9, %v0 \n"
747 " stvewx %v10, 0, %r3 \n"
748 " stvewx %v10, %r9, %r3 \n"
749 );
750 }
751
752 static void MC_avg_y_16_altivec (uint8_t * dest, const uint8_t * ref,
753 int stride, int height)
754 {
755 asm (" \n"
756 " li %r9, 15 \n"
757 " lvx %v1, %r9, %r4 \n"
758 " lvsl %v9, 0, %r4 \n"
759 " lvx %v13, 0, %r4 \n"
760 " add %r4, %r4, %r5 \n"
761 " vperm %v11, %v13, %v1, %v9 \n"
762 " li %r11, 15 \n"
763 " lvx %v13, 0, %r4 \n"
764 " lvx %v1, %r11, %r4 \n"
765 " srawi %r6, %r6, 1 \n"
766 " vperm %v10, %v13, %v1, %v9 \n"
767 " addi %r6, %r6, -1 \n"
768 " lvx %v12, 0, %r3 \n"
769 " mtctr %r6 \n"
770 " vavgub %v0, %v11, %v10 \n"
771 " add %r9, %r5, %r5 \n"
772 " add %r4, %r4, %r5 \n"
773 " vavgub %v0, %v12, %v0 \n"
774 "._L66: \n"
775 " li %r11, 15 \n"
776 " lvx %v13, 0, %r4 \n"
777 " lvx %v1, %r11, %r4 \n"
778 " lvx %v12, %r5, %r3 \n"
779 " vperm %v11, %v13, %v1, %v9 \n"
780 " stvx %v0, 0, %r3 \n"
781 " add %r4, %r4, %r5 \n"
782 " vavgub %v0, %v11, %v10 \n"
783 " lvx %v13, 0, %r4 \n"
784 " lvx %v1, %r11, %r4 \n"
785 " vavgub %v0, %v12, %v0 \n"
786 " add %r4, %r4, %r5 \n"
787 " lvx %v12, %r9, %r3 \n"
788 " vperm %v10, %v13, %v1, %v9 \n"
789 " stvx %v0, %r5, %r3 \n"
790 " vavgub %v0, %v11, %v10 \n"
791 " add %r3, %r3, %r9 \n"
792 " vavgub %v0, %v12, %v0 \n"
793 " bdnz ._L66 \n"
794 " lvx %v1, %r11, %r4 \n"
795 " lvx %v13, 0, %r4 \n"
796 " lvx %v12, %r5, %r3 \n"
797 " vperm %v11, %v13, %v1, %v9 \n"
798 " stvx %v0, 0, %r3 \n"
799 " vavgub %v0, %v11, %v10 \n"
800 " vavgub %v0, %v12, %v0 \n"
801 " stvx %v0, %r5, %r3 \n"
802 );
803 }
804
805 static void MC_avg_y_8_altivec (uint8_t * dest, const uint8_t * ref,
806 int stride, int height)
807 {
808 asm (" \n"
809 " lvsl %v12, 0, %r4 \n"
810 " lvsl %v9, %r5, %r4 \n"
811 " vmrghb %v12, %v12, %v12 \n"
812 " li %r9, 7 \n"
813 " lvx %v11, 0, %r4 \n"
814 " vmrghb %v9, %v9, %v9 \n"
815 " lvx %v13, %r9, %r4 \n"
816 " vpkuhum %v7, %v12, %v12 \n"
817 " add %r4, %r4, %r5 \n"
818 " vpkuhum %v8, %v9, %v9 \n"
819 " vperm %v12, %v11, %v13, %v7 \n"
820 " srawi %r6, %r6, 1 \n"
821 " lvx %v11, 0, %r4 \n"
822 " lvx %v13, %r9, %r4 \n"
823 " addi %r6, %r6, -1 \n"
824 " vperm %v9, %v11, %v13, %v8 \n"
825 " lvx %v10, 0, %r3 \n"
826 " mtctr %r6 \n"
827 " add %r4, %r4, %r5 \n"
828 " vavgub %v0, %v12, %v9 \n"
829 " vavgub %v1, %v10, %v0 \n"
830 "._L71: \n"
831 " li %r9, 7 \n"
832 " lvx %v13, %r9, %r4 \n"
833 " lvx %v11, 0, %r4 \n"
834 " lvx %v10, %r5, %r3 \n"
835 " stvewx %v1, 0, %r3 \n"
836 " vperm %v12, %v11, %v13, %v7 \n"
837 " li %r9, 4 \n"
838 " stvewx %v1, %r9, %r3 \n"
839 " vavgub %v0, %v12, %v9 \n"
840 " add %r4, %r4, %r5 \n"
841 " li %r9, 7 \n"
842 " vavgub %v1, %v10, %v0 \n"
843 " lvx %v13, %r9, %r4 \n"
844 " lvx %v11, 0, %r4 \n"
845 " add %r3, %r3, %r5 \n"
846 " vperm %v9, %v11, %v13, %v8 \n"
847 " lvx %v10, %r5, %r3 \n"
848 " stvewx %v1, 0, %r3 \n"
849 " vavgub %v0, %v12, %v9 \n"
850 " li %r9, 4 \n"
851 " stvewx %v1, %r9, %r3 \n"
852 " add %r4, %r4, %r5 \n"
853 " vavgub %v1, %v10, %v0 \n"
854 " add %r3, %r3, %r5 \n"
855 " bdnz ._L71 \n"
856 " li %r9, 7 \n"
857 " lvx %v13, %r9, %r4 \n"
858 " lvx %v11, 0, %r4 \n"
859 " lvx %v10, %r5, %r3 \n"
860 " vperm %v12, %v11, %v13, %v7 \n"
861 " stvewx %v1, 0, %r3 \n"
862 " li %r9, 4 \n"
863 " vavgub %v0, %v12, %v9 \n"
864 " stvewx %v1, %r9, %r3 \n"
865 " add %r3, %r3, %r5 \n"
866 " vavgub %v1, %v10, %v0 \n"
867 " stvewx %v1, 0, %r3 \n"
868 " stvewx %v1, %r9, %r3 \n"
869 );
870 }
871
872 static void MC_avg_xy_16_altivec (uint8_t * dest, const uint8_t * ref,
873 int stride, int height)
874 {
875 asm (" \n"
876 " lvsl %v4, 0, %r4 \n"
877 " vspltisb %v2, 1 \n"
878 " li %r9, 16 \n"
879 " lvx %v1, %r9, %r4 \n"
880 " vaddubm %v3, %v4, %v2 \n"
881 " lvx %v13, 0, %r4 \n"
882 " add %r4, %r4, %r5 \n"
883 " vperm %v10, %v13, %v1, %v3 \n"
884 " li %r11, 16 \n"
885 " vperm %v11, %v13, %v1, %v4 \n"
886 " srawi %r6, %r6, 1 \n"
887 " lvx %v13, 0, %r4 \n"
888 " lvx %v1, %r11, %r4 \n"
889 " vavgub %v9, %v11, %v10 \n"
890 " vxor %v8, %v11, %v10 \n"
891 " addi %r6, %r6, -1 \n"
892 " vperm %v10, %v13, %v1, %v3 \n"
893 " lvx %v6, 0, %r3 \n"
894 " mtctr %r6 \n"
895 " vperm %v11, %v13, %v1, %v4 \n"
896 " add %r9, %r5, %r5 \n"
897 " add %r4, %r4, %r5 \n"
898 " vxor %v5, %v11, %v10 \n"
899 " vavgub %v7, %v11, %v10 \n"
900 " vor %v1, %v8, %v5 \n"
901 " vxor %v13, %v9, %v7 \n"
902 " vand %v1, %v2, %v1 \n"
903 " vavgub %v0, %v9, %v7 \n"
904 " vand %v1, %v1, %v13 \n"
905 " vsububm %v0, %v0, %v1 \n"
906 " vavgub %v12, %v6, %v0 \n"
907 "._L76: \n"
908 " li %r11, 16 \n"
909 " lvx %v13, 0, %r4 \n"
910 " lvx %v1, %r11, %r4 \n"
911 " lvx %v6, %r5, %r3 \n"
912 " stvx %v12, 0, %r3 \n"
913 " vperm %v10, %v13, %v1, %v3 \n"
914 " vperm %v11, %v13, %v1, %v4 \n"
915 " add %r4, %r4, %r5 \n"
916 " lvx %v13, 0, %r4 \n"
917 " lvx %v1, %r11, %r4 \n"
918 " vavgub %v9, %v11, %v10 \n"
919 " vxor %v8, %v11, %v10 \n"
920 " add %r4, %r4, %r5 \n"
921 " vperm %v10, %v13, %v1, %v3 \n"
922 " vavgub %v12, %v9, %v7 \n"
923 " vperm %v11, %v13, %v1, %v4 \n"
924 " vor %v0, %v8, %v5 \n"
925 " vxor %v13, %v9, %v7 \n"
926 " vxor %v5, %v11, %v10 \n"
927 " vand %v0, %v2, %v0 \n"
928 " vavgub %v7, %v11, %v10 \n"
929 " vor %v1, %v8, %v5 \n"
930 " vand %v0, %v0, %v13 \n"
931 " vand %v1, %v2, %v1 \n"
932 " vxor %v13, %v9, %v7 \n"
933 " vsububm %v12, %v12, %v0 \n"
934 " vand %v1, %v1, %v13 \n"
935 " vavgub %v0, %v9, %v7 \n"
936 " vavgub %v12, %v6, %v12 \n"
937 " lvx %v6, %r9, %r3 \n"
938 " vsububm %v0, %v0, %v1 \n"
939 " stvx %v12, %r5, %r3 \n"
940 " vavgub %v12, %v6, %v0 \n"
941 " add %r3, %r3, %r9 \n"
942 " bdnz ._L76 \n"
943 " lvx %v1, %r11, %r4 \n"
944 " lvx %v13, 0, %r4 \n"
945 " lvx %v6, %r5, %r3 \n"
946 " vperm %v10, %v13, %v1, %v3 \n"
947 " stvx %v12, 0, %r3 \n"
948 " vperm %v11, %v13, %v1, %v4 \n"
949 " vxor %v8, %v11, %v10 \n"
950 " vavgub %v9, %v11, %v10 \n"
951 " vor %v0, %v8, %v5 \n"
952 " vxor %v13, %v9, %v7 \n"
953 " vand %v0, %v2, %v0 \n"
954 " vavgub %v1, %v9, %v7 \n"
955 " vand %v0, %v0, %v13 \n"
956 " vsububm %v1, %v1, %v0 \n"
957 " vavgub %v12, %v6, %v1 \n"
958 " stvx %v12, %r5, %r3 \n"
959 );
960 }
961
962 static void MC_avg_xy_8_altivec (uint8_t * dest, const uint8_t * ref,
963 int stride, int height)
964 {
965 asm (" \n"
966 " lvsl %v2, 0, %r4 \n"
967 " vspltisb %v19, 1 \n"
968 " lvsl %v3, %r5, %r4 \n"
969 " vmrghb %v2, %v2, %v2 \n"
970 " li %r9, 8 \n"
971 " vmrghb %v3, %v3, %v3 \n"
972 " lvx %v9, 0, %r4 \n"
973 " vpkuhum %v2, %v2, %v2 \n"
974 " lvx %v1, %r9, %r4 \n"
975 " vpkuhum %v3, %v3, %v3 \n"
976 " add %r4, %r4, %r5 \n"
977 " vaddubm %v18, %v2, %v19 \n"
978 " vperm %v11, %v9, %v1, %v2 \n"
979 " srawi %r6, %r6, 1 \n"
980 " vaddubm %v17, %v3, %v19 \n"
981 " addi %r6, %r6, -1 \n"
982 " vperm %v10, %v9, %v1, %v18 \n"
983 " lvx %v4, 0, %r3 \n"
984 " mtctr %r6 \n"
985 " lvx %v1, %r9, %r4 \n"
986 " lvx %v9, 0, %r4 \n"
987 " vavgub %v8, %v11, %v10 \n"
988 " vxor %v7, %v11, %v10 \n"
989 " add %r4, %r4, %r5 \n"
990 " vperm %v10, %v9, %v1, %v17 \n"
991 " vperm %v11, %v9, %v1, %v3 \n"
992 " vxor %v5, %v11, %v10 \n"
993 " vavgub %v6, %v11, %v10 \n"
994 " vor %v1, %v7, %v5 \n"
995 " vxor %v13, %v8, %v6 \n"
996 " vand %v1, %v19, %v1 \n"
997 " vavgub %v0, %v8, %v6 \n"
998 " vand %v1, %v1, %v13 \n"
999 " vsububm %v0, %v0, %v1 \n"
1000 " vavgub %v13, %v4, %v0 \n"
1001 "._L81: \n"
1002 " li %r9, 8 \n"
1003 " lvx %v1, %r9, %r4 \n"
1004 " lvx %v9, 0, %r4 \n"
1005 " lvx %v4, %r5, %r3 \n"
1006 " stvewx %v13, 0, %r3 \n"
1007 " vperm %v10, %v9, %v1, %v18 \n"
1008 " vperm %v11, %v9, %v1, %v2 \n"
1009 " li %r9, 4 \n"
1010 " stvewx %v13, %r9, %r3 \n"
1011 " vxor %v7, %v11, %v10 \n"
1012 " add %r4, %r4, %r5 \n"
1013 " li %r9, 8 \n"
1014 " vavgub %v8, %v11, %v10 \n"
1015 " lvx %v1, %r9, %r4 \n"
1016 " vor %v0, %v7, %v5 \n"
1017 " lvx %v9, 0, %r4 \n"
1018 " vxor %v12, %v8, %v6 \n"
1019 " vand %v0, %v19, %v0 \n"
1020 " add %r3, %r3, %r5 \n"
1021 " vperm %v10, %v9, %v1, %v17 \n"
1022 " vavgub %v13, %v8, %v6 \n"
1023 " li %r9, 4 \n"
1024 " vperm %v11, %v9, %v1, %v3 \n"
1025 " vand %v0, %v0, %v12 \n"
1026 " add %r4, %r4, %r5 \n"
1027 " vxor %v5, %v11, %v10 \n"
1028 " vavgub %v6, %v11, %v10 \n"
1029 " vor %v1, %v7, %v5 \n"
1030 " vsububm %v13, %v13, %v0 \n"
1031 " vxor %v0, %v8, %v6 \n"
1032 " vand %v1, %v19, %v1 \n"
1033 " vavgub %v13, %v4, %v13 \n"
1034 " vand %v1, %v1, %v0 \n"
1035 " lvx %v4, %r5, %r3 \n"
1036 " vavgub %v0, %v8, %v6 \n"
1037 " stvewx %v13, 0, %r3 \n"
1038 " stvewx %v13, %r9, %r3 \n"
1039 " vsububm %v0, %v0, %v1 \n"
1040 " add %r3, %r3, %r5 \n"
1041 " vavgub %v13, %v4, %v0 \n"
1042 " bdnz ._L81 \n"
1043 " li %r9, 8 \n"
1044 " lvx %v1, %r9, %r4 \n"
1045 " lvx %v9, 0, %r4 \n"
1046 " lvx %v4, %r5, %r3 \n"
1047 " vperm %v10, %v9, %v1, %v18 \n"
1048 " stvewx %v13, 0, %r3 \n"
1049 " vperm %v11, %v9, %v1, %v2 \n"
1050 " li %r9, 4 \n"
1051 " stvewx %v13, %r9, %r3 \n"
1052 " vxor %v7, %v11, %v10 \n"
1053 " add %r3, %r3, %r5 \n"
1054 " vavgub %v8, %v11, %v10 \n"
1055 " vor %v0, %v7, %v5 \n"
1056 " vxor %v13, %v8, %v6 \n"
1057 " vand %v0, %v19, %v0 \n"
1058 " vavgub %v1, %v8, %v6 \n"
1059 " vand %v0, %v0, %v13 \n"
1060 " vsububm %v1, %v1, %v0 \n"
1061 " vavgub %v13, %v4, %v1 \n"
1062 " stvewx %v13, 0, %r3 \n"
1063 " stvewx %v13, %r9, %r3 \n"
1064 );
1065 }
1066
1067 MPEG2_MC_EXTERN (altivec)
1068
1069 #endif /* ARCH_PPC */
1070
1071 #else /* __ALTIVEC__ */
1072
1073 #define vector_s16_t vector signed short
1074 #define vector_u16_t vector unsigned short
1075 #define vector_s8_t vector signed char
1076 #define vector_u8_t vector unsigned char
1077 #define vector_s32_t vector signed int
1078 #define vector_u32_t vector unsigned int
1079
1080 void MC_put_o_16_altivec (unsigned char * dest, const unsigned char * ref,
1081 const int stride, int height)
1082 {
1083 vector_u8_t perm, ref0, ref1, tmp;
1084
1085 perm = vec_lvsl (0, ref);
1086
1087 height = (height >> 1) - 1;
1088
1089 ref0 = vec_ld (0, ref);
1090 ref1 = vec_ld (15, ref);
1091 ref += stride;
1092 tmp = vec_perm (ref0, ref1, perm);
1093
1094 do {
1095 ref0 = vec_ld (0, ref);
1096 ref1 = vec_ld (15, ref);
1097 ref += stride;
1098 vec_st (tmp, 0, dest);
1099 tmp = vec_perm (ref0, ref1, perm);
1100
1101 ref0 = vec_ld (0, ref);
1102 ref1 = vec_ld (15, ref);
1103 ref += stride;
1104 vec_st (tmp, stride, dest);
1105 dest += 2*stride;
1106 tmp = vec_perm (ref0, ref1, perm);
1107 } while (--height);
1108
1109 ref0 = vec_ld (0, ref);
1110 ref1 = vec_ld (15, ref);
1111 vec_st (tmp, 0, dest);
1112 tmp = vec_perm (ref0, ref1, perm);
1113 vec_st (tmp, stride, dest);
1114 }
1115
1116 void MC_put_o_8_altivec (unsigned char * dest, const unsigned char * ref,
1117 const int stride, int height)
1118 {
1119 vector_u8_t perm0, perm1, tmp0, tmp1, ref0, ref1;
1120
1121 tmp0 = vec_lvsl (0, ref);
1122 tmp0 = vec_mergeh (tmp0, tmp0);
1123 perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
1124 tmp1 = vec_lvsl (stride, ref);
1125 tmp1 = vec_mergeh (tmp1, tmp1);
1126 perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
1127
1128 height = (height >> 1) - 1;
1129
1130 ref0 = vec_ld (0, ref);
1131 ref1 = vec_ld (7, ref);
1132 ref += stride;
1133 tmp0 = vec_perm (ref0, ref1, perm0);
1134
1135 do {
1136 ref0 = vec_ld (0, ref);
1137 ref1 = vec_ld (7, ref);
1138 ref += stride;
1139 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
1140 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
1141 dest += stride;
1142 tmp1 = vec_perm (ref0, ref1, perm1);
1143
1144 ref0 = vec_ld (0, ref);
1145 ref1 = vec_ld (7, ref);
1146 ref += stride;
1147 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
1148 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
1149 dest += stride;
1150 tmp0 = vec_perm (ref0, ref1, perm0);
1151 } while (--height);
1152
1153 ref0 = vec_ld (0, ref);
1154 ref1 = vec_ld (7, ref);
1155 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
1156 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
1157 dest += stride;
1158 tmp1 = vec_perm (ref0, ref1, perm1);
1159 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
1160 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
1161 }
1162
1163 void MC_put_x_16_altivec (unsigned char * dest, const unsigned char * ref,
1164 const int stride, int height)
1165 {
1166 vector_u8_t permA, permB, ref0, ref1, tmp;
1167
1168 permA = vec_lvsl (0, ref);
1169 permB = vec_add (permA, vec_splat_u8 (1));
1170
1171 height = (height >> 1) - 1;
1172
1173 ref0 = vec_ld (0, ref);
1174 ref1 = vec_ld (16, ref);
1175 ref += stride;
1176 tmp = vec_avg (vec_perm (ref0, ref1, permA),
1177 vec_perm (ref0, ref1, permB));
1178
1179 do {
1180 ref0 = vec_ld (0, ref);
1181 ref1 = vec_ld (16, ref);
1182 ref += stride;
1183 vec_st (tmp, 0, dest);
1184 tmp = vec_avg (vec_perm (ref0, ref1, permA),
1185 vec_perm (ref0, ref1, permB));
1186
1187 ref0 = vec_ld (0, ref);
1188 ref1 = vec_ld (16, ref);
1189 ref += stride;
1190 vec_st (tmp, stride, dest);
1191 dest += 2*stride;
1192 tmp = vec_avg (vec_perm (ref0, ref1, permA),
1193 vec_perm (ref0, ref1, permB));
1194 } while (--height);
1195
1196 ref0 = vec_ld (0, ref);
1197 ref1 = vec_ld (16, ref);
1198 vec_st (tmp, 0, dest);
1199 tmp = vec_avg (vec_perm (ref0, ref1, permA),
1200 vec_perm (ref0, ref1, permB));
1201 vec_st (tmp, stride, dest);
1202 }
1203
1204 void MC_put_x_8_altivec (unsigned char * dest, const unsigned char * ref,
1205 const int stride, int height)
1206 {
1207 vector_u8_t perm0A, perm0B, perm1A, perm1B, ones, tmp0, tmp1, ref0, ref1;
1208
1209 ones = vec_splat_u8 (1);
1210 tmp0 = vec_lvsl (0, ref);
1211 tmp0 = vec_mergeh (tmp0, tmp0);
1212 perm0A = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
1213 perm0B = vec_add (perm0A, ones);
1214 tmp1 = vec_lvsl (stride, ref);
1215 tmp1 = vec_mergeh (tmp1, tmp1);
1216 perm1A = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
1217 perm1B = vec_add (perm1A, ones);
1218
1219 height = (height >> 1) - 1;
1220
1221 ref0 = vec_ld (0, ref);
1222 ref1 = vec_ld (8, ref);
1223 ref += stride;
1224 tmp0 = vec_avg (vec_perm (ref0, ref1, perm0A),
1225 vec_perm (ref0, ref1, perm0B));
1226
1227 do {
1228 ref0 = vec_ld (0, ref);
1229 ref1 = vec_ld (8, ref);
1230 ref += stride;
1231 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
1232 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
1233 dest += stride;
1234 tmp1 = vec_avg (vec_perm (ref0, ref1, perm1A),
1235 vec_perm (ref0, ref1, perm1B));
1236
1237 ref0 = vec_ld (0, ref);
1238 ref1 = vec_ld (8, ref);
1239 ref += stride;
1240 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
1241 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
1242 dest += stride;
1243 tmp0 = vec_avg (vec_perm (ref0, ref1, perm0A),
1244 vec_perm (ref0, ref1, perm0B));
1245 } while (--height);
1246
1247 ref0 = vec_ld (0, ref);
1248 ref1 = vec_ld (8, ref);
1249 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
1250 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
1251 dest += stride;
1252 tmp1 = vec_avg (vec_perm (ref0, ref1, perm1A),
1253 vec_perm (ref0, ref1, perm1B));
1254 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
1255 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
1256 }
1257
1258 void MC_put_y_16_altivec (unsigned char * dest, const unsigned char * ref,
1259 const int stride, int height)
1260 {
1261 vector_u8_t perm, ref0, ref1, tmp0, tmp1, tmp;
1262
1263 perm = vec_lvsl (0, ref);
1264
1265 height = (height >> 1) - 1;
1266
1267 ref0 = vec_ld (0, ref);
1268 ref1 = vec_ld (15, ref);
1269 ref += stride;
1270 tmp0 = vec_perm (ref0, ref1, perm);
1271 ref0 = vec_ld (0, ref);
1272 ref1 = vec_ld (15, ref);
1273 ref += stride;
1274 tmp1 = vec_perm (ref0, ref1, perm);
1275 tmp = vec_avg (tmp0, tmp1);
1276
1277 do {
1278 ref0 = vec_ld (0, ref);
1279 ref1 = vec_ld (15, ref);
1280 ref += stride;
1281 vec_st (tmp, 0, dest);
1282 tmp0 = vec_perm (ref0, ref1, perm);
1283 tmp = vec_avg (tmp0, tmp1);
1284
1285 ref0 = vec_ld (0, ref);
1286 ref1 = vec_ld (15, ref);
1287 ref += stride;
1288 vec_st (tmp, stride, dest);
1289 dest += 2*stride;
1290 tmp1 = vec_perm (ref0, ref1, perm);
1291 tmp = vec_avg (tmp0, tmp1);
1292 } while (--height);
1293
1294 ref0 = vec_ld (0, ref);
1295 ref1 = vec_ld (15, ref);
1296 vec_st (tmp, 0, dest);
1297 tmp0 = vec_perm (ref0, ref1, perm);
1298 tmp = vec_avg (tmp0, tmp1);
1299 vec_st (tmp, stride, dest);
1300 }
1301
1302 void MC_put_y_8_altivec (unsigned char * dest, const unsigned char * ref,
1303 const int stride, int height)
1304 {
1305 vector_u8_t perm0, perm1, tmp0, tmp1, tmp, ref0, ref1;
1306
1307 tmp0 = vec_lvsl (0, ref);
1308 tmp0 = vec_mergeh (tmp0, tmp0);
1309 perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
1310 tmp1 = vec_lvsl (stride, ref);
1311 tmp1 = vec_mergeh (tmp1, tmp1);
1312 perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
1313
1314 height = (height >> 1) - 1;
1315
1316 ref0 = vec_ld (0, ref);
1317 ref1 = vec_ld (7, ref);
1318 ref += stride;
1319 tmp0 = vec_perm (ref0, ref1, perm0);
1320 ref0 = vec_ld (0, ref);
1321 ref1 = vec_ld (7, ref);
1322 ref += stride;
1323 tmp1 = vec_perm (ref0, ref1, perm1);
1324 tmp = vec_avg (tmp0, tmp1);
1325
1326 do {
1327 ref0 = vec_ld (0, ref);
1328 ref1 = vec_ld (7, ref);
1329 ref += stride;
1330 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1331 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1332 dest += stride;
1333 tmp0 = vec_perm (ref0, ref1, perm0);
1334 tmp = vec_avg (tmp0, tmp1);
1335
1336 ref0 = vec_ld (0, ref);
1337 ref1 = vec_ld (7, ref);
1338 ref += stride;
1339 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1340 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1341 dest += stride;
1342 tmp1 = vec_perm (ref0, ref1, perm1);
1343 tmp = vec_avg (tmp0, tmp1);
1344 } while (--height);
1345
1346 ref0 = vec_ld (0, ref);
1347 ref1 = vec_ld (7, ref);
1348 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1349 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1350 dest += stride;
1351 tmp0 = vec_perm (ref0, ref1, perm0);
1352 tmp = vec_avg (tmp0, tmp1);
1353 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1354 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1355 }
1356
1357 void MC_put_xy_16_altivec (unsigned char * dest, const unsigned char * ref,
1358 const int stride, int height)
1359 {
1360 vector_u8_t permA, permB, ref0, ref1, A, B, avg0, avg1, xor0, xor1, tmp;
1361 vector_u8_t ones;
1362
1363 ones = vec_splat_u8 (1);
1364 permA = vec_lvsl (0, ref);
1365 permB = vec_add (permA, ones);
1366
1367 height = (height >> 1) - 1;
1368
1369 ref0 = vec_ld (0, ref);
1370 ref1 = vec_ld (16, ref);
1371 ref += stride;
1372 A = vec_perm (ref0, ref1, permA);
1373 B = vec_perm (ref0, ref1, permB);
1374 avg0 = vec_avg (A, B);
1375 xor0 = vec_xor (A, B);
1376
1377 ref0 = vec_ld (0, ref);
1378 ref1 = vec_ld (16, ref);
1379 ref += stride;
1380 A = vec_perm (ref0, ref1, permA);
1381 B = vec_perm (ref0, ref1, permB);
1382 avg1 = vec_avg (A, B);
1383 xor1 = vec_xor (A, B);
1384 tmp = vec_sub (vec_avg (avg0, avg1),
1385 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1386 vec_xor (avg0, avg1)));
1387
1388 do {
1389 ref0 = vec_ld (0, ref);
1390 ref1 = vec_ld (16, ref);
1391 ref += stride;
1392 vec_st (tmp, 0, dest);
1393 A = vec_perm (ref0, ref1, permA);
1394 B = vec_perm (ref0, ref1, permB);
1395 avg0 = vec_avg (A, B);
1396 xor0 = vec_xor (A, B);
1397 tmp = vec_sub (vec_avg (avg0, avg1),
1398 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1399 vec_xor (avg0, avg1)));
1400
1401 ref0 = vec_ld (0, ref);
1402 ref1 = vec_ld (16, ref);
1403 ref += stride;
1404 vec_st (tmp, stride, dest);
1405 dest += 2*stride;
1406 A = vec_perm (ref0, ref1, permA);
1407 B = vec_perm (ref0, ref1, permB);
1408 avg1 = vec_avg (A, B);
1409 xor1 = vec_xor (A, B);
1410 tmp = vec_sub (vec_avg (avg0, avg1),
1411 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1412 vec_xor (avg0, avg1)));
1413 } while (--height);
1414
1415 ref0 = vec_ld (0, ref);
1416 ref1 = vec_ld (16, ref);
1417 vec_st (tmp, 0, dest);
1418 A = vec_perm (ref0, ref1, permA);
1419 B = vec_perm (ref0, ref1, permB);
1420 avg0 = vec_avg (A, B);
1421 xor0 = vec_xor (A, B);
1422 tmp = vec_sub (vec_avg (avg0, avg1),
1423 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1424 vec_xor (avg0, avg1)));
1425 vec_st (tmp, stride, dest);
1426 }
1427
1428 void MC_put_xy_8_altivec (unsigned char * dest, const unsigned char * ref,
1429 const int stride, int height)
1430 {
1431 vector_u8_t perm0A, perm0B, perm1A, perm1B, ref0, ref1, A, B;
1432 vector_u8_t avg0, avg1, xor0, xor1, tmp, ones;
1433
1434 ones = vec_splat_u8 (1);
1435 perm0A = vec_lvsl (0, ref);
1436 perm0A = vec_mergeh (perm0A, perm0A);
1437 perm0A = vec_pack ((vector_u16_t)perm0A, (vector_u16_t)perm0A);
1438 perm0B = vec_add (perm0A, ones);
1439 perm1A = vec_lvsl (stride, ref);
1440 perm1A = vec_mergeh (perm1A, perm1A);
1441 perm1A = vec_pack ((vector_u16_t)perm1A, (vector_u16_t)perm1A);
1442 perm1B = vec_add (perm1A, ones);
1443
1444 height = (height >> 1) - 1;
1445
1446 ref0 = vec_ld (0, ref);
1447 ref1 = vec_ld (8, ref);
1448 ref += stride;
1449 A = vec_perm (ref0, ref1, perm0A);
1450 B = vec_perm (ref0, ref1, perm0B);
1451 avg0 = vec_avg (A, B);
1452 xor0 = vec_xor (A, B);
1453
1454 ref0 = vec_ld (0, ref);
1455 ref1 = vec_ld (8, ref);
1456 ref += stride;
1457 A = vec_perm (ref0, ref1, perm1A);
1458 B = vec_perm (ref0, ref1, perm1B);
1459 avg1 = vec_avg (A, B);
1460 xor1 = vec_xor (A, B);
1461 tmp = vec_sub (vec_avg (avg0, avg1),
1462 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1463 vec_xor (avg0, avg1)));
1464
1465 do {
1466 ref0 = vec_ld (0, ref);
1467 ref1 = vec_ld (8, ref);
1468 ref += stride;
1469 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1470 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1471 dest += stride;
1472 A = vec_perm (ref0, ref1, perm0A);
1473 B = vec_perm (ref0, ref1, perm0B);
1474 avg0 = vec_avg (A, B);
1475 xor0 = vec_xor (A, B);
1476 tmp = vec_sub (vec_avg (avg0, avg1),
1477 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1478 vec_xor (avg0, avg1)));
1479
1480 ref0 = vec_ld (0, ref);
1481 ref1 = vec_ld (8, ref);
1482 ref += stride;
1483 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1484 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1485 dest += stride;
1486 A = vec_perm (ref0, ref1, perm1A);
1487 B = vec_perm (ref0, ref1, perm1B);
1488 avg1 = vec_avg (A, B);
1489 xor1 = vec_xor (A, B);
1490 tmp = vec_sub (vec_avg (avg0, avg1),
1491 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1492 vec_xor (avg0, avg1)));
1493 } while (--height);
1494
1495 ref0 = vec_ld (0, ref);
1496 ref1 = vec_ld (8, ref);
1497 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1498 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1499 dest += stride;
1500 A = vec_perm (ref0, ref1, perm0A);
1501 B = vec_perm (ref0, ref1, perm0B);
1502 avg0 = vec_avg (A, B);
1503 xor0 = vec_xor (A, B);
1504 tmp = vec_sub (vec_avg (avg0, avg1),
1505 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1506 vec_xor (avg0, avg1)));
1507 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1508 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1509 }
1510
1511 #if 0
1512 void MC_put_xy_8_altivec (unsigned char * dest, const unsigned char * ref,
1513 const int stride, int height)
1514 {
1515 vector_u8_t permA, permB, ref0, ref1, A, B, C, D, tmp, zero, ones;
1516 vector_u16_t splat2, temp;
1517
1518 ones = vec_splat_u8 (1);
1519 permA = vec_lvsl (0, ref);
1520 permB = vec_add (permA, ones);
1521
1522 zero = vec_splat_u8 (0);
1523 splat2 = vec_splat_u16 (2);
1524
1525 do {
1526 ref0 = vec_ld (0, ref);
1527 ref1 = vec_ld (8, ref);
1528 ref += stride;
1529 A = vec_perm (ref0, ref1, permA);
1530 B = vec_perm (ref0, ref1, permB);
1531 ref0 = vec_ld (0, ref);
1532 ref1 = vec_ld (8, ref);
1533 C = vec_perm (ref0, ref1, permA);
1534 D = vec_perm (ref0, ref1, permB);
1535
1536 temp = vec_add (vec_add ((vector_u16_t)vec_mergeh (zero, A),
1537 (vector_u16_t)vec_mergeh (zero, B)),
1538 vec_add ((vector_u16_t)vec_mergeh (zero, C),
1539 (vector_u16_t)vec_mergeh (zero, D)));
1540 temp = vec_sr (vec_add (temp, splat2), splat2);
1541 tmp = vec_pack (temp, temp);
1542
1543 vec_st (tmp, 0, dest);
1544 dest += stride;
1545 tmp = vec_avg (vec_perm (ref0, ref1, permA),
1546 vec_perm (ref0, ref1, permB));
1547 } while (--height);
1548 }
1549 #endif
1550
1551 void MC_avg_o_16_altivec (unsigned char * dest, const unsigned char * ref,
1552 const int stride, int height)
1553 {
1554 vector_u8_t perm, ref0, ref1, tmp, prev;
1555
1556 perm = vec_lvsl (0, ref);
1557
1558 height = (height >> 1) - 1;
1559
1560 ref0 = vec_ld (0, ref);
1561 ref1 = vec_ld (15, ref);
1562 ref += stride;
1563 prev = vec_ld (0, dest);
1564 tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
1565
1566 do {
1567 ref0 = vec_ld (0, ref);
1568 ref1 = vec_ld (15, ref);
1569 ref += stride;
1570 prev = vec_ld (stride, dest);
1571 vec_st (tmp, 0, dest);
1572 tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
1573
1574 ref0 = vec_ld (0, ref);
1575 ref1 = vec_ld (15, ref);
1576 ref += stride;
1577 prev = vec_ld (2*stride, dest);
1578 vec_st (tmp, stride, dest);
1579 dest += 2*stride;
1580 tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
1581 } while (--height);
1582
1583 ref0 = vec_ld (0, ref);
1584 ref1 = vec_ld (15, ref);
1585 prev = vec_ld (stride, dest);
1586 vec_st (tmp, 0, dest);
1587 tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
1588 vec_st (tmp, stride, dest);
1589 }
1590
1591 void MC_avg_o_8_altivec (unsigned char * dest, const unsigned char * ref,
1592 const int stride, int height)
1593 {
1594 vector_u8_t perm0, perm1, tmp0, tmp1, ref0, ref1, prev;
1595
1596 tmp0 = vec_lvsl (0, ref);
1597 tmp0 = vec_mergeh (tmp0, tmp0);
1598 perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
1599 tmp1 = vec_lvsl (stride, ref);
1600 tmp1 = vec_mergeh (tmp1, tmp1);
1601 perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
1602
1603 height = (height >> 1) - 1;
1604
1605 ref0 = vec_ld (0, ref);
1606 ref1 = vec_ld (7, ref);
1607 ref += stride;
1608 prev = vec_ld (0, dest);
1609 tmp0 = vec_avg (prev, vec_perm (ref0, ref1, perm0));
1610
1611 do {
1612 ref0 = vec_ld (0, ref);
1613 ref1 = vec_ld (7, ref);
1614 ref += stride;
1615 prev = vec_ld (stride, dest);
1616 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
1617 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
1618 dest += stride;
1619 tmp1 = vec_avg (prev, vec_perm (ref0, ref1, perm1));
1620
1621 ref0 = vec_ld (0, ref);
1622 ref1 = vec_ld (7, ref);
1623 ref += stride;
1624 prev = vec_ld (stride, dest);
1625 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
1626 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
1627 dest += stride;
1628 tmp0 = vec_avg (prev, vec_perm (ref0, ref1, perm0));
1629 } while (--height);
1630
1631 ref0 = vec_ld (0, ref);
1632 ref1 = vec_ld (7, ref);
1633 prev = vec_ld (stride, dest);
1634 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
1635 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
1636 dest += stride;
1637 tmp1 = vec_avg (prev, vec_perm (ref0, ref1, perm1));
1638 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
1639 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
1640 }
1641
1642 void MC_avg_x_16_altivec (unsigned char * dest, const unsigned char * ref,
1643 const int stride, int height)
1644 {
1645 vector_u8_t permA, permB, ref0, ref1, tmp, prev;
1646
1647 permA = vec_lvsl (0, ref);
1648 permB = vec_add (permA, vec_splat_u8 (1));
1649
1650 height = (height >> 1) - 1;
1651
1652 ref0 = vec_ld (0, ref);
1653 ref1 = vec_ld (16, ref);
1654 prev = vec_ld (0, dest);
1655 ref += stride;
1656 tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
1657 vec_perm (ref0, ref1, permB)));
1658
1659 do {
1660 ref0 = vec_ld (0, ref);
1661 ref1 = vec_ld (16, ref);
1662 ref += stride;
1663 prev = vec_ld (stride, dest);
1664 vec_st (tmp, 0, dest);
1665 tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
1666 vec_perm (ref0, ref1, permB)));
1667
1668 ref0 = vec_ld (0, ref);
1669 ref1 = vec_ld (16, ref);
1670 ref += stride;
1671 prev = vec_ld (2*stride, dest);
1672 vec_st (tmp, stride, dest);
1673 dest += 2*stride;
1674 tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
1675 vec_perm (ref0, ref1, permB)));
1676 } while (--height);
1677
1678 ref0 = vec_ld (0, ref);
1679 ref1 = vec_ld (16, ref);
1680 prev = vec_ld (stride, dest);
1681 vec_st (tmp, 0, dest);
1682 tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
1683 vec_perm (ref0, ref1, permB)));
1684 vec_st (tmp, stride, dest);
1685 }
1686
1687 void MC_avg_x_8_altivec (unsigned char * dest, const unsigned char * ref,
1688 const int stride, int height)
1689 {
1690 vector_u8_t perm0A, perm0B, perm1A, perm1B, ones, tmp0, tmp1, ref0, ref1;
1691 vector_u8_t prev;
1692
1693 ones = vec_splat_u8 (1);
1694 tmp0 = vec_lvsl (0, ref);
1695 tmp0 = vec_mergeh (tmp0, tmp0);
1696 perm0A = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
1697 perm0B = vec_add (perm0A, ones);
1698 tmp1 = vec_lvsl (stride, ref);
1699 tmp1 = vec_mergeh (tmp1, tmp1);
1700 perm1A = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
1701 perm1B = vec_add (perm1A, ones);
1702
1703 height = (height >> 1) - 1;
1704
1705 ref0 = vec_ld (0, ref);
1706 ref1 = vec_ld (8, ref);
1707 prev = vec_ld (0, dest);
1708 ref += stride;
1709 tmp0 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm0A),
1710 vec_perm (ref0, ref1, perm0B)));
1711
1712 do {
1713 ref0 = vec_ld (0, ref);
1714 ref1 = vec_ld (8, ref);
1715 ref += stride;
1716 prev = vec_ld (stride, dest);
1717 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
1718 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
1719 dest += stride;
1720 tmp1 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm1A),
1721 vec_perm (ref0, ref1, perm1B)));
1722
1723 ref0 = vec_ld (0, ref);
1724 ref1 = vec_ld (8, ref);
1725 ref += stride;
1726 prev = vec_ld (stride, dest);
1727 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
1728 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
1729 dest += stride;
1730 tmp0 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm0A),
1731 vec_perm (ref0, ref1, perm0B)));
1732 } while (--height);
1733
1734 ref0 = vec_ld (0, ref);
1735 ref1 = vec_ld (8, ref);
1736 prev = vec_ld (stride, dest);
1737 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
1738 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
1739 dest += stride;
1740 tmp1 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm1A),
1741 vec_perm (ref0, ref1, perm1B)));
1742 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
1743 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
1744 }
1745
1746 void MC_avg_y_16_altivec (unsigned char * dest, const unsigned char * ref,
1747 const int stride, int height)
1748 {
1749 vector_u8_t perm, ref0, ref1, tmp0, tmp1, tmp, prev;
1750
1751 perm = vec_lvsl (0, ref);
1752
1753 height = (height >> 1) - 1;
1754
1755 ref0 = vec_ld (0, ref);
1756 ref1 = vec_ld (15, ref);
1757 ref += stride;
1758 tmp0 = vec_perm (ref0, ref1, perm);
1759 ref0 = vec_ld (0, ref);
1760 ref1 = vec_ld (15, ref);
1761 ref += stride;
1762 prev = vec_ld (0, dest);
1763 tmp1 = vec_perm (ref0, ref1, perm);
1764 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
1765
1766 do {
1767 ref0 = vec_ld (0, ref);
1768 ref1 = vec_ld (15, ref);
1769 ref += stride;
1770 prev = vec_ld (stride, dest);
1771 vec_st (tmp, 0, dest);
1772 tmp0 = vec_perm (ref0, ref1, perm);
1773 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
1774
1775 ref0 = vec_ld (0, ref);
1776 ref1 = vec_ld (15, ref);
1777 ref += stride;
1778 prev = vec_ld (2*stride, dest);
1779 vec_st (tmp, stride, dest);
1780 dest += 2*stride;
1781 tmp1 = vec_perm (ref0, ref1, perm);
1782 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
1783 } while (--height);
1784
1785 ref0 = vec_ld (0, ref);
1786 ref1 = vec_ld (15, ref);
1787 prev = vec_ld (stride, dest);
1788 vec_st (tmp, 0, dest);
1789 tmp0 = vec_perm (ref0, ref1, perm);
1790 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
1791 vec_st (tmp, stride, dest);
1792 }
1793
1794 void MC_avg_y_8_altivec (unsigned char * dest, const unsigned char * ref,
1795 const int stride, int height)
1796 {
1797 vector_u8_t perm0, perm1, tmp0, tmp1, tmp, ref0, ref1, prev;
1798
1799 tmp0 = vec_lvsl (0, ref);
1800 tmp0 = vec_mergeh (tmp0, tmp0);
1801 perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
1802 tmp1 = vec_lvsl (stride, ref);
1803 tmp1 = vec_mergeh (tmp1, tmp1);
1804 perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
1805
1806 height = (height >> 1) - 1;
1807
1808 ref0 = vec_ld (0, ref);
1809 ref1 = vec_ld (7, ref);
1810 ref += stride;
1811 tmp0 = vec_perm (ref0, ref1, perm0);
1812 ref0 = vec_ld (0, ref);
1813 ref1 = vec_ld (7, ref);
1814 ref += stride;
1815 prev = vec_ld (0, dest);
1816 tmp1 = vec_perm (ref0, ref1, perm1);
1817 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
1818
1819 do {
1820 ref0 = vec_ld (0, ref);
1821 ref1 = vec_ld (7, ref);
1822 ref += stride;
1823 prev = vec_ld (stride, dest);
1824 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1825 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1826 dest += stride;
1827 tmp0 = vec_perm (ref0, ref1, perm0);
1828 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
1829
1830 ref0 = vec_ld (0, ref);
1831 ref1 = vec_ld (7, ref);
1832 ref += stride;
1833 prev = vec_ld (stride, dest);
1834 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1835 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1836 dest += stride;
1837 tmp1 = vec_perm (ref0, ref1, perm1);
1838 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
1839 } while (--height);
1840
1841 ref0 = vec_ld (0, ref);
1842 ref1 = vec_ld (7, ref);
1843 prev = vec_ld (stride, dest);
1844 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1845 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1846 dest += stride;
1847 tmp0 = vec_perm (ref0, ref1, perm0);
1848 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
1849 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1850 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1851 }
1852
1853 void MC_avg_xy_16_altivec (unsigned char * dest, const unsigned char * ref,
1854 const int stride, int height)
1855 {
1856 vector_u8_t permA, permB, ref0, ref1, A, B, avg0, avg1, xor0, xor1, tmp;
1857 vector_u8_t ones, prev;
1858
1859 ones = vec_splat_u8 (1);
1860 permA = vec_lvsl (0, ref);
1861 permB = vec_add (permA, ones);
1862
1863 height = (height >> 1) - 1;
1864
1865 ref0 = vec_ld (0, ref);
1866 ref1 = vec_ld (16, ref);
1867 ref += stride;
1868 A = vec_perm (ref0, ref1, permA);
1869 B = vec_perm (ref0, ref1, permB);
1870 avg0 = vec_avg (A, B);
1871 xor0 = vec_xor (A, B);
1872
1873 ref0 = vec_ld (0, ref);
1874 ref1 = vec_ld (16, ref);
1875 ref += stride;
1876 prev = vec_ld (0, dest);
1877 A = vec_perm (ref0, ref1, permA);
1878 B = vec_perm (ref0, ref1, permB);
1879 avg1 = vec_avg (A, B);
1880 xor1 = vec_xor (A, B);
1881 tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
1882 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1883 vec_xor (avg0, avg1))));
1884
1885 do {
1886 ref0 = vec_ld (0, ref);
1887 ref1 = vec_ld (16, ref);
1888 ref += stride;
1889 prev = vec_ld (stride, dest);
1890 vec_st (tmp, 0, dest);
1891 A = vec_perm (ref0, ref1, permA);
1892 B = vec_perm (ref0, ref1, permB);
1893 avg0 = vec_avg (A, B);
1894 xor0 = vec_xor (A, B);
1895 tmp = vec_avg (prev,
1896 vec_sub (vec_avg (avg0, avg1),
1897 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1898 vec_xor (avg0, avg1))));
1899
1900 ref0 = vec_ld (0, ref);
1901 ref1 = vec_ld (16, ref);
1902 ref += stride;
1903 prev = vec_ld (2*stride, dest);
1904 vec_st (tmp, stride, dest);
1905 dest += 2*stride;
1906 A = vec_perm (ref0, ref1, permA);
1907 B = vec_perm (ref0, ref1, permB);
1908 avg1 = vec_avg (A, B);
1909 xor1 = vec_xor (A, B);
1910 tmp = vec_avg (prev,
1911 vec_sub (vec_avg (avg0, avg1),
1912 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1913 vec_xor (avg0, avg1))));
1914 } while (--height);
1915
1916 ref0 = vec_ld (0, ref);
1917 ref1 = vec_ld (16, ref);
1918 prev = vec_ld (stride, dest);
1919 vec_st (tmp, 0, dest);
1920 A = vec_perm (ref0, ref1, permA);
1921 B = vec_perm (ref0, ref1, permB);
1922 avg0 = vec_avg (A, B);
1923 xor0 = vec_xor (A, B);
1924 tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
1925 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1926 vec_xor (avg0, avg1))));
1927 vec_st (tmp, stride, dest);
1928 }
1929
1930 void MC_avg_xy_8_altivec (unsigned char * dest, const unsigned char * ref,
1931 const int stride, int height)
1932 {
1933 vector_u8_t perm0A, perm0B, perm1A, perm1B, ref0, ref1, A, B;
1934 vector_u8_t avg0, avg1, xor0, xor1, tmp, ones, prev;
1935
1936 ones = vec_splat_u8 (1);
1937 perm0A = vec_lvsl (0, ref);
1938 perm0A = vec_mergeh (perm0A, perm0A);
1939 perm0A = vec_pack ((vector_u16_t)perm0A, (vector_u16_t)perm0A);
1940 perm0B = vec_add (perm0A, ones);
1941 perm1A = vec_lvsl (stride, ref);
1942 perm1A = vec_mergeh (perm1A, perm1A);
1943 perm1A = vec_pack ((vector_u16_t)perm1A, (vector_u16_t)perm1A);
1944 perm1B = vec_add (perm1A, ones);
1945
1946 height = (height >> 1) - 1;
1947
1948 ref0 = vec_ld (0, ref);
1949 ref1 = vec_ld (8, ref);
1950 ref += stride;
1951 A = vec_perm (ref0, ref1, perm0A);
1952 B = vec_perm (ref0, ref1, perm0B);
1953 avg0 = vec_avg (A, B);
1954 xor0 = vec_xor (A, B);
1955
1956 ref0 = vec_ld (0, ref);
1957 ref1 = vec_ld (8, ref);
1958 ref += stride;
1959 prev = vec_ld (0, dest);
1960 A = vec_perm (ref0, ref1, perm1A);
1961 B = vec_perm (ref0, ref1, perm1B);
1962 avg1 = vec_avg (A, B);
1963 xor1 = vec_xor (A, B);
1964 tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
1965 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1966 vec_xor (avg0, avg1))));
1967
1968 do {
1969 ref0 = vec_ld (0, ref);
1970 ref1 = vec_ld (8, ref);
1971 ref += stride;
1972 prev = vec_ld (stride, dest);
1973 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1974 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1975 dest += stride;
1976 A = vec_perm (ref0, ref1, perm0A);
1977 B = vec_perm (ref0, ref1, perm0B);
1978 avg0 = vec_avg (A, B);
1979 xor0 = vec_xor (A, B);
1980 tmp = vec_avg (prev,
1981 vec_sub (vec_avg (avg0, avg1),
1982 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1983 vec_xor (avg0, avg1))));
1984
1985 ref0 = vec_ld (0, ref);
1986 ref1 = vec_ld (8, ref);
1987 ref += stride;
1988 prev = vec_ld (stride, dest);
1989 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
1990 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
1991 dest += stride;
1992 A = vec_perm (ref0, ref1, perm1A);
1993 B = vec_perm (ref0, ref1, perm1B);
1994 avg1 = vec_avg (A, B);
1995 xor1 = vec_xor (A, B);
1996 tmp = vec_avg (prev,
1997 vec_sub (vec_avg (avg0, avg1),
1998 vec_and (vec_and (ones, vec_or (xor0, xor1)),
1999 vec_xor (avg0, avg1))));
2000 } while (--height);
2001
2002 ref0 = vec_ld (0, ref);
2003 ref1 = vec_ld (8, ref);
2004 prev = vec_ld (stride, dest);
2005 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
2006 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
2007 dest += stride;
2008 A = vec_perm (ref0, ref1, perm0A);
2009 B = vec_perm (ref0, ref1, perm0B);
2010 avg0 = vec_avg (A, B);
2011 xor0 = vec_xor (A, B);
2012 tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
2013 vec_and (vec_and (ones, vec_or (xor0, xor1)),
2014 vec_xor (avg0, avg1))));
2015 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
2016 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
2017 }
2018
2019 #endif /* __ALTIVEC__ */