9857
|
1 /*
|
|
2 * motion_comp_altivec.c
|
|
3 * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
|
|
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
|
|
5 *
|
|
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
|
|
7 * See http://libmpeg2.sourceforge.net/ for updates.
|
|
8 *
|
|
9 * mpeg2dec is free software; you can redistribute it and/or modify
|
|
10 * it under the terms of the GNU General Public License as published by
|
|
11 * the Free Software Foundation; either version 2 of the License, or
|
|
12 * (at your option) any later version.
|
|
13 *
|
|
14 * mpeg2dec is distributed in the hope that it will be useful,
|
|
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
17 * GNU General Public License for more details.
|
|
18 *
|
|
19 * You should have received a copy of the GNU General Public License
|
|
20 * along with this program; if not, write to the Free Software
|
|
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
22 */
|
|
23
|
|
24 #ifndef __ALTIVEC__
|
|
25
|
|
26 #include "config.h"
|
|
27
|
|
28 #ifdef ARCH_PPC
|
|
29
|
|
30 #include <inttypes.h>
|
|
31
|
|
32 #include "mpeg2.h"
|
|
33 #include "mpeg2_internal.h"
|
|
34
|
|
35 /*
|
|
36 * The asm code is generated with:
|
|
37 *
|
|
38 * gcc-2.95 -fvec -D__ALTIVEC__ -O9 -fomit-frame-pointer -mregnames -S
|
|
39 * motion_comp_altivec.c
|
|
40 *
|
|
41 * sed 's/.L/._L/g' motion_comp_altivec.s |
|
|
42 * awk '{args=""; len=split ($2, arg, ",");
|
|
43 * for (i=1; i<=len; i++) { a=arg[i]; if (i<len) a=a",";
|
|
44 * args = args sprintf ("%-6s", a) }
|
|
45 * printf ("\t\"\t%-16s%-24s\\n\"\n", $1, args) }' |
|
|
46 * unexpand -a
|
|
47 */
|
|
48
|
|
49 static void MC_put_o_16_altivec (uint8_t * dest, const uint8_t * ref,
|
|
50 int stride, int height)
|
|
51 {
|
|
52 asm (" \n"
|
|
53 " srawi %r6, %r6, 1 \n"
|
|
54 " li %r9, 15 \n"
|
|
55 " addi %r6, %r6, -1 \n"
|
|
56 " lvsl %v12, 0, %r4 \n"
|
|
57 " mtctr %r6 \n"
|
|
58 " lvx %v1, 0, %r4 \n"
|
|
59 " lvx %v0, %r9, %r4 \n"
|
|
60 " add %r0, %r5, %r5 \n"
|
|
61 " vperm %v13, %v1, %v0, %v12 \n"
|
|
62 " add %r4, %r4, %r5 \n"
|
|
63 "._L6: \n"
|
|
64 " li %r9, 15 \n"
|
|
65 " lvx %v1, 0, %r4 \n"
|
|
66 " lvx %v0, %r9, %r4 \n"
|
|
67 " stvx %v13, 0, %r3 \n"
|
|
68 " vperm %v13, %v1, %v0, %v12 \n"
|
|
69 " add %r4, %r4, %r5 \n"
|
|
70 " lvx %v1, 0, %r4 \n"
|
|
71 " lvx %v0, %r9, %r4 \n"
|
|
72 " stvx %v13, %r5, %r3 \n"
|
|
73 " vperm %v13, %v1, %v0, %v12 \n"
|
|
74 " add %r4, %r4, %r5 \n"
|
|
75 " add %r3, %r3, %r0 \n"
|
|
76 " bdnz ._L6 \n"
|
|
77 " lvx %v0, %r9, %r4 \n"
|
|
78 " lvx %v1, 0, %r4 \n"
|
|
79 " stvx %v13, 0, %r3 \n"
|
|
80 " vperm %v13, %v1, %v0, %v12 \n"
|
|
81 " stvx %v13, %r5, %r3 \n"
|
|
82 );
|
|
83 }
|
|
84
|
|
85 static void MC_put_o_8_altivec (uint8_t * dest, const uint8_t * ref,
|
|
86 int stride, int height)
|
|
87 {
|
|
88 asm (" \n"
|
|
89 " lvsl %v12, 0, %r4 \n"
|
|
90 " lvsl %v1, %r5, %r4 \n"
|
|
91 " vmrghb %v12, %v12, %v12 \n"
|
|
92 " srawi %r6, %r6, 1 \n"
|
|
93 " li %r9, 7 \n"
|
|
94 " vmrghb %v1, %v1, %v1 \n"
|
|
95 " addi %r6, %r6, -1 \n"
|
|
96 " vpkuhum %v10, %v12, %v12 \n"
|
|
97 " lvx %v13, 0, %r4 \n"
|
|
98 " mtctr %r6 \n"
|
|
99 " vpkuhum %v11, %v1, %v1 \n"
|
|
100 " lvx %v0, %r9, %r4 \n"
|
|
101 " add %r4, %r4, %r5 \n"
|
|
102 " vperm %v12, %v13, %v0, %v10 \n"
|
|
103 "._L11: \n"
|
|
104 " li %r9, 7 \n"
|
|
105 " lvx %v0, %r9, %r4 \n"
|
|
106 " lvx %v13, 0, %r4 \n"
|
|
107 " stvewx %v12, 0, %r3 \n"
|
|
108 " li %r9, 4 \n"
|
|
109 " vperm %v1, %v13, %v0, %v11 \n"
|
|
110 " stvewx %v12, %r9, %r3 \n"
|
|
111 " add %r4, %r4, %r5 \n"
|
|
112 " li %r9, 7 \n"
|
|
113 " lvx %v0, %r9, %r4 \n"
|
|
114 " lvx %v13, 0, %r4 \n"
|
|
115 " add %r3, %r3, %r5 \n"
|
|
116 " stvewx %v1, 0, %r3 \n"
|
|
117 " vperm %v12, %v13, %v0, %v10 \n"
|
|
118 " li %r9, 4 \n"
|
|
119 " stvewx %v1, %r9, %r3 \n"
|
|
120 " add %r4, %r4, %r5 \n"
|
|
121 " add %r3, %r3, %r5 \n"
|
|
122 " bdnz ._L11 \n"
|
|
123 " li %r9, 7 \n"
|
|
124 " lvx %v0, %r9, %r4 \n"
|
|
125 " lvx %v13, 0, %r4 \n"
|
|
126 " stvewx %v12, 0, %r3 \n"
|
|
127 " li %r9, 4 \n"
|
|
128 " vperm %v1, %v13, %v0, %v11 \n"
|
|
129 " stvewx %v12, %r9, %r3 \n"
|
|
130 " add %r3, %r3, %r5 \n"
|
|
131 " stvewx %v1, 0, %r3 \n"
|
|
132 " stvewx %v1, %r9, %r3 \n"
|
|
133 );
|
|
134 }
|
|
135
|
|
136 static void MC_put_x_16_altivec (uint8_t * dest, const uint8_t * ref,
|
|
137 int stride, int height)
|
|
138 {
|
|
139 asm (" \n"
|
|
140 " lvsl %v11, 0, %r4 \n"
|
|
141 " vspltisb %v0, 1 \n"
|
|
142 " li %r9, 16 \n"
|
|
143 " lvx %v12, 0, %r4 \n"
|
|
144 " vaddubm %v10, %v11, %v0 \n"
|
|
145 " lvx %v13, %r9, %r4 \n"
|
|
146 " srawi %r6, %r6, 1 \n"
|
|
147 " addi %r6, %r6, -1 \n"
|
|
148 " vperm %v1, %v12, %v13, %v10 \n"
|
|
149 " vperm %v0, %v12, %v13, %v11 \n"
|
|
150 " mtctr %r6 \n"
|
|
151 " add %r0, %r5, %r5 \n"
|
|
152 " add %r4, %r4, %r5 \n"
|
|
153 " vavgub %v0, %v0, %v1 \n"
|
|
154 "._L16: \n"
|
|
155 " li %r9, 16 \n"
|
|
156 " lvx %v12, 0, %r4 \n"
|
|
157 " lvx %v13, %r9, %r4 \n"
|
|
158 " stvx %v0, 0, %r3 \n"
|
|
159 " vperm %v1, %v12, %v13, %v10 \n"
|
|
160 " add %r4, %r4, %r5 \n"
|
|
161 " vperm %v0, %v12, %v13, %v11 \n"
|
|
162 " lvx %v12, 0, %r4 \n"
|
|
163 " lvx %v13, %r9, %r4 \n"
|
|
164 " vavgub %v0, %v0, %v1 \n"
|
|
165 " stvx %v0, %r5, %r3 \n"
|
|
166 " vperm %v1, %v12, %v13, %v10 \n"
|
|
167 " add %r4, %r4, %r5 \n"
|
|
168 " vperm %v0, %v12, %v13, %v11 \n"
|
|
169 " add %r3, %r3, %r0 \n"
|
|
170 " vavgub %v0, %v0, %v1 \n"
|
|
171 " bdnz ._L16 \n"
|
|
172 " lvx %v13, %r9, %r4 \n"
|
|
173 " lvx %v12, 0, %r4 \n"
|
|
174 " stvx %v0, 0, %r3 \n"
|
|
175 " vperm %v1, %v12, %v13, %v10 \n"
|
|
176 " vperm %v0, %v12, %v13, %v11 \n"
|
|
177 " vavgub %v0, %v0, %v1 \n"
|
|
178 " stvx %v0, %r5, %r3 \n"
|
|
179 );
|
|
180 }
|
|
181
|
|
182 static void MC_put_x_8_altivec (uint8_t * dest, const uint8_t * ref,
|
|
183 int stride, int height)
|
|
184 {
|
|
185 asm (" \n"
|
|
186 " lvsl %v0, 0, %r4 \n"
|
|
187 " vspltisb %v13, 1 \n"
|
|
188 " lvsl %v10, %r5, %r4 \n"
|
|
189 " vmrghb %v0, %v0, %v0 \n"
|
|
190 " li %r9, 8 \n"
|
|
191 " lvx %v11, 0, %r4 \n"
|
|
192 " vmrghb %v10, %v10, %v10 \n"
|
|
193 " vpkuhum %v8, %v0, %v0 \n"
|
|
194 " lvx %v12, %r9, %r4 \n"
|
|
195 " srawi %r6, %r6, 1 \n"
|
|
196 " vpkuhum %v9, %v10, %v10 \n"
|
|
197 " vaddubm %v7, %v8, %v13 \n"
|
|
198 " addi %r6, %r6, -1 \n"
|
|
199 " vperm %v1, %v11, %v12, %v8 \n"
|
|
200 " mtctr %r6 \n"
|
|
201 " vaddubm %v13, %v9, %v13 \n"
|
|
202 " add %r4, %r4, %r5 \n"
|
|
203 " vperm %v0, %v11, %v12, %v7 \n"
|
|
204 " vavgub %v0, %v1, %v0 \n"
|
|
205 "._L21: \n"
|
|
206 " li %r9, 8 \n"
|
|
207 " lvx %v12, %r9, %r4 \n"
|
|
208 " lvx %v11, 0, %r4 \n"
|
|
209 " stvewx %v0, 0, %r3 \n"
|
|
210 " li %r9, 4 \n"
|
|
211 " vperm %v1, %v11, %v12, %v13 \n"
|
|
212 " stvewx %v0, %r9, %r3 \n"
|
|
213 " vperm %v0, %v11, %v12, %v9 \n"
|
|
214 " add %r4, %r4, %r5 \n"
|
|
215 " li %r9, 8 \n"
|
|
216 " lvx %v12, %r9, %r4 \n"
|
|
217 " vavgub %v10, %v0, %v1 \n"
|
|
218 " lvx %v11, 0, %r4 \n"
|
|
219 " add %r3, %r3, %r5 \n"
|
|
220 " stvewx %v10, 0, %r3 \n"
|
|
221 " vperm %v1, %v11, %v12, %v7 \n"
|
|
222 " vperm %v0, %v11, %v12, %v8 \n"
|
|
223 " li %r9, 4 \n"
|
|
224 " stvewx %v10, %r9, %r3 \n"
|
|
225 " add %r4, %r4, %r5 \n"
|
|
226 " vavgub %v0, %v0, %v1 \n"
|
|
227 " add %r3, %r3, %r5 \n"
|
|
228 " bdnz ._L21 \n"
|
|
229 " li %r9, 8 \n"
|
|
230 " lvx %v12, %r9, %r4 \n"
|
|
231 " lvx %v11, 0, %r4 \n"
|
|
232 " stvewx %v0, 0, %r3 \n"
|
|
233 " li %r9, 4 \n"
|
|
234 " vperm %v1, %v11, %v12, %v13 \n"
|
|
235 " stvewx %v0, %r9, %r3 \n"
|
|
236 " vperm %v0, %v11, %v12, %v9 \n"
|
|
237 " add %r3, %r3, %r5 \n"
|
|
238 " vavgub %v10, %v0, %v1 \n"
|
|
239 " stvewx %v10, 0, %r3 \n"
|
|
240 " stvewx %v10, %r9, %r3 \n"
|
|
241 );
|
|
242 }
|
|
243
|
|
244 static void MC_put_y_16_altivec (uint8_t * dest, const uint8_t * ref,
|
|
245 int stride, int height)
|
|
246 {
|
|
247 asm (" \n"
|
|
248 " li %r9, 15 \n"
|
|
249 " lvsl %v10, 0, %r4 \n"
|
|
250 " lvx %v13, 0, %r4 \n"
|
|
251 " lvx %v1, %r9, %r4 \n"
|
|
252 " add %r4, %r4, %r5 \n"
|
|
253 " vperm %v12, %v13, %v1, %v10 \n"
|
|
254 " srawi %r6, %r6, 1 \n"
|
|
255 " lvx %v13, 0, %r4 \n"
|
|
256 " lvx %v1, %r9, %r4 \n"
|
|
257 " addi %r6, %r6, -1 \n"
|
|
258 " vperm %v11, %v13, %v1, %v10 \n"
|
|
259 " mtctr %r6 \n"
|
|
260 " add %r0, %r5, %r5 \n"
|
|
261 " add %r4, %r4, %r5 \n"
|
|
262 " vavgub %v0, %v12, %v11 \n"
|
|
263 "._L26: \n"
|
|
264 " li %r9, 15 \n"
|
|
265 " lvx %v13, 0, %r4 \n"
|
|
266 " lvx %v1, %r9, %r4 \n"
|
|
267 " stvx %v0, 0, %r3 \n"
|
|
268 " vperm %v12, %v13, %v1, %v10 \n"
|
|
269 " add %r4, %r4, %r5 \n"
|
|
270 " lvx %v13, 0, %r4 \n"
|
|
271 " lvx %v1, %r9, %r4 \n"
|
|
272 " vavgub %v0, %v12, %v11 \n"
|
|
273 " stvx %v0, %r5, %r3 \n"
|
|
274 " vperm %v11, %v13, %v1, %v10 \n"
|
|
275 " add %r4, %r4, %r5 \n"
|
|
276 " add %r3, %r3, %r0 \n"
|
|
277 " vavgub %v0, %v12, %v11 \n"
|
|
278 " bdnz ._L26 \n"
|
|
279 " lvx %v1, %r9, %r4 \n"
|
|
280 " lvx %v13, 0, %r4 \n"
|
|
281 " stvx %v0, 0, %r3 \n"
|
|
282 " vperm %v12, %v13, %v1, %v10 \n"
|
|
283 " vavgub %v0, %v12, %v11 \n"
|
|
284 " stvx %v0, %r5, %r3 \n"
|
|
285 );
|
|
286 }
|
|
287
|
|
288 static void MC_put_y_8_altivec (uint8_t * dest, const uint8_t * ref,
|
|
289 int stride, int height)
|
|
290 {
|
|
291 asm (" \n"
|
|
292 " lvsl %v13, 0, %r4 \n"
|
|
293 " lvsl %v11, %r5, %r4 \n"
|
|
294 " vmrghb %v13, %v13, %v13 \n"
|
|
295 " li %r9, 7 \n"
|
|
296 " lvx %v12, 0, %r4 \n"
|
|
297 " vmrghb %v11, %v11, %v11 \n"
|
|
298 " lvx %v1, %r9, %r4 \n"
|
|
299 " vpkuhum %v9, %v13, %v13 \n"
|
|
300 " add %r4, %r4, %r5 \n"
|
|
301 " vpkuhum %v10, %v11, %v11 \n"
|
|
302 " vperm %v13, %v12, %v1, %v9 \n"
|
|
303 " srawi %r6, %r6, 1 \n"
|
|
304 " lvx %v12, 0, %r4 \n"
|
|
305 " lvx %v1, %r9, %r4 \n"
|
|
306 " addi %r6, %r6, -1 \n"
|
|
307 " vperm %v11, %v12, %v1, %v10 \n"
|
|
308 " mtctr %r6 \n"
|
|
309 " add %r4, %r4, %r5 \n"
|
|
310 " vavgub %v0, %v13, %v11 \n"
|
|
311 "._L31: \n"
|
|
312 " li %r9, 7 \n"
|
|
313 " lvx %v1, %r9, %r4 \n"
|
|
314 " lvx %v12, 0, %r4 \n"
|
|
315 " stvewx %v0, 0, %r3 \n"
|
|
316 " li %r9, 4 \n"
|
|
317 " vperm %v13, %v12, %v1, %v9 \n"
|
|
318 " stvewx %v0, %r9, %r3 \n"
|
|
319 " add %r4, %r4, %r5 \n"
|
|
320 " vavgub %v0, %v13, %v11 \n"
|
|
321 " li %r9, 7 \n"
|
|
322 " lvx %v1, %r9, %r4 \n"
|
|
323 " lvx %v12, 0, %r4 \n"
|
|
324 " add %r3, %r3, %r5 \n"
|
|
325 " stvewx %v0, 0, %r3 \n"
|
|
326 " vperm %v11, %v12, %v1, %v10 \n"
|
|
327 " li %r9, 4 \n"
|
|
328 " stvewx %v0, %r9, %r3 \n"
|
|
329 " vavgub %v0, %v13, %v11 \n"
|
|
330 " add %r4, %r4, %r5 \n"
|
|
331 " add %r3, %r3, %r5 \n"
|
|
332 " bdnz ._L31 \n"
|
|
333 " li %r9, 7 \n"
|
|
334 " lvx %v1, %r9, %r4 \n"
|
|
335 " lvx %v12, 0, %r4 \n"
|
|
336 " stvewx %v0, 0, %r3 \n"
|
|
337 " li %r9, 4 \n"
|
|
338 " vperm %v13, %v12, %v1, %v9 \n"
|
|
339 " stvewx %v0, %r9, %r3 \n"
|
|
340 " add %r3, %r3, %r5 \n"
|
|
341 " vavgub %v0, %v13, %v11 \n"
|
|
342 " stvewx %v0, 0, %r3 \n"
|
|
343 " stvewx %v0, %r9, %r3 \n"
|
|
344 );
|
|
345 }
|
|
346
|
|
347 static void MC_put_xy_16_altivec (uint8_t * dest, const uint8_t * ref,
|
|
348 int stride, int height)
|
|
349 {
|
|
350 asm (" \n"
|
|
351 " lvsl %v5, 0, %r4 \n"
|
|
352 " vspltisb %v3, 1 \n"
|
|
353 " li %r9, 16 \n"
|
|
354 " lvx %v1, 0, %r4 \n"
|
|
355 " vaddubm %v4, %v5, %v3 \n"
|
|
356 " lvx %v0, %r9, %r4 \n"
|
|
357 " add %r4, %r4, %r5 \n"
|
|
358 " vperm %v10, %v1, %v0, %v4 \n"
|
|
359 " srawi %r6, %r6, 1 \n"
|
|
360 " vperm %v11, %v1, %v0, %v5 \n"
|
|
361 " addi %r6, %r6, -1 \n"
|
|
362 " lvx %v1, 0, %r4 \n"
|
|
363 " mtctr %r6 \n"
|
|
364 " lvx %v0, %r9, %r4 \n"
|
|
365 " vavgub %v9, %v11, %v10 \n"
|
|
366 " vxor %v8, %v11, %v10 \n"
|
|
367 " add %r0, %r5, %r5 \n"
|
|
368 " vperm %v10, %v1, %v0, %v4 \n"
|
|
369 " add %r4, %r4, %r5 \n"
|
|
370 " vperm %v11, %v1, %v0, %v5 \n"
|
|
371 " vxor %v6, %v11, %v10 \n"
|
|
372 " vavgub %v7, %v11, %v10 \n"
|
|
373 " vor %v0, %v8, %v6 \n"
|
|
374 " vxor %v13, %v9, %v7 \n"
|
|
375 " vand %v0, %v3, %v0 \n"
|
|
376 " vavgub %v1, %v9, %v7 \n"
|
|
377 " vand %v0, %v0, %v13 \n"
|
|
378 " vsububm %v13, %v1, %v0 \n"
|
|
379 "._L36: \n"
|
|
380 " li %r9, 16 \n"
|
|
381 " lvx %v1, 0, %r4 \n"
|
|
382 " lvx %v0, %r9, %r4 \n"
|
|
383 " stvx %v13, 0, %r3 \n"
|
|
384 " vperm %v10, %v1, %v0, %v4 \n"
|
|
385 " add %r4, %r4, %r5 \n"
|
|
386 " vperm %v11, %v1, %v0, %v5 \n"
|
|
387 " lvx %v1, 0, %r4 \n"
|
|
388 " lvx %v0, %r9, %r4 \n"
|
|
389 " vavgub %v9, %v11, %v10 \n"
|
|
390 " vxor %v8, %v11, %v10 \n"
|
|
391 " add %r4, %r4, %r5 \n"
|
|
392 " vperm %v10, %v1, %v0, %v4 \n"
|
|
393 " vavgub %v12, %v9, %v7 \n"
|
|
394 " vperm %v11, %v1, %v0, %v5 \n"
|
|
395 " vor %v13, %v8, %v6 \n"
|
|
396 " vxor %v0, %v9, %v7 \n"
|
|
397 " vxor %v6, %v11, %v10 \n"
|
|
398 " vand %v13, %v3, %v13 \n"
|
|
399 " vavgub %v7, %v11, %v10 \n"
|
|
400 " vor %v1, %v8, %v6 \n"
|
|
401 " vand %v13, %v13, %v0 \n"
|
|
402 " vxor %v0, %v9, %v7 \n"
|
|
403 " vand %v1, %v3, %v1 \n"
|
|
404 " vsububm %v13, %v12, %v13 \n"
|
|
405 " vand %v1, %v1, %v0 \n"
|
|
406 " stvx %v13, %r5, %r3 \n"
|
|
407 " vavgub %v0, %v9, %v7 \n"
|
|
408 " add %r3, %r3, %r0 \n"
|
|
409 " vsububm %v13, %v0, %v1 \n"
|
|
410 " bdnz ._L36 \n"
|
|
411 " lvx %v0, %r9, %r4 \n"
|
|
412 " lvx %v1, 0, %r4 \n"
|
|
413 " stvx %v13, 0, %r3 \n"
|
|
414 " vperm %v10, %v1, %v0, %v4 \n"
|
|
415 " vperm %v11, %v1, %v0, %v5 \n"
|
|
416 " vxor %v8, %v11, %v10 \n"
|
|
417 " vavgub %v9, %v11, %v10 \n"
|
|
418 " vor %v0, %v8, %v6 \n"
|
|
419 " vxor %v13, %v9, %v7 \n"
|
|
420 " vand %v0, %v3, %v0 \n"
|
|
421 " vavgub %v1, %v9, %v7 \n"
|
|
422 " vand %v0, %v0, %v13 \n"
|
|
423 " vsububm %v13, %v1, %v0 \n"
|
|
424 " stvx %v13, %r5, %r3 \n"
|
|
425 );
|
|
426 }
|
|
427
|
|
428 static void MC_put_xy_8_altivec (uint8_t * dest, const uint8_t * ref,
|
|
429 int stride, int height)
|
|
430 {
|
|
431 asm (" \n"
|
|
432 " lvsl %v4, 0, %r4 \n"
|
|
433 " vspltisb %v3, 1 \n"
|
|
434 " lvsl %v5, %r5, %r4 \n"
|
|
435 " vmrghb %v4, %v4, %v4 \n"
|
|
436 " li %r9, 8 \n"
|
|
437 " vmrghb %v5, %v5, %v5 \n"
|
|
438 " lvx %v1, 0, %r4 \n"
|
|
439 " vpkuhum %v4, %v4, %v4 \n"
|
|
440 " lvx %v0, %r9, %r4 \n"
|
|
441 " vpkuhum %v5, %v5, %v5 \n"
|
|
442 " add %r4, %r4, %r5 \n"
|
|
443 " vaddubm %v2, %v4, %v3 \n"
|
|
444 " vperm %v11, %v1, %v0, %v4 \n"
|
|
445 " srawi %r6, %r6, 1 \n"
|
|
446 " vaddubm %v19, %v5, %v3 \n"
|
|
447 " addi %r6, %r6, -1 \n"
|
|
448 " vperm %v10, %v1, %v0, %v2 \n"
|
|
449 " mtctr %r6 \n"
|
|
450 " lvx %v1, 0, %r4 \n"
|
|
451 " lvx %v0, %r9, %r4 \n"
|
|
452 " vavgub %v9, %v11, %v10 \n"
|
|
453 " vxor %v8, %v11, %v10 \n"
|
|
454 " add %r4, %r4, %r5 \n"
|
|
455 " vperm %v10, %v1, %v0, %v19 \n"
|
|
456 " vperm %v11, %v1, %v0, %v5 \n"
|
|
457 " vxor %v6, %v11, %v10 \n"
|
|
458 " vavgub %v7, %v11, %v10 \n"
|
|
459 " vor %v0, %v8, %v6 \n"
|
|
460 " vxor %v13, %v9, %v7 \n"
|
|
461 " vand %v0, %v3, %v0 \n"
|
|
462 " vavgub %v1, %v9, %v7 \n"
|
|
463 " vand %v0, %v0, %v13 \n"
|
|
464 " vsububm %v13, %v1, %v0 \n"
|
|
465 "._L41: \n"
|
|
466 " li %r9, 8 \n"
|
|
467 " lvx %v0, %r9, %r4 \n"
|
|
468 " lvx %v1, 0, %r4 \n"
|
|
469 " stvewx %v13, 0, %r3 \n"
|
|
470 " li %r9, 4 \n"
|
|
471 " vperm %v10, %v1, %v0, %v2 \n"
|
|
472 " stvewx %v13, %r9, %r3 \n"
|
|
473 " vperm %v11, %v1, %v0, %v4 \n"
|
|
474 " add %r4, %r4, %r5 \n"
|
|
475 " li %r9, 8 \n"
|
|
476 " vavgub %v9, %v11, %v10 \n"
|
|
477 " lvx %v0, %r9, %r4 \n"
|
|
478 " vxor %v8, %v11, %v10 \n"
|
|
479 " lvx %v1, 0, %r4 \n"
|
|
480 " vavgub %v12, %v9, %v7 \n"
|
|
481 " vor %v13, %v8, %v6 \n"
|
|
482 " add %r3, %r3, %r5 \n"
|
|
483 " vperm %v10, %v1, %v0, %v19 \n"
|
|
484 " li %r9, 4 \n"
|
|
485 " vperm %v11, %v1, %v0, %v5 \n"
|
|
486 " vand %v13, %v3, %v13 \n"
|
|
487 " add %r4, %r4, %r5 \n"
|
|
488 " vxor %v0, %v9, %v7 \n"
|
|
489 " vxor %v6, %v11, %v10 \n"
|
|
490 " vavgub %v7, %v11, %v10 \n"
|
|
491 " vor %v1, %v8, %v6 \n"
|
|
492 " vand %v13, %v13, %v0 \n"
|
|
493 " vxor %v0, %v9, %v7 \n"
|
|
494 " vand %v1, %v3, %v1 \n"
|
|
495 " vsububm %v13, %v12, %v13 \n"
|
|
496 " vand %v1, %v1, %v0 \n"
|
|
497 " stvewx %v13, 0, %r3 \n"
|
|
498 " vavgub %v0, %v9, %v7 \n"
|
|
499 " stvewx %v13, %r9, %r3 \n"
|
|
500 " add %r3, %r3, %r5 \n"
|
|
501 " vsububm %v13, %v0, %v1 \n"
|
|
502 " bdnz ._L41 \n"
|
|
503 " li %r9, 8 \n"
|
|
504 " lvx %v0, %r9, %r4 \n"
|
|
505 " lvx %v1, 0, %r4 \n"
|
|
506 " stvewx %v13, 0, %r3 \n"
|
|
507 " vperm %v10, %v1, %v0, %v2 \n"
|
|
508 " li %r9, 4 \n"
|
|
509 " vperm %v11, %v1, %v0, %v4 \n"
|
|
510 " stvewx %v13, %r9, %r3 \n"
|
|
511 " add %r3, %r3, %r5 \n"
|
|
512 " vxor %v8, %v11, %v10 \n"
|
|
513 " vavgub %v9, %v11, %v10 \n"
|
|
514 " vor %v0, %v8, %v6 \n"
|
|
515 " vxor %v13, %v9, %v7 \n"
|
|
516 " vand %v0, %v3, %v0 \n"
|
|
517 " vavgub %v1, %v9, %v7 \n"
|
|
518 " vand %v0, %v0, %v13 \n"
|
|
519 " vsububm %v13, %v1, %v0 \n"
|
|
520 " stvewx %v13, 0, %r3 \n"
|
|
521 " stvewx %v13, %r9, %r3 \n"
|
|
522 );
|
|
523 }
|
|
524
|
|
525 static void MC_avg_o_16_altivec (uint8_t * dest, const uint8_t * ref,
|
|
526 int stride, int height)
|
|
527 {
|
|
528 asm (" \n"
|
|
529 " li %r9, 15 \n"
|
|
530 " lvx %v0, %r9, %r4 \n"
|
|
531 " lvsl %v11, 0, %r4 \n"
|
|
532 " lvx %v1, 0, %r4 \n"
|
|
533 " srawi %r6, %r6, 1 \n"
|
|
534 " addi %r6, %r6, -1 \n"
|
|
535 " vperm %v0, %v1, %v0, %v11 \n"
|
|
536 " lvx %v13, 0, %r3 \n"
|
|
537 " mtctr %r6 \n"
|
|
538 " add %r9, %r5, %r5 \n"
|
|
539 " vavgub %v12, %v13, %v0 \n"
|
|
540 " add %r4, %r4, %r5 \n"
|
|
541 "._L46: \n"
|
|
542 " li %r11, 15 \n"
|
|
543 " lvx %v1, 0, %r4 \n"
|
|
544 " lvx %v0, %r11, %r4 \n"
|
|
545 " lvx %v13, %r5, %r3 \n"
|
|
546 " vperm %v0, %v1, %v0, %v11 \n"
|
|
547 " stvx %v12, 0, %r3 \n"
|
|
548 " add %r4, %r4, %r5 \n"
|
|
549 " vavgub %v12, %v13, %v0 \n"
|
|
550 " lvx %v1, 0, %r4 \n"
|
|
551 " lvx %v0, %r11, %r4 \n"
|
|
552 " lvx %v13, %r9, %r3 \n"
|
|
553 " vperm %v0, %v1, %v0, %v11 \n"
|
|
554 " stvx %v12, %r5, %r3 \n"
|
|
555 " add %r4, %r4, %r5 \n"
|
|
556 " vavgub %v12, %v13, %v0 \n"
|
|
557 " add %r3, %r3, %r9 \n"
|
|
558 " bdnz ._L46 \n"
|
|
559 " lvx %v0, %r11, %r4 \n"
|
|
560 " lvx %v1, 0, %r4 \n"
|
|
561 " lvx %v13, %r5, %r3 \n"
|
|
562 " vperm %v0, %v1, %v0, %v11 \n"
|
|
563 " stvx %v12, 0, %r3 \n"
|
|
564 " vavgub %v12, %v13, %v0 \n"
|
|
565 " stvx %v12, %r5, %r3 \n"
|
|
566 );
|
|
567 }
|
|
568
|
|
569 static void MC_avg_o_8_altivec (uint8_t * dest, const uint8_t * ref,
|
|
570 int stride, int height)
|
|
571 {
|
|
572 asm (" \n"
|
|
573 " lvsl %v12, 0, %r4 \n"
|
|
574 " li %r9, 7 \n"
|
|
575 " vmrghb %v12, %v12, %v12 \n"
|
|
576 " lvsl %v1, %r5, %r4 \n"
|
|
577 " lvx %v13, 0, %r4 \n"
|
|
578 " vpkuhum %v9, %v12, %v12 \n"
|
|
579 " lvx %v0, %r9, %r4 \n"
|
|
580 " srawi %r6, %r6, 1 \n"
|
|
581 " vmrghb %v1, %v1, %v1 \n"
|
|
582 " addi %r6, %r6, -1 \n"
|
|
583 " vperm %v0, %v13, %v0, %v9 \n"
|
|
584 " lvx %v11, 0, %r3 \n"
|
|
585 " mtctr %r6 \n"
|
|
586 " vpkuhum %v10, %v1, %v1 \n"
|
|
587 " add %r4, %r4, %r5 \n"
|
|
588 " vavgub %v12, %v11, %v0 \n"
|
|
589 "._L51: \n"
|
|
590 " li %r9, 7 \n"
|
|
591 " lvx %v0, %r9, %r4 \n"
|
|
592 " lvx %v13, 0, %r4 \n"
|
|
593 " lvx %v11, %r5, %r3 \n"
|
|
594 " stvewx %v12, 0, %r3 \n"
|
|
595 " vperm %v0, %v13, %v0, %v10 \n"
|
|
596 " li %r9, 4 \n"
|
|
597 " stvewx %v12, %r9, %r3 \n"
|
|
598 " vavgub %v1, %v11, %v0 \n"
|
|
599 " add %r4, %r4, %r5 \n"
|
|
600 " li %r9, 7 \n"
|
|
601 " lvx %v0, %r9, %r4 \n"
|
|
602 " add %r3, %r3, %r5 \n"
|
|
603 " lvx %v13, 0, %r4 \n"
|
|
604 " lvx %v11, %r5, %r3 \n"
|
|
605 " stvewx %v1, 0, %r3 \n"
|
|
606 " vperm %v0, %v13, %v0, %v9 \n"
|
|
607 " li %r9, 4 \n"
|
|
608 " stvewx %v1, %r9, %r3 \n"
|
|
609 " vavgub %v12, %v11, %v0 \n"
|
|
610 " add %r4, %r4, %r5 \n"
|
|
611 " add %r3, %r3, %r5 \n"
|
|
612 " bdnz ._L51 \n"
|
|
613 " li %r9, 7 \n"
|
|
614 " lvx %v0, %r9, %r4 \n"
|
|
615 " lvx %v13, 0, %r4 \n"
|
|
616 " lvx %v11, %r5, %r3 \n"
|
|
617 " stvewx %v12, 0, %r3 \n"
|
|
618 " vperm %v0, %v13, %v0, %v10 \n"
|
|
619 " li %r9, 4 \n"
|
|
620 " stvewx %v12, %r9, %r3 \n"
|
|
621 " vavgub %v1, %v11, %v0 \n"
|
|
622 " add %r3, %r3, %r5 \n"
|
|
623 " stvewx %v1, 0, %r3 \n"
|
|
624 " stvewx %v1, %r9, %r3 \n"
|
|
625 );
|
|
626 }
|
|
627
|
|
628 static void MC_avg_x_16_altivec (uint8_t * dest, const uint8_t * ref,
|
|
629 int stride, int height)
|
|
630 {
|
|
631 asm (" \n"
|
|
632 " lvsl %v8, 0, %r4 \n"
|
|
633 " vspltisb %v0, 1 \n"
|
|
634 " li %r9, 16 \n"
|
|
635 " lvx %v12, %r9, %r4 \n"
|
|
636 " vaddubm %v7, %v8, %v0 \n"
|
|
637 " lvx %v11, 0, %r4 \n"
|
|
638 " srawi %r6, %r6, 1 \n"
|
|
639 " vperm %v1, %v11, %v12, %v7 \n"
|
|
640 " addi %r6, %r6, -1 \n"
|
|
641 " vperm %v0, %v11, %v12, %v8 \n"
|
|
642 " lvx %v9, 0, %r3 \n"
|
|
643 " mtctr %r6 \n"
|
|
644 " add %r9, %r5, %r5 \n"
|
|
645 " vavgub %v0, %v0, %v1 \n"
|
|
646 " add %r4, %r4, %r5 \n"
|
|
647 " vavgub %v10, %v9, %v0 \n"
|
|
648 "._L56: \n"
|
|
649 " li %r11, 16 \n"
|
|
650 " lvx %v11, 0, %r4 \n"
|
|
651 " lvx %v12, %r11, %r4 \n"
|
|
652 " lvx %v9, %r5, %r3 \n"
|
|
653 " stvx %v10, 0, %r3 \n"
|
|
654 " vperm %v0, %v11, %v12, %v7 \n"
|
|
655 " add %r4, %r4, %r5 \n"
|
|
656 " vperm %v1, %v11, %v12, %v8 \n"
|
|
657 " lvx %v11, 0, %r4 \n"
|
|
658 " lvx %v12, %r11, %r4 \n"
|
|
659 " vavgub %v1, %v1, %v0 \n"
|
|
660 " add %r4, %r4, %r5 \n"
|
|
661 " vperm %v13, %v11, %v12, %v7 \n"
|
|
662 " vavgub %v10, %v9, %v1 \n"
|
|
663 " vperm %v0, %v11, %v12, %v8 \n"
|
|
664 " lvx %v9, %r9, %r3 \n"
|
|
665 " stvx %v10, %r5, %r3 \n"
|
|
666 " vavgub %v0, %v0, %v13 \n"
|
|
667 " add %r3, %r3, %r9 \n"
|
|
668 " vavgub %v10, %v9, %v0 \n"
|
|
669 " bdnz ._L56 \n"
|
|
670 " lvx %v12, %r11, %r4 \n"
|
|
671 " lvx %v11, 0, %r4 \n"
|
|
672 " lvx %v9, %r5, %r3 \n"
|
|
673 " vperm %v1, %v11, %v12, %v7 \n"
|
|
674 " stvx %v10, 0, %r3 \n"
|
|
675 " vperm %v0, %v11, %v12, %v8 \n"
|
|
676 " vavgub %v0, %v0, %v1 \n"
|
|
677 " vavgub %v10, %v9, %v0 \n"
|
|
678 " stvx %v10, %r5, %r3 \n"
|
|
679 );
|
|
680 }
|
|
681
|
|
682 static void MC_avg_x_8_altivec (uint8_t * dest, const uint8_t * ref,
|
|
683 int stride, int height)
|
|
684 {
|
|
685 asm (" \n"
|
|
686 " lvsl %v10, 0, %r4 \n"
|
|
687 " vspltisb %v13, 1 \n"
|
|
688 " li %r9, 8 \n"
|
|
689 " vmrghb %v10, %v10, %v10 \n"
|
|
690 " lvx %v11, 0, %r4 \n"
|
|
691 " lvx %v12, %r9, %r4 \n"
|
|
692 " vpkuhum %v7, %v10, %v10 \n"
|
|
693 " srawi %r6, %r6, 1 \n"
|
|
694 " lvsl %v10, %r5, %r4 \n"
|
|
695 " vaddubm %v6, %v7, %v13 \n"
|
|
696 " vperm %v0, %v11, %v12, %v7 \n"
|
|
697 " addi %r6, %r6, -1 \n"
|
|
698 " vmrghb %v10, %v10, %v10 \n"
|
|
699 " lvx %v9, 0, %r3 \n"
|
|
700 " mtctr %r6 \n"
|
|
701 " vperm %v1, %v11, %v12, %v6 \n"
|
|
702 " add %r4, %r4, %r5 \n"
|
|
703 " vpkuhum %v8, %v10, %v10 \n"
|
|
704 " vavgub %v0, %v0, %v1 \n"
|
|
705 " vaddubm %v13, %v8, %v13 \n"
|
|
706 " vavgub %v10, %v9, %v0 \n"
|
|
707 "._L61: \n"
|
|
708 " li %r9, 8 \n"
|
|
709 " lvx %v12, %r9, %r4 \n"
|
|
710 " lvx %v11, 0, %r4 \n"
|
|
711 " lvx %v9, %r5, %r3 \n"
|
|
712 " stvewx %v10, 0, %r3 \n"
|
|
713 " vperm %v1, %v11, %v12, %v13 \n"
|
|
714 " vperm %v0, %v11, %v12, %v8 \n"
|
|
715 " li %r9, 4 \n"
|
|
716 " stvewx %v10, %r9, %r3 \n"
|
|
717 " add %r4, %r4, %r5 \n"
|
|
718 " vavgub %v0, %v0, %v1 \n"
|
|
719 " li %r9, 8 \n"
|
|
720 " lvx %v12, %r9, %r4 \n"
|
|
721 " vavgub %v10, %v9, %v0 \n"
|
|
722 " lvx %v11, 0, %r4 \n"
|
|
723 " add %r3, %r3, %r5 \n"
|
|
724 " vperm %v1, %v11, %v12, %v6 \n"
|
|
725 " lvx %v9, %r5, %r3 \n"
|
|
726 " vperm %v0, %v11, %v12, %v7 \n"
|
|
727 " stvewx %v10, 0, %r3 \n"
|
|
728 " li %r9, 4 \n"
|
|
729 " vavgub %v0, %v0, %v1 \n"
|
|
730 " stvewx %v10, %r9, %r3 \n"
|
|
731 " add %r4, %r4, %r5 \n"
|
|
732 " add %r3, %r3, %r5 \n"
|
|
733 " vavgub %v10, %v9, %v0 \n"
|
|
734 " bdnz ._L61 \n"
|
|
735 " li %r9, 8 \n"
|
|
736 " lvx %v12, %r9, %r4 \n"
|
|
737 " lvx %v11, 0, %r4 \n"
|
|
738 " lvx %v9, %r5, %r3 \n"
|
|
739 " vperm %v1, %v11, %v12, %v13 \n"
|
|
740 " stvewx %v10, 0, %r3 \n"
|
|
741 " vperm %v0, %v11, %v12, %v8 \n"
|
|
742 " li %r9, 4 \n"
|
|
743 " stvewx %v10, %r9, %r3 \n"
|
|
744 " vavgub %v0, %v0, %v1 \n"
|
|
745 " add %r3, %r3, %r5 \n"
|
|
746 " vavgub %v10, %v9, %v0 \n"
|
|
747 " stvewx %v10, 0, %r3 \n"
|
|
748 " stvewx %v10, %r9, %r3 \n"
|
|
749 );
|
|
750 }
|
|
751
|
|
752 static void MC_avg_y_16_altivec (uint8_t * dest, const uint8_t * ref,
|
|
753 int stride, int height)
|
|
754 {
|
|
755 asm (" \n"
|
|
756 " li %r9, 15 \n"
|
|
757 " lvx %v1, %r9, %r4 \n"
|
|
758 " lvsl %v9, 0, %r4 \n"
|
|
759 " lvx %v13, 0, %r4 \n"
|
|
760 " add %r4, %r4, %r5 \n"
|
|
761 " vperm %v11, %v13, %v1, %v9 \n"
|
|
762 " li %r11, 15 \n"
|
|
763 " lvx %v13, 0, %r4 \n"
|
|
764 " lvx %v1, %r11, %r4 \n"
|
|
765 " srawi %r6, %r6, 1 \n"
|
|
766 " vperm %v10, %v13, %v1, %v9 \n"
|
|
767 " addi %r6, %r6, -1 \n"
|
|
768 " lvx %v12, 0, %r3 \n"
|
|
769 " mtctr %r6 \n"
|
|
770 " vavgub %v0, %v11, %v10 \n"
|
|
771 " add %r9, %r5, %r5 \n"
|
|
772 " add %r4, %r4, %r5 \n"
|
|
773 " vavgub %v0, %v12, %v0 \n"
|
|
774 "._L66: \n"
|
|
775 " li %r11, 15 \n"
|
|
776 " lvx %v13, 0, %r4 \n"
|
|
777 " lvx %v1, %r11, %r4 \n"
|
|
778 " lvx %v12, %r5, %r3 \n"
|
|
779 " vperm %v11, %v13, %v1, %v9 \n"
|
|
780 " stvx %v0, 0, %r3 \n"
|
|
781 " add %r4, %r4, %r5 \n"
|
|
782 " vavgub %v0, %v11, %v10 \n"
|
|
783 " lvx %v13, 0, %r4 \n"
|
|
784 " lvx %v1, %r11, %r4 \n"
|
|
785 " vavgub %v0, %v12, %v0 \n"
|
|
786 " add %r4, %r4, %r5 \n"
|
|
787 " lvx %v12, %r9, %r3 \n"
|
|
788 " vperm %v10, %v13, %v1, %v9 \n"
|
|
789 " stvx %v0, %r5, %r3 \n"
|
|
790 " vavgub %v0, %v11, %v10 \n"
|
|
791 " add %r3, %r3, %r9 \n"
|
|
792 " vavgub %v0, %v12, %v0 \n"
|
|
793 " bdnz ._L66 \n"
|
|
794 " lvx %v1, %r11, %r4 \n"
|
|
795 " lvx %v13, 0, %r4 \n"
|
|
796 " lvx %v12, %r5, %r3 \n"
|
|
797 " vperm %v11, %v13, %v1, %v9 \n"
|
|
798 " stvx %v0, 0, %r3 \n"
|
|
799 " vavgub %v0, %v11, %v10 \n"
|
|
800 " vavgub %v0, %v12, %v0 \n"
|
|
801 " stvx %v0, %r5, %r3 \n"
|
|
802 );
|
|
803 }
|
|
804
|
|
805 static void MC_avg_y_8_altivec (uint8_t * dest, const uint8_t * ref,
|
|
806 int stride, int height)
|
|
807 {
|
|
808 asm (" \n"
|
|
809 " lvsl %v12, 0, %r4 \n"
|
|
810 " lvsl %v9, %r5, %r4 \n"
|
|
811 " vmrghb %v12, %v12, %v12 \n"
|
|
812 " li %r9, 7 \n"
|
|
813 " lvx %v11, 0, %r4 \n"
|
|
814 " vmrghb %v9, %v9, %v9 \n"
|
|
815 " lvx %v13, %r9, %r4 \n"
|
|
816 " vpkuhum %v7, %v12, %v12 \n"
|
|
817 " add %r4, %r4, %r5 \n"
|
|
818 " vpkuhum %v8, %v9, %v9 \n"
|
|
819 " vperm %v12, %v11, %v13, %v7 \n"
|
|
820 " srawi %r6, %r6, 1 \n"
|
|
821 " lvx %v11, 0, %r4 \n"
|
|
822 " lvx %v13, %r9, %r4 \n"
|
|
823 " addi %r6, %r6, -1 \n"
|
|
824 " vperm %v9, %v11, %v13, %v8 \n"
|
|
825 " lvx %v10, 0, %r3 \n"
|
|
826 " mtctr %r6 \n"
|
|
827 " add %r4, %r4, %r5 \n"
|
|
828 " vavgub %v0, %v12, %v9 \n"
|
|
829 " vavgub %v1, %v10, %v0 \n"
|
|
830 "._L71: \n"
|
|
831 " li %r9, 7 \n"
|
|
832 " lvx %v13, %r9, %r4 \n"
|
|
833 " lvx %v11, 0, %r4 \n"
|
|
834 " lvx %v10, %r5, %r3 \n"
|
|
835 " stvewx %v1, 0, %r3 \n"
|
|
836 " vperm %v12, %v11, %v13, %v7 \n"
|
|
837 " li %r9, 4 \n"
|
|
838 " stvewx %v1, %r9, %r3 \n"
|
|
839 " vavgub %v0, %v12, %v9 \n"
|
|
840 " add %r4, %r4, %r5 \n"
|
|
841 " li %r9, 7 \n"
|
|
842 " vavgub %v1, %v10, %v0 \n"
|
|
843 " lvx %v13, %r9, %r4 \n"
|
|
844 " lvx %v11, 0, %r4 \n"
|
|
845 " add %r3, %r3, %r5 \n"
|
|
846 " vperm %v9, %v11, %v13, %v8 \n"
|
|
847 " lvx %v10, %r5, %r3 \n"
|
|
848 " stvewx %v1, 0, %r3 \n"
|
|
849 " vavgub %v0, %v12, %v9 \n"
|
|
850 " li %r9, 4 \n"
|
|
851 " stvewx %v1, %r9, %r3 \n"
|
|
852 " add %r4, %r4, %r5 \n"
|
|
853 " vavgub %v1, %v10, %v0 \n"
|
|
854 " add %r3, %r3, %r5 \n"
|
|
855 " bdnz ._L71 \n"
|
|
856 " li %r9, 7 \n"
|
|
857 " lvx %v13, %r9, %r4 \n"
|
|
858 " lvx %v11, 0, %r4 \n"
|
|
859 " lvx %v10, %r5, %r3 \n"
|
|
860 " vperm %v12, %v11, %v13, %v7 \n"
|
|
861 " stvewx %v1, 0, %r3 \n"
|
|
862 " li %r9, 4 \n"
|
|
863 " vavgub %v0, %v12, %v9 \n"
|
|
864 " stvewx %v1, %r9, %r3 \n"
|
|
865 " add %r3, %r3, %r5 \n"
|
|
866 " vavgub %v1, %v10, %v0 \n"
|
|
867 " stvewx %v1, 0, %r3 \n"
|
|
868 " stvewx %v1, %r9, %r3 \n"
|
|
869 );
|
|
870 }
|
|
871
|
|
872 static void MC_avg_xy_16_altivec (uint8_t * dest, const uint8_t * ref,
|
|
873 int stride, int height)
|
|
874 {
|
|
875 asm (" \n"
|
|
876 " lvsl %v4, 0, %r4 \n"
|
|
877 " vspltisb %v2, 1 \n"
|
|
878 " li %r9, 16 \n"
|
|
879 " lvx %v1, %r9, %r4 \n"
|
|
880 " vaddubm %v3, %v4, %v2 \n"
|
|
881 " lvx %v13, 0, %r4 \n"
|
|
882 " add %r4, %r4, %r5 \n"
|
|
883 " vperm %v10, %v13, %v1, %v3 \n"
|
|
884 " li %r11, 16 \n"
|
|
885 " vperm %v11, %v13, %v1, %v4 \n"
|
|
886 " srawi %r6, %r6, 1 \n"
|
|
887 " lvx %v13, 0, %r4 \n"
|
|
888 " lvx %v1, %r11, %r4 \n"
|
|
889 " vavgub %v9, %v11, %v10 \n"
|
|
890 " vxor %v8, %v11, %v10 \n"
|
|
891 " addi %r6, %r6, -1 \n"
|
|
892 " vperm %v10, %v13, %v1, %v3 \n"
|
|
893 " lvx %v6, 0, %r3 \n"
|
|
894 " mtctr %r6 \n"
|
|
895 " vperm %v11, %v13, %v1, %v4 \n"
|
|
896 " add %r9, %r5, %r5 \n"
|
|
897 " add %r4, %r4, %r5 \n"
|
|
898 " vxor %v5, %v11, %v10 \n"
|
|
899 " vavgub %v7, %v11, %v10 \n"
|
|
900 " vor %v1, %v8, %v5 \n"
|
|
901 " vxor %v13, %v9, %v7 \n"
|
|
902 " vand %v1, %v2, %v1 \n"
|
|
903 " vavgub %v0, %v9, %v7 \n"
|
|
904 " vand %v1, %v1, %v13 \n"
|
|
905 " vsububm %v0, %v0, %v1 \n"
|
|
906 " vavgub %v12, %v6, %v0 \n"
|
|
907 "._L76: \n"
|
|
908 " li %r11, 16 \n"
|
|
909 " lvx %v13, 0, %r4 \n"
|
|
910 " lvx %v1, %r11, %r4 \n"
|
|
911 " lvx %v6, %r5, %r3 \n"
|
|
912 " stvx %v12, 0, %r3 \n"
|
|
913 " vperm %v10, %v13, %v1, %v3 \n"
|
|
914 " vperm %v11, %v13, %v1, %v4 \n"
|
|
915 " add %r4, %r4, %r5 \n"
|
|
916 " lvx %v13, 0, %r4 \n"
|
|
917 " lvx %v1, %r11, %r4 \n"
|
|
918 " vavgub %v9, %v11, %v10 \n"
|
|
919 " vxor %v8, %v11, %v10 \n"
|
|
920 " add %r4, %r4, %r5 \n"
|
|
921 " vperm %v10, %v13, %v1, %v3 \n"
|
|
922 " vavgub %v12, %v9, %v7 \n"
|
|
923 " vperm %v11, %v13, %v1, %v4 \n"
|
|
924 " vor %v0, %v8, %v5 \n"
|
|
925 " vxor %v13, %v9, %v7 \n"
|
|
926 " vxor %v5, %v11, %v10 \n"
|
|
927 " vand %v0, %v2, %v0 \n"
|
|
928 " vavgub %v7, %v11, %v10 \n"
|
|
929 " vor %v1, %v8, %v5 \n"
|
|
930 " vand %v0, %v0, %v13 \n"
|
|
931 " vand %v1, %v2, %v1 \n"
|
|
932 " vxor %v13, %v9, %v7 \n"
|
|
933 " vsububm %v12, %v12, %v0 \n"
|
|
934 " vand %v1, %v1, %v13 \n"
|
|
935 " vavgub %v0, %v9, %v7 \n"
|
|
936 " vavgub %v12, %v6, %v12 \n"
|
|
937 " lvx %v6, %r9, %r3 \n"
|
|
938 " vsububm %v0, %v0, %v1 \n"
|
|
939 " stvx %v12, %r5, %r3 \n"
|
|
940 " vavgub %v12, %v6, %v0 \n"
|
|
941 " add %r3, %r3, %r9 \n"
|
|
942 " bdnz ._L76 \n"
|
|
943 " lvx %v1, %r11, %r4 \n"
|
|
944 " lvx %v13, 0, %r4 \n"
|
|
945 " lvx %v6, %r5, %r3 \n"
|
|
946 " vperm %v10, %v13, %v1, %v3 \n"
|
|
947 " stvx %v12, 0, %r3 \n"
|
|
948 " vperm %v11, %v13, %v1, %v4 \n"
|
|
949 " vxor %v8, %v11, %v10 \n"
|
|
950 " vavgub %v9, %v11, %v10 \n"
|
|
951 " vor %v0, %v8, %v5 \n"
|
|
952 " vxor %v13, %v9, %v7 \n"
|
|
953 " vand %v0, %v2, %v0 \n"
|
|
954 " vavgub %v1, %v9, %v7 \n"
|
|
955 " vand %v0, %v0, %v13 \n"
|
|
956 " vsububm %v1, %v1, %v0 \n"
|
|
957 " vavgub %v12, %v6, %v1 \n"
|
|
958 " stvx %v12, %r5, %r3 \n"
|
|
959 );
|
|
960 }
|
|
961
|
|
962 static void MC_avg_xy_8_altivec (uint8_t * dest, const uint8_t * ref,
|
|
963 int stride, int height)
|
|
964 {
|
|
965 asm (" \n"
|
|
966 " lvsl %v2, 0, %r4 \n"
|
|
967 " vspltisb %v19, 1 \n"
|
|
968 " lvsl %v3, %r5, %r4 \n"
|
|
969 " vmrghb %v2, %v2, %v2 \n"
|
|
970 " li %r9, 8 \n"
|
|
971 " vmrghb %v3, %v3, %v3 \n"
|
|
972 " lvx %v9, 0, %r4 \n"
|
|
973 " vpkuhum %v2, %v2, %v2 \n"
|
|
974 " lvx %v1, %r9, %r4 \n"
|
|
975 " vpkuhum %v3, %v3, %v3 \n"
|
|
976 " add %r4, %r4, %r5 \n"
|
|
977 " vaddubm %v18, %v2, %v19 \n"
|
|
978 " vperm %v11, %v9, %v1, %v2 \n"
|
|
979 " srawi %r6, %r6, 1 \n"
|
|
980 " vaddubm %v17, %v3, %v19 \n"
|
|
981 " addi %r6, %r6, -1 \n"
|
|
982 " vperm %v10, %v9, %v1, %v18 \n"
|
|
983 " lvx %v4, 0, %r3 \n"
|
|
984 " mtctr %r6 \n"
|
|
985 " lvx %v1, %r9, %r4 \n"
|
|
986 " lvx %v9, 0, %r4 \n"
|
|
987 " vavgub %v8, %v11, %v10 \n"
|
|
988 " vxor %v7, %v11, %v10 \n"
|
|
989 " add %r4, %r4, %r5 \n"
|
|
990 " vperm %v10, %v9, %v1, %v17 \n"
|
|
991 " vperm %v11, %v9, %v1, %v3 \n"
|
|
992 " vxor %v5, %v11, %v10 \n"
|
|
993 " vavgub %v6, %v11, %v10 \n"
|
|
994 " vor %v1, %v7, %v5 \n"
|
|
995 " vxor %v13, %v8, %v6 \n"
|
|
996 " vand %v1, %v19, %v1 \n"
|
|
997 " vavgub %v0, %v8, %v6 \n"
|
|
998 " vand %v1, %v1, %v13 \n"
|
|
999 " vsububm %v0, %v0, %v1 \n"
|
|
1000 " vavgub %v13, %v4, %v0 \n"
|
|
1001 "._L81: \n"
|
|
1002 " li %r9, 8 \n"
|
|
1003 " lvx %v1, %r9, %r4 \n"
|
|
1004 " lvx %v9, 0, %r4 \n"
|
|
1005 " lvx %v4, %r5, %r3 \n"
|
|
1006 " stvewx %v13, 0, %r3 \n"
|
|
1007 " vperm %v10, %v9, %v1, %v18 \n"
|
|
1008 " vperm %v11, %v9, %v1, %v2 \n"
|
|
1009 " li %r9, 4 \n"
|
|
1010 " stvewx %v13, %r9, %r3 \n"
|
|
1011 " vxor %v7, %v11, %v10 \n"
|
|
1012 " add %r4, %r4, %r5 \n"
|
|
1013 " li %r9, 8 \n"
|
|
1014 " vavgub %v8, %v11, %v10 \n"
|
|
1015 " lvx %v1, %r9, %r4 \n"
|
|
1016 " vor %v0, %v7, %v5 \n"
|
|
1017 " lvx %v9, 0, %r4 \n"
|
|
1018 " vxor %v12, %v8, %v6 \n"
|
|
1019 " vand %v0, %v19, %v0 \n"
|
|
1020 " add %r3, %r3, %r5 \n"
|
|
1021 " vperm %v10, %v9, %v1, %v17 \n"
|
|
1022 " vavgub %v13, %v8, %v6 \n"
|
|
1023 " li %r9, 4 \n"
|
|
1024 " vperm %v11, %v9, %v1, %v3 \n"
|
|
1025 " vand %v0, %v0, %v12 \n"
|
|
1026 " add %r4, %r4, %r5 \n"
|
|
1027 " vxor %v5, %v11, %v10 \n"
|
|
1028 " vavgub %v6, %v11, %v10 \n"
|
|
1029 " vor %v1, %v7, %v5 \n"
|
|
1030 " vsububm %v13, %v13, %v0 \n"
|
|
1031 " vxor %v0, %v8, %v6 \n"
|
|
1032 " vand %v1, %v19, %v1 \n"
|
|
1033 " vavgub %v13, %v4, %v13 \n"
|
|
1034 " vand %v1, %v1, %v0 \n"
|
|
1035 " lvx %v4, %r5, %r3 \n"
|
|
1036 " vavgub %v0, %v8, %v6 \n"
|
|
1037 " stvewx %v13, 0, %r3 \n"
|
|
1038 " stvewx %v13, %r9, %r3 \n"
|
|
1039 " vsububm %v0, %v0, %v1 \n"
|
|
1040 " add %r3, %r3, %r5 \n"
|
|
1041 " vavgub %v13, %v4, %v0 \n"
|
|
1042 " bdnz ._L81 \n"
|
|
1043 " li %r9, 8 \n"
|
|
1044 " lvx %v1, %r9, %r4 \n"
|
|
1045 " lvx %v9, 0, %r4 \n"
|
|
1046 " lvx %v4, %r5, %r3 \n"
|
|
1047 " vperm %v10, %v9, %v1, %v18 \n"
|
|
1048 " stvewx %v13, 0, %r3 \n"
|
|
1049 " vperm %v11, %v9, %v1, %v2 \n"
|
|
1050 " li %r9, 4 \n"
|
|
1051 " stvewx %v13, %r9, %r3 \n"
|
|
1052 " vxor %v7, %v11, %v10 \n"
|
|
1053 " add %r3, %r3, %r5 \n"
|
|
1054 " vavgub %v8, %v11, %v10 \n"
|
|
1055 " vor %v0, %v7, %v5 \n"
|
|
1056 " vxor %v13, %v8, %v6 \n"
|
|
1057 " vand %v0, %v19, %v0 \n"
|
|
1058 " vavgub %v1, %v8, %v6 \n"
|
|
1059 " vand %v0, %v0, %v13 \n"
|
|
1060 " vsububm %v1, %v1, %v0 \n"
|
|
1061 " vavgub %v13, %v4, %v1 \n"
|
|
1062 " stvewx %v13, 0, %r3 \n"
|
|
1063 " stvewx %v13, %r9, %r3 \n"
|
|
1064 );
|
|
1065 }
|
|
1066
|
|
1067 MPEG2_MC_EXTERN (altivec)
|
|
1068
|
|
1069 #endif /* ARCH_PPC */
|
|
1070
|
|
1071 #else /* __ALTIVEC__ */
|
|
1072
|
|
1073 #define vector_s16_t vector signed short
|
|
1074 #define vector_u16_t vector unsigned short
|
|
1075 #define vector_s8_t vector signed char
|
|
1076 #define vector_u8_t vector unsigned char
|
|
1077 #define vector_s32_t vector signed int
|
|
1078 #define vector_u32_t vector unsigned int
|
|
1079
|
|
1080 void MC_put_o_16_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1081 const int stride, int height)
|
|
1082 {
|
|
1083 vector_u8_t perm, ref0, ref1, tmp;
|
|
1084
|
|
1085 perm = vec_lvsl (0, ref);
|
|
1086
|
|
1087 height = (height >> 1) - 1;
|
|
1088
|
|
1089 ref0 = vec_ld (0, ref);
|
|
1090 ref1 = vec_ld (15, ref);
|
|
1091 ref += stride;
|
|
1092 tmp = vec_perm (ref0, ref1, perm);
|
|
1093
|
|
1094 do {
|
|
1095 ref0 = vec_ld (0, ref);
|
|
1096 ref1 = vec_ld (15, ref);
|
|
1097 ref += stride;
|
|
1098 vec_st (tmp, 0, dest);
|
|
1099 tmp = vec_perm (ref0, ref1, perm);
|
|
1100
|
|
1101 ref0 = vec_ld (0, ref);
|
|
1102 ref1 = vec_ld (15, ref);
|
|
1103 ref += stride;
|
|
1104 vec_st (tmp, stride, dest);
|
|
1105 dest += 2*stride;
|
|
1106 tmp = vec_perm (ref0, ref1, perm);
|
|
1107 } while (--height);
|
|
1108
|
|
1109 ref0 = vec_ld (0, ref);
|
|
1110 ref1 = vec_ld (15, ref);
|
|
1111 vec_st (tmp, 0, dest);
|
|
1112 tmp = vec_perm (ref0, ref1, perm);
|
|
1113 vec_st (tmp, stride, dest);
|
|
1114 }
|
|
1115
|
|
1116 void MC_put_o_8_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1117 const int stride, int height)
|
|
1118 {
|
|
1119 vector_u8_t perm0, perm1, tmp0, tmp1, ref0, ref1;
|
|
1120
|
|
1121 tmp0 = vec_lvsl (0, ref);
|
|
1122 tmp0 = vec_mergeh (tmp0, tmp0);
|
|
1123 perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
|
|
1124 tmp1 = vec_lvsl (stride, ref);
|
|
1125 tmp1 = vec_mergeh (tmp1, tmp1);
|
|
1126 perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
|
|
1127
|
|
1128 height = (height >> 1) - 1;
|
|
1129
|
|
1130 ref0 = vec_ld (0, ref);
|
|
1131 ref1 = vec_ld (7, ref);
|
|
1132 ref += stride;
|
|
1133 tmp0 = vec_perm (ref0, ref1, perm0);
|
|
1134
|
|
1135 do {
|
|
1136 ref0 = vec_ld (0, ref);
|
|
1137 ref1 = vec_ld (7, ref);
|
|
1138 ref += stride;
|
|
1139 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
|
|
1140 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
|
|
1141 dest += stride;
|
|
1142 tmp1 = vec_perm (ref0, ref1, perm1);
|
|
1143
|
|
1144 ref0 = vec_ld (0, ref);
|
|
1145 ref1 = vec_ld (7, ref);
|
|
1146 ref += stride;
|
|
1147 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
|
|
1148 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
|
|
1149 dest += stride;
|
|
1150 tmp0 = vec_perm (ref0, ref1, perm0);
|
|
1151 } while (--height);
|
|
1152
|
|
1153 ref0 = vec_ld (0, ref);
|
|
1154 ref1 = vec_ld (7, ref);
|
|
1155 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
|
|
1156 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
|
|
1157 dest += stride;
|
|
1158 tmp1 = vec_perm (ref0, ref1, perm1);
|
|
1159 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
|
|
1160 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
|
|
1161 }
|
|
1162
|
|
1163 void MC_put_x_16_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1164 const int stride, int height)
|
|
1165 {
|
|
1166 vector_u8_t permA, permB, ref0, ref1, tmp;
|
|
1167
|
|
1168 permA = vec_lvsl (0, ref);
|
|
1169 permB = vec_add (permA, vec_splat_u8 (1));
|
|
1170
|
|
1171 height = (height >> 1) - 1;
|
|
1172
|
|
1173 ref0 = vec_ld (0, ref);
|
|
1174 ref1 = vec_ld (16, ref);
|
|
1175 ref += stride;
|
|
1176 tmp = vec_avg (vec_perm (ref0, ref1, permA),
|
|
1177 vec_perm (ref0, ref1, permB));
|
|
1178
|
|
1179 do {
|
|
1180 ref0 = vec_ld (0, ref);
|
|
1181 ref1 = vec_ld (16, ref);
|
|
1182 ref += stride;
|
|
1183 vec_st (tmp, 0, dest);
|
|
1184 tmp = vec_avg (vec_perm (ref0, ref1, permA),
|
|
1185 vec_perm (ref0, ref1, permB));
|
|
1186
|
|
1187 ref0 = vec_ld (0, ref);
|
|
1188 ref1 = vec_ld (16, ref);
|
|
1189 ref += stride;
|
|
1190 vec_st (tmp, stride, dest);
|
|
1191 dest += 2*stride;
|
|
1192 tmp = vec_avg (vec_perm (ref0, ref1, permA),
|
|
1193 vec_perm (ref0, ref1, permB));
|
|
1194 } while (--height);
|
|
1195
|
|
1196 ref0 = vec_ld (0, ref);
|
|
1197 ref1 = vec_ld (16, ref);
|
|
1198 vec_st (tmp, 0, dest);
|
|
1199 tmp = vec_avg (vec_perm (ref0, ref1, permA),
|
|
1200 vec_perm (ref0, ref1, permB));
|
|
1201 vec_st (tmp, stride, dest);
|
|
1202 }
|
|
1203
|
|
1204 void MC_put_x_8_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1205 const int stride, int height)
|
|
1206 {
|
|
1207 vector_u8_t perm0A, perm0B, perm1A, perm1B, ones, tmp0, tmp1, ref0, ref1;
|
|
1208
|
|
1209 ones = vec_splat_u8 (1);
|
|
1210 tmp0 = vec_lvsl (0, ref);
|
|
1211 tmp0 = vec_mergeh (tmp0, tmp0);
|
|
1212 perm0A = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
|
|
1213 perm0B = vec_add (perm0A, ones);
|
|
1214 tmp1 = vec_lvsl (stride, ref);
|
|
1215 tmp1 = vec_mergeh (tmp1, tmp1);
|
|
1216 perm1A = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
|
|
1217 perm1B = vec_add (perm1A, ones);
|
|
1218
|
|
1219 height = (height >> 1) - 1;
|
|
1220
|
|
1221 ref0 = vec_ld (0, ref);
|
|
1222 ref1 = vec_ld (8, ref);
|
|
1223 ref += stride;
|
|
1224 tmp0 = vec_avg (vec_perm (ref0, ref1, perm0A),
|
|
1225 vec_perm (ref0, ref1, perm0B));
|
|
1226
|
|
1227 do {
|
|
1228 ref0 = vec_ld (0, ref);
|
|
1229 ref1 = vec_ld (8, ref);
|
|
1230 ref += stride;
|
|
1231 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
|
|
1232 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
|
|
1233 dest += stride;
|
|
1234 tmp1 = vec_avg (vec_perm (ref0, ref1, perm1A),
|
|
1235 vec_perm (ref0, ref1, perm1B));
|
|
1236
|
|
1237 ref0 = vec_ld (0, ref);
|
|
1238 ref1 = vec_ld (8, ref);
|
|
1239 ref += stride;
|
|
1240 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
|
|
1241 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
|
|
1242 dest += stride;
|
|
1243 tmp0 = vec_avg (vec_perm (ref0, ref1, perm0A),
|
|
1244 vec_perm (ref0, ref1, perm0B));
|
|
1245 } while (--height);
|
|
1246
|
|
1247 ref0 = vec_ld (0, ref);
|
|
1248 ref1 = vec_ld (8, ref);
|
|
1249 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
|
|
1250 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
|
|
1251 dest += stride;
|
|
1252 tmp1 = vec_avg (vec_perm (ref0, ref1, perm1A),
|
|
1253 vec_perm (ref0, ref1, perm1B));
|
|
1254 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
|
|
1255 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
|
|
1256 }
|
|
1257
|
|
1258 void MC_put_y_16_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1259 const int stride, int height)
|
|
1260 {
|
|
1261 vector_u8_t perm, ref0, ref1, tmp0, tmp1, tmp;
|
|
1262
|
|
1263 perm = vec_lvsl (0, ref);
|
|
1264
|
|
1265 height = (height >> 1) - 1;
|
|
1266
|
|
1267 ref0 = vec_ld (0, ref);
|
|
1268 ref1 = vec_ld (15, ref);
|
|
1269 ref += stride;
|
|
1270 tmp0 = vec_perm (ref0, ref1, perm);
|
|
1271 ref0 = vec_ld (0, ref);
|
|
1272 ref1 = vec_ld (15, ref);
|
|
1273 ref += stride;
|
|
1274 tmp1 = vec_perm (ref0, ref1, perm);
|
|
1275 tmp = vec_avg (tmp0, tmp1);
|
|
1276
|
|
1277 do {
|
|
1278 ref0 = vec_ld (0, ref);
|
|
1279 ref1 = vec_ld (15, ref);
|
|
1280 ref += stride;
|
|
1281 vec_st (tmp, 0, dest);
|
|
1282 tmp0 = vec_perm (ref0, ref1, perm);
|
|
1283 tmp = vec_avg (tmp0, tmp1);
|
|
1284
|
|
1285 ref0 = vec_ld (0, ref);
|
|
1286 ref1 = vec_ld (15, ref);
|
|
1287 ref += stride;
|
|
1288 vec_st (tmp, stride, dest);
|
|
1289 dest += 2*stride;
|
|
1290 tmp1 = vec_perm (ref0, ref1, perm);
|
|
1291 tmp = vec_avg (tmp0, tmp1);
|
|
1292 } while (--height);
|
|
1293
|
|
1294 ref0 = vec_ld (0, ref);
|
|
1295 ref1 = vec_ld (15, ref);
|
|
1296 vec_st (tmp, 0, dest);
|
|
1297 tmp0 = vec_perm (ref0, ref1, perm);
|
|
1298 tmp = vec_avg (tmp0, tmp1);
|
|
1299 vec_st (tmp, stride, dest);
|
|
1300 }
|
|
1301
|
|
1302 void MC_put_y_8_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1303 const int stride, int height)
|
|
1304 {
|
|
1305 vector_u8_t perm0, perm1, tmp0, tmp1, tmp, ref0, ref1;
|
|
1306
|
|
1307 tmp0 = vec_lvsl (0, ref);
|
|
1308 tmp0 = vec_mergeh (tmp0, tmp0);
|
|
1309 perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
|
|
1310 tmp1 = vec_lvsl (stride, ref);
|
|
1311 tmp1 = vec_mergeh (tmp1, tmp1);
|
|
1312 perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
|
|
1313
|
|
1314 height = (height >> 1) - 1;
|
|
1315
|
|
1316 ref0 = vec_ld (0, ref);
|
|
1317 ref1 = vec_ld (7, ref);
|
|
1318 ref += stride;
|
|
1319 tmp0 = vec_perm (ref0, ref1, perm0);
|
|
1320 ref0 = vec_ld (0, ref);
|
|
1321 ref1 = vec_ld (7, ref);
|
|
1322 ref += stride;
|
|
1323 tmp1 = vec_perm (ref0, ref1, perm1);
|
|
1324 tmp = vec_avg (tmp0, tmp1);
|
|
1325
|
|
1326 do {
|
|
1327 ref0 = vec_ld (0, ref);
|
|
1328 ref1 = vec_ld (7, ref);
|
|
1329 ref += stride;
|
|
1330 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1331 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1332 dest += stride;
|
|
1333 tmp0 = vec_perm (ref0, ref1, perm0);
|
|
1334 tmp = vec_avg (tmp0, tmp1);
|
|
1335
|
|
1336 ref0 = vec_ld (0, ref);
|
|
1337 ref1 = vec_ld (7, ref);
|
|
1338 ref += stride;
|
|
1339 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1340 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1341 dest += stride;
|
|
1342 tmp1 = vec_perm (ref0, ref1, perm1);
|
|
1343 tmp = vec_avg (tmp0, tmp1);
|
|
1344 } while (--height);
|
|
1345
|
|
1346 ref0 = vec_ld (0, ref);
|
|
1347 ref1 = vec_ld (7, ref);
|
|
1348 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1349 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1350 dest += stride;
|
|
1351 tmp0 = vec_perm (ref0, ref1, perm0);
|
|
1352 tmp = vec_avg (tmp0, tmp1);
|
|
1353 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1354 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1355 }
|
|
1356
|
|
1357 void MC_put_xy_16_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1358 const int stride, int height)
|
|
1359 {
|
|
1360 vector_u8_t permA, permB, ref0, ref1, A, B, avg0, avg1, xor0, xor1, tmp;
|
|
1361 vector_u8_t ones;
|
|
1362
|
|
1363 ones = vec_splat_u8 (1);
|
|
1364 permA = vec_lvsl (0, ref);
|
|
1365 permB = vec_add (permA, ones);
|
|
1366
|
|
1367 height = (height >> 1) - 1;
|
|
1368
|
|
1369 ref0 = vec_ld (0, ref);
|
|
1370 ref1 = vec_ld (16, ref);
|
|
1371 ref += stride;
|
|
1372 A = vec_perm (ref0, ref1, permA);
|
|
1373 B = vec_perm (ref0, ref1, permB);
|
|
1374 avg0 = vec_avg (A, B);
|
|
1375 xor0 = vec_xor (A, B);
|
|
1376
|
|
1377 ref0 = vec_ld (0, ref);
|
|
1378 ref1 = vec_ld (16, ref);
|
|
1379 ref += stride;
|
|
1380 A = vec_perm (ref0, ref1, permA);
|
|
1381 B = vec_perm (ref0, ref1, permB);
|
|
1382 avg1 = vec_avg (A, B);
|
|
1383 xor1 = vec_xor (A, B);
|
|
1384 tmp = vec_sub (vec_avg (avg0, avg1),
|
|
1385 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1386 vec_xor (avg0, avg1)));
|
|
1387
|
|
1388 do {
|
|
1389 ref0 = vec_ld (0, ref);
|
|
1390 ref1 = vec_ld (16, ref);
|
|
1391 ref += stride;
|
|
1392 vec_st (tmp, 0, dest);
|
|
1393 A = vec_perm (ref0, ref1, permA);
|
|
1394 B = vec_perm (ref0, ref1, permB);
|
|
1395 avg0 = vec_avg (A, B);
|
|
1396 xor0 = vec_xor (A, B);
|
|
1397 tmp = vec_sub (vec_avg (avg0, avg1),
|
|
1398 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1399 vec_xor (avg0, avg1)));
|
|
1400
|
|
1401 ref0 = vec_ld (0, ref);
|
|
1402 ref1 = vec_ld (16, ref);
|
|
1403 ref += stride;
|
|
1404 vec_st (tmp, stride, dest);
|
|
1405 dest += 2*stride;
|
|
1406 A = vec_perm (ref0, ref1, permA);
|
|
1407 B = vec_perm (ref0, ref1, permB);
|
|
1408 avg1 = vec_avg (A, B);
|
|
1409 xor1 = vec_xor (A, B);
|
|
1410 tmp = vec_sub (vec_avg (avg0, avg1),
|
|
1411 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1412 vec_xor (avg0, avg1)));
|
|
1413 } while (--height);
|
|
1414
|
|
1415 ref0 = vec_ld (0, ref);
|
|
1416 ref1 = vec_ld (16, ref);
|
|
1417 vec_st (tmp, 0, dest);
|
|
1418 A = vec_perm (ref0, ref1, permA);
|
|
1419 B = vec_perm (ref0, ref1, permB);
|
|
1420 avg0 = vec_avg (A, B);
|
|
1421 xor0 = vec_xor (A, B);
|
|
1422 tmp = vec_sub (vec_avg (avg0, avg1),
|
|
1423 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1424 vec_xor (avg0, avg1)));
|
|
1425 vec_st (tmp, stride, dest);
|
|
1426 }
|
|
1427
|
|
1428 void MC_put_xy_8_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1429 const int stride, int height)
|
|
1430 {
|
|
1431 vector_u8_t perm0A, perm0B, perm1A, perm1B, ref0, ref1, A, B;
|
|
1432 vector_u8_t avg0, avg1, xor0, xor1, tmp, ones;
|
|
1433
|
|
1434 ones = vec_splat_u8 (1);
|
|
1435 perm0A = vec_lvsl (0, ref);
|
|
1436 perm0A = vec_mergeh (perm0A, perm0A);
|
|
1437 perm0A = vec_pack ((vector_u16_t)perm0A, (vector_u16_t)perm0A);
|
|
1438 perm0B = vec_add (perm0A, ones);
|
|
1439 perm1A = vec_lvsl (stride, ref);
|
|
1440 perm1A = vec_mergeh (perm1A, perm1A);
|
|
1441 perm1A = vec_pack ((vector_u16_t)perm1A, (vector_u16_t)perm1A);
|
|
1442 perm1B = vec_add (perm1A, ones);
|
|
1443
|
|
1444 height = (height >> 1) - 1;
|
|
1445
|
|
1446 ref0 = vec_ld (0, ref);
|
|
1447 ref1 = vec_ld (8, ref);
|
|
1448 ref += stride;
|
|
1449 A = vec_perm (ref0, ref1, perm0A);
|
|
1450 B = vec_perm (ref0, ref1, perm0B);
|
|
1451 avg0 = vec_avg (A, B);
|
|
1452 xor0 = vec_xor (A, B);
|
|
1453
|
|
1454 ref0 = vec_ld (0, ref);
|
|
1455 ref1 = vec_ld (8, ref);
|
|
1456 ref += stride;
|
|
1457 A = vec_perm (ref0, ref1, perm1A);
|
|
1458 B = vec_perm (ref0, ref1, perm1B);
|
|
1459 avg1 = vec_avg (A, B);
|
|
1460 xor1 = vec_xor (A, B);
|
|
1461 tmp = vec_sub (vec_avg (avg0, avg1),
|
|
1462 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1463 vec_xor (avg0, avg1)));
|
|
1464
|
|
1465 do {
|
|
1466 ref0 = vec_ld (0, ref);
|
|
1467 ref1 = vec_ld (8, ref);
|
|
1468 ref += stride;
|
|
1469 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1470 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1471 dest += stride;
|
|
1472 A = vec_perm (ref0, ref1, perm0A);
|
|
1473 B = vec_perm (ref0, ref1, perm0B);
|
|
1474 avg0 = vec_avg (A, B);
|
|
1475 xor0 = vec_xor (A, B);
|
|
1476 tmp = vec_sub (vec_avg (avg0, avg1),
|
|
1477 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1478 vec_xor (avg0, avg1)));
|
|
1479
|
|
1480 ref0 = vec_ld (0, ref);
|
|
1481 ref1 = vec_ld (8, ref);
|
|
1482 ref += stride;
|
|
1483 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1484 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1485 dest += stride;
|
|
1486 A = vec_perm (ref0, ref1, perm1A);
|
|
1487 B = vec_perm (ref0, ref1, perm1B);
|
|
1488 avg1 = vec_avg (A, B);
|
|
1489 xor1 = vec_xor (A, B);
|
|
1490 tmp = vec_sub (vec_avg (avg0, avg1),
|
|
1491 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1492 vec_xor (avg0, avg1)));
|
|
1493 } while (--height);
|
|
1494
|
|
1495 ref0 = vec_ld (0, ref);
|
|
1496 ref1 = vec_ld (8, ref);
|
|
1497 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1498 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1499 dest += stride;
|
|
1500 A = vec_perm (ref0, ref1, perm0A);
|
|
1501 B = vec_perm (ref0, ref1, perm0B);
|
|
1502 avg0 = vec_avg (A, B);
|
|
1503 xor0 = vec_xor (A, B);
|
|
1504 tmp = vec_sub (vec_avg (avg0, avg1),
|
|
1505 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1506 vec_xor (avg0, avg1)));
|
|
1507 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1508 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1509 }
|
|
1510
|
|
1511 #if 0
|
|
1512 void MC_put_xy_8_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1513 const int stride, int height)
|
|
1514 {
|
|
1515 vector_u8_t permA, permB, ref0, ref1, A, B, C, D, tmp, zero, ones;
|
|
1516 vector_u16_t splat2, temp;
|
|
1517
|
|
1518 ones = vec_splat_u8 (1);
|
|
1519 permA = vec_lvsl (0, ref);
|
|
1520 permB = vec_add (permA, ones);
|
|
1521
|
|
1522 zero = vec_splat_u8 (0);
|
|
1523 splat2 = vec_splat_u16 (2);
|
|
1524
|
|
1525 do {
|
|
1526 ref0 = vec_ld (0, ref);
|
|
1527 ref1 = vec_ld (8, ref);
|
|
1528 ref += stride;
|
|
1529 A = vec_perm (ref0, ref1, permA);
|
|
1530 B = vec_perm (ref0, ref1, permB);
|
|
1531 ref0 = vec_ld (0, ref);
|
|
1532 ref1 = vec_ld (8, ref);
|
|
1533 C = vec_perm (ref0, ref1, permA);
|
|
1534 D = vec_perm (ref0, ref1, permB);
|
|
1535
|
|
1536 temp = vec_add (vec_add ((vector_u16_t)vec_mergeh (zero, A),
|
|
1537 (vector_u16_t)vec_mergeh (zero, B)),
|
|
1538 vec_add ((vector_u16_t)vec_mergeh (zero, C),
|
|
1539 (vector_u16_t)vec_mergeh (zero, D)));
|
|
1540 temp = vec_sr (vec_add (temp, splat2), splat2);
|
|
1541 tmp = vec_pack (temp, temp);
|
|
1542
|
|
1543 vec_st (tmp, 0, dest);
|
|
1544 dest += stride;
|
|
1545 tmp = vec_avg (vec_perm (ref0, ref1, permA),
|
|
1546 vec_perm (ref0, ref1, permB));
|
|
1547 } while (--height);
|
|
1548 }
|
|
1549 #endif
|
|
1550
|
|
1551 void MC_avg_o_16_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1552 const int stride, int height)
|
|
1553 {
|
|
1554 vector_u8_t perm, ref0, ref1, tmp, prev;
|
|
1555
|
|
1556 perm = vec_lvsl (0, ref);
|
|
1557
|
|
1558 height = (height >> 1) - 1;
|
|
1559
|
|
1560 ref0 = vec_ld (0, ref);
|
|
1561 ref1 = vec_ld (15, ref);
|
|
1562 ref += stride;
|
|
1563 prev = vec_ld (0, dest);
|
|
1564 tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
|
|
1565
|
|
1566 do {
|
|
1567 ref0 = vec_ld (0, ref);
|
|
1568 ref1 = vec_ld (15, ref);
|
|
1569 ref += stride;
|
|
1570 prev = vec_ld (stride, dest);
|
|
1571 vec_st (tmp, 0, dest);
|
|
1572 tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
|
|
1573
|
|
1574 ref0 = vec_ld (0, ref);
|
|
1575 ref1 = vec_ld (15, ref);
|
|
1576 ref += stride;
|
|
1577 prev = vec_ld (2*stride, dest);
|
|
1578 vec_st (tmp, stride, dest);
|
|
1579 dest += 2*stride;
|
|
1580 tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
|
|
1581 } while (--height);
|
|
1582
|
|
1583 ref0 = vec_ld (0, ref);
|
|
1584 ref1 = vec_ld (15, ref);
|
|
1585 prev = vec_ld (stride, dest);
|
|
1586 vec_st (tmp, 0, dest);
|
|
1587 tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
|
|
1588 vec_st (tmp, stride, dest);
|
|
1589 }
|
|
1590
|
|
1591 void MC_avg_o_8_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1592 const int stride, int height)
|
|
1593 {
|
|
1594 vector_u8_t perm0, perm1, tmp0, tmp1, ref0, ref1, prev;
|
|
1595
|
|
1596 tmp0 = vec_lvsl (0, ref);
|
|
1597 tmp0 = vec_mergeh (tmp0, tmp0);
|
|
1598 perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
|
|
1599 tmp1 = vec_lvsl (stride, ref);
|
|
1600 tmp1 = vec_mergeh (tmp1, tmp1);
|
|
1601 perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
|
|
1602
|
|
1603 height = (height >> 1) - 1;
|
|
1604
|
|
1605 ref0 = vec_ld (0, ref);
|
|
1606 ref1 = vec_ld (7, ref);
|
|
1607 ref += stride;
|
|
1608 prev = vec_ld (0, dest);
|
|
1609 tmp0 = vec_avg (prev, vec_perm (ref0, ref1, perm0));
|
|
1610
|
|
1611 do {
|
|
1612 ref0 = vec_ld (0, ref);
|
|
1613 ref1 = vec_ld (7, ref);
|
|
1614 ref += stride;
|
|
1615 prev = vec_ld (stride, dest);
|
|
1616 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
|
|
1617 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
|
|
1618 dest += stride;
|
|
1619 tmp1 = vec_avg (prev, vec_perm (ref0, ref1, perm1));
|
|
1620
|
|
1621 ref0 = vec_ld (0, ref);
|
|
1622 ref1 = vec_ld (7, ref);
|
|
1623 ref += stride;
|
|
1624 prev = vec_ld (stride, dest);
|
|
1625 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
|
|
1626 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
|
|
1627 dest += stride;
|
|
1628 tmp0 = vec_avg (prev, vec_perm (ref0, ref1, perm0));
|
|
1629 } while (--height);
|
|
1630
|
|
1631 ref0 = vec_ld (0, ref);
|
|
1632 ref1 = vec_ld (7, ref);
|
|
1633 prev = vec_ld (stride, dest);
|
|
1634 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
|
|
1635 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
|
|
1636 dest += stride;
|
|
1637 tmp1 = vec_avg (prev, vec_perm (ref0, ref1, perm1));
|
|
1638 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
|
|
1639 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
|
|
1640 }
|
|
1641
|
|
1642 void MC_avg_x_16_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1643 const int stride, int height)
|
|
1644 {
|
|
1645 vector_u8_t permA, permB, ref0, ref1, tmp, prev;
|
|
1646
|
|
1647 permA = vec_lvsl (0, ref);
|
|
1648 permB = vec_add (permA, vec_splat_u8 (1));
|
|
1649
|
|
1650 height = (height >> 1) - 1;
|
|
1651
|
|
1652 ref0 = vec_ld (0, ref);
|
|
1653 ref1 = vec_ld (16, ref);
|
|
1654 prev = vec_ld (0, dest);
|
|
1655 ref += stride;
|
|
1656 tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
|
|
1657 vec_perm (ref0, ref1, permB)));
|
|
1658
|
|
1659 do {
|
|
1660 ref0 = vec_ld (0, ref);
|
|
1661 ref1 = vec_ld (16, ref);
|
|
1662 ref += stride;
|
|
1663 prev = vec_ld (stride, dest);
|
|
1664 vec_st (tmp, 0, dest);
|
|
1665 tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
|
|
1666 vec_perm (ref0, ref1, permB)));
|
|
1667
|
|
1668 ref0 = vec_ld (0, ref);
|
|
1669 ref1 = vec_ld (16, ref);
|
|
1670 ref += stride;
|
|
1671 prev = vec_ld (2*stride, dest);
|
|
1672 vec_st (tmp, stride, dest);
|
|
1673 dest += 2*stride;
|
|
1674 tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
|
|
1675 vec_perm (ref0, ref1, permB)));
|
|
1676 } while (--height);
|
|
1677
|
|
1678 ref0 = vec_ld (0, ref);
|
|
1679 ref1 = vec_ld (16, ref);
|
|
1680 prev = vec_ld (stride, dest);
|
|
1681 vec_st (tmp, 0, dest);
|
|
1682 tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
|
|
1683 vec_perm (ref0, ref1, permB)));
|
|
1684 vec_st (tmp, stride, dest);
|
|
1685 }
|
|
1686
|
|
1687 void MC_avg_x_8_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1688 const int stride, int height)
|
|
1689 {
|
|
1690 vector_u8_t perm0A, perm0B, perm1A, perm1B, ones, tmp0, tmp1, ref0, ref1;
|
|
1691 vector_u8_t prev;
|
|
1692
|
|
1693 ones = vec_splat_u8 (1);
|
|
1694 tmp0 = vec_lvsl (0, ref);
|
|
1695 tmp0 = vec_mergeh (tmp0, tmp0);
|
|
1696 perm0A = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
|
|
1697 perm0B = vec_add (perm0A, ones);
|
|
1698 tmp1 = vec_lvsl (stride, ref);
|
|
1699 tmp1 = vec_mergeh (tmp1, tmp1);
|
|
1700 perm1A = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
|
|
1701 perm1B = vec_add (perm1A, ones);
|
|
1702
|
|
1703 height = (height >> 1) - 1;
|
|
1704
|
|
1705 ref0 = vec_ld (0, ref);
|
|
1706 ref1 = vec_ld (8, ref);
|
|
1707 prev = vec_ld (0, dest);
|
|
1708 ref += stride;
|
|
1709 tmp0 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm0A),
|
|
1710 vec_perm (ref0, ref1, perm0B)));
|
|
1711
|
|
1712 do {
|
|
1713 ref0 = vec_ld (0, ref);
|
|
1714 ref1 = vec_ld (8, ref);
|
|
1715 ref += stride;
|
|
1716 prev = vec_ld (stride, dest);
|
|
1717 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
|
|
1718 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
|
|
1719 dest += stride;
|
|
1720 tmp1 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm1A),
|
|
1721 vec_perm (ref0, ref1, perm1B)));
|
|
1722
|
|
1723 ref0 = vec_ld (0, ref);
|
|
1724 ref1 = vec_ld (8, ref);
|
|
1725 ref += stride;
|
|
1726 prev = vec_ld (stride, dest);
|
|
1727 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
|
|
1728 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
|
|
1729 dest += stride;
|
|
1730 tmp0 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm0A),
|
|
1731 vec_perm (ref0, ref1, perm0B)));
|
|
1732 } while (--height);
|
|
1733
|
|
1734 ref0 = vec_ld (0, ref);
|
|
1735 ref1 = vec_ld (8, ref);
|
|
1736 prev = vec_ld (stride, dest);
|
|
1737 vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
|
|
1738 vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
|
|
1739 dest += stride;
|
|
1740 tmp1 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm1A),
|
|
1741 vec_perm (ref0, ref1, perm1B)));
|
|
1742 vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
|
|
1743 vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
|
|
1744 }
|
|
1745
|
|
1746 void MC_avg_y_16_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1747 const int stride, int height)
|
|
1748 {
|
|
1749 vector_u8_t perm, ref0, ref1, tmp0, tmp1, tmp, prev;
|
|
1750
|
|
1751 perm = vec_lvsl (0, ref);
|
|
1752
|
|
1753 height = (height >> 1) - 1;
|
|
1754
|
|
1755 ref0 = vec_ld (0, ref);
|
|
1756 ref1 = vec_ld (15, ref);
|
|
1757 ref += stride;
|
|
1758 tmp0 = vec_perm (ref0, ref1, perm);
|
|
1759 ref0 = vec_ld (0, ref);
|
|
1760 ref1 = vec_ld (15, ref);
|
|
1761 ref += stride;
|
|
1762 prev = vec_ld (0, dest);
|
|
1763 tmp1 = vec_perm (ref0, ref1, perm);
|
|
1764 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
|
|
1765
|
|
1766 do {
|
|
1767 ref0 = vec_ld (0, ref);
|
|
1768 ref1 = vec_ld (15, ref);
|
|
1769 ref += stride;
|
|
1770 prev = vec_ld (stride, dest);
|
|
1771 vec_st (tmp, 0, dest);
|
|
1772 tmp0 = vec_perm (ref0, ref1, perm);
|
|
1773 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
|
|
1774
|
|
1775 ref0 = vec_ld (0, ref);
|
|
1776 ref1 = vec_ld (15, ref);
|
|
1777 ref += stride;
|
|
1778 prev = vec_ld (2*stride, dest);
|
|
1779 vec_st (tmp, stride, dest);
|
|
1780 dest += 2*stride;
|
|
1781 tmp1 = vec_perm (ref0, ref1, perm);
|
|
1782 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
|
|
1783 } while (--height);
|
|
1784
|
|
1785 ref0 = vec_ld (0, ref);
|
|
1786 ref1 = vec_ld (15, ref);
|
|
1787 prev = vec_ld (stride, dest);
|
|
1788 vec_st (tmp, 0, dest);
|
|
1789 tmp0 = vec_perm (ref0, ref1, perm);
|
|
1790 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
|
|
1791 vec_st (tmp, stride, dest);
|
|
1792 }
|
|
1793
|
|
1794 void MC_avg_y_8_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1795 const int stride, int height)
|
|
1796 {
|
|
1797 vector_u8_t perm0, perm1, tmp0, tmp1, tmp, ref0, ref1, prev;
|
|
1798
|
|
1799 tmp0 = vec_lvsl (0, ref);
|
|
1800 tmp0 = vec_mergeh (tmp0, tmp0);
|
|
1801 perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
|
|
1802 tmp1 = vec_lvsl (stride, ref);
|
|
1803 tmp1 = vec_mergeh (tmp1, tmp1);
|
|
1804 perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
|
|
1805
|
|
1806 height = (height >> 1) - 1;
|
|
1807
|
|
1808 ref0 = vec_ld (0, ref);
|
|
1809 ref1 = vec_ld (7, ref);
|
|
1810 ref += stride;
|
|
1811 tmp0 = vec_perm (ref0, ref1, perm0);
|
|
1812 ref0 = vec_ld (0, ref);
|
|
1813 ref1 = vec_ld (7, ref);
|
|
1814 ref += stride;
|
|
1815 prev = vec_ld (0, dest);
|
|
1816 tmp1 = vec_perm (ref0, ref1, perm1);
|
|
1817 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
|
|
1818
|
|
1819 do {
|
|
1820 ref0 = vec_ld (0, ref);
|
|
1821 ref1 = vec_ld (7, ref);
|
|
1822 ref += stride;
|
|
1823 prev = vec_ld (stride, dest);
|
|
1824 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1825 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1826 dest += stride;
|
|
1827 tmp0 = vec_perm (ref0, ref1, perm0);
|
|
1828 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
|
|
1829
|
|
1830 ref0 = vec_ld (0, ref);
|
|
1831 ref1 = vec_ld (7, ref);
|
|
1832 ref += stride;
|
|
1833 prev = vec_ld (stride, dest);
|
|
1834 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1835 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1836 dest += stride;
|
|
1837 tmp1 = vec_perm (ref0, ref1, perm1);
|
|
1838 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
|
|
1839 } while (--height);
|
|
1840
|
|
1841 ref0 = vec_ld (0, ref);
|
|
1842 ref1 = vec_ld (7, ref);
|
|
1843 prev = vec_ld (stride, dest);
|
|
1844 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1845 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1846 dest += stride;
|
|
1847 tmp0 = vec_perm (ref0, ref1, perm0);
|
|
1848 tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
|
|
1849 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1850 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1851 }
|
|
1852
|
|
1853 void MC_avg_xy_16_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1854 const int stride, int height)
|
|
1855 {
|
|
1856 vector_u8_t permA, permB, ref0, ref1, A, B, avg0, avg1, xor0, xor1, tmp;
|
|
1857 vector_u8_t ones, prev;
|
|
1858
|
|
1859 ones = vec_splat_u8 (1);
|
|
1860 permA = vec_lvsl (0, ref);
|
|
1861 permB = vec_add (permA, ones);
|
|
1862
|
|
1863 height = (height >> 1) - 1;
|
|
1864
|
|
1865 ref0 = vec_ld (0, ref);
|
|
1866 ref1 = vec_ld (16, ref);
|
|
1867 ref += stride;
|
|
1868 A = vec_perm (ref0, ref1, permA);
|
|
1869 B = vec_perm (ref0, ref1, permB);
|
|
1870 avg0 = vec_avg (A, B);
|
|
1871 xor0 = vec_xor (A, B);
|
|
1872
|
|
1873 ref0 = vec_ld (0, ref);
|
|
1874 ref1 = vec_ld (16, ref);
|
|
1875 ref += stride;
|
|
1876 prev = vec_ld (0, dest);
|
|
1877 A = vec_perm (ref0, ref1, permA);
|
|
1878 B = vec_perm (ref0, ref1, permB);
|
|
1879 avg1 = vec_avg (A, B);
|
|
1880 xor1 = vec_xor (A, B);
|
|
1881 tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
|
|
1882 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1883 vec_xor (avg0, avg1))));
|
|
1884
|
|
1885 do {
|
|
1886 ref0 = vec_ld (0, ref);
|
|
1887 ref1 = vec_ld (16, ref);
|
|
1888 ref += stride;
|
|
1889 prev = vec_ld (stride, dest);
|
|
1890 vec_st (tmp, 0, dest);
|
|
1891 A = vec_perm (ref0, ref1, permA);
|
|
1892 B = vec_perm (ref0, ref1, permB);
|
|
1893 avg0 = vec_avg (A, B);
|
|
1894 xor0 = vec_xor (A, B);
|
|
1895 tmp = vec_avg (prev,
|
|
1896 vec_sub (vec_avg (avg0, avg1),
|
|
1897 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1898 vec_xor (avg0, avg1))));
|
|
1899
|
|
1900 ref0 = vec_ld (0, ref);
|
|
1901 ref1 = vec_ld (16, ref);
|
|
1902 ref += stride;
|
|
1903 prev = vec_ld (2*stride, dest);
|
|
1904 vec_st (tmp, stride, dest);
|
|
1905 dest += 2*stride;
|
|
1906 A = vec_perm (ref0, ref1, permA);
|
|
1907 B = vec_perm (ref0, ref1, permB);
|
|
1908 avg1 = vec_avg (A, B);
|
|
1909 xor1 = vec_xor (A, B);
|
|
1910 tmp = vec_avg (prev,
|
|
1911 vec_sub (vec_avg (avg0, avg1),
|
|
1912 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1913 vec_xor (avg0, avg1))));
|
|
1914 } while (--height);
|
|
1915
|
|
1916 ref0 = vec_ld (0, ref);
|
|
1917 ref1 = vec_ld (16, ref);
|
|
1918 prev = vec_ld (stride, dest);
|
|
1919 vec_st (tmp, 0, dest);
|
|
1920 A = vec_perm (ref0, ref1, permA);
|
|
1921 B = vec_perm (ref0, ref1, permB);
|
|
1922 avg0 = vec_avg (A, B);
|
|
1923 xor0 = vec_xor (A, B);
|
|
1924 tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
|
|
1925 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1926 vec_xor (avg0, avg1))));
|
|
1927 vec_st (tmp, stride, dest);
|
|
1928 }
|
|
1929
|
|
1930 void MC_avg_xy_8_altivec (unsigned char * dest, const unsigned char * ref,
|
|
1931 const int stride, int height)
|
|
1932 {
|
|
1933 vector_u8_t perm0A, perm0B, perm1A, perm1B, ref0, ref1, A, B;
|
|
1934 vector_u8_t avg0, avg1, xor0, xor1, tmp, ones, prev;
|
|
1935
|
|
1936 ones = vec_splat_u8 (1);
|
|
1937 perm0A = vec_lvsl (0, ref);
|
|
1938 perm0A = vec_mergeh (perm0A, perm0A);
|
|
1939 perm0A = vec_pack ((vector_u16_t)perm0A, (vector_u16_t)perm0A);
|
|
1940 perm0B = vec_add (perm0A, ones);
|
|
1941 perm1A = vec_lvsl (stride, ref);
|
|
1942 perm1A = vec_mergeh (perm1A, perm1A);
|
|
1943 perm1A = vec_pack ((vector_u16_t)perm1A, (vector_u16_t)perm1A);
|
|
1944 perm1B = vec_add (perm1A, ones);
|
|
1945
|
|
1946 height = (height >> 1) - 1;
|
|
1947
|
|
1948 ref0 = vec_ld (0, ref);
|
|
1949 ref1 = vec_ld (8, ref);
|
|
1950 ref += stride;
|
|
1951 A = vec_perm (ref0, ref1, perm0A);
|
|
1952 B = vec_perm (ref0, ref1, perm0B);
|
|
1953 avg0 = vec_avg (A, B);
|
|
1954 xor0 = vec_xor (A, B);
|
|
1955
|
|
1956 ref0 = vec_ld (0, ref);
|
|
1957 ref1 = vec_ld (8, ref);
|
|
1958 ref += stride;
|
|
1959 prev = vec_ld (0, dest);
|
|
1960 A = vec_perm (ref0, ref1, perm1A);
|
|
1961 B = vec_perm (ref0, ref1, perm1B);
|
|
1962 avg1 = vec_avg (A, B);
|
|
1963 xor1 = vec_xor (A, B);
|
|
1964 tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
|
|
1965 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1966 vec_xor (avg0, avg1))));
|
|
1967
|
|
1968 do {
|
|
1969 ref0 = vec_ld (0, ref);
|
|
1970 ref1 = vec_ld (8, ref);
|
|
1971 ref += stride;
|
|
1972 prev = vec_ld (stride, dest);
|
|
1973 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1974 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1975 dest += stride;
|
|
1976 A = vec_perm (ref0, ref1, perm0A);
|
|
1977 B = vec_perm (ref0, ref1, perm0B);
|
|
1978 avg0 = vec_avg (A, B);
|
|
1979 xor0 = vec_xor (A, B);
|
|
1980 tmp = vec_avg (prev,
|
|
1981 vec_sub (vec_avg (avg0, avg1),
|
|
1982 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1983 vec_xor (avg0, avg1))));
|
|
1984
|
|
1985 ref0 = vec_ld (0, ref);
|
|
1986 ref1 = vec_ld (8, ref);
|
|
1987 ref += stride;
|
|
1988 prev = vec_ld (stride, dest);
|
|
1989 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
1990 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
1991 dest += stride;
|
|
1992 A = vec_perm (ref0, ref1, perm1A);
|
|
1993 B = vec_perm (ref0, ref1, perm1B);
|
|
1994 avg1 = vec_avg (A, B);
|
|
1995 xor1 = vec_xor (A, B);
|
|
1996 tmp = vec_avg (prev,
|
|
1997 vec_sub (vec_avg (avg0, avg1),
|
|
1998 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
1999 vec_xor (avg0, avg1))));
|
|
2000 } while (--height);
|
|
2001
|
|
2002 ref0 = vec_ld (0, ref);
|
|
2003 ref1 = vec_ld (8, ref);
|
|
2004 prev = vec_ld (stride, dest);
|
|
2005 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
2006 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
2007 dest += stride;
|
|
2008 A = vec_perm (ref0, ref1, perm0A);
|
|
2009 B = vec_perm (ref0, ref1, perm0B);
|
|
2010 avg0 = vec_avg (A, B);
|
|
2011 xor0 = vec_xor (A, B);
|
|
2012 tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
|
|
2013 vec_and (vec_and (ones, vec_or (xor0, xor1)),
|
|
2014 vec_xor (avg0, avg1))));
|
|
2015 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
|
|
2016 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
2017 }
|
|
2018
|
|
2019 #endif /* __ALTIVEC__ */
|