comparison ppc/h264_template_altivec.c @ 3577:5be5a936c8a9 libavcodec

Clean up:make dsputil subfile names consistent
author lu_zero
date Sun, 13 Aug 2006 08:26:43 +0000
parents ppc/dsputil_h264_template_altivec.c@eff63ac2b545
children c8c591fe26f8
comparison
equal deleted inserted replaced
3576:f7125bf10892 3577:5be5a936c8a9
1 /*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 /* this code assume that stride % 16 == 0 */
20 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
21 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
22 signed int ABCD[4] __attribute__((aligned(16))) =
23 {((8 - x) * (8 - y)),
24 ((x) * (8 - y)),
25 ((8 - x) * (y)),
26 ((x) * (y))};
27 register int i;
28 vector unsigned char fperm;
29 const vector signed int vABCD = vec_ld(0, ABCD);
30 const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
31 const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
32 const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
33 const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
34 const vector signed int vzero = vec_splat_s32(0);
35 const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
36 const vector unsigned short v6us = vec_splat_u16(6);
37 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
38 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
39
40 vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
41 vector unsigned char vsrc0uc, vsrc1uc;
42 vector signed short vsrc0ssH, vsrc1ssH;
43 vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
44 vector signed short vsrc2ssH, vsrc3ssH, psum;
45 vector unsigned char vdst, ppsum, vfdst, fsum;
46
47 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
48
49 if (((unsigned long)dst) % 16 == 0) {
50 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
51 0x14, 0x15, 0x16, 0x17,
52 0x08, 0x09, 0x0A, 0x0B,
53 0x0C, 0x0D, 0x0E, 0x0F);
54 } else {
55 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
56 0x04, 0x05, 0x06, 0x07,
57 0x18, 0x19, 0x1A, 0x1B,
58 0x1C, 0x1D, 0x1E, 0x1F);
59 }
60
61 vsrcAuc = vec_ld(0, src);
62
63 if (loadSecond)
64 vsrcBuc = vec_ld(16, src);
65 vsrcperm0 = vec_lvsl(0, src);
66 vsrcperm1 = vec_lvsl(1, src);
67
68 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
69 if (reallyBadAlign)
70 vsrc1uc = vsrcBuc;
71 else
72 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
73
74 vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
75 (vector unsigned char)vsrc0uc);
76 vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
77 (vector unsigned char)vsrc1uc);
78
79 if (!loadSecond) {// -> !reallyBadAlign
80 for (i = 0 ; i < h ; i++) {
81
82
83 vsrcCuc = vec_ld(stride + 0, src);
84
85 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
86 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
87
88 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
89 (vector unsigned char)vsrc2uc);
90 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
91 (vector unsigned char)vsrc3uc);
92
93 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
94 psum = vec_mladd(vB, vsrc1ssH, psum);
95 psum = vec_mladd(vC, vsrc2ssH, psum);
96 psum = vec_mladd(vD, vsrc3ssH, psum);
97 psum = vec_add(v32ss, psum);
98 psum = vec_sra(psum, v6us);
99
100 vdst = vec_ld(0, dst);
101 ppsum = (vector unsigned char)vec_packsu(psum, psum);
102 vfdst = vec_perm(vdst, ppsum, fperm);
103
104 OP_U8_ALTIVEC(fsum, vfdst, vdst);
105
106 vec_st(fsum, 0, dst);
107
108 vsrc0ssH = vsrc2ssH;
109 vsrc1ssH = vsrc3ssH;
110
111 dst += stride;
112 src += stride;
113 }
114 } else {
115 vector unsigned char vsrcDuc;
116 for (i = 0 ; i < h ; i++) {
117 vsrcCuc = vec_ld(stride + 0, src);
118 vsrcDuc = vec_ld(stride + 16, src);
119
120 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
121 if (reallyBadAlign)
122 vsrc3uc = vsrcDuc;
123 else
124 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
125
126 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
127 (vector unsigned char)vsrc2uc);
128 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
129 (vector unsigned char)vsrc3uc);
130
131 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
132 psum = vec_mladd(vB, vsrc1ssH, psum);
133 psum = vec_mladd(vC, vsrc2ssH, psum);
134 psum = vec_mladd(vD, vsrc3ssH, psum);
135 psum = vec_add(v32ss, psum);
136 psum = vec_sr(psum, v6us);
137
138 vdst = vec_ld(0, dst);
139 ppsum = (vector unsigned char)vec_pack(psum, psum);
140 vfdst = vec_perm(vdst, ppsum, fperm);
141
142 OP_U8_ALTIVEC(fsum, vfdst, vdst);
143
144 vec_st(fsum, 0, dst);
145
146 vsrc0ssH = vsrc2ssH;
147 vsrc1ssH = vsrc3ssH;
148
149 dst += stride;
150 src += stride;
151 }
152 }
153 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
154 }
155
156 /* this code assume stride % 16 == 0 */
157 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
158 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
159 register int i;
160
161 const vector signed int vzero = vec_splat_s32(0);
162 const vector unsigned char permM2 = vec_lvsl(-2, src);
163 const vector unsigned char permM1 = vec_lvsl(-1, src);
164 const vector unsigned char permP0 = vec_lvsl(+0, src);
165 const vector unsigned char permP1 = vec_lvsl(+1, src);
166 const vector unsigned char permP2 = vec_lvsl(+2, src);
167 const vector unsigned char permP3 = vec_lvsl(+3, src);
168 const vector signed short v5ss = vec_splat_s16(5);
169 const vector unsigned short v5us = vec_splat_u16(5);
170 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
171 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
172 const vector unsigned char dstperm = vec_lvsr(0, dst);
173 const vector unsigned char neg1 =
174 (const vector unsigned char) vec_splat_s8(-1);
175
176 const vector unsigned char dstmask =
177 vec_perm((const vector unsigned char)vzero,
178 neg1, dstperm);
179
180 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
181
182 register int align = ((((unsigned long)src) - 2) % 16);
183
184 vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
185 srcP2A, srcP2B, srcP3A, srcP3B,
186 srcM1A, srcM1B, srcM2A, srcM2B,
187 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
188 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
189 psumA, psumB, sumA, sumB;
190
191 vector unsigned char sum, dst1, dst2, vdst, fsum,
192 rsum, fdst1, fdst2;
193
194 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
195
196 for (i = 0 ; i < 16 ; i ++) {
197 vector unsigned char srcR1 = vec_ld(-2, src);
198 vector unsigned char srcR2 = vec_ld(14, src);
199
200 switch (align) {
201 default: {
202 srcM2 = vec_perm(srcR1, srcR2, permM2);
203 srcM1 = vec_perm(srcR1, srcR2, permM1);
204 srcP0 = vec_perm(srcR1, srcR2, permP0);
205 srcP1 = vec_perm(srcR1, srcR2, permP1);
206 srcP2 = vec_perm(srcR1, srcR2, permP2);
207 srcP3 = vec_perm(srcR1, srcR2, permP3);
208 } break;
209 case 11: {
210 srcM2 = vec_perm(srcR1, srcR2, permM2);
211 srcM1 = vec_perm(srcR1, srcR2, permM1);
212 srcP0 = vec_perm(srcR1, srcR2, permP0);
213 srcP1 = vec_perm(srcR1, srcR2, permP1);
214 srcP2 = vec_perm(srcR1, srcR2, permP2);
215 srcP3 = srcR2;
216 } break;
217 case 12: {
218 vector unsigned char srcR3 = vec_ld(30, src);
219 srcM2 = vec_perm(srcR1, srcR2, permM2);
220 srcM1 = vec_perm(srcR1, srcR2, permM1);
221 srcP0 = vec_perm(srcR1, srcR2, permP0);
222 srcP1 = vec_perm(srcR1, srcR2, permP1);
223 srcP2 = srcR2;
224 srcP3 = vec_perm(srcR2, srcR3, permP3);
225 } break;
226 case 13: {
227 vector unsigned char srcR3 = vec_ld(30, src);
228 srcM2 = vec_perm(srcR1, srcR2, permM2);
229 srcM1 = vec_perm(srcR1, srcR2, permM1);
230 srcP0 = vec_perm(srcR1, srcR2, permP0);
231 srcP1 = srcR2;
232 srcP2 = vec_perm(srcR2, srcR3, permP2);
233 srcP3 = vec_perm(srcR2, srcR3, permP3);
234 } break;
235 case 14: {
236 vector unsigned char srcR3 = vec_ld(30, src);
237 srcM2 = vec_perm(srcR1, srcR2, permM2);
238 srcM1 = vec_perm(srcR1, srcR2, permM1);
239 srcP0 = srcR2;
240 srcP1 = vec_perm(srcR2, srcR3, permP1);
241 srcP2 = vec_perm(srcR2, srcR3, permP2);
242 srcP3 = vec_perm(srcR2, srcR3, permP3);
243 } break;
244 case 15: {
245 vector unsigned char srcR3 = vec_ld(30, src);
246 srcM2 = vec_perm(srcR1, srcR2, permM2);
247 srcM1 = srcR2;
248 srcP0 = vec_perm(srcR2, srcR3, permP0);
249 srcP1 = vec_perm(srcR2, srcR3, permP1);
250 srcP2 = vec_perm(srcR2, srcR3, permP2);
251 srcP3 = vec_perm(srcR2, srcR3, permP3);
252 } break;
253 }
254
255 srcP0A = (vector signed short)
256 vec_mergeh((vector unsigned char)vzero, srcP0);
257 srcP0B = (vector signed short)
258 vec_mergel((vector unsigned char)vzero, srcP0);
259 srcP1A = (vector signed short)
260 vec_mergeh((vector unsigned char)vzero, srcP1);
261 srcP1B = (vector signed short)
262 vec_mergel((vector unsigned char)vzero, srcP1);
263
264 srcP2A = (vector signed short)
265 vec_mergeh((vector unsigned char)vzero, srcP2);
266 srcP2B = (vector signed short)
267 vec_mergel((vector unsigned char)vzero, srcP2);
268 srcP3A = (vector signed short)
269 vec_mergeh((vector unsigned char)vzero, srcP3);
270 srcP3B = (vector signed short)
271 vec_mergel((vector unsigned char)vzero, srcP3);
272
273 srcM1A = (vector signed short)
274 vec_mergeh((vector unsigned char)vzero, srcM1);
275 srcM1B = (vector signed short)
276 vec_mergel((vector unsigned char)vzero, srcM1);
277 srcM2A = (vector signed short)
278 vec_mergeh((vector unsigned char)vzero, srcM2);
279 srcM2B = (vector signed short)
280 vec_mergel((vector unsigned char)vzero, srcM2);
281
282 sum1A = vec_adds(srcP0A, srcP1A);
283 sum1B = vec_adds(srcP0B, srcP1B);
284 sum2A = vec_adds(srcM1A, srcP2A);
285 sum2B = vec_adds(srcM1B, srcP2B);
286 sum3A = vec_adds(srcM2A, srcP3A);
287 sum3B = vec_adds(srcM2B, srcP3B);
288
289 pp1A = vec_mladd(sum1A, v20ss, v16ss);
290 pp1B = vec_mladd(sum1B, v20ss, v16ss);
291
292 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
293 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
294
295 pp3A = vec_add(sum3A, pp1A);
296 pp3B = vec_add(sum3B, pp1B);
297
298 psumA = vec_sub(pp3A, pp2A);
299 psumB = vec_sub(pp3B, pp2B);
300
301 sumA = vec_sra(psumA, v5us);
302 sumB = vec_sra(psumB, v5us);
303
304 sum = vec_packsu(sumA, sumB);
305
306 dst1 = vec_ld(0, dst);
307 dst2 = vec_ld(16, dst);
308 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
309
310 OP_U8_ALTIVEC(fsum, sum, vdst);
311
312 rsum = vec_perm(fsum, fsum, dstperm);
313 fdst1 = vec_sel(dst1, rsum, dstmask);
314 fdst2 = vec_sel(rsum, dst2, dstmask);
315
316 vec_st(fdst1, 0, dst);
317 vec_st(fdst2, 16, dst);
318
319 src += srcStride;
320 dst += dstStride;
321 }
322 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
323 }
324
325 /* this code assume stride % 16 == 0 */
326 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
327 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
328
329 register int i;
330
331 const vector signed int vzero = vec_splat_s32(0);
332 const vector unsigned char perm = vec_lvsl(0, src);
333 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
334 const vector unsigned short v5us = vec_splat_u16(5);
335 const vector signed short v5ss = vec_splat_s16(5);
336 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
337 const vector unsigned char dstperm = vec_lvsr(0, dst);
338 const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
339 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
340
341 uint8_t *srcbis = src - (srcStride * 2);
342
343 const vector unsigned char srcM2a = vec_ld(0, srcbis);
344 const vector unsigned char srcM2b = vec_ld(16, srcbis);
345 const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
346 // srcbis += srcStride;
347 const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
348 const vector unsigned char srcM1b = vec_ld(16, srcbis);
349 const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
350 // srcbis += srcStride;
351 const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
352 const vector unsigned char srcP0b = vec_ld(16, srcbis);
353 const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
354 // srcbis += srcStride;
355 const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
356 const vector unsigned char srcP1b = vec_ld(16, srcbis);
357 const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
358 // srcbis += srcStride;
359 const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
360 const vector unsigned char srcP2b = vec_ld(16, srcbis);
361 const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
362 // srcbis += srcStride;
363
364 vector signed short srcM2ssA = (vector signed short)
365 vec_mergeh((vector unsigned char)vzero, srcM2);
366 vector signed short srcM2ssB = (vector signed short)
367 vec_mergel((vector unsigned char)vzero, srcM2);
368 vector signed short srcM1ssA = (vector signed short)
369 vec_mergeh((vector unsigned char)vzero, srcM1);
370 vector signed short srcM1ssB = (vector signed short)
371 vec_mergel((vector unsigned char)vzero, srcM1);
372 vector signed short srcP0ssA = (vector signed short)
373 vec_mergeh((vector unsigned char)vzero, srcP0);
374 vector signed short srcP0ssB = (vector signed short)
375 vec_mergel((vector unsigned char)vzero, srcP0);
376 vector signed short srcP1ssA = (vector signed short)
377 vec_mergeh((vector unsigned char)vzero, srcP1);
378 vector signed short srcP1ssB = (vector signed short)
379 vec_mergel((vector unsigned char)vzero, srcP1);
380 vector signed short srcP2ssA = (vector signed short)
381 vec_mergeh((vector unsigned char)vzero, srcP2);
382 vector signed short srcP2ssB = (vector signed short)
383 vec_mergel((vector unsigned char)vzero, srcP2);
384
385 vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
386 psumA, psumB, sumA, sumB,
387 srcP3ssA, srcP3ssB,
388 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
389
390 vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
391 srcP3a, srcP3b, srcP3;
392
393 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
394
395 for (i = 0 ; i < 16 ; i++) {
396 srcP3a = vec_ld(0, srcbis += srcStride);
397 srcP3b = vec_ld(16, srcbis);
398 srcP3 = vec_perm(srcP3a, srcP3b, perm);
399 srcP3ssA = (vector signed short)
400 vec_mergeh((vector unsigned char)vzero, srcP3);
401 srcP3ssB = (vector signed short)
402 vec_mergel((vector unsigned char)vzero, srcP3);
403 // srcbis += srcStride;
404
405 sum1A = vec_adds(srcP0ssA, srcP1ssA);
406 sum1B = vec_adds(srcP0ssB, srcP1ssB);
407 sum2A = vec_adds(srcM1ssA, srcP2ssA);
408 sum2B = vec_adds(srcM1ssB, srcP2ssB);
409 sum3A = vec_adds(srcM2ssA, srcP3ssA);
410 sum3B = vec_adds(srcM2ssB, srcP3ssB);
411
412 srcM2ssA = srcM1ssA;
413 srcM2ssB = srcM1ssB;
414 srcM1ssA = srcP0ssA;
415 srcM1ssB = srcP0ssB;
416 srcP0ssA = srcP1ssA;
417 srcP0ssB = srcP1ssB;
418 srcP1ssA = srcP2ssA;
419 srcP1ssB = srcP2ssB;
420 srcP2ssA = srcP3ssA;
421 srcP2ssB = srcP3ssB;
422
423 pp1A = vec_mladd(sum1A, v20ss, v16ss);
424 pp1B = vec_mladd(sum1B, v20ss, v16ss);
425
426 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
427 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
428
429 pp3A = vec_add(sum3A, pp1A);
430 pp3B = vec_add(sum3B, pp1B);
431
432 psumA = vec_sub(pp3A, pp2A);
433 psumB = vec_sub(pp3B, pp2B);
434
435 sumA = vec_sra(psumA, v5us);
436 sumB = vec_sra(psumB, v5us);
437
438 sum = vec_packsu(sumA, sumB);
439
440 dst1 = vec_ld(0, dst);
441 dst2 = vec_ld(16, dst);
442 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
443
444 OP_U8_ALTIVEC(fsum, sum, vdst);
445
446 rsum = vec_perm(fsum, fsum, dstperm);
447 fdst1 = vec_sel(dst1, rsum, dstmask);
448 fdst2 = vec_sel(rsum, dst2, dstmask);
449
450 vec_st(fdst1, 0, dst);
451 vec_st(fdst2, 16, dst);
452
453 dst += dstStride;
454 }
455 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
456 }
457
458 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
459 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
460 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
461 register int i;
462 const vector signed int vzero = vec_splat_s32(0);
463 const vector unsigned char permM2 = vec_lvsl(-2, src);
464 const vector unsigned char permM1 = vec_lvsl(-1, src);
465 const vector unsigned char permP0 = vec_lvsl(+0, src);
466 const vector unsigned char permP1 = vec_lvsl(+1, src);
467 const vector unsigned char permP2 = vec_lvsl(+2, src);
468 const vector unsigned char permP3 = vec_lvsl(+3, src);
469 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
470 const vector unsigned int v10ui = vec_splat_u32(10);
471 const vector signed short v5ss = vec_splat_s16(5);
472 const vector signed short v1ss = vec_splat_s16(1);
473 const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
474 const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
475
476 register int align = ((((unsigned long)src) - 2) % 16);
477
478 const vector unsigned char neg1 = (const vector unsigned char)
479 vec_splat_s8(-1);
480
481 vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
482 srcP2A, srcP2B, srcP3A, srcP3B,
483 srcM1A, srcM1B, srcM2A, srcM2B,
484 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
485 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
486
487 const vector unsigned char dstperm = vec_lvsr(0, dst);
488
489 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
490
491 const vector unsigned char mperm = (const vector unsigned char)
492 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
493 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
494 int16_t *tmpbis = tmp;
495
496 vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
497 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
498 tmpP2ssA, tmpP2ssB;
499
500 vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
501 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
502 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
503 ssumAe, ssumAo, ssumBe, ssumBo;
504 vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
505 rsum, fdst1, fdst2;
506 vector signed short ssume, ssumo;
507
508 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
509 src -= (2 * srcStride);
510 for (i = 0 ; i < 21 ; i ++) {
511 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
512 vector unsigned char srcR1 = vec_ld(-2, src);
513 vector unsigned char srcR2 = vec_ld(14, src);
514
515 switch (align) {
516 default: {
517 srcM2 = vec_perm(srcR1, srcR2, permM2);
518 srcM1 = vec_perm(srcR1, srcR2, permM1);
519 srcP0 = vec_perm(srcR1, srcR2, permP0);
520 srcP1 = vec_perm(srcR1, srcR2, permP1);
521 srcP2 = vec_perm(srcR1, srcR2, permP2);
522 srcP3 = vec_perm(srcR1, srcR2, permP3);
523 } break;
524 case 11: {
525 srcM2 = vec_perm(srcR1, srcR2, permM2);
526 srcM1 = vec_perm(srcR1, srcR2, permM1);
527 srcP0 = vec_perm(srcR1, srcR2, permP0);
528 srcP1 = vec_perm(srcR1, srcR2, permP1);
529 srcP2 = vec_perm(srcR1, srcR2, permP2);
530 srcP3 = srcR2;
531 } break;
532 case 12: {
533 vector unsigned char srcR3 = vec_ld(30, src);
534 srcM2 = vec_perm(srcR1, srcR2, permM2);
535 srcM1 = vec_perm(srcR1, srcR2, permM1);
536 srcP0 = vec_perm(srcR1, srcR2, permP0);
537 srcP1 = vec_perm(srcR1, srcR2, permP1);
538 srcP2 = srcR2;
539 srcP3 = vec_perm(srcR2, srcR3, permP3);
540 } break;
541 case 13: {
542 vector unsigned char srcR3 = vec_ld(30, src);
543 srcM2 = vec_perm(srcR1, srcR2, permM2);
544 srcM1 = vec_perm(srcR1, srcR2, permM1);
545 srcP0 = vec_perm(srcR1, srcR2, permP0);
546 srcP1 = srcR2;
547 srcP2 = vec_perm(srcR2, srcR3, permP2);
548 srcP3 = vec_perm(srcR2, srcR3, permP3);
549 } break;
550 case 14: {
551 vector unsigned char srcR3 = vec_ld(30, src);
552 srcM2 = vec_perm(srcR1, srcR2, permM2);
553 srcM1 = vec_perm(srcR1, srcR2, permM1);
554 srcP0 = srcR2;
555 srcP1 = vec_perm(srcR2, srcR3, permP1);
556 srcP2 = vec_perm(srcR2, srcR3, permP2);
557 srcP3 = vec_perm(srcR2, srcR3, permP3);
558 } break;
559 case 15: {
560 vector unsigned char srcR3 = vec_ld(30, src);
561 srcM2 = vec_perm(srcR1, srcR2, permM2);
562 srcM1 = srcR2;
563 srcP0 = vec_perm(srcR2, srcR3, permP0);
564 srcP1 = vec_perm(srcR2, srcR3, permP1);
565 srcP2 = vec_perm(srcR2, srcR3, permP2);
566 srcP3 = vec_perm(srcR2, srcR3, permP3);
567 } break;
568 }
569
570 srcP0A = (vector signed short)
571 vec_mergeh((vector unsigned char)vzero, srcP0);
572 srcP0B = (vector signed short)
573 vec_mergel((vector unsigned char)vzero, srcP0);
574 srcP1A = (vector signed short)
575 vec_mergeh((vector unsigned char)vzero, srcP1);
576 srcP1B = (vector signed short)
577 vec_mergel((vector unsigned char)vzero, srcP1);
578
579 srcP2A = (vector signed short)
580 vec_mergeh((vector unsigned char)vzero, srcP2);
581 srcP2B = (vector signed short)
582 vec_mergel((vector unsigned char)vzero, srcP2);
583 srcP3A = (vector signed short)
584 vec_mergeh((vector unsigned char)vzero, srcP3);
585 srcP3B = (vector signed short)
586 vec_mergel((vector unsigned char)vzero, srcP3);
587
588 srcM1A = (vector signed short)
589 vec_mergeh((vector unsigned char)vzero, srcM1);
590 srcM1B = (vector signed short)
591 vec_mergel((vector unsigned char)vzero, srcM1);
592 srcM2A = (vector signed short)
593 vec_mergeh((vector unsigned char)vzero, srcM2);
594 srcM2B = (vector signed short)
595 vec_mergel((vector unsigned char)vzero, srcM2);
596
597 sum1A = vec_adds(srcP0A, srcP1A);
598 sum1B = vec_adds(srcP0B, srcP1B);
599 sum2A = vec_adds(srcM1A, srcP2A);
600 sum2B = vec_adds(srcM1B, srcP2B);
601 sum3A = vec_adds(srcM2A, srcP3A);
602 sum3B = vec_adds(srcM2B, srcP3B);
603
604 pp1A = vec_mladd(sum1A, v20ss, sum3A);
605 pp1B = vec_mladd(sum1B, v20ss, sum3B);
606
607 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
608 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
609
610 psumA = vec_sub(pp1A, pp2A);
611 psumB = vec_sub(pp1B, pp2B);
612
613 vec_st(psumA, 0, tmp);
614 vec_st(psumB, 16, tmp);
615
616 src += srcStride;
617 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
618 }
619
620 tmpM2ssA = vec_ld(0, tmpbis);
621 tmpM2ssB = vec_ld(16, tmpbis);
622 tmpbis += tmpStride;
623 tmpM1ssA = vec_ld(0, tmpbis);
624 tmpM1ssB = vec_ld(16, tmpbis);
625 tmpbis += tmpStride;
626 tmpP0ssA = vec_ld(0, tmpbis);
627 tmpP0ssB = vec_ld(16, tmpbis);
628 tmpbis += tmpStride;
629 tmpP1ssA = vec_ld(0, tmpbis);
630 tmpP1ssB = vec_ld(16, tmpbis);
631 tmpbis += tmpStride;
632 tmpP2ssA = vec_ld(0, tmpbis);
633 tmpP2ssB = vec_ld(16, tmpbis);
634 tmpbis += tmpStride;
635
636 for (i = 0 ; i < 16 ; i++) {
637 const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
638 const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
639
640 const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
641 const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
642 const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
643 const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
644 const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
645 const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
646
647 tmpbis += tmpStride;
648
649 tmpM2ssA = tmpM1ssA;
650 tmpM2ssB = tmpM1ssB;
651 tmpM1ssA = tmpP0ssA;
652 tmpM1ssB = tmpP0ssB;
653 tmpP0ssA = tmpP1ssA;
654 tmpP0ssB = tmpP1ssB;
655 tmpP1ssA = tmpP2ssA;
656 tmpP1ssB = tmpP2ssB;
657 tmpP2ssA = tmpP3ssA;
658 tmpP2ssB = tmpP3ssB;
659
660 pp1Ae = vec_mule(sum1A, v20ss);
661 pp1Ao = vec_mulo(sum1A, v20ss);
662 pp1Be = vec_mule(sum1B, v20ss);
663 pp1Bo = vec_mulo(sum1B, v20ss);
664
665 pp2Ae = vec_mule(sum2A, v5ss);
666 pp2Ao = vec_mulo(sum2A, v5ss);
667 pp2Be = vec_mule(sum2B, v5ss);
668 pp2Bo = vec_mulo(sum2B, v5ss);
669
670 pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
671 pp3Ao = vec_mulo(sum3A, v1ss);
672 pp3Be = vec_sra((vector signed int)sum3B, v16ui);
673 pp3Bo = vec_mulo(sum3B, v1ss);
674
675 pp1cAe = vec_add(pp1Ae, v512si);
676 pp1cAo = vec_add(pp1Ao, v512si);
677 pp1cBe = vec_add(pp1Be, v512si);
678 pp1cBo = vec_add(pp1Bo, v512si);
679
680 pp32Ae = vec_sub(pp3Ae, pp2Ae);
681 pp32Ao = vec_sub(pp3Ao, pp2Ao);
682 pp32Be = vec_sub(pp3Be, pp2Be);
683 pp32Bo = vec_sub(pp3Bo, pp2Bo);
684
685 sumAe = vec_add(pp1cAe, pp32Ae);
686 sumAo = vec_add(pp1cAo, pp32Ao);
687 sumBe = vec_add(pp1cBe, pp32Be);
688 sumBo = vec_add(pp1cBo, pp32Bo);
689
690 ssumAe = vec_sra(sumAe, v10ui);
691 ssumAo = vec_sra(sumAo, v10ui);
692 ssumBe = vec_sra(sumBe, v10ui);
693 ssumBo = vec_sra(sumBo, v10ui);
694
695 ssume = vec_packs(ssumAe, ssumBe);
696 ssumo = vec_packs(ssumAo, ssumBo);
697
698 sumv = vec_packsu(ssume, ssumo);
699 sum = vec_perm(sumv, sumv, mperm);
700
701 dst1 = vec_ld(0, dst);
702 dst2 = vec_ld(16, dst);
703 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
704
705 OP_U8_ALTIVEC(fsum, sum, vdst);
706
707 rsum = vec_perm(fsum, fsum, dstperm);
708 fdst1 = vec_sel(dst1, rsum, dstmask);
709 fdst2 = vec_sel(rsum, dst2, dstmask);
710
711 vec_st(fdst1, 0, dst);
712 vec_st(fdst2, 16, dst);
713
714 dst += dstStride;
715 }
716 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
717 }