comparison ppc/dsputil_h264_template_altivec.c @ 3346:052765f11f1c libavcodec

Cosmetics: should not hurt performance, scream if are
author lu_zero
date Sat, 17 Jun 2006 18:46:06 +0000
parents 7aa01243b4d3
children eff63ac2b545
comparison
equal deleted inserted replaced
3345:76620d530d9a 3346:052765f11f1c
17 */ 17 */
18 18
19 /* this code assume that stride % 16 == 0 */ 19 /* this code assume that stride % 16 == 0 */
20 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { 20 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
21 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); 21 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
22 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); 22 signed int ABCD[4] __attribute__((aligned(16))) =
23 signed int ABCD[4] __attribute__((aligned(16))); 23 {((8 - x) * (8 - y)),
24 ((x) * (8 - y)),
25 ((8 - x) * (y)),
26 ((x) * (y))};
24 register int i; 27 register int i;
25 ABCD[0] = ((8 - x) * (8 - y)); 28 vector unsigned char fperm;
26 ABCD[1] = ((x) * (8 - y));
27 ABCD[2] = ((8 - x) * (y));
28 ABCD[3] = ((x) * (y));
29 const vector signed int vABCD = vec_ld(0, ABCD); 29 const vector signed int vABCD = vec_ld(0, ABCD);
30 const vector signed short vA = vec_splat((vector signed short)vABCD, 1); 30 const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
31 const vector signed short vB = vec_splat((vector signed short)vABCD, 3); 31 const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
32 const vector signed short vC = vec_splat((vector signed short)vABCD, 5); 32 const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
33 const vector signed short vD = vec_splat((vector signed short)vABCD, 7); 33 const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
34 const vector signed int vzero = vec_splat_s32(0); 34 const vector signed int vzero = vec_splat_s32(0);
35 const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); 35 const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
36 const vector unsigned short v6us = vec_splat_u16(6); 36 const vector unsigned short v6us = vec_splat_u16(6);
37
38 vector unsigned char fperm;
39
40 if (((unsigned long)dst) % 16 == 0) {
41 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
42 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
43 } else {
44 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
45 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
46 }
47
48 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 37 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
49 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 38 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
50 39
51 vector unsigned char vsrcAuc; 40 vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
52 vector unsigned char vsrcBuc; 41 vector unsigned char vsrc0uc, vsrc1uc;
53 vector unsigned char vsrcperm0; 42 vector signed short vsrc0ssH, vsrc1ssH;
54 vector unsigned char vsrcperm1; 43 vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
44 vector signed short vsrc2ssH, vsrc3ssH, psum;
45 vector unsigned char vdst, ppsum, vfdst, fsum;
46
47 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
48
49 if (((unsigned long)dst) % 16 == 0) {
50 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
51 0x14, 0x15, 0x16, 0x17,
52 0x08, 0x09, 0x0A, 0x0B,
53 0x0C, 0x0D, 0x0E, 0x0F);
54 } else {
55 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
56 0x04, 0x05, 0x06, 0x07,
57 0x18, 0x19, 0x1A, 0x1B,
58 0x1C, 0x1D, 0x1E, 0x1F);
59 }
60
55 vsrcAuc = vec_ld(0, src); 61 vsrcAuc = vec_ld(0, src);
62
56 if (loadSecond) 63 if (loadSecond)
57 vsrcBuc = vec_ld(16, src); 64 vsrcBuc = vec_ld(16, src);
58 vsrcperm0 = vec_lvsl(0, src); 65 vsrcperm0 = vec_lvsl(0, src);
59 vsrcperm1 = vec_lvsl(1, src); 66 vsrcperm1 = vec_lvsl(1, src);
60 67
61 vector unsigned char vsrc0uc;
62 vector unsigned char vsrc1uc;
63 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); 68 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
64 if (reallyBadAlign) 69 if (reallyBadAlign)
65 vsrc1uc = vsrcBuc; 70 vsrc1uc = vsrcBuc;
66 else 71 else
67 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 72 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
68 73
69 vector signed short vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc0uc); 74 vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
70 vector signed short vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc1uc); 75 (vector unsigned char)vsrc0uc);
76 vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
77 (vector unsigned char)vsrc1uc);
71 78
72 if (!loadSecond) {// -> !reallyBadAlign 79 if (!loadSecond) {// -> !reallyBadAlign
73 for (i = 0 ; i < h ; i++) { 80 for (i = 0 ; i < h ; i++) {
74 vector unsigned char vsrcCuc; 81
82
75 vsrcCuc = vec_ld(stride + 0, src); 83 vsrcCuc = vec_ld(stride + 0, src);
76 84
77 vector unsigned char vsrc2uc;
78 vector unsigned char vsrc3uc;
79 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 85 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
80 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 86 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
81 87
82 vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc); 88 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
83 vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc); 89 (vector unsigned char)vsrc2uc);
84 90 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
85 vector signed short psum; 91 (vector unsigned char)vsrc3uc);
86 92
87 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); 93 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
88 psum = vec_mladd(vB, vsrc1ssH, psum); 94 psum = vec_mladd(vB, vsrc1ssH, psum);
89 psum = vec_mladd(vC, vsrc2ssH, psum); 95 psum = vec_mladd(vC, vsrc2ssH, psum);
90 psum = vec_mladd(vD, vsrc3ssH, psum); 96 psum = vec_mladd(vD, vsrc3ssH, psum);
91 psum = vec_add(v32ss, psum); 97 psum = vec_add(v32ss, psum);
92 psum = vec_sra(psum, v6us); 98 psum = vec_sra(psum, v6us);
93 99
94 vector unsigned char vdst = vec_ld(0, dst); 100 vdst = vec_ld(0, dst);
95 vector unsigned char ppsum = (vector unsigned char)vec_packsu(psum, psum); 101 ppsum = (vector unsigned char)vec_packsu(psum, psum);
96 102 vfdst = vec_perm(vdst, ppsum, fperm);
97 vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm);
98 vector unsigned char fsum;
99 103
100 OP_U8_ALTIVEC(fsum, vfdst, vdst); 104 OP_U8_ALTIVEC(fsum, vfdst, vdst);
101 105
102 vec_st(fsum, 0, dst); 106 vec_st(fsum, 0, dst);
103 107
106 110
107 dst += stride; 111 dst += stride;
108 src += stride; 112 src += stride;
109 } 113 }
110 } else { 114 } else {
115 vector unsigned char vsrcDuc;
111 for (i = 0 ; i < h ; i++) { 116 for (i = 0 ; i < h ; i++) {
112 vector unsigned char vsrcCuc;
113 vector unsigned char vsrcDuc;
114 vsrcCuc = vec_ld(stride + 0, src); 117 vsrcCuc = vec_ld(stride + 0, src);
115 vsrcDuc = vec_ld(stride + 16, src); 118 vsrcDuc = vec_ld(stride + 16, src);
116 119
117 vector unsigned char vsrc2uc;
118 vector unsigned char vsrc3uc;
119 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 120 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
120 if (reallyBadAlign) 121 if (reallyBadAlign)
121 vsrc3uc = vsrcDuc; 122 vsrc3uc = vsrcDuc;
122 else 123 else
123 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 124 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
124 125
125 vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc); 126 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
126 vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc); 127 (vector unsigned char)vsrc2uc);
127 128 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
128 vector signed short psum; 129 (vector unsigned char)vsrc3uc);
129 130
130 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); 131 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
131 psum = vec_mladd(vB, vsrc1ssH, psum); 132 psum = vec_mladd(vB, vsrc1ssH, psum);
132 psum = vec_mladd(vC, vsrc2ssH, psum); 133 psum = vec_mladd(vC, vsrc2ssH, psum);
133 psum = vec_mladd(vD, vsrc3ssH, psum); 134 psum = vec_mladd(vD, vsrc3ssH, psum);
134 psum = vec_add(v32ss, psum); 135 psum = vec_add(v32ss, psum);
135 psum = vec_sr(psum, v6us); 136 psum = vec_sr(psum, v6us);
136 137
137 vector unsigned char vdst = vec_ld(0, dst); 138 vdst = vec_ld(0, dst);
138 vector unsigned char ppsum = (vector unsigned char)vec_pack(psum, psum); 139 ppsum = (vector unsigned char)vec_pack(psum, psum);
139 140 vfdst = vec_perm(vdst, ppsum, fperm);
140 vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm);
141 vector unsigned char fsum;
142 141
143 OP_U8_ALTIVEC(fsum, vfdst, vdst); 142 OP_U8_ALTIVEC(fsum, vfdst, vdst);
144 143
145 vec_st(fsum, 0, dst); 144 vec_st(fsum, 0, dst);
146 145
155 } 154 }
156 155
157 /* this code assume stride % 16 == 0 */ 156 /* this code assume stride % 16 == 0 */
158 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 157 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
159 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); 158 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
160 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
161 register int i; 159 register int i;
162 160
163 const vector signed int vzero = vec_splat_s32(0); 161 const vector signed int vzero = vec_splat_s32(0);
164 const vector unsigned char permM2 = vec_lvsl(-2, src); 162 const vector unsigned char permM2 = vec_lvsl(-2, src);
165 const vector unsigned char permM1 = vec_lvsl(-1, src); 163 const vector unsigned char permM1 = vec_lvsl(-1, src);
170 const vector signed short v5ss = vec_splat_s16(5); 168 const vector signed short v5ss = vec_splat_s16(5);
171 const vector unsigned short v5us = vec_splat_u16(5); 169 const vector unsigned short v5us = vec_splat_u16(5);
172 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 170 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
173 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 171 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
174 const vector unsigned char dstperm = vec_lvsr(0, dst); 172 const vector unsigned char dstperm = vec_lvsr(0, dst);
175 const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); 173 const vector unsigned char neg1 =
176 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); 174 (const vector unsigned char) vec_splat_s8(-1);
175
176 const vector unsigned char dstmask =
177 vec_perm((const vector unsigned char)vzero,
178 neg1, dstperm);
179
180 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
177 181
178 register int align = ((((unsigned long)src) - 2) % 16); 182 register int align = ((((unsigned long)src) - 2) % 16);
179 183
184 vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
185 srcP2A, srcP2B, srcP3A, srcP3B,
186 srcM1A, srcM1B, srcM2A, srcM2B,
187 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
188 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
189 psumA, psumB, sumA, sumB;
190
191 vector unsigned char sum, dst1, dst2, vdst, fsum,
192 rsum, fdst1, fdst2;
193
194 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
195
180 for (i = 0 ; i < 16 ; i ++) { 196 for (i = 0 ; i < 16 ; i ++) {
181 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
182 vector unsigned char srcR1 = vec_ld(-2, src); 197 vector unsigned char srcR1 = vec_ld(-2, src);
183 vector unsigned char srcR2 = vec_ld(14, src); 198 vector unsigned char srcR2 = vec_ld(14, src);
184 199
185 switch (align) { 200 switch (align) {
186 default: { 201 default: {
235 srcP2 = vec_perm(srcR2, srcR3, permP2); 250 srcP2 = vec_perm(srcR2, srcR3, permP2);
236 srcP3 = vec_perm(srcR2, srcR3, permP3); 251 srcP3 = vec_perm(srcR2, srcR3, permP3);
237 } break; 252 } break;
238 } 253 }
239 254
240 const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); 255 srcP0A = vec_mergeh((vector unsigned char)vzero, srcP0);
241 const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); 256 srcP0B = vec_mergel((vector unsigned char)vzero, srcP0);
242 const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); 257 srcP1A = vec_mergeh((vector unsigned char)vzero, srcP1);
243 const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); 258 srcP1B = vec_mergel((vector unsigned char)vzero, srcP1);
244 259
245 const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); 260 srcP2A = vec_mergeh((vector unsigned char)vzero, srcP2);
246 const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); 261 srcP2B = vec_mergel((vector unsigned char)vzero, srcP2);
247 const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); 262 srcP3A = vec_mergeh((vector unsigned char)vzero, srcP3);
248 const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); 263 srcP3B = vec_mergel((vector unsigned char)vzero, srcP3);
249 264
250 const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); 265 srcM1A = vec_mergeh((vector unsigned char)vzero, srcM1);
251 const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); 266 srcM1B = vec_mergel((vector unsigned char)vzero, srcM1);
252 const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); 267 srcM2A = vec_mergeh((vector unsigned char)vzero, srcM2);
253 const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); 268 srcM2B = vec_mergel((vector unsigned char)vzero, srcM2);
254 269
255 const vector signed short sum1A = vec_adds(srcP0A, srcP1A); 270 sum1A = vec_adds(srcP0A, srcP1A);
256 const vector signed short sum1B = vec_adds(srcP0B, srcP1B); 271 sum1B = vec_adds(srcP0B, srcP1B);
257 const vector signed short sum2A = vec_adds(srcM1A, srcP2A); 272 sum2A = vec_adds(srcM1A, srcP2A);
258 const vector signed short sum2B = vec_adds(srcM1B, srcP2B); 273 sum2B = vec_adds(srcM1B, srcP2B);
259 const vector signed short sum3A = vec_adds(srcM2A, srcP3A); 274 sum3A = vec_adds(srcM2A, srcP3A);
260 const vector signed short sum3B = vec_adds(srcM2B, srcP3B); 275 sum3B = vec_adds(srcM2B, srcP3B);
261 276
262 const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss); 277 pp1A = vec_mladd(sum1A, v20ss, v16ss);
263 const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss); 278 pp1B = vec_mladd(sum1B, v20ss, v16ss);
264 279
265 const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); 280 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
266 const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); 281 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
267 282
268 const vector signed short pp3A = vec_add(sum3A, pp1A); 283 pp3A = vec_add(sum3A, pp1A);
269 const vector signed short pp3B = vec_add(sum3B, pp1B); 284 pp3B = vec_add(sum3B, pp1B);
270 285
271 const vector signed short psumA = vec_sub(pp3A, pp2A); 286 psumA = vec_sub(pp3A, pp2A);
272 const vector signed short psumB = vec_sub(pp3B, pp2B); 287 psumB = vec_sub(pp3B, pp2B);
273 288
274 const vector signed short sumA = vec_sra(psumA, v5us); 289 sumA = vec_sra(psumA, v5us);
275 const vector signed short sumB = vec_sra(psumB, v5us); 290 sumB = vec_sra(psumB, v5us);
276 291
277 const vector unsigned char sum = vec_packsu(sumA, sumB); 292 sum = vec_packsu(sumA, sumB);
278 293
279 const vector unsigned char dst1 = vec_ld(0, dst); 294 dst1 = vec_ld(0, dst);
280 const vector unsigned char dst2 = vec_ld(16, dst); 295 dst2 = vec_ld(16, dst);
281 const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); 296 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
282 297
283 vector unsigned char fsum;
284 OP_U8_ALTIVEC(fsum, sum, vdst); 298 OP_U8_ALTIVEC(fsum, sum, vdst);
285 299
286 const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); 300 rsum = vec_perm(fsum, fsum, dstperm);
287 const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); 301 fdst1 = vec_sel(dst1, rsum, dstmask);
288 const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); 302 fdst2 = vec_sel(rsum, dst2, dstmask);
289 303
290 vec_st(fdst1, 0, dst); 304 vec_st(fdst1, 0, dst);
291 vec_st(fdst2, 16, dst); 305 vec_st(fdst2, 16, dst);
292 306
293 src += srcStride; 307 src += srcStride;
297 } 311 }
298 312
299 /* this code assume stride % 16 == 0 */ 313 /* this code assume stride % 16 == 0 */
300 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 314 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
301 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); 315 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
302 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
303 316
304 register int i; 317 register int i;
305 318
306 const vector signed int vzero = vec_splat_s32(0); 319 const vector signed int vzero = vec_splat_s32(0);
307 const vector unsigned char perm = vec_lvsl(0, src); 320 const vector unsigned char perm = vec_lvsl(0, src);
316 uint8_t *srcbis = src - (srcStride * 2); 329 uint8_t *srcbis = src - (srcStride * 2);
317 330
318 const vector unsigned char srcM2a = vec_ld(0, srcbis); 331 const vector unsigned char srcM2a = vec_ld(0, srcbis);
319 const vector unsigned char srcM2b = vec_ld(16, srcbis); 332 const vector unsigned char srcM2b = vec_ld(16, srcbis);
320 const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm); 333 const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
321 srcbis += srcStride; 334 // srcbis += srcStride;
322 const vector unsigned char srcM1a = vec_ld(0, srcbis); 335 const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
323 const vector unsigned char srcM1b = vec_ld(16, srcbis); 336 const vector unsigned char srcM1b = vec_ld(16, srcbis);
324 const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm); 337 const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
325 srcbis += srcStride; 338 // srcbis += srcStride;
326 const vector unsigned char srcP0a = vec_ld(0, srcbis); 339 const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
327 const vector unsigned char srcP0b = vec_ld(16, srcbis); 340 const vector unsigned char srcP0b = vec_ld(16, srcbis);
328 const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm); 341 const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
329 srcbis += srcStride; 342 // srcbis += srcStride;
330 const vector unsigned char srcP1a = vec_ld(0, srcbis); 343 const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
331 const vector unsigned char srcP1b = vec_ld(16, srcbis); 344 const vector unsigned char srcP1b = vec_ld(16, srcbis);
332 const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm); 345 const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
333 srcbis += srcStride; 346 // srcbis += srcStride;
334 const vector unsigned char srcP2a = vec_ld(0, srcbis); 347 const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
335 const vector unsigned char srcP2b = vec_ld(16, srcbis); 348 const vector unsigned char srcP2b = vec_ld(16, srcbis);
336 const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm); 349 const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
337 srcbis += srcStride; 350 // srcbis += srcStride;
338 351
339 vector signed short srcM2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); 352 vector signed short srcM2ssA = (vector signed short)
340 vector signed short srcM2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); 353 vec_mergeh((vector unsigned char)vzero, srcM2);
341 vector signed short srcM1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); 354 vector signed short srcM2ssB = (vector signed short)
342 vector signed short srcM1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); 355 vec_mergel((vector unsigned char)vzero, srcM2);
343 vector signed short srcP0ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); 356 vector signed short srcM1ssA = (vector signed short)
344 vector signed short srcP0ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); 357 vec_mergeh((vector unsigned char)vzero, srcM1);
345 vector signed short srcP1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); 358 vector signed short srcM1ssB = (vector signed short)
346 vector signed short srcP1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); 359 vec_mergel((vector unsigned char)vzero, srcM1);
347 vector signed short srcP2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); 360 vector signed short srcP0ssA = (vector signed short)
348 vector signed short srcP2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); 361 vec_mergeh((vector unsigned char)vzero, srcP0);
362 vector signed short srcP0ssB = (vector signed short)
363 vec_mergel((vector unsigned char)vzero, srcP0);
364 vector signed short srcP1ssA = (vector signed short)
365 vec_mergeh((vector unsigned char)vzero, srcP1);
366 vector signed short srcP1ssB = (vector signed short)
367 vec_mergel((vector unsigned char)vzero, srcP1);
368 vector signed short srcP2ssA = (vector signed short)
369 vec_mergeh((vector unsigned char)vzero, srcP2);
370 vector signed short srcP2ssB = (vector signed short)
371 vec_mergel((vector unsigned char)vzero, srcP2);
372
373 vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
374 psumA, psumB, sumA, sumB,
375 srcP3ssA, srcP3ssB,
376 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
377
378 vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
379 srcP3a, srcP3b, srcP3;
380
381 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
349 382
350 for (i = 0 ; i < 16 ; i++) { 383 for (i = 0 ; i < 16 ; i++) {
351 const vector unsigned char srcP3a = vec_ld(0, srcbis); 384 srcP3a = vec_ld(0, srcbis += srcStride);
352 const vector unsigned char srcP3b = vec_ld(16, srcbis); 385 srcP3b = vec_ld(16, srcbis);
353 const vector unsigned char srcP3 = vec_perm(srcP3a, srcP3b, perm); 386 srcP3 = vec_perm(srcP3a, srcP3b, perm);
354 const vector signed short srcP3ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); 387 srcP3ssA = (vector signed short)
355 const vector signed short srcP3ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); 388 vec_mergeh((vector unsigned char)vzero, srcP3);
356 srcbis += srcStride; 389 srcP3ssB = (vector signed short)
357 390 vec_mergel((vector unsigned char)vzero, srcP3);
358 const vector signed short sum1A = vec_adds(srcP0ssA, srcP1ssA); 391 // srcbis += srcStride;
359 const vector signed short sum1B = vec_adds(srcP0ssB, srcP1ssB); 392
360 const vector signed short sum2A = vec_adds(srcM1ssA, srcP2ssA); 393 sum1A = vec_adds(srcP0ssA, srcP1ssA);
361 const vector signed short sum2B = vec_adds(srcM1ssB, srcP2ssB); 394 sum1B = vec_adds(srcP0ssB, srcP1ssB);
362 const vector signed short sum3A = vec_adds(srcM2ssA, srcP3ssA); 395 sum2A = vec_adds(srcM1ssA, srcP2ssA);
363 const vector signed short sum3B = vec_adds(srcM2ssB, srcP3ssB); 396 sum2B = vec_adds(srcM1ssB, srcP2ssB);
397 sum3A = vec_adds(srcM2ssA, srcP3ssA);
398 sum3B = vec_adds(srcM2ssB, srcP3ssB);
364 399
365 srcM2ssA = srcM1ssA; 400 srcM2ssA = srcM1ssA;
366 srcM2ssB = srcM1ssB; 401 srcM2ssB = srcM1ssB;
367 srcM1ssA = srcP0ssA; 402 srcM1ssA = srcP0ssA;
368 srcM1ssB = srcP0ssB; 403 srcM1ssB = srcP0ssB;
371 srcP1ssA = srcP2ssA; 406 srcP1ssA = srcP2ssA;
372 srcP1ssB = srcP2ssB; 407 srcP1ssB = srcP2ssB;
373 srcP2ssA = srcP3ssA; 408 srcP2ssA = srcP3ssA;
374 srcP2ssB = srcP3ssB; 409 srcP2ssB = srcP3ssB;
375 410
376 const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss); 411 pp1A = vec_mladd(sum1A, v20ss, v16ss);
377 const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss); 412 pp1B = vec_mladd(sum1B, v20ss, v16ss);
378 413
379 const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); 414 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
380 const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); 415 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
381 416
382 const vector signed short pp3A = vec_add(sum3A, pp1A); 417 pp3A = vec_add(sum3A, pp1A);
383 const vector signed short pp3B = vec_add(sum3B, pp1B); 418 pp3B = vec_add(sum3B, pp1B);
384 419
385 const vector signed short psumA = vec_sub(pp3A, pp2A); 420 psumA = vec_sub(pp3A, pp2A);
386 const vector signed short psumB = vec_sub(pp3B, pp2B); 421 psumB = vec_sub(pp3B, pp2B);
387 422
388 const vector signed short sumA = vec_sra(psumA, v5us); 423 sumA = vec_sra(psumA, v5us);
389 const vector signed short sumB = vec_sra(psumB, v5us); 424 sumB = vec_sra(psumB, v5us);
390 425
391 const vector unsigned char sum = vec_packsu(sumA, sumB); 426 sum = vec_packsu(sumA, sumB);
392 427
393 const vector unsigned char dst1 = vec_ld(0, dst); 428 dst1 = vec_ld(0, dst);
394 const vector unsigned char dst2 = vec_ld(16, dst); 429 dst2 = vec_ld(16, dst);
395 const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); 430 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
396 431
397 vector unsigned char fsum;
398 OP_U8_ALTIVEC(fsum, sum, vdst); 432 OP_U8_ALTIVEC(fsum, sum, vdst);
399 433
400 const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); 434 rsum = vec_perm(fsum, fsum, dstperm);
401 const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); 435 fdst1 = vec_sel(dst1, rsum, dstmask);
402 const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); 436 fdst2 = vec_sel(rsum, dst2, dstmask);
403 437
404 vec_st(fdst1, 0, dst); 438 vec_st(fdst1, 0, dst);
405 vec_st(fdst2, 16, dst); 439 vec_st(fdst2, 16, dst);
406 440
407 dst += dstStride; 441 dst += dstStride;
410 } 444 }
411 445
412 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 446 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
413 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { 447 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
414 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); 448 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
415 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
416 register int i; 449 register int i;
417 const vector signed int vzero = vec_splat_s32(0); 450 const vector signed int vzero = vec_splat_s32(0);
418 const vector unsigned char permM2 = vec_lvsl(-2, src); 451 const vector unsigned char permM2 = vec_lvsl(-2, src);
419 const vector unsigned char permM1 = vec_lvsl(-1, src); 452 const vector unsigned char permM1 = vec_lvsl(-1, src);
420 const vector unsigned char permP0 = vec_lvsl(+0, src); 453 const vector unsigned char permP0 = vec_lvsl(+0, src);
428 const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 461 const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
429 const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 462 const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
430 463
431 register int align = ((((unsigned long)src) - 2) % 16); 464 register int align = ((((unsigned long)src) - 2) % 16);
432 465
466 const vector unsigned char neg1 = (const vector unsigned char)
467 vec_splat_s8(-1);
468
469 vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
470 srcP2A, srcP2B, srcP3A, srcP3B,
471 srcM1A, srcM1B, srcM2A, srcM2B,
472 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
473 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
474
475 const vector unsigned char dstperm = vec_lvsr(0, dst);
476
477 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
478
479 const vector unsigned char mperm = (const vector unsigned char)
480 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
481 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
482 int16_t *tmpbis = tmp;
483
484 vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
485 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
486 tmpP2ssA, tmpP2ssB;
487
488 vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
489 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
490 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
491 ssumAe, ssumAo, ssumBe, ssumBo;
492 vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
493 rsum, fdst1, fdst2;
494 vector signed short ssume, ssumo;
495
496 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
433 src -= (2 * srcStride); 497 src -= (2 * srcStride);
434
435 for (i = 0 ; i < 21 ; i ++) { 498 for (i = 0 ; i < 21 ; i ++) {
436 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 499 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
437 vector unsigned char srcR1 = vec_ld(-2, src); 500 vector unsigned char srcR1 = vec_ld(-2, src);
438 vector unsigned char srcR2 = vec_ld(14, src); 501 vector unsigned char srcR2 = vec_ld(14, src);
439 502
490 srcP2 = vec_perm(srcR2, srcR3, permP2); 553 srcP2 = vec_perm(srcR2, srcR3, permP2);
491 srcP3 = vec_perm(srcR2, srcR3, permP3); 554 srcP3 = vec_perm(srcR2, srcR3, permP3);
492 } break; 555 } break;
493 } 556 }
494 557
495 const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); 558 srcP0A = (vector signed short)
496 const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); 559 vec_mergeh((vector unsigned char)vzero, srcP0);
497 const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); 560 srcP0B = (vector signed short)
498 const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); 561 vec_mergel((vector unsigned char)vzero, srcP0);
499 562 srcP1A = (vector signed short)
500 const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); 563 vec_mergeh((vector unsigned char)vzero, srcP1);
501 const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); 564 srcP1B = (vector signed short)
502 const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); 565 vec_mergel((vector unsigned char)vzero, srcP1);
503 const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); 566
504 567 srcP2A = (vector signed short)
505 const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); 568 vec_mergeh((vector unsigned char)vzero, srcP2);
506 const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); 569 srcP2B = (vector signed short)
507 const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); 570 vec_mergel((vector unsigned char)vzero, srcP2);
508 const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); 571 srcP3A = (vector signed short)
509 572 vec_mergeh((vector unsigned char)vzero, srcP3);
510 const vector signed short sum1A = vec_adds(srcP0A, srcP1A); 573 srcP3B = (vector signed short)
511 const vector signed short sum1B = vec_adds(srcP0B, srcP1B); 574 vec_mergel((vector unsigned char)vzero, srcP3);
512 const vector signed short sum2A = vec_adds(srcM1A, srcP2A); 575
513 const vector signed short sum2B = vec_adds(srcM1B, srcP2B); 576 srcM1A = (vector signed short)
514 const vector signed short sum3A = vec_adds(srcM2A, srcP3A); 577 vec_mergeh((vector unsigned char)vzero, srcM1);
515 const vector signed short sum3B = vec_adds(srcM2B, srcP3B); 578 srcM1B = (vector signed short)
516 579 vec_mergel((vector unsigned char)vzero, srcM1);
517 const vector signed short pp1A = vec_mladd(sum1A, v20ss, sum3A); 580 srcM2A = (vector signed short)
518 const vector signed short pp1B = vec_mladd(sum1B, v20ss, sum3B); 581 vec_mergeh((vector unsigned char)vzero, srcM2);
519 582 srcM2B = (vector signed short)
520 const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); 583 vec_mergel((vector unsigned char)vzero, srcM2);
521 const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); 584
522 585 sum1A = vec_adds(srcP0A, srcP1A);
523 const vector signed short psumA = vec_sub(pp1A, pp2A); 586 sum1B = vec_adds(srcP0B, srcP1B);
524 const vector signed short psumB = vec_sub(pp1B, pp2B); 587 sum2A = vec_adds(srcM1A, srcP2A);
588 sum2B = vec_adds(srcM1B, srcP2B);
589 sum3A = vec_adds(srcM2A, srcP3A);
590 sum3B = vec_adds(srcM2B, srcP3B);
591
592 pp1A = vec_mladd(sum1A, v20ss, sum3A);
593 pp1B = vec_mladd(sum1B, v20ss, sum3B);
594
595 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
596 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
597
598 psumA = vec_sub(pp1A, pp2A);
599 psumB = vec_sub(pp1B, pp2B);
525 600
526 vec_st(psumA, 0, tmp); 601 vec_st(psumA, 0, tmp);
527 vec_st(psumB, 16, tmp); 602 vec_st(psumB, 16, tmp);
528 603
529 src += srcStride; 604 src += srcStride;
530 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 605 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
531 } 606 }
532 607
533 const vector unsigned char dstperm = vec_lvsr(0, dst); 608 tmpM2ssA = vec_ld(0, tmpbis);
534 const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); 609 tmpM2ssB = vec_ld(16, tmpbis);
535 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
536 const vector unsigned char mperm = (const vector unsigned char)
537 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
538 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
539
540 int16_t *tmpbis = tmp - (tmpStride * 21);
541
542 vector signed short tmpM2ssA = vec_ld(0, tmpbis);
543 vector signed short tmpM2ssB = vec_ld(16, tmpbis);
544 tmpbis += tmpStride; 610 tmpbis += tmpStride;
545 vector signed short tmpM1ssA = vec_ld(0, tmpbis); 611 tmpM1ssA = vec_ld(0, tmpbis);
546 vector signed short tmpM1ssB = vec_ld(16, tmpbis); 612 tmpM1ssB = vec_ld(16, tmpbis);
547 tmpbis += tmpStride; 613 tmpbis += tmpStride;
548 vector signed short tmpP0ssA = vec_ld(0, tmpbis); 614 tmpP0ssA = vec_ld(0, tmpbis);
549 vector signed short tmpP0ssB = vec_ld(16, tmpbis); 615 tmpP0ssB = vec_ld(16, tmpbis);
550 tmpbis += tmpStride; 616 tmpbis += tmpStride;
551 vector signed short tmpP1ssA = vec_ld(0, tmpbis); 617 tmpP1ssA = vec_ld(0, tmpbis);
552 vector signed short tmpP1ssB = vec_ld(16, tmpbis); 618 tmpP1ssB = vec_ld(16, tmpbis);
553 tmpbis += tmpStride; 619 tmpbis += tmpStride;
554 vector signed short tmpP2ssA = vec_ld(0, tmpbis); 620 tmpP2ssA = vec_ld(0, tmpbis);
555 vector signed short tmpP2ssB = vec_ld(16, tmpbis); 621 tmpP2ssB = vec_ld(16, tmpbis);
556 tmpbis += tmpStride; 622 tmpbis += tmpStride;
557 623
558 for (i = 0 ; i < 16 ; i++) { 624 for (i = 0 ; i < 16 ; i++) {
559 const vector signed short tmpP3ssA = vec_ld(0, tmpbis); 625 const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
560 const vector signed short tmpP3ssB = vec_ld(16, tmpbis); 626 const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
561 tmpbis += tmpStride;
562 627
563 const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA); 628 const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
564 const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB); 629 const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
565 const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA); 630 const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
566 const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB); 631 const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
567 const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA); 632 const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
568 const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB); 633 const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
634
635 tmpbis += tmpStride;
569 636
570 tmpM2ssA = tmpM1ssA; 637 tmpM2ssA = tmpM1ssA;
571 tmpM2ssB = tmpM1ssB; 638 tmpM2ssB = tmpM1ssB;
572 tmpM1ssA = tmpP0ssA; 639 tmpM1ssA = tmpP0ssA;
573 tmpM1ssB = tmpP0ssB; 640 tmpM1ssB = tmpP0ssB;
576 tmpP1ssA = tmpP2ssA; 643 tmpP1ssA = tmpP2ssA;
577 tmpP1ssB = tmpP2ssB; 644 tmpP1ssB = tmpP2ssB;
578 tmpP2ssA = tmpP3ssA; 645 tmpP2ssA = tmpP3ssA;
579 tmpP2ssB = tmpP3ssB; 646 tmpP2ssB = tmpP3ssB;
580 647
581 const vector signed int pp1Ae = vec_mule(sum1A, v20ss); 648 pp1Ae = vec_mule(sum1A, v20ss);
582 const vector signed int pp1Ao = vec_mulo(sum1A, v20ss); 649 pp1Ao = vec_mulo(sum1A, v20ss);
583 const vector signed int pp1Be = vec_mule(sum1B, v20ss); 650 pp1Be = vec_mule(sum1B, v20ss);
584 const vector signed int pp1Bo = vec_mulo(sum1B, v20ss); 651 pp1Bo = vec_mulo(sum1B, v20ss);
585 652
586 const vector signed int pp2Ae = vec_mule(sum2A, v5ss); 653 pp2Ae = vec_mule(sum2A, v5ss);
587 const vector signed int pp2Ao = vec_mulo(sum2A, v5ss); 654 pp2Ao = vec_mulo(sum2A, v5ss);
588 const vector signed int pp2Be = vec_mule(sum2B, v5ss); 655 pp2Be = vec_mule(sum2B, v5ss);
589 const vector signed int pp2Bo = vec_mulo(sum2B, v5ss); 656 pp2Bo = vec_mulo(sum2B, v5ss);
590 657
591 const vector signed int pp3Ae = vec_sra((vector signed int)sum3A, v16ui); 658 pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
592 const vector signed int pp3Ao = vec_mulo(sum3A, v1ss); 659 pp3Ao = vec_mulo(sum3A, v1ss);
593 const vector signed int pp3Be = vec_sra((vector signed int)sum3B, v16ui); 660 pp3Be = vec_sra((vector signed int)sum3B, v16ui);
594 const vector signed int pp3Bo = vec_mulo(sum3B, v1ss); 661 pp3Bo = vec_mulo(sum3B, v1ss);
595 662
596 const vector signed int pp1cAe = vec_add(pp1Ae, v512si); 663 pp1cAe = vec_add(pp1Ae, v512si);
597 const vector signed int pp1cAo = vec_add(pp1Ao, v512si); 664 pp1cAo = vec_add(pp1Ao, v512si);
598 const vector signed int pp1cBe = vec_add(pp1Be, v512si); 665 pp1cBe = vec_add(pp1Be, v512si);
599 const vector signed int pp1cBo = vec_add(pp1Bo, v512si); 666 pp1cBo = vec_add(pp1Bo, v512si);
600 667
601 const vector signed int pp32Ae = vec_sub(pp3Ae, pp2Ae); 668 pp32Ae = vec_sub(pp3Ae, pp2Ae);
602 const vector signed int pp32Ao = vec_sub(pp3Ao, pp2Ao); 669 pp32Ao = vec_sub(pp3Ao, pp2Ao);
603 const vector signed int pp32Be = vec_sub(pp3Be, pp2Be); 670 pp32Be = vec_sub(pp3Be, pp2Be);
604 const vector signed int pp32Bo = vec_sub(pp3Bo, pp2Bo); 671 pp32Bo = vec_sub(pp3Bo, pp2Bo);
605 672
606 const vector signed int sumAe = vec_add(pp1cAe, pp32Ae); 673 sumAe = vec_add(pp1cAe, pp32Ae);
607 const vector signed int sumAo = vec_add(pp1cAo, pp32Ao); 674 sumAo = vec_add(pp1cAo, pp32Ao);
608 const vector signed int sumBe = vec_add(pp1cBe, pp32Be); 675 sumBe = vec_add(pp1cBe, pp32Be);
609 const vector signed int sumBo = vec_add(pp1cBo, pp32Bo); 676 sumBo = vec_add(pp1cBo, pp32Bo);
610 677
611 const vector signed int ssumAe = vec_sra(sumAe, v10ui); 678 ssumAe = vec_sra(sumAe, v10ui);
612 const vector signed int ssumAo = vec_sra(sumAo, v10ui); 679 ssumAo = vec_sra(sumAo, v10ui);
613 const vector signed int ssumBe = vec_sra(sumBe, v10ui); 680 ssumBe = vec_sra(sumBe, v10ui);
614 const vector signed int ssumBo = vec_sra(sumBo, v10ui); 681 ssumBo = vec_sra(sumBo, v10ui);
615 682
616 const vector signed short ssume = vec_packs(ssumAe, ssumBe); 683 ssume = vec_packs(ssumAe, ssumBe);
617 const vector signed short ssumo = vec_packs(ssumAo, ssumBo); 684 ssumo = vec_packs(ssumAo, ssumBo);
618 685
619 const vector unsigned char sumv = vec_packsu(ssume, ssumo); 686 sumv = vec_packsu(ssume, ssumo);
620 const vector unsigned char sum = vec_perm(sumv, sumv, mperm); 687 sum = vec_perm(sumv, sumv, mperm);
621 688
622 const vector unsigned char dst1 = vec_ld(0, dst); 689 dst1 = vec_ld(0, dst);
623 const vector unsigned char dst2 = vec_ld(16, dst); 690 dst2 = vec_ld(16, dst);
624 const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); 691 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
625 692
626 vector unsigned char fsum;
627 OP_U8_ALTIVEC(fsum, sum, vdst); 693 OP_U8_ALTIVEC(fsum, sum, vdst);
628 694
629 const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); 695 rsum = vec_perm(fsum, fsum, dstperm);
630 const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); 696 fdst1 = vec_sel(dst1, rsum, dstmask);
631 const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); 697 fdst2 = vec_sel(rsum, dst2, dstmask);
632 698
633 vec_st(fdst1, 0, dst); 699 vec_st(fdst1, 0, dst);
634 vec_st(fdst2, 16, dst); 700 vec_st(fdst2, 16, dst);
635 701
636 dst += dstStride; 702 dst += dstStride;