comparison ppc/h264_template_altivec.c @ 8494:1615d6b75ada libavcodec

Cleanup _t types in libavcodec/ppc
author lu_zero
date Sat, 27 Dec 2008 11:21:28 +0000
parents 5f3b62eaf6e5
children 40738baaafc2
comparison
equal deleted inserted replaced
8493:469f3e5bcf13 8494:1615d6b75ada
26 #endif 26 #endif
27 27
28 /* this code assume that stride % 16 == 0 */ 28 /* this code assume that stride % 16 == 0 */
29 29
30 #define CHROMA_MC8_ALTIVEC_CORE \ 30 #define CHROMA_MC8_ALTIVEC_CORE \
31 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\ 31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);\ 32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
33 \ 33 \
34 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 34 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
35 psum = vec_mladd(vB, vsrc1ssH, psum);\ 35 psum = vec_mladd(vB, vsrc1ssH, psum);\
36 psum = vec_mladd(vC, vsrc2ssH, psum);\ 36 psum = vec_mladd(vC, vsrc2ssH, psum);\
37 psum = vec_mladd(vD, vsrc3ssH, psum);\ 37 psum = vec_mladd(vD, vsrc3ssH, psum);\
38 psum = vec_sr(psum, v6us);\ 38 psum = vec_sr(psum, v6us);\
39 \ 39 \
40 vdst = vec_ld(0, dst);\ 40 vdst = vec_ld(0, dst);\
41 ppsum = (vec_u8_t)vec_pack(psum, psum);\ 41 ppsum = (vec_u8)vec_pack(psum, psum);\
42 vfdst = vec_perm(vdst, ppsum, fperm);\ 42 vfdst = vec_perm(vdst, ppsum, fperm);\
43 \ 43 \
44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
45 \ 45 \
46 vec_st(fsum, 0, dst);\ 46 vec_st(fsum, 0, dst);\
51 dst += stride;\ 51 dst += stride;\
52 src += stride; 52 src += stride;
53 53
54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ 54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
55 \ 55 \
56 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\ 56 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
57 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\ 57 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
58 \ 58 \
59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
60 psum = vec_mladd(vE, vsrc1ssH, psum);\ 60 psum = vec_mladd(vE, vsrc1ssH, psum);\
61 psum = vec_sr(psum, v6us);\ 61 psum = vec_sr(psum, v6us);\
62 \ 62 \
63 vdst = vec_ld(0, dst);\ 63 vdst = vec_ld(0, dst);\
64 ppsum = (vec_u8_t)vec_pack(psum, psum);\ 64 ppsum = (vec_u8)vec_pack(psum, psum);\
65 vfdst = vec_perm(vdst, ppsum, fperm);\ 65 vfdst = vec_perm(vdst, ppsum, fperm);\
66 \ 66 \
67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
68 \ 68 \
69 vec_st(fsum, 0, dst);\ 69 vec_st(fsum, 0, dst);\
78 {((8 - x) * (8 - y)), 78 {((8 - x) * (8 - y)),
79 (( x) * (8 - y)), 79 (( x) * (8 - y)),
80 ((8 - x) * ( y)), 80 ((8 - x) * ( y)),
81 (( x) * ( y))}; 81 (( x) * ( y))};
82 register int i; 82 register int i;
83 vec_u8_t fperm; 83 vec_u8 fperm;
84 const vec_s32_t vABCD = vec_ld(0, ABCD); 84 const vec_s32 vABCD = vec_ld(0, ABCD);
85 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); 85 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
86 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); 86 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
87 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); 87 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
88 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); 88 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
89 LOAD_ZERO; 89 LOAD_ZERO;
90 const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); 90 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
91 const vec_u16_t v6us = vec_splat_u16(6); 91 const vec_u16 v6us = vec_splat_u16(6);
92 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 92 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
93 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 93 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
94 94
95 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; 95 vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
96 vec_u8_t vsrc0uc, vsrc1uc; 96 vec_u8 vsrc0uc, vsrc1uc;
97 vec_s16_t vsrc0ssH, vsrc1ssH; 97 vec_s16 vsrc0ssH, vsrc1ssH;
98 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; 98 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
99 vec_s16_t vsrc2ssH, vsrc3ssH, psum; 99 vec_s16 vsrc2ssH, vsrc3ssH, psum;
100 vec_u8_t vdst, ppsum, vfdst, fsum; 100 vec_u8 vdst, ppsum, vfdst, fsum;
101 101
102 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); 102 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
103 103
104 if (((unsigned long)dst) % 16 == 0) { 104 if (((unsigned long)dst) % 16 == 0) {
105 fperm = (vec_u8_t){0x10, 0x11, 0x12, 0x13, 105 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
106 0x14, 0x15, 0x16, 0x17, 106 0x14, 0x15, 0x16, 0x17,
107 0x08, 0x09, 0x0A, 0x0B, 107 0x08, 0x09, 0x0A, 0x0B,
108 0x0C, 0x0D, 0x0E, 0x0F}; 108 0x0C, 0x0D, 0x0E, 0x0F};
109 } else { 109 } else {
110 fperm = (vec_u8_t){0x00, 0x01, 0x02, 0x03, 110 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
111 0x04, 0x05, 0x06, 0x07, 111 0x04, 0x05, 0x06, 0x07,
112 0x18, 0x19, 0x1A, 0x1B, 112 0x18, 0x19, 0x1A, 0x1B,
113 0x1C, 0x1D, 0x1E, 0x1F}; 113 0x1C, 0x1D, 0x1E, 0x1F};
114 } 114 }
115 115
124 if (reallyBadAlign) 124 if (reallyBadAlign)
125 vsrc1uc = vsrcBuc; 125 vsrc1uc = vsrcBuc;
126 else 126 else
127 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 127 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
128 128
129 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc); 129 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
130 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc); 130 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
131 131
132 if (ABCD[3]) { 132 if (ABCD[3]) {
133 if (!loadSecond) {// -> !reallyBadAlign 133 if (!loadSecond) {// -> !reallyBadAlign
134 for (i = 0 ; i < h ; i++) { 134 for (i = 0 ; i < h ; i++) {
135 vsrcCuc = vec_ld(stride + 0, src); 135 vsrcCuc = vec_ld(stride + 0, src);
137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
138 138
139 CHROMA_MC8_ALTIVEC_CORE 139 CHROMA_MC8_ALTIVEC_CORE
140 } 140 }
141 } else { 141 } else {
142 vec_u8_t vsrcDuc; 142 vec_u8 vsrcDuc;
143 for (i = 0 ; i < h ; i++) { 143 for (i = 0 ; i < h ; i++) {
144 vsrcCuc = vec_ld(stride + 0, src); 144 vsrcCuc = vec_ld(stride + 0, src);
145 vsrcDuc = vec_ld(stride + 16, src); 145 vsrcDuc = vec_ld(stride + 16, src);
146 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 146 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
147 if (reallyBadAlign) 147 if (reallyBadAlign)
151 151
152 CHROMA_MC8_ALTIVEC_CORE 152 CHROMA_MC8_ALTIVEC_CORE
153 } 153 }
154 } 154 }
155 } else { 155 } else {
156 const vec_s16_t vE = vec_add(vB, vC); 156 const vec_s16 vE = vec_add(vB, vC);
157 if (ABCD[2]) { // x == 0 B == 0 157 if (ABCD[2]) { // x == 0 B == 0
158 if (!loadSecond) {// -> !reallyBadAlign 158 if (!loadSecond) {// -> !reallyBadAlign
159 for (i = 0 ; i < h ; i++) { 159 for (i = 0 ; i < h ; i++) {
160 vsrcCuc = vec_ld(stride + 0, src); 160 vsrcCuc = vec_ld(stride + 0, src);
161 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 161 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
163 163
164 vsrc0uc = vsrc1uc; 164 vsrc0uc = vsrc1uc;
165 } 165 }
166 } else { 166 } else {
167 vec_u8_t vsrcDuc; 167 vec_u8 vsrcDuc;
168 for (i = 0 ; i < h ; i++) { 168 for (i = 0 ; i < h ; i++) {
169 vsrcCuc = vec_ld(stride + 0, src); 169 vsrcCuc = vec_ld(stride + 0, src);
170 vsrcDuc = vec_ld(stride + 15, src); 170 vsrcDuc = vec_ld(stride + 15, src);
171 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 171 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
182 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 182 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
183 183
184 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 184 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
185 } 185 }
186 } else { 186 } else {
187 vec_u8_t vsrcDuc; 187 vec_u8 vsrcDuc;
188 for (i = 0 ; i < h ; i++) { 188 for (i = 0 ; i < h ; i++) {
189 vsrcCuc = vec_ld(0, src); 189 vsrcCuc = vec_ld(0, src);
190 vsrcDuc = vec_ld(15, src); 190 vsrcDuc = vec_ld(15, src);
191 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 191 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
192 if (reallyBadAlign) 192 if (reallyBadAlign)
208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); 209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
210 register int i; 210 register int i;
211 211
212 LOAD_ZERO; 212 LOAD_ZERO;
213 const vec_u8_t permM2 = vec_lvsl(-2, src); 213 const vec_u8 permM2 = vec_lvsl(-2, src);
214 const vec_u8_t permM1 = vec_lvsl(-1, src); 214 const vec_u8 permM1 = vec_lvsl(-1, src);
215 const vec_u8_t permP0 = vec_lvsl(+0, src); 215 const vec_u8 permP0 = vec_lvsl(+0, src);
216 const vec_u8_t permP1 = vec_lvsl(+1, src); 216 const vec_u8 permP1 = vec_lvsl(+1, src);
217 const vec_u8_t permP2 = vec_lvsl(+2, src); 217 const vec_u8 permP2 = vec_lvsl(+2, src);
218 const vec_u8_t permP3 = vec_lvsl(+3, src); 218 const vec_u8 permP3 = vec_lvsl(+3, src);
219 const vec_s16_t v5ss = vec_splat_s16(5); 219 const vec_s16 v5ss = vec_splat_s16(5);
220 const vec_u16_t v5us = vec_splat_u16(5); 220 const vec_u16 v5us = vec_splat_u16(5);
221 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 221 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
222 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 222 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
223 223
224 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 224 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
225 225
226 register int align = ((((unsigned long)src) - 2) % 16); 226 register int align = ((((unsigned long)src) - 2) % 16);
227 227
228 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, 228 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
229 srcP2A, srcP2B, srcP3A, srcP3B, 229 srcP2A, srcP2B, srcP3A, srcP3B,
230 srcM1A, srcM1B, srcM2A, srcM2B, 230 srcM1A, srcM1B, srcM2A, srcM2B,
231 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 231 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
232 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 232 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
233 psumA, psumB, sumA, sumB; 233 psumA, psumB, sumA, sumB;
234 234
235 vec_u8_t sum, vdst, fsum; 235 vec_u8 sum, vdst, fsum;
236 236
237 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 237 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
238 238
239 for (i = 0 ; i < 16 ; i ++) { 239 for (i = 0 ; i < 16 ; i ++) {
240 vec_u8_t srcR1 = vec_ld(-2, src); 240 vec_u8 srcR1 = vec_ld(-2, src);
241 vec_u8_t srcR2 = vec_ld(14, src); 241 vec_u8 srcR2 = vec_ld(14, src);
242 242
243 switch (align) { 243 switch (align) {
244 default: { 244 default: {
245 srcM2 = vec_perm(srcR1, srcR2, permM2); 245 srcM2 = vec_perm(srcR1, srcR2, permM2);
246 srcM1 = vec_perm(srcR1, srcR2, permM1); 246 srcM1 = vec_perm(srcR1, srcR2, permM1);
256 srcP1 = vec_perm(srcR1, srcR2, permP1); 256 srcP1 = vec_perm(srcR1, srcR2, permP1);
257 srcP2 = vec_perm(srcR1, srcR2, permP2); 257 srcP2 = vec_perm(srcR1, srcR2, permP2);
258 srcP3 = srcR2; 258 srcP3 = srcR2;
259 } break; 259 } break;
260 case 12: { 260 case 12: {
261 vec_u8_t srcR3 = vec_ld(30, src); 261 vec_u8 srcR3 = vec_ld(30, src);
262 srcM2 = vec_perm(srcR1, srcR2, permM2); 262 srcM2 = vec_perm(srcR1, srcR2, permM2);
263 srcM1 = vec_perm(srcR1, srcR2, permM1); 263 srcM1 = vec_perm(srcR1, srcR2, permM1);
264 srcP0 = vec_perm(srcR1, srcR2, permP0); 264 srcP0 = vec_perm(srcR1, srcR2, permP0);
265 srcP1 = vec_perm(srcR1, srcR2, permP1); 265 srcP1 = vec_perm(srcR1, srcR2, permP1);
266 srcP2 = srcR2; 266 srcP2 = srcR2;
267 srcP3 = vec_perm(srcR2, srcR3, permP3); 267 srcP3 = vec_perm(srcR2, srcR3, permP3);
268 } break; 268 } break;
269 case 13: { 269 case 13: {
270 vec_u8_t srcR3 = vec_ld(30, src); 270 vec_u8 srcR3 = vec_ld(30, src);
271 srcM2 = vec_perm(srcR1, srcR2, permM2); 271 srcM2 = vec_perm(srcR1, srcR2, permM2);
272 srcM1 = vec_perm(srcR1, srcR2, permM1); 272 srcM1 = vec_perm(srcR1, srcR2, permM1);
273 srcP0 = vec_perm(srcR1, srcR2, permP0); 273 srcP0 = vec_perm(srcR1, srcR2, permP0);
274 srcP1 = srcR2; 274 srcP1 = srcR2;
275 srcP2 = vec_perm(srcR2, srcR3, permP2); 275 srcP2 = vec_perm(srcR2, srcR3, permP2);
276 srcP3 = vec_perm(srcR2, srcR3, permP3); 276 srcP3 = vec_perm(srcR2, srcR3, permP3);
277 } break; 277 } break;
278 case 14: { 278 case 14: {
279 vec_u8_t srcR3 = vec_ld(30, src); 279 vec_u8 srcR3 = vec_ld(30, src);
280 srcM2 = vec_perm(srcR1, srcR2, permM2); 280 srcM2 = vec_perm(srcR1, srcR2, permM2);
281 srcM1 = vec_perm(srcR1, srcR2, permM1); 281 srcM1 = vec_perm(srcR1, srcR2, permM1);
282 srcP0 = srcR2; 282 srcP0 = srcR2;
283 srcP1 = vec_perm(srcR2, srcR3, permP1); 283 srcP1 = vec_perm(srcR2, srcR3, permP1);
284 srcP2 = vec_perm(srcR2, srcR3, permP2); 284 srcP2 = vec_perm(srcR2, srcR3, permP2);
285 srcP3 = vec_perm(srcR2, srcR3, permP3); 285 srcP3 = vec_perm(srcR2, srcR3, permP3);
286 } break; 286 } break;
287 case 15: { 287 case 15: {
288 vec_u8_t srcR3 = vec_ld(30, src); 288 vec_u8 srcR3 = vec_ld(30, src);
289 srcM2 = vec_perm(srcR1, srcR2, permM2); 289 srcM2 = vec_perm(srcR1, srcR2, permM2);
290 srcM1 = srcR2; 290 srcM1 = srcR2;
291 srcP0 = vec_perm(srcR2, srcR3, permP0); 291 srcP0 = vec_perm(srcR2, srcR3, permP0);
292 srcP1 = vec_perm(srcR2, srcR3, permP1); 292 srcP1 = vec_perm(srcR2, srcR3, permP1);
293 srcP2 = vec_perm(srcR2, srcR3, permP2); 293 srcP2 = vec_perm(srcR2, srcR3, permP2);
294 srcP3 = vec_perm(srcR2, srcR3, permP3); 294 srcP3 = vec_perm(srcR2, srcR3, permP3);
295 } break; 295 } break;
296 } 296 }
297 297
298 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); 298 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
299 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); 299 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
300 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); 300 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
301 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); 301 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
302 302
303 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); 303 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
304 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); 304 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
305 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); 305 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
306 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); 306 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
307 307
308 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); 308 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
309 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); 309 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
310 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); 310 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
311 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); 311 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
312 312
313 sum1A = vec_adds(srcP0A, srcP1A); 313 sum1A = vec_adds(srcP0A, srcP1A);
314 sum1B = vec_adds(srcP0B, srcP1B); 314 sum1B = vec_adds(srcP0B, srcP1B);
315 sum2A = vec_adds(srcM1A, srcP2A); 315 sum2A = vec_adds(srcM1A, srcP2A);
316 sum2B = vec_adds(srcM1B, srcP2B); 316 sum2B = vec_adds(srcM1B, srcP2B);
352 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); 352 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
353 353
354 register int i; 354 register int i;
355 355
356 LOAD_ZERO; 356 LOAD_ZERO;
357 const vec_u8_t perm = vec_lvsl(0, src); 357 const vec_u8 perm = vec_lvsl(0, src);
358 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 358 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
359 const vec_u16_t v5us = vec_splat_u16(5); 359 const vec_u16 v5us = vec_splat_u16(5);
360 const vec_s16_t v5ss = vec_splat_s16(5); 360 const vec_s16 v5ss = vec_splat_s16(5);
361 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 361 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
362 362
363 uint8_t *srcbis = src - (srcStride * 2); 363 uint8_t *srcbis = src - (srcStride * 2);
364 364
365 const vec_u8_t srcM2a = vec_ld(0, srcbis); 365 const vec_u8 srcM2a = vec_ld(0, srcbis);
366 const vec_u8_t srcM2b = vec_ld(16, srcbis); 366 const vec_u8 srcM2b = vec_ld(16, srcbis);
367 const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm); 367 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
368 //srcbis += srcStride; 368 //srcbis += srcStride;
369 const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride); 369 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
370 const vec_u8_t srcM1b = vec_ld(16, srcbis); 370 const vec_u8 srcM1b = vec_ld(16, srcbis);
371 const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm); 371 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
372 //srcbis += srcStride; 372 //srcbis += srcStride;
373 const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride); 373 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
374 const vec_u8_t srcP0b = vec_ld(16, srcbis); 374 const vec_u8 srcP0b = vec_ld(16, srcbis);
375 const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm); 375 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
376 //srcbis += srcStride; 376 //srcbis += srcStride;
377 const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride); 377 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
378 const vec_u8_t srcP1b = vec_ld(16, srcbis); 378 const vec_u8 srcP1b = vec_ld(16, srcbis);
379 const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm); 379 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
380 //srcbis += srcStride; 380 //srcbis += srcStride;
381 const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride); 381 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
382 const vec_u8_t srcP2b = vec_ld(16, srcbis); 382 const vec_u8 srcP2b = vec_ld(16, srcbis);
383 const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm); 383 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
384 //srcbis += srcStride; 384 //srcbis += srcStride;
385 385
386 vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); 386 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
387 vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); 387 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
388 vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); 388 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
389 vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); 389 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
390 vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); 390 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
391 vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); 391 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
392 vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); 392 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
393 vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); 393 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
394 vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); 394 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
395 vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); 395 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
396 396
397 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 397 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
398 psumA, psumB, sumA, sumB, 398 psumA, psumB, sumA, sumB,
399 srcP3ssA, srcP3ssB, 399 srcP3ssA, srcP3ssB,
400 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; 400 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
401 401
402 vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3; 402 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
403 403
404 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 404 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
405 405
406 for (i = 0 ; i < 16 ; i++) { 406 for (i = 0 ; i < 16 ; i++) {
407 srcP3a = vec_ld(0, srcbis += srcStride); 407 srcP3a = vec_ld(0, srcbis += srcStride);
408 srcP3b = vec_ld(16, srcbis); 408 srcP3b = vec_ld(16, srcbis);
409 srcP3 = vec_perm(srcP3a, srcP3b, perm); 409 srcP3 = vec_perm(srcP3a, srcP3b, perm);
410 srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); 410 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
411 srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); 411 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
412 //srcbis += srcStride; 412 //srcbis += srcStride;
413 413
414 sum1A = vec_adds(srcP0ssA, srcP1ssA); 414 sum1A = vec_adds(srcP0ssA, srcP1ssA);
415 sum1B = vec_adds(srcP0ssB, srcP1ssB); 415 sum1B = vec_adds(srcP0ssB, srcP1ssB);
416 sum2A = vec_adds(srcM1ssA, srcP2ssA); 416 sum2A = vec_adds(srcM1ssA, srcP2ssA);
461 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 461 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
462 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { 462 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
463 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); 463 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
464 register int i; 464 register int i;
465 LOAD_ZERO; 465 LOAD_ZERO;
466 const vec_u8_t permM2 = vec_lvsl(-2, src); 466 const vec_u8 permM2 = vec_lvsl(-2, src);
467 const vec_u8_t permM1 = vec_lvsl(-1, src); 467 const vec_u8 permM1 = vec_lvsl(-1, src);
468 const vec_u8_t permP0 = vec_lvsl(+0, src); 468 const vec_u8 permP0 = vec_lvsl(+0, src);
469 const vec_u8_t permP1 = vec_lvsl(+1, src); 469 const vec_u8 permP1 = vec_lvsl(+1, src);
470 const vec_u8_t permP2 = vec_lvsl(+2, src); 470 const vec_u8 permP2 = vec_lvsl(+2, src);
471 const vec_u8_t permP3 = vec_lvsl(+3, src); 471 const vec_u8 permP3 = vec_lvsl(+3, src);
472 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 472 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
473 const vec_u32_t v10ui = vec_splat_u32(10); 473 const vec_u32 v10ui = vec_splat_u32(10);
474 const vec_s16_t v5ss = vec_splat_s16(5); 474 const vec_s16 v5ss = vec_splat_s16(5);
475 const vec_s16_t v1ss = vec_splat_s16(1); 475 const vec_s16 v1ss = vec_splat_s16(1);
476 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 476 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
477 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 477 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
478 478
479 register int align = ((((unsigned long)src) - 2) % 16); 479 register int align = ((((unsigned long)src) - 2) % 16);
480 480
481 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, 481 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
482 srcP2A, srcP2B, srcP3A, srcP3B, 482 srcP2A, srcP2B, srcP3A, srcP3B,
483 srcM1A, srcM1B, srcM2A, srcM2B, 483 srcM1A, srcM1B, srcM2A, srcM2B,
484 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 484 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
485 pp1A, pp1B, pp2A, pp2B, psumA, psumB; 485 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
486 486
487 const vec_u8_t mperm = (const vec_u8_t) 487 const vec_u8 mperm = (const vec_u8)
488 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 488 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
489 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; 489 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
490 int16_t *tmpbis = tmp; 490 int16_t *tmpbis = tmp;
491 491
492 vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, 492 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
493 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, 493 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
494 tmpP2ssA, tmpP2ssB; 494 tmpP2ssA, tmpP2ssB;
495 495
496 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, 496 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
497 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 497 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
498 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 498 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
499 ssumAe, ssumAo, ssumBe, ssumBo; 499 ssumAe, ssumAo, ssumBe, ssumBo;
500 vec_u8_t fsum, sumv, sum, vdst; 500 vec_u8 fsum, sumv, sum, vdst;
501 vec_s16_t ssume, ssumo; 501 vec_s16 ssume, ssumo;
502 502
503 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 503 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
504 src -= (2 * srcStride); 504 src -= (2 * srcStride);
505 for (i = 0 ; i < 21 ; i ++) { 505 for (i = 0 ; i < 21 ; i ++) {
506 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 506 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
507 vec_u8_t srcR1 = vec_ld(-2, src); 507 vec_u8 srcR1 = vec_ld(-2, src);
508 vec_u8_t srcR2 = vec_ld(14, src); 508 vec_u8 srcR2 = vec_ld(14, src);
509 509
510 switch (align) { 510 switch (align) {
511 default: { 511 default: {
512 srcM2 = vec_perm(srcR1, srcR2, permM2); 512 srcM2 = vec_perm(srcR1, srcR2, permM2);
513 srcM1 = vec_perm(srcR1, srcR2, permM1); 513 srcM1 = vec_perm(srcR1, srcR2, permM1);
523 srcP1 = vec_perm(srcR1, srcR2, permP1); 523 srcP1 = vec_perm(srcR1, srcR2, permP1);
524 srcP2 = vec_perm(srcR1, srcR2, permP2); 524 srcP2 = vec_perm(srcR1, srcR2, permP2);
525 srcP3 = srcR2; 525 srcP3 = srcR2;
526 } break; 526 } break;
527 case 12: { 527 case 12: {
528 vec_u8_t srcR3 = vec_ld(30, src); 528 vec_u8 srcR3 = vec_ld(30, src);
529 srcM2 = vec_perm(srcR1, srcR2, permM2); 529 srcM2 = vec_perm(srcR1, srcR2, permM2);
530 srcM1 = vec_perm(srcR1, srcR2, permM1); 530 srcM1 = vec_perm(srcR1, srcR2, permM1);
531 srcP0 = vec_perm(srcR1, srcR2, permP0); 531 srcP0 = vec_perm(srcR1, srcR2, permP0);
532 srcP1 = vec_perm(srcR1, srcR2, permP1); 532 srcP1 = vec_perm(srcR1, srcR2, permP1);
533 srcP2 = srcR2; 533 srcP2 = srcR2;
534 srcP3 = vec_perm(srcR2, srcR3, permP3); 534 srcP3 = vec_perm(srcR2, srcR3, permP3);
535 } break; 535 } break;
536 case 13: { 536 case 13: {
537 vec_u8_t srcR3 = vec_ld(30, src); 537 vec_u8 srcR3 = vec_ld(30, src);
538 srcM2 = vec_perm(srcR1, srcR2, permM2); 538 srcM2 = vec_perm(srcR1, srcR2, permM2);
539 srcM1 = vec_perm(srcR1, srcR2, permM1); 539 srcM1 = vec_perm(srcR1, srcR2, permM1);
540 srcP0 = vec_perm(srcR1, srcR2, permP0); 540 srcP0 = vec_perm(srcR1, srcR2, permP0);
541 srcP1 = srcR2; 541 srcP1 = srcR2;
542 srcP2 = vec_perm(srcR2, srcR3, permP2); 542 srcP2 = vec_perm(srcR2, srcR3, permP2);
543 srcP3 = vec_perm(srcR2, srcR3, permP3); 543 srcP3 = vec_perm(srcR2, srcR3, permP3);
544 } break; 544 } break;
545 case 14: { 545 case 14: {
546 vec_u8_t srcR3 = vec_ld(30, src); 546 vec_u8 srcR3 = vec_ld(30, src);
547 srcM2 = vec_perm(srcR1, srcR2, permM2); 547 srcM2 = vec_perm(srcR1, srcR2, permM2);
548 srcM1 = vec_perm(srcR1, srcR2, permM1); 548 srcM1 = vec_perm(srcR1, srcR2, permM1);
549 srcP0 = srcR2; 549 srcP0 = srcR2;
550 srcP1 = vec_perm(srcR2, srcR3, permP1); 550 srcP1 = vec_perm(srcR2, srcR3, permP1);
551 srcP2 = vec_perm(srcR2, srcR3, permP2); 551 srcP2 = vec_perm(srcR2, srcR3, permP2);
552 srcP3 = vec_perm(srcR2, srcR3, permP3); 552 srcP3 = vec_perm(srcR2, srcR3, permP3);
553 } break; 553 } break;
554 case 15: { 554 case 15: {
555 vec_u8_t srcR3 = vec_ld(30, src); 555 vec_u8 srcR3 = vec_ld(30, src);
556 srcM2 = vec_perm(srcR1, srcR2, permM2); 556 srcM2 = vec_perm(srcR1, srcR2, permM2);
557 srcM1 = srcR2; 557 srcM1 = srcR2;
558 srcP0 = vec_perm(srcR2, srcR3, permP0); 558 srcP0 = vec_perm(srcR2, srcR3, permP0);
559 srcP1 = vec_perm(srcR2, srcR3, permP1); 559 srcP1 = vec_perm(srcR2, srcR3, permP1);
560 srcP2 = vec_perm(srcR2, srcR3, permP2); 560 srcP2 = vec_perm(srcR2, srcR3, permP2);
561 srcP3 = vec_perm(srcR2, srcR3, permP3); 561 srcP3 = vec_perm(srcR2, srcR3, permP3);
562 } break; 562 } break;
563 } 563 }
564 564
565 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); 565 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
566 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); 566 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
567 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); 567 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
568 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); 568 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
569 569
570 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); 570 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
571 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); 571 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
572 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); 572 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
573 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); 573 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
574 574
575 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); 575 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
576 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); 576 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
577 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); 577 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
578 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); 578 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
579 579
580 sum1A = vec_adds(srcP0A, srcP1A); 580 sum1A = vec_adds(srcP0A, srcP1A);
581 sum1B = vec_adds(srcP0B, srcP1B); 581 sum1B = vec_adds(srcP0B, srcP1B);
582 sum2A = vec_adds(srcM1A, srcP2A); 582 sum2A = vec_adds(srcM1A, srcP2A);
583 sum2B = vec_adds(srcM1B, srcP2B); 583 sum2B = vec_adds(srcM1B, srcP2B);
615 tmpP2ssA = vec_ld(0, tmpbis); 615 tmpP2ssA = vec_ld(0, tmpbis);
616 tmpP2ssB = vec_ld(16, tmpbis); 616 tmpP2ssB = vec_ld(16, tmpbis);
617 tmpbis += tmpStride; 617 tmpbis += tmpStride;
618 618
619 for (i = 0 ; i < 16 ; i++) { 619 for (i = 0 ; i < 16 ; i++) {
620 const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis); 620 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
621 const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis); 621 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
622 622
623 const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA); 623 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
624 const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB); 624 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
625 const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA); 625 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
626 const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB); 626 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
627 const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA); 627 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
628 const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB); 628 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
629 629
630 tmpbis += tmpStride; 630 tmpbis += tmpStride;
631 631
632 tmpM2ssA = tmpM1ssA; 632 tmpM2ssA = tmpM1ssA;
633 tmpM2ssB = tmpM1ssB; 633 tmpM2ssB = tmpM1ssB;
648 pp2Ae = vec_mule(sum2A, v5ss); 648 pp2Ae = vec_mule(sum2A, v5ss);
649 pp2Ao = vec_mulo(sum2A, v5ss); 649 pp2Ao = vec_mulo(sum2A, v5ss);
650 pp2Be = vec_mule(sum2B, v5ss); 650 pp2Be = vec_mule(sum2B, v5ss);
651 pp2Bo = vec_mulo(sum2B, v5ss); 651 pp2Bo = vec_mulo(sum2B, v5ss);
652 652
653 pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); 653 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
654 pp3Ao = vec_mulo(sum3A, v1ss); 654 pp3Ao = vec_mulo(sum3A, v1ss);
655 pp3Be = vec_sra((vec_s32_t)sum3B, v16ui); 655 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
656 pp3Bo = vec_mulo(sum3B, v1ss); 656 pp3Bo = vec_mulo(sum3B, v1ss);
657 657
658 pp1cAe = vec_add(pp1Ae, v512si); 658 pp1cAe = vec_add(pp1Ae, v512si);
659 pp1cAo = vec_add(pp1Ao, v512si); 659 pp1cAo = vec_add(pp1Ao, v512si);
660 pp1cBe = vec_add(pp1Be, v512si); 660 pp1cBe = vec_add(pp1Be, v512si);