comparison ppc/h264_template_altivec.c @ 5530:cd266411b11a libavcodec

use shorter types vec_"type" instead of the too long vector "type" part 1 of h264 luma interpolation 8x8 for altivec contributed by Mauricio Alvarez % lokifo A gmail P com % Original thread: Date: Jun 26, 2007 8:15 PM Subject: Re: [FFmpeg-devel] [PATCH] h264 luma interpolation 8x8 for altivec
author gpoirier
date Sun, 12 Aug 2007 13:50:06 +0000
parents 41cabe79ba25
children 861eb234e6ba
comparison
equal deleted inserted replaced
5529:af68496af656 5530:cd266411b11a
25 {((8 - x) * (8 - y)), 25 {((8 - x) * (8 - y)),
26 ((x) * (8 - y)), 26 ((x) * (8 - y)),
27 ((8 - x) * (y)), 27 ((8 - x) * (y)),
28 ((x) * (y))}; 28 ((x) * (y))};
29 register int i; 29 register int i;
30 vector unsigned char fperm; 30 vec_u8_t fperm;
31 const vector signed int vABCD = vec_ld(0, ABCD); 31 const vec_s32_t vABCD = vec_ld(0, ABCD);
32 const vector signed short vA = vec_splat((vector signed short)vABCD, 1); 32 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
33 const vector signed short vB = vec_splat((vector signed short)vABCD, 3); 33 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
34 const vector signed short vC = vec_splat((vector signed short)vABCD, 5); 34 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
35 const vector signed short vD = vec_splat((vector signed short)vABCD, 7); 35 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
36 const vector signed int vzero = vec_splat_s32(0); 36 LOAD_ZERO;
37 const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); 37 const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
38 const vector unsigned short v6us = vec_splat_u16(6); 38 const vec_u16_t v6us = vec_splat_u16(6);
39 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 39 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
40 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 40 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
41 41
42 vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; 42 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
43 vector unsigned char vsrc0uc, vsrc1uc; 43 vec_u8_t vsrc0uc, vsrc1uc;
44 vector signed short vsrc0ssH, vsrc1ssH; 44 vec_s16_t vsrc0ssH, vsrc1ssH;
45 vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; 45 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
46 vector signed short vsrc2ssH, vsrc3ssH, psum; 46 vec_s16_t vsrc2ssH, vsrc3ssH, psum;
47 vector unsigned char vdst, ppsum, vfdst, fsum; 47 vec_u8_t vdst, ppsum, vfdst, fsum;
48 48
49 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); 49 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
50 50
51 if (((unsigned long)dst) % 16 == 0) { 51 if (((unsigned long)dst) % 16 == 0) {
52 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 52 fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
53 0x14, 0x15, 0x16, 0x17, 53 0x14, 0x15, 0x16, 0x17,
54 0x08, 0x09, 0x0A, 0x0B, 54 0x08, 0x09, 0x0A, 0x0B,
55 0x0C, 0x0D, 0x0E, 0x0F); 55 0x0C, 0x0D, 0x0E, 0x0F);
56 } else { 56 } else {
57 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 57 fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
58 0x04, 0x05, 0x06, 0x07, 58 0x04, 0x05, 0x06, 0x07,
59 0x18, 0x19, 0x1A, 0x1B, 59 0x18, 0x19, 0x1A, 0x1B,
60 0x1C, 0x1D, 0x1E, 0x1F); 60 0x1C, 0x1D, 0x1E, 0x1F);
61 } 61 }
62 62
63 vsrcAuc = vec_ld(0, src); 63 vsrcAuc = vec_ld(0, src);
64 64
65 if (loadSecond) 65 if (loadSecond)
71 if (reallyBadAlign) 71 if (reallyBadAlign)
72 vsrc1uc = vsrcBuc; 72 vsrc1uc = vsrcBuc;
73 else 73 else
74 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 74 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
75 75
76 vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, 76 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
77 (vector unsigned char)vsrc0uc); 77 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);
78 vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
79 (vector unsigned char)vsrc1uc);
80 78
81 if (!loadSecond) {// -> !reallyBadAlign 79 if (!loadSecond) {// -> !reallyBadAlign
82 for (i = 0 ; i < h ; i++) { 80 for (i = 0 ; i < h ; i++) {
83 81
84 82
85 vsrcCuc = vec_ld(stride + 0, src); 83 vsrcCuc = vec_ld(stride + 0, src);
86 84
87 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 85 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
88 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 86 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
89 87
90 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, 88 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
91 (vector unsigned char)vsrc2uc); 89 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
92 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
93 (vector unsigned char)vsrc3uc);
94 90
95 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); 91 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
96 psum = vec_mladd(vB, vsrc1ssH, psum); 92 psum = vec_mladd(vB, vsrc1ssH, psum);
97 psum = vec_mladd(vC, vsrc2ssH, psum); 93 psum = vec_mladd(vC, vsrc2ssH, psum);
98 psum = vec_mladd(vD, vsrc3ssH, psum); 94 psum = vec_mladd(vD, vsrc3ssH, psum);
99 psum = vec_add(v32ss, psum); 95 psum = vec_add(v32ss, psum);
100 psum = vec_sra(psum, v6us); 96 psum = vec_sra(psum, v6us);
101 97
102 vdst = vec_ld(0, dst); 98 vdst = vec_ld(0, dst);
103 ppsum = (vector unsigned char)vec_packsu(psum, psum); 99 ppsum = (vec_u8_t)vec_packsu(psum, psum);
104 vfdst = vec_perm(vdst, ppsum, fperm); 100 vfdst = vec_perm(vdst, ppsum, fperm);
105 101
106 OP_U8_ALTIVEC(fsum, vfdst, vdst); 102 OP_U8_ALTIVEC(fsum, vfdst, vdst);
107 103
108 vec_st(fsum, 0, dst); 104 vec_st(fsum, 0, dst);
112 108
113 dst += stride; 109 dst += stride;
114 src += stride; 110 src += stride;
115 } 111 }
116 } else { 112 } else {
117 vector unsigned char vsrcDuc; 113 vec_u8_t vsrcDuc;
118 for (i = 0 ; i < h ; i++) { 114 for (i = 0 ; i < h ; i++) {
119 vsrcCuc = vec_ld(stride + 0, src); 115 vsrcCuc = vec_ld(stride + 0, src);
120 vsrcDuc = vec_ld(stride + 16, src); 116 vsrcDuc = vec_ld(stride + 16, src);
121 117
122 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 118 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
123 if (reallyBadAlign) 119 if (reallyBadAlign)
124 vsrc3uc = vsrcDuc; 120 vsrc3uc = vsrcDuc;
125 else 121 else
126 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 122 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
127 123
128 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, 124 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
129 (vector unsigned char)vsrc2uc); 125 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
130 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
131 (vector unsigned char)vsrc3uc);
132 126
133 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); 127 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
134 psum = vec_mladd(vB, vsrc1ssH, psum); 128 psum = vec_mladd(vB, vsrc1ssH, psum);
135 psum = vec_mladd(vC, vsrc2ssH, psum); 129 psum = vec_mladd(vC, vsrc2ssH, psum);
136 psum = vec_mladd(vD, vsrc3ssH, psum); 130 psum = vec_mladd(vD, vsrc3ssH, psum);
137 psum = vec_add(v32ss, psum); 131 psum = vec_add(v32ss, psum);
138 psum = vec_sr(psum, v6us); 132 psum = vec_sr(psum, v6us);
139 133
140 vdst = vec_ld(0, dst); 134 vdst = vec_ld(0, dst);
141 ppsum = (vector unsigned char)vec_pack(psum, psum); 135 ppsum = (vec_u8_t)vec_pack(psum, psum);
142 vfdst = vec_perm(vdst, ppsum, fperm); 136 vfdst = vec_perm(vdst, ppsum, fperm);
143 137
144 OP_U8_ALTIVEC(fsum, vfdst, vdst); 138 OP_U8_ALTIVEC(fsum, vfdst, vdst);
145 139
146 vec_st(fsum, 0, dst); 140 vec_st(fsum, 0, dst);
158 /* this code assume stride % 16 == 0 */ 152 /* this code assume stride % 16 == 0 */
159 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 153 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
160 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); 154 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
161 register int i; 155 register int i;
162 156
163 const vector signed int vzero = vec_splat_s32(0); 157 LOAD_ZERO;
164 const vector unsigned char permM2 = vec_lvsl(-2, src); 158 const vec_u8_t permM2 = vec_lvsl(-2, src);
165 const vector unsigned char permM1 = vec_lvsl(-1, src); 159 const vec_u8_t permM1 = vec_lvsl(-1, src);
166 const vector unsigned char permP0 = vec_lvsl(+0, src); 160 const vec_u8_t permP0 = vec_lvsl(+0, src);
167 const vector unsigned char permP1 = vec_lvsl(+1, src); 161 const vec_u8_t permP1 = vec_lvsl(+1, src);
168 const vector unsigned char permP2 = vec_lvsl(+2, src); 162 const vec_u8_t permP2 = vec_lvsl(+2, src);
169 const vector unsigned char permP3 = vec_lvsl(+3, src); 163 const vec_u8_t permP3 = vec_lvsl(+3, src);
170 const vector signed short v5ss = vec_splat_s16(5); 164 const vec_s16_t v5ss = vec_splat_s16(5);
171 const vector unsigned short v5us = vec_splat_u16(5); 165 const vec_u16_t v5us = vec_splat_u16(5);
172 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 166 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
173 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 167 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
174 const vector unsigned char dstperm = vec_lvsr(0, dst); 168 const vec_u8_t dstperm = vec_lvsr(0, dst);
175 const vector unsigned char neg1 = 169 const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);
176 (const vector unsigned char) vec_splat_s8(-1); 170 const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
177 171
178 const vector unsigned char dstmask = 172 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
179 vec_perm((const vector unsigned char)vzero,
180 neg1, dstperm);
181
182 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
183 173
184 register int align = ((((unsigned long)src) - 2) % 16); 174 register int align = ((((unsigned long)src) - 2) % 16);
185 175
186 vector signed short srcP0A, srcP0B, srcP1A, srcP1B, 176 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
187 srcP2A, srcP2B, srcP3A, srcP3B, 177 srcP2A, srcP2B, srcP3A, srcP3B,
188 srcM1A, srcM1B, srcM2A, srcM2B, 178 srcM1A, srcM1B, srcM2A, srcM2B,
189 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 179 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
190 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 180 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
191 psumA, psumB, sumA, sumB; 181 psumA, psumB, sumA, sumB;
192 182
193 vector unsigned char sum, dst1, dst2, vdst, fsum, 183 vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2;
194 rsum, fdst1, fdst2;
195 184
196 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 185 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
197 186
198 for (i = 0 ; i < 16 ; i ++) { 187 for (i = 0 ; i < 16 ; i ++) {
199 vector unsigned char srcR1 = vec_ld(-2, src); 188 vec_u8_t srcR1 = vec_ld(-2, src);
200 vector unsigned char srcR2 = vec_ld(14, src); 189 vec_u8_t srcR2 = vec_ld(14, src);
201 190
202 switch (align) { 191 switch (align) {
203 default: { 192 default: {
204 srcM2 = vec_perm(srcR1, srcR2, permM2); 193 srcM2 = vec_perm(srcR1, srcR2, permM2);
205 srcM1 = vec_perm(srcR1, srcR2, permM1); 194 srcM1 = vec_perm(srcR1, srcR2, permM1);
215 srcP1 = vec_perm(srcR1, srcR2, permP1); 204 srcP1 = vec_perm(srcR1, srcR2, permP1);
216 srcP2 = vec_perm(srcR1, srcR2, permP2); 205 srcP2 = vec_perm(srcR1, srcR2, permP2);
217 srcP3 = srcR2; 206 srcP3 = srcR2;
218 } break; 207 } break;
219 case 12: { 208 case 12: {
220 vector unsigned char srcR3 = vec_ld(30, src); 209 vec_u8_t srcR3 = vec_ld(30, src);
221 srcM2 = vec_perm(srcR1, srcR2, permM2); 210 srcM2 = vec_perm(srcR1, srcR2, permM2);
222 srcM1 = vec_perm(srcR1, srcR2, permM1); 211 srcM1 = vec_perm(srcR1, srcR2, permM1);
223 srcP0 = vec_perm(srcR1, srcR2, permP0); 212 srcP0 = vec_perm(srcR1, srcR2, permP0);
224 srcP1 = vec_perm(srcR1, srcR2, permP1); 213 srcP1 = vec_perm(srcR1, srcR2, permP1);
225 srcP2 = srcR2; 214 srcP2 = srcR2;
226 srcP3 = vec_perm(srcR2, srcR3, permP3); 215 srcP3 = vec_perm(srcR2, srcR3, permP3);
227 } break; 216 } break;
228 case 13: { 217 case 13: {
229 vector unsigned char srcR3 = vec_ld(30, src); 218 vec_u8_t srcR3 = vec_ld(30, src);
230 srcM2 = vec_perm(srcR1, srcR2, permM2); 219 srcM2 = vec_perm(srcR1, srcR2, permM2);
231 srcM1 = vec_perm(srcR1, srcR2, permM1); 220 srcM1 = vec_perm(srcR1, srcR2, permM1);
232 srcP0 = vec_perm(srcR1, srcR2, permP0); 221 srcP0 = vec_perm(srcR1, srcR2, permP0);
233 srcP1 = srcR2; 222 srcP1 = srcR2;
234 srcP2 = vec_perm(srcR2, srcR3, permP2); 223 srcP2 = vec_perm(srcR2, srcR3, permP2);
235 srcP3 = vec_perm(srcR2, srcR3, permP3); 224 srcP3 = vec_perm(srcR2, srcR3, permP3);
236 } break; 225 } break;
237 case 14: { 226 case 14: {
238 vector unsigned char srcR3 = vec_ld(30, src); 227 vec_u8_t srcR3 = vec_ld(30, src);
239 srcM2 = vec_perm(srcR1, srcR2, permM2); 228 srcM2 = vec_perm(srcR1, srcR2, permM2);
240 srcM1 = vec_perm(srcR1, srcR2, permM1); 229 srcM1 = vec_perm(srcR1, srcR2, permM1);
241 srcP0 = srcR2; 230 srcP0 = srcR2;
242 srcP1 = vec_perm(srcR2, srcR3, permP1); 231 srcP1 = vec_perm(srcR2, srcR3, permP1);
243 srcP2 = vec_perm(srcR2, srcR3, permP2); 232 srcP2 = vec_perm(srcR2, srcR3, permP2);
244 srcP3 = vec_perm(srcR2, srcR3, permP3); 233 srcP3 = vec_perm(srcR2, srcR3, permP3);
245 } break; 234 } break;
246 case 15: { 235 case 15: {
247 vector unsigned char srcR3 = vec_ld(30, src); 236 vec_u8_t srcR3 = vec_ld(30, src);
248 srcM2 = vec_perm(srcR1, srcR2, permM2); 237 srcM2 = vec_perm(srcR1, srcR2, permM2);
249 srcM1 = srcR2; 238 srcM1 = srcR2;
250 srcP0 = vec_perm(srcR2, srcR3, permP0); 239 srcP0 = vec_perm(srcR2, srcR3, permP0);
251 srcP1 = vec_perm(srcR2, srcR3, permP1); 240 srcP1 = vec_perm(srcR2, srcR3, permP1);
252 srcP2 = vec_perm(srcR2, srcR3, permP2); 241 srcP2 = vec_perm(srcR2, srcR3, permP2);
253 srcP3 = vec_perm(srcR2, srcR3, permP3); 242 srcP3 = vec_perm(srcR2, srcR3, permP3);
254 } break; 243 } break;
255 } 244 }
256 245
257 srcP0A = (vector signed short) 246 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
258 vec_mergeh((vector unsigned char)vzero, srcP0); 247 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
259 srcP0B = (vector signed short) 248 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
260 vec_mergel((vector unsigned char)vzero, srcP0); 249 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
261 srcP1A = (vector signed short) 250
262 vec_mergeh((vector unsigned char)vzero, srcP1); 251 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
263 srcP1B = (vector signed short) 252 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
264 vec_mergel((vector unsigned char)vzero, srcP1); 253 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
265 254 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
266 srcP2A = (vector signed short) 255
267 vec_mergeh((vector unsigned char)vzero, srcP2); 256 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
268 srcP2B = (vector signed short) 257 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
269 vec_mergel((vector unsigned char)vzero, srcP2); 258 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
270 srcP3A = (vector signed short) 259 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
271 vec_mergeh((vector unsigned char)vzero, srcP3);
272 srcP3B = (vector signed short)
273 vec_mergel((vector unsigned char)vzero, srcP3);
274
275 srcM1A = (vector signed short)
276 vec_mergeh((vector unsigned char)vzero, srcM1);
277 srcM1B = (vector signed short)
278 vec_mergel((vector unsigned char)vzero, srcM1);
279 srcM2A = (vector signed short)
280 vec_mergeh((vector unsigned char)vzero, srcM2);
281 srcM2B = (vector signed short)
282 vec_mergel((vector unsigned char)vzero, srcM2);
283 260
284 sum1A = vec_adds(srcP0A, srcP1A); 261 sum1A = vec_adds(srcP0A, srcP1A);
285 sum1B = vec_adds(srcP0B, srcP1B); 262 sum1B = vec_adds(srcP0B, srcP1B);
286 sum2A = vec_adds(srcM1A, srcP2A); 263 sum2A = vec_adds(srcM1A, srcP2A);
287 sum2B = vec_adds(srcM1B, srcP2B); 264 sum2B = vec_adds(srcM1B, srcP2B);
289 sum3B = vec_adds(srcM2B, srcP3B); 266 sum3B = vec_adds(srcM2B, srcP3B);
290 267
291 pp1A = vec_mladd(sum1A, v20ss, v16ss); 268 pp1A = vec_mladd(sum1A, v20ss, v16ss);
292 pp1B = vec_mladd(sum1B, v20ss, v16ss); 269 pp1B = vec_mladd(sum1B, v20ss, v16ss);
293 270
294 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); 271 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
295 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); 272 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
296 273
297 pp3A = vec_add(sum3A, pp1A); 274 pp3A = vec_add(sum3A, pp1A);
298 pp3B = vec_add(sum3B, pp1B); 275 pp3B = vec_add(sum3B, pp1B);
299 276
300 psumA = vec_sub(pp3A, pp2A); 277 psumA = vec_sub(pp3A, pp2A);
328 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 305 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
329 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); 306 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
330 307
331 register int i; 308 register int i;
332 309
333 const vector signed int vzero = vec_splat_s32(0); 310 LOAD_ZERO;
334 const vector unsigned char perm = vec_lvsl(0, src); 311 const vec_u8_t perm = vec_lvsl(0, src);
335 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 312 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
336 const vector unsigned short v5us = vec_splat_u16(5); 313 const vec_u16_t v5us = vec_splat_u16(5);
337 const vector signed short v5ss = vec_splat_s16(5); 314 const vec_s16_t v5ss = vec_splat_s16(5);
338 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 315 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
339 const vector unsigned char dstperm = vec_lvsr(0, dst); 316 const vec_u8_t dstperm = vec_lvsr(0, dst);
340 const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); 317 const vec_u8_t neg1 = (const vec_u8_t)vec_splat_s8(-1);
341 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); 318 const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
342 319
343 uint8_t *srcbis = src - (srcStride * 2); 320 uint8_t *srcbis = src - (srcStride * 2);
344 321
345 const vector unsigned char srcM2a = vec_ld(0, srcbis); 322 const vec_u8_t srcM2a = vec_ld(0, srcbis);
346 const vector unsigned char srcM2b = vec_ld(16, srcbis); 323 const vec_u8_t srcM2b = vec_ld(16, srcbis);
347 const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm); 324 const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
348 // srcbis += srcStride; 325 // srcbis += srcStride;
349 const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride); 326 const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
350 const vector unsigned char srcM1b = vec_ld(16, srcbis); 327 const vec_u8_t srcM1b = vec_ld(16, srcbis);
351 const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm); 328 const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
352 // srcbis += srcStride; 329 // srcbis += srcStride;
353 const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride); 330 const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
354 const vector unsigned char srcP0b = vec_ld(16, srcbis); 331 const vec_u8_t srcP0b = vec_ld(16, srcbis);
355 const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm); 332 const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
356 // srcbis += srcStride; 333 // srcbis += srcStride;
357 const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride); 334 const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
358 const vector unsigned char srcP1b = vec_ld(16, srcbis); 335 const vec_u8_t srcP1b = vec_ld(16, srcbis);
359 const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm); 336 const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
360 // srcbis += srcStride; 337 // srcbis += srcStride;
361 const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride); 338 const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
362 const vector unsigned char srcP2b = vec_ld(16, srcbis); 339 const vec_u8_t srcP2b = vec_ld(16, srcbis);
363 const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm); 340 const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
364 // srcbis += srcStride; 341 // srcbis += srcStride;
365 342
366 vector signed short srcM2ssA = (vector signed short) 343 vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
367 vec_mergeh((vector unsigned char)vzero, srcM2); 344 vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
368 vector signed short srcM2ssB = (vector signed short) 345 vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
369 vec_mergel((vector unsigned char)vzero, srcM2); 346 vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
370 vector signed short srcM1ssA = (vector signed short) 347 vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
371 vec_mergeh((vector unsigned char)vzero, srcM1); 348 vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
372 vector signed short srcM1ssB = (vector signed short) 349 vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
373 vec_mergel((vector unsigned char)vzero, srcM1); 350 vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
374 vector signed short srcP0ssA = (vector signed short) 351 vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
375 vec_mergeh((vector unsigned char)vzero, srcP0); 352 vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
376 vector signed short srcP0ssB = (vector signed short) 353
377 vec_mergel((vector unsigned char)vzero, srcP0); 354 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
378 vector signed short srcP1ssA = (vector signed short)
379 vec_mergeh((vector unsigned char)vzero, srcP1);
380 vector signed short srcP1ssB = (vector signed short)
381 vec_mergel((vector unsigned char)vzero, srcP1);
382 vector signed short srcP2ssA = (vector signed short)
383 vec_mergeh((vector unsigned char)vzero, srcP2);
384 vector signed short srcP2ssB = (vector signed short)
385 vec_mergel((vector unsigned char)vzero, srcP2);
386
387 vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
388 psumA, psumB, sumA, sumB, 355 psumA, psumB, sumA, sumB,
389 srcP3ssA, srcP3ssB, 356 srcP3ssA, srcP3ssB,
390 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; 357 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
391 358
392 vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, 359 vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, srcP3a, srcP3b, srcP3;
393 srcP3a, srcP3b, srcP3;
394 360
395 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 361 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
396 362
397 for (i = 0 ; i < 16 ; i++) { 363 for (i = 0 ; i < 16 ; i++) {
398 srcP3a = vec_ld(0, srcbis += srcStride); 364 srcP3a = vec_ld(0, srcbis += srcStride);
399 srcP3b = vec_ld(16, srcbis); 365 srcP3b = vec_ld(16, srcbis);
400 srcP3 = vec_perm(srcP3a, srcP3b, perm); 366 srcP3 = vec_perm(srcP3a, srcP3b, perm);
401 srcP3ssA = (vector signed short) 367 srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
402 vec_mergeh((vector unsigned char)vzero, srcP3); 368 srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
403 srcP3ssB = (vector signed short)
404 vec_mergel((vector unsigned char)vzero, srcP3);
405 // srcbis += srcStride; 369 // srcbis += srcStride;
406 370
407 sum1A = vec_adds(srcP0ssA, srcP1ssA); 371 sum1A = vec_adds(srcP0ssA, srcP1ssA);
408 sum1B = vec_adds(srcP0ssB, srcP1ssB); 372 sum1B = vec_adds(srcP0ssB, srcP1ssB);
409 sum2A = vec_adds(srcM1ssA, srcP2ssA); 373 sum2A = vec_adds(srcM1ssA, srcP2ssA);
423 srcP2ssB = srcP3ssB; 387 srcP2ssB = srcP3ssB;
424 388
425 pp1A = vec_mladd(sum1A, v20ss, v16ss); 389 pp1A = vec_mladd(sum1A, v20ss, v16ss);
426 pp1B = vec_mladd(sum1B, v20ss, v16ss); 390 pp1B = vec_mladd(sum1B, v20ss, v16ss);
427 391
428 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); 392 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
429 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); 393 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
430 394
431 pp3A = vec_add(sum3A, pp1A); 395 pp3A = vec_add(sum3A, pp1A);
432 pp3B = vec_add(sum3B, pp1B); 396 pp3B = vec_add(sum3B, pp1B);
433 397
434 psumA = vec_sub(pp3A, pp2A); 398 psumA = vec_sub(pp3A, pp2A);
459 423
460 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 424 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
461 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { 425 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
462 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); 426 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
463 register int i; 427 register int i;
464 const vector signed int vzero = vec_splat_s32(0); 428 LOAD_ZERO;
465 const vector unsigned char permM2 = vec_lvsl(-2, src); 429 const vec_u8_t permM2 = vec_lvsl(-2, src);
466 const vector unsigned char permM1 = vec_lvsl(-1, src); 430 const vec_u8_t permM1 = vec_lvsl(-1, src);
467 const vector unsigned char permP0 = vec_lvsl(+0, src); 431 const vec_u8_t permP0 = vec_lvsl(+0, src);
468 const vector unsigned char permP1 = vec_lvsl(+1, src); 432 const vec_u8_t permP1 = vec_lvsl(+1, src);
469 const vector unsigned char permP2 = vec_lvsl(+2, src); 433 const vec_u8_t permP2 = vec_lvsl(+2, src);
470 const vector unsigned char permP3 = vec_lvsl(+3, src); 434 const vec_u8_t permP3 = vec_lvsl(+3, src);
471 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 435 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
472 const vector unsigned int v10ui = vec_splat_u32(10); 436 const vec_u32_t v10ui = vec_splat_u32(10);
473 const vector signed short v5ss = vec_splat_s16(5); 437 const vec_s16_t v5ss = vec_splat_s16(5);
474 const vector signed short v1ss = vec_splat_s16(1); 438 const vec_s16_t v1ss = vec_splat_s16(1);
475 const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 439 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
476 const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 440 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
477 441
478 register int align = ((((unsigned long)src) - 2) % 16); 442 register int align = ((((unsigned long)src) - 2) % 16);
479 443
480 const vector unsigned char neg1 = (const vector unsigned char) 444 const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);
481 vec_splat_s8(-1); 445
482 446 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
483 vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
484 srcP2A, srcP2B, srcP3A, srcP3B, 447 srcP2A, srcP2B, srcP3A, srcP3B,
485 srcM1A, srcM1B, srcM2A, srcM2B, 448 srcM1A, srcM1B, srcM2A, srcM2B,
486 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 449 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
487 pp1A, pp1B, pp2A, pp2B, psumA, psumB; 450 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
488 451
489 const vector unsigned char dstperm = vec_lvsr(0, dst); 452 const vec_u8_t dstperm = vec_lvsr(0, dst);
490 453
491 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); 454 const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);
492 455
493 const vector unsigned char mperm = (const vector unsigned char) 456 const vec_u8_t mperm = (const vec_u8_t)
494 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 457 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
495 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); 458 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
496 int16_t *tmpbis = tmp; 459 int16_t *tmpbis = tmp;
497 460
498 vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, 461 vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
499 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, 462 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
500 tmpP2ssA, tmpP2ssB; 463 tmpP2ssA, tmpP2ssB;
501 464
502 vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, 465 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
503 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 466 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
504 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 467 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
505 ssumAe, ssumAo, ssumBe, ssumBo; 468 ssumAe, ssumAo, ssumBe, ssumBo;
506 vector unsigned char fsum, sumv, sum, dst1, dst2, vdst, 469 vec_u8_t fsum, sumv, sum, dst1, dst2, vdst, rsum, fdst1, fdst2;
507 rsum, fdst1, fdst2; 470 vec_s16_t ssume, ssumo;
508 vector signed short ssume, ssumo;
509 471
510 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 472 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
511 src -= (2 * srcStride); 473 src -= (2 * srcStride);
512 for (i = 0 ; i < 21 ; i ++) { 474 for (i = 0 ; i < 21 ; i ++) {
513 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 475 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
514 vector unsigned char srcR1 = vec_ld(-2, src); 476 vec_u8_t srcR1 = vec_ld(-2, src);
515 vector unsigned char srcR2 = vec_ld(14, src); 477 vec_u8_t srcR2 = vec_ld(14, src);
516 478
517 switch (align) { 479 switch (align) {
518 default: { 480 default: {
519 srcM2 = vec_perm(srcR1, srcR2, permM2); 481 srcM2 = vec_perm(srcR1, srcR2, permM2);
520 srcM1 = vec_perm(srcR1, srcR2, permM1); 482 srcM1 = vec_perm(srcR1, srcR2, permM1);
530 srcP1 = vec_perm(srcR1, srcR2, permP1); 492 srcP1 = vec_perm(srcR1, srcR2, permP1);
531 srcP2 = vec_perm(srcR1, srcR2, permP2); 493 srcP2 = vec_perm(srcR1, srcR2, permP2);
532 srcP3 = srcR2; 494 srcP3 = srcR2;
533 } break; 495 } break;
534 case 12: { 496 case 12: {
535 vector unsigned char srcR3 = vec_ld(30, src); 497 vec_u8_t srcR3 = vec_ld(30, src);
536 srcM2 = vec_perm(srcR1, srcR2, permM2); 498 srcM2 = vec_perm(srcR1, srcR2, permM2);
537 srcM1 = vec_perm(srcR1, srcR2, permM1); 499 srcM1 = vec_perm(srcR1, srcR2, permM1);
538 srcP0 = vec_perm(srcR1, srcR2, permP0); 500 srcP0 = vec_perm(srcR1, srcR2, permP0);
539 srcP1 = vec_perm(srcR1, srcR2, permP1); 501 srcP1 = vec_perm(srcR1, srcR2, permP1);
540 srcP2 = srcR2; 502 srcP2 = srcR2;
541 srcP3 = vec_perm(srcR2, srcR3, permP3); 503 srcP3 = vec_perm(srcR2, srcR3, permP3);
542 } break; 504 } break;
543 case 13: { 505 case 13: {
544 vector unsigned char srcR3 = vec_ld(30, src); 506 vec_u8_t srcR3 = vec_ld(30, src);
545 srcM2 = vec_perm(srcR1, srcR2, permM2); 507 srcM2 = vec_perm(srcR1, srcR2, permM2);
546 srcM1 = vec_perm(srcR1, srcR2, permM1); 508 srcM1 = vec_perm(srcR1, srcR2, permM1);
547 srcP0 = vec_perm(srcR1, srcR2, permP0); 509 srcP0 = vec_perm(srcR1, srcR2, permP0);
548 srcP1 = srcR2; 510 srcP1 = srcR2;
549 srcP2 = vec_perm(srcR2, srcR3, permP2); 511 srcP2 = vec_perm(srcR2, srcR3, permP2);
550 srcP3 = vec_perm(srcR2, srcR3, permP3); 512 srcP3 = vec_perm(srcR2, srcR3, permP3);
551 } break; 513 } break;
552 case 14: { 514 case 14: {
553 vector unsigned char srcR3 = vec_ld(30, src); 515 vec_u8_t srcR3 = vec_ld(30, src);
554 srcM2 = vec_perm(srcR1, srcR2, permM2); 516 srcM2 = vec_perm(srcR1, srcR2, permM2);
555 srcM1 = vec_perm(srcR1, srcR2, permM1); 517 srcM1 = vec_perm(srcR1, srcR2, permM1);
556 srcP0 = srcR2; 518 srcP0 = srcR2;
557 srcP1 = vec_perm(srcR2, srcR3, permP1); 519 srcP1 = vec_perm(srcR2, srcR3, permP1);
558 srcP2 = vec_perm(srcR2, srcR3, permP2); 520 srcP2 = vec_perm(srcR2, srcR3, permP2);
559 srcP3 = vec_perm(srcR2, srcR3, permP3); 521 srcP3 = vec_perm(srcR2, srcR3, permP3);
560 } break; 522 } break;
561 case 15: { 523 case 15: {
562 vector unsigned char srcR3 = vec_ld(30, src); 524 vec_u8_t srcR3 = vec_ld(30, src);
563 srcM2 = vec_perm(srcR1, srcR2, permM2); 525 srcM2 = vec_perm(srcR1, srcR2, permM2);
564 srcM1 = srcR2; 526 srcM1 = srcR2;
565 srcP0 = vec_perm(srcR2, srcR3, permP0); 527 srcP0 = vec_perm(srcR2, srcR3, permP0);
566 srcP1 = vec_perm(srcR2, srcR3, permP1); 528 srcP1 = vec_perm(srcR2, srcR3, permP1);
567 srcP2 = vec_perm(srcR2, srcR3, permP2); 529 srcP2 = vec_perm(srcR2, srcR3, permP2);
568 srcP3 = vec_perm(srcR2, srcR3, permP3); 530 srcP3 = vec_perm(srcR2, srcR3, permP3);
569 } break; 531 } break;
570 } 532 }
571 533
572 srcP0A = (vector signed short) 534 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
573 vec_mergeh((vector unsigned char)vzero, srcP0); 535 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
574 srcP0B = (vector signed short) 536 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
575 vec_mergel((vector unsigned char)vzero, srcP0); 537 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
576 srcP1A = (vector signed short) 538
577 vec_mergeh((vector unsigned char)vzero, srcP1); 539 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
578 srcP1B = (vector signed short) 540 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
579 vec_mergel((vector unsigned char)vzero, srcP1); 541 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
580 542 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
581 srcP2A = (vector signed short) 543
582 vec_mergeh((vector unsigned char)vzero, srcP2); 544 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
583 srcP2B = (vector signed short) 545 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
584 vec_mergel((vector unsigned char)vzero, srcP2); 546 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
585 srcP3A = (vector signed short) 547 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
586 vec_mergeh((vector unsigned char)vzero, srcP3);
587 srcP3B = (vector signed short)
588 vec_mergel((vector unsigned char)vzero, srcP3);
589
590 srcM1A = (vector signed short)
591 vec_mergeh((vector unsigned char)vzero, srcM1);
592 srcM1B = (vector signed short)
593 vec_mergel((vector unsigned char)vzero, srcM1);
594 srcM2A = (vector signed short)
595 vec_mergeh((vector unsigned char)vzero, srcM2);
596 srcM2B = (vector signed short)
597 vec_mergel((vector unsigned char)vzero, srcM2);
598 548
599 sum1A = vec_adds(srcP0A, srcP1A); 549 sum1A = vec_adds(srcP0A, srcP1A);
600 sum1B = vec_adds(srcP0B, srcP1B); 550 sum1B = vec_adds(srcP0B, srcP1B);
601 sum2A = vec_adds(srcM1A, srcP2A); 551 sum2A = vec_adds(srcM1A, srcP2A);
602 sum2B = vec_adds(srcM1B, srcP2B); 552 sum2B = vec_adds(srcM1B, srcP2B);
604 sum3B = vec_adds(srcM2B, srcP3B); 554 sum3B = vec_adds(srcM2B, srcP3B);
605 555
606 pp1A = vec_mladd(sum1A, v20ss, sum3A); 556 pp1A = vec_mladd(sum1A, v20ss, sum3A);
607 pp1B = vec_mladd(sum1B, v20ss, sum3B); 557 pp1B = vec_mladd(sum1B, v20ss, sum3B);
608 558
609 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); 559 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
610 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); 560 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
611 561
612 psumA = vec_sub(pp1A, pp2A); 562 psumA = vec_sub(pp1A, pp2A);
613 psumB = vec_sub(pp1B, pp2B); 563 psumB = vec_sub(pp1B, pp2B);
614 564
615 vec_st(psumA, 0, tmp); 565 vec_st(psumA, 0, tmp);
634 tmpP2ssA = vec_ld(0, tmpbis); 584 tmpP2ssA = vec_ld(0, tmpbis);
635 tmpP2ssB = vec_ld(16, tmpbis); 585 tmpP2ssB = vec_ld(16, tmpbis);
636 tmpbis += tmpStride; 586 tmpbis += tmpStride;
637 587
638 for (i = 0 ; i < 16 ; i++) { 588 for (i = 0 ; i < 16 ; i++) {
639 const vector signed short tmpP3ssA = vec_ld(0, tmpbis); 589 const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
640 const vector signed short tmpP3ssB = vec_ld(16, tmpbis); 590 const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
641 591
642 const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA); 592 const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
643 const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB); 593 const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
644 const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA); 594 const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
645 const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB); 595 const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
646 const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA); 596 const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
647 const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB); 597 const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
648 598
649 tmpbis += tmpStride; 599 tmpbis += tmpStride;
650 600
651 tmpM2ssA = tmpM1ssA; 601 tmpM2ssA = tmpM1ssA;
652 tmpM2ssB = tmpM1ssB; 602 tmpM2ssB = tmpM1ssB;
667 pp2Ae = vec_mule(sum2A, v5ss); 617 pp2Ae = vec_mule(sum2A, v5ss);
668 pp2Ao = vec_mulo(sum2A, v5ss); 618 pp2Ao = vec_mulo(sum2A, v5ss);
669 pp2Be = vec_mule(sum2B, v5ss); 619 pp2Be = vec_mule(sum2B, v5ss);
670 pp2Bo = vec_mulo(sum2B, v5ss); 620 pp2Bo = vec_mulo(sum2B, v5ss);
671 621
672 pp3Ae = vec_sra((vector signed int)sum3A, v16ui); 622 pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
673 pp3Ao = vec_mulo(sum3A, v1ss); 623 pp3Ao = vec_mulo(sum3A, v1ss);
674 pp3Be = vec_sra((vector signed int)sum3B, v16ui); 624 pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
675 pp3Bo = vec_mulo(sum3B, v1ss); 625 pp3Bo = vec_mulo(sum3B, v1ss);
676 626
677 pp1cAe = vec_add(pp1Ae, v512si); 627 pp1cAe = vec_add(pp1Ae, v512si);
678 pp1cAo = vec_add(pp1Ao, v512si); 628 pp1cAo = vec_add(pp1Ao, v512si);
679 pp1cBe = vec_add(pp1Be, v512si); 629 pp1cBe = vec_add(pp1Be, v512si);