Mercurial > libavcodec.hg
comparison ppc/h264_template_altivec.c @ 8494:1615d6b75ada libavcodec
Cleanup _t types in libavcodec/ppc
author | lu_zero |
---|---|
date | Sat, 27 Dec 2008 11:21:28 +0000 |
parents | 5f3b62eaf6e5 |
children | 40738baaafc2 |
comparison
equal
deleted
inserted
replaced
8493:469f3e5bcf13 | 8494:1615d6b75ada |
---|---|
26 #endif | 26 #endif |
27 | 27 |
28 /* this code assume that stride % 16 == 0 */ | 28 /* this code assume that stride % 16 == 0 */ |
29 | 29 |
30 #define CHROMA_MC8_ALTIVEC_CORE \ | 30 #define CHROMA_MC8_ALTIVEC_CORE \ |
31 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\ | 31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ |
32 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);\ | 32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ |
33 \ | 33 \ |
34 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ | 34 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ |
35 psum = vec_mladd(vB, vsrc1ssH, psum);\ | 35 psum = vec_mladd(vB, vsrc1ssH, psum);\ |
36 psum = vec_mladd(vC, vsrc2ssH, psum);\ | 36 psum = vec_mladd(vC, vsrc2ssH, psum);\ |
37 psum = vec_mladd(vD, vsrc3ssH, psum);\ | 37 psum = vec_mladd(vD, vsrc3ssH, psum);\ |
38 psum = vec_sr(psum, v6us);\ | 38 psum = vec_sr(psum, v6us);\ |
39 \ | 39 \ |
40 vdst = vec_ld(0, dst);\ | 40 vdst = vec_ld(0, dst);\ |
41 ppsum = (vec_u8_t)vec_pack(psum, psum);\ | 41 ppsum = (vec_u8)vec_pack(psum, psum);\ |
42 vfdst = vec_perm(vdst, ppsum, fperm);\ | 42 vfdst = vec_perm(vdst, ppsum, fperm);\ |
43 \ | 43 \ |
44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ | 44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ |
45 \ | 45 \ |
46 vec_st(fsum, 0, dst);\ | 46 vec_st(fsum, 0, dst);\ |
51 dst += stride;\ | 51 dst += stride;\ |
52 src += stride; | 52 src += stride; |
53 | 53 |
54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ | 54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ |
55 \ | 55 \ |
56 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\ | 56 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\ |
57 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\ | 57 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\ |
58 \ | 58 \ |
59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ | 59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ |
60 psum = vec_mladd(vE, vsrc1ssH, psum);\ | 60 psum = vec_mladd(vE, vsrc1ssH, psum);\ |
61 psum = vec_sr(psum, v6us);\ | 61 psum = vec_sr(psum, v6us);\ |
62 \ | 62 \ |
63 vdst = vec_ld(0, dst);\ | 63 vdst = vec_ld(0, dst);\ |
64 ppsum = (vec_u8_t)vec_pack(psum, psum);\ | 64 ppsum = (vec_u8)vec_pack(psum, psum);\ |
65 vfdst = vec_perm(vdst, ppsum, fperm);\ | 65 vfdst = vec_perm(vdst, ppsum, fperm);\ |
66 \ | 66 \ |
67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ | 67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ |
68 \ | 68 \ |
69 vec_st(fsum, 0, dst);\ | 69 vec_st(fsum, 0, dst);\ |
78 {((8 - x) * (8 - y)), | 78 {((8 - x) * (8 - y)), |
79 (( x) * (8 - y)), | 79 (( x) * (8 - y)), |
80 ((8 - x) * ( y)), | 80 ((8 - x) * ( y)), |
81 (( x) * ( y))}; | 81 (( x) * ( y))}; |
82 register int i; | 82 register int i; |
83 vec_u8_t fperm; | 83 vec_u8 fperm; |
84 const vec_s32_t vABCD = vec_ld(0, ABCD); | 84 const vec_s32 vABCD = vec_ld(0, ABCD); |
85 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); | 85 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); |
86 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); | 86 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); |
87 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); | 87 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); |
88 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); | 88 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); |
89 LOAD_ZERO; | 89 LOAD_ZERO; |
90 const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); | 90 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); |
91 const vec_u16_t v6us = vec_splat_u16(6); | 91 const vec_u16 v6us = vec_splat_u16(6); |
92 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; | 92 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; |
93 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; | 93 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; |
94 | 94 |
95 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; | 95 vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; |
96 vec_u8_t vsrc0uc, vsrc1uc; | 96 vec_u8 vsrc0uc, vsrc1uc; |
97 vec_s16_t vsrc0ssH, vsrc1ssH; | 97 vec_s16 vsrc0ssH, vsrc1ssH; |
98 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; | 98 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; |
99 vec_s16_t vsrc2ssH, vsrc3ssH, psum; | 99 vec_s16 vsrc2ssH, vsrc3ssH, psum; |
100 vec_u8_t vdst, ppsum, vfdst, fsum; | 100 vec_u8 vdst, ppsum, vfdst, fsum; |
101 | 101 |
102 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); | 102 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); |
103 | 103 |
104 if (((unsigned long)dst) % 16 == 0) { | 104 if (((unsigned long)dst) % 16 == 0) { |
105 fperm = (vec_u8_t){0x10, 0x11, 0x12, 0x13, | 105 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, |
106 0x14, 0x15, 0x16, 0x17, | 106 0x14, 0x15, 0x16, 0x17, |
107 0x08, 0x09, 0x0A, 0x0B, | 107 0x08, 0x09, 0x0A, 0x0B, |
108 0x0C, 0x0D, 0x0E, 0x0F}; | 108 0x0C, 0x0D, 0x0E, 0x0F}; |
109 } else { | 109 } else { |
110 fperm = (vec_u8_t){0x00, 0x01, 0x02, 0x03, | 110 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, |
111 0x04, 0x05, 0x06, 0x07, | 111 0x04, 0x05, 0x06, 0x07, |
112 0x18, 0x19, 0x1A, 0x1B, | 112 0x18, 0x19, 0x1A, 0x1B, |
113 0x1C, 0x1D, 0x1E, 0x1F}; | 113 0x1C, 0x1D, 0x1E, 0x1F}; |
114 } | 114 } |
115 | 115 |
124 if (reallyBadAlign) | 124 if (reallyBadAlign) |
125 vsrc1uc = vsrcBuc; | 125 vsrc1uc = vsrcBuc; |
126 else | 126 else |
127 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); | 127 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); |
128 | 128 |
129 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc); | 129 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); |
130 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc); | 130 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); |
131 | 131 |
132 if (ABCD[3]) { | 132 if (ABCD[3]) { |
133 if (!loadSecond) {// -> !reallyBadAlign | 133 if (!loadSecond) {// -> !reallyBadAlign |
134 for (i = 0 ; i < h ; i++) { | 134 for (i = 0 ; i < h ; i++) { |
135 vsrcCuc = vec_ld(stride + 0, src); | 135 vsrcCuc = vec_ld(stride + 0, src); |
137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); | 137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); |
138 | 138 |
139 CHROMA_MC8_ALTIVEC_CORE | 139 CHROMA_MC8_ALTIVEC_CORE |
140 } | 140 } |
141 } else { | 141 } else { |
142 vec_u8_t vsrcDuc; | 142 vec_u8 vsrcDuc; |
143 for (i = 0 ; i < h ; i++) { | 143 for (i = 0 ; i < h ; i++) { |
144 vsrcCuc = vec_ld(stride + 0, src); | 144 vsrcCuc = vec_ld(stride + 0, src); |
145 vsrcDuc = vec_ld(stride + 16, src); | 145 vsrcDuc = vec_ld(stride + 16, src); |
146 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); | 146 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); |
147 if (reallyBadAlign) | 147 if (reallyBadAlign) |
151 | 151 |
152 CHROMA_MC8_ALTIVEC_CORE | 152 CHROMA_MC8_ALTIVEC_CORE |
153 } | 153 } |
154 } | 154 } |
155 } else { | 155 } else { |
156 const vec_s16_t vE = vec_add(vB, vC); | 156 const vec_s16 vE = vec_add(vB, vC); |
157 if (ABCD[2]) { // x == 0 B == 0 | 157 if (ABCD[2]) { // x == 0 B == 0 |
158 if (!loadSecond) {// -> !reallyBadAlign | 158 if (!loadSecond) {// -> !reallyBadAlign |
159 for (i = 0 ; i < h ; i++) { | 159 for (i = 0 ; i < h ; i++) { |
160 vsrcCuc = vec_ld(stride + 0, src); | 160 vsrcCuc = vec_ld(stride + 0, src); |
161 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); | 161 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); |
162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE | 162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE |
163 | 163 |
164 vsrc0uc = vsrc1uc; | 164 vsrc0uc = vsrc1uc; |
165 } | 165 } |
166 } else { | 166 } else { |
167 vec_u8_t vsrcDuc; | 167 vec_u8 vsrcDuc; |
168 for (i = 0 ; i < h ; i++) { | 168 for (i = 0 ; i < h ; i++) { |
169 vsrcCuc = vec_ld(stride + 0, src); | 169 vsrcCuc = vec_ld(stride + 0, src); |
170 vsrcDuc = vec_ld(stride + 15, src); | 170 vsrcDuc = vec_ld(stride + 15, src); |
171 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); | 171 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); |
172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE | 172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE |
182 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); | 182 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); |
183 | 183 |
184 CHROMA_MC8_ALTIVEC_CORE_SIMPLE | 184 CHROMA_MC8_ALTIVEC_CORE_SIMPLE |
185 } | 185 } |
186 } else { | 186 } else { |
187 vec_u8_t vsrcDuc; | 187 vec_u8 vsrcDuc; |
188 for (i = 0 ; i < h ; i++) { | 188 for (i = 0 ; i < h ; i++) { |
189 vsrcCuc = vec_ld(0, src); | 189 vsrcCuc = vec_ld(0, src); |
190 vsrcDuc = vec_ld(15, src); | 190 vsrcDuc = vec_ld(15, src); |
191 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); | 191 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); |
192 if (reallyBadAlign) | 192 if (reallyBadAlign) |
208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { | 208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); | 209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); |
210 register int i; | 210 register int i; |
211 | 211 |
212 LOAD_ZERO; | 212 LOAD_ZERO; |
213 const vec_u8_t permM2 = vec_lvsl(-2, src); | 213 const vec_u8 permM2 = vec_lvsl(-2, src); |
214 const vec_u8_t permM1 = vec_lvsl(-1, src); | 214 const vec_u8 permM1 = vec_lvsl(-1, src); |
215 const vec_u8_t permP0 = vec_lvsl(+0, src); | 215 const vec_u8 permP0 = vec_lvsl(+0, src); |
216 const vec_u8_t permP1 = vec_lvsl(+1, src); | 216 const vec_u8 permP1 = vec_lvsl(+1, src); |
217 const vec_u8_t permP2 = vec_lvsl(+2, src); | 217 const vec_u8 permP2 = vec_lvsl(+2, src); |
218 const vec_u8_t permP3 = vec_lvsl(+3, src); | 218 const vec_u8 permP3 = vec_lvsl(+3, src); |
219 const vec_s16_t v5ss = vec_splat_s16(5); | 219 const vec_s16 v5ss = vec_splat_s16(5); |
220 const vec_u16_t v5us = vec_splat_u16(5); | 220 const vec_u16 v5us = vec_splat_u16(5); |
221 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | 221 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
222 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | 222 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
223 | 223 |
224 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | 224 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
225 | 225 |
226 register int align = ((((unsigned long)src) - 2) % 16); | 226 register int align = ((((unsigned long)src) - 2) % 16); |
227 | 227 |
228 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, | 228 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, |
229 srcP2A, srcP2B, srcP3A, srcP3B, | 229 srcP2A, srcP2B, srcP3A, srcP3B, |
230 srcM1A, srcM1B, srcM2A, srcM2B, | 230 srcM1A, srcM1B, srcM2A, srcM2B, |
231 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | 231 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
232 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | 232 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
233 psumA, psumB, sumA, sumB; | 233 psumA, psumB, sumA, sumB; |
234 | 234 |
235 vec_u8_t sum, vdst, fsum; | 235 vec_u8 sum, vdst, fsum; |
236 | 236 |
237 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); | 237 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); |
238 | 238 |
239 for (i = 0 ; i < 16 ; i ++) { | 239 for (i = 0 ; i < 16 ; i ++) { |
240 vec_u8_t srcR1 = vec_ld(-2, src); | 240 vec_u8 srcR1 = vec_ld(-2, src); |
241 vec_u8_t srcR2 = vec_ld(14, src); | 241 vec_u8 srcR2 = vec_ld(14, src); |
242 | 242 |
243 switch (align) { | 243 switch (align) { |
244 default: { | 244 default: { |
245 srcM2 = vec_perm(srcR1, srcR2, permM2); | 245 srcM2 = vec_perm(srcR1, srcR2, permM2); |
246 srcM1 = vec_perm(srcR1, srcR2, permM1); | 246 srcM1 = vec_perm(srcR1, srcR2, permM1); |
256 srcP1 = vec_perm(srcR1, srcR2, permP1); | 256 srcP1 = vec_perm(srcR1, srcR2, permP1); |
257 srcP2 = vec_perm(srcR1, srcR2, permP2); | 257 srcP2 = vec_perm(srcR1, srcR2, permP2); |
258 srcP3 = srcR2; | 258 srcP3 = srcR2; |
259 } break; | 259 } break; |
260 case 12: { | 260 case 12: { |
261 vec_u8_t srcR3 = vec_ld(30, src); | 261 vec_u8 srcR3 = vec_ld(30, src); |
262 srcM2 = vec_perm(srcR1, srcR2, permM2); | 262 srcM2 = vec_perm(srcR1, srcR2, permM2); |
263 srcM1 = vec_perm(srcR1, srcR2, permM1); | 263 srcM1 = vec_perm(srcR1, srcR2, permM1); |
264 srcP0 = vec_perm(srcR1, srcR2, permP0); | 264 srcP0 = vec_perm(srcR1, srcR2, permP0); |
265 srcP1 = vec_perm(srcR1, srcR2, permP1); | 265 srcP1 = vec_perm(srcR1, srcR2, permP1); |
266 srcP2 = srcR2; | 266 srcP2 = srcR2; |
267 srcP3 = vec_perm(srcR2, srcR3, permP3); | 267 srcP3 = vec_perm(srcR2, srcR3, permP3); |
268 } break; | 268 } break; |
269 case 13: { | 269 case 13: { |
270 vec_u8_t srcR3 = vec_ld(30, src); | 270 vec_u8 srcR3 = vec_ld(30, src); |
271 srcM2 = vec_perm(srcR1, srcR2, permM2); | 271 srcM2 = vec_perm(srcR1, srcR2, permM2); |
272 srcM1 = vec_perm(srcR1, srcR2, permM1); | 272 srcM1 = vec_perm(srcR1, srcR2, permM1); |
273 srcP0 = vec_perm(srcR1, srcR2, permP0); | 273 srcP0 = vec_perm(srcR1, srcR2, permP0); |
274 srcP1 = srcR2; | 274 srcP1 = srcR2; |
275 srcP2 = vec_perm(srcR2, srcR3, permP2); | 275 srcP2 = vec_perm(srcR2, srcR3, permP2); |
276 srcP3 = vec_perm(srcR2, srcR3, permP3); | 276 srcP3 = vec_perm(srcR2, srcR3, permP3); |
277 } break; | 277 } break; |
278 case 14: { | 278 case 14: { |
279 vec_u8_t srcR3 = vec_ld(30, src); | 279 vec_u8 srcR3 = vec_ld(30, src); |
280 srcM2 = vec_perm(srcR1, srcR2, permM2); | 280 srcM2 = vec_perm(srcR1, srcR2, permM2); |
281 srcM1 = vec_perm(srcR1, srcR2, permM1); | 281 srcM1 = vec_perm(srcR1, srcR2, permM1); |
282 srcP0 = srcR2; | 282 srcP0 = srcR2; |
283 srcP1 = vec_perm(srcR2, srcR3, permP1); | 283 srcP1 = vec_perm(srcR2, srcR3, permP1); |
284 srcP2 = vec_perm(srcR2, srcR3, permP2); | 284 srcP2 = vec_perm(srcR2, srcR3, permP2); |
285 srcP3 = vec_perm(srcR2, srcR3, permP3); | 285 srcP3 = vec_perm(srcR2, srcR3, permP3); |
286 } break; | 286 } break; |
287 case 15: { | 287 case 15: { |
288 vec_u8_t srcR3 = vec_ld(30, src); | 288 vec_u8 srcR3 = vec_ld(30, src); |
289 srcM2 = vec_perm(srcR1, srcR2, permM2); | 289 srcM2 = vec_perm(srcR1, srcR2, permM2); |
290 srcM1 = srcR2; | 290 srcM1 = srcR2; |
291 srcP0 = vec_perm(srcR2, srcR3, permP0); | 291 srcP0 = vec_perm(srcR2, srcR3, permP0); |
292 srcP1 = vec_perm(srcR2, srcR3, permP1); | 292 srcP1 = vec_perm(srcR2, srcR3, permP1); |
293 srcP2 = vec_perm(srcR2, srcR3, permP2); | 293 srcP2 = vec_perm(srcR2, srcR3, permP2); |
294 srcP3 = vec_perm(srcR2, srcR3, permP3); | 294 srcP3 = vec_perm(srcR2, srcR3, permP3); |
295 } break; | 295 } break; |
296 } | 296 } |
297 | 297 |
298 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); | 298 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); |
299 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); | 299 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); |
300 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); | 300 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); |
301 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); | 301 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); |
302 | 302 |
303 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); | 303 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); |
304 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); | 304 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); |
305 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); | 305 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); |
306 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); | 306 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); |
307 | 307 |
308 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); | 308 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); |
309 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); | 309 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); |
310 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); | 310 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); |
311 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); | 311 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); |
312 | 312 |
313 sum1A = vec_adds(srcP0A, srcP1A); | 313 sum1A = vec_adds(srcP0A, srcP1A); |
314 sum1B = vec_adds(srcP0B, srcP1B); | 314 sum1B = vec_adds(srcP0B, srcP1B); |
315 sum2A = vec_adds(srcM1A, srcP2A); | 315 sum2A = vec_adds(srcM1A, srcP2A); |
316 sum2B = vec_adds(srcM1B, srcP2B); | 316 sum2B = vec_adds(srcM1B, srcP2B); |
352 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); | 352 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); |
353 | 353 |
354 register int i; | 354 register int i; |
355 | 355 |
356 LOAD_ZERO; | 356 LOAD_ZERO; |
357 const vec_u8_t perm = vec_lvsl(0, src); | 357 const vec_u8 perm = vec_lvsl(0, src); |
358 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | 358 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
359 const vec_u16_t v5us = vec_splat_u16(5); | 359 const vec_u16 v5us = vec_splat_u16(5); |
360 const vec_s16_t v5ss = vec_splat_s16(5); | 360 const vec_s16 v5ss = vec_splat_s16(5); |
361 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | 361 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
362 | 362 |
363 uint8_t *srcbis = src - (srcStride * 2); | 363 uint8_t *srcbis = src - (srcStride * 2); |
364 | 364 |
365 const vec_u8_t srcM2a = vec_ld(0, srcbis); | 365 const vec_u8 srcM2a = vec_ld(0, srcbis); |
366 const vec_u8_t srcM2b = vec_ld(16, srcbis); | 366 const vec_u8 srcM2b = vec_ld(16, srcbis); |
367 const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm); | 367 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); |
368 //srcbis += srcStride; | 368 //srcbis += srcStride; |
369 const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride); | 369 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); |
370 const vec_u8_t srcM1b = vec_ld(16, srcbis); | 370 const vec_u8 srcM1b = vec_ld(16, srcbis); |
371 const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm); | 371 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); |
372 //srcbis += srcStride; | 372 //srcbis += srcStride; |
373 const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride); | 373 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); |
374 const vec_u8_t srcP0b = vec_ld(16, srcbis); | 374 const vec_u8 srcP0b = vec_ld(16, srcbis); |
375 const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm); | 375 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); |
376 //srcbis += srcStride; | 376 //srcbis += srcStride; |
377 const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride); | 377 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); |
378 const vec_u8_t srcP1b = vec_ld(16, srcbis); | 378 const vec_u8 srcP1b = vec_ld(16, srcbis); |
379 const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm); | 379 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); |
380 //srcbis += srcStride; | 380 //srcbis += srcStride; |
381 const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride); | 381 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); |
382 const vec_u8_t srcP2b = vec_ld(16, srcbis); | 382 const vec_u8 srcP2b = vec_ld(16, srcbis); |
383 const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm); | 383 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); |
384 //srcbis += srcStride; | 384 //srcbis += srcStride; |
385 | 385 |
386 vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); | 386 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); |
387 vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); | 387 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); |
388 vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); | 388 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); |
389 vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); | 389 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); |
390 vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); | 390 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); |
391 vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); | 391 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); |
392 vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); | 392 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); |
393 vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); | 393 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); |
394 vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); | 394 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); |
395 vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); | 395 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); |
396 | 396 |
397 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | 397 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
398 psumA, psumB, sumA, sumB, | 398 psumA, psumB, sumA, sumB, |
399 srcP3ssA, srcP3ssB, | 399 srcP3ssA, srcP3ssB, |
400 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; | 400 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; |
401 | 401 |
402 vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3; | 402 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3; |
403 | 403 |
404 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); | 404 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); |
405 | 405 |
406 for (i = 0 ; i < 16 ; i++) { | 406 for (i = 0 ; i < 16 ; i++) { |
407 srcP3a = vec_ld(0, srcbis += srcStride); | 407 srcP3a = vec_ld(0, srcbis += srcStride); |
408 srcP3b = vec_ld(16, srcbis); | 408 srcP3b = vec_ld(16, srcbis); |
409 srcP3 = vec_perm(srcP3a, srcP3b, perm); | 409 srcP3 = vec_perm(srcP3a, srcP3b, perm); |
410 srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); | 410 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); |
411 srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); | 411 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); |
412 //srcbis += srcStride; | 412 //srcbis += srcStride; |
413 | 413 |
414 sum1A = vec_adds(srcP0ssA, srcP1ssA); | 414 sum1A = vec_adds(srcP0ssA, srcP1ssA); |
415 sum1B = vec_adds(srcP0ssB, srcP1ssB); | 415 sum1B = vec_adds(srcP0ssB, srcP1ssB); |
416 sum2A = vec_adds(srcM1ssA, srcP2ssA); | 416 sum2A = vec_adds(srcM1ssA, srcP2ssA); |
461 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ | 461 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ |
462 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { | 462 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { |
463 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); | 463 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); |
464 register int i; | 464 register int i; |
465 LOAD_ZERO; | 465 LOAD_ZERO; |
466 const vec_u8_t permM2 = vec_lvsl(-2, src); | 466 const vec_u8 permM2 = vec_lvsl(-2, src); |
467 const vec_u8_t permM1 = vec_lvsl(-1, src); | 467 const vec_u8 permM1 = vec_lvsl(-1, src); |
468 const vec_u8_t permP0 = vec_lvsl(+0, src); | 468 const vec_u8 permP0 = vec_lvsl(+0, src); |
469 const vec_u8_t permP1 = vec_lvsl(+1, src); | 469 const vec_u8 permP1 = vec_lvsl(+1, src); |
470 const vec_u8_t permP2 = vec_lvsl(+2, src); | 470 const vec_u8 permP2 = vec_lvsl(+2, src); |
471 const vec_u8_t permP3 = vec_lvsl(+3, src); | 471 const vec_u8 permP3 = vec_lvsl(+3, src); |
472 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | 472 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
473 const vec_u32_t v10ui = vec_splat_u32(10); | 473 const vec_u32 v10ui = vec_splat_u32(10); |
474 const vec_s16_t v5ss = vec_splat_s16(5); | 474 const vec_s16 v5ss = vec_splat_s16(5); |
475 const vec_s16_t v1ss = vec_splat_s16(1); | 475 const vec_s16 v1ss = vec_splat_s16(1); |
476 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); | 476 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); |
477 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); | 477 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); |
478 | 478 |
479 register int align = ((((unsigned long)src) - 2) % 16); | 479 register int align = ((((unsigned long)src) - 2) % 16); |
480 | 480 |
481 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, | 481 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, |
482 srcP2A, srcP2B, srcP3A, srcP3B, | 482 srcP2A, srcP2B, srcP3A, srcP3B, |
483 srcM1A, srcM1B, srcM2A, srcM2B, | 483 srcM1A, srcM1B, srcM2A, srcM2B, |
484 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | 484 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
485 pp1A, pp1B, pp2A, pp2B, psumA, psumB; | 485 pp1A, pp1B, pp2A, pp2B, psumA, psumB; |
486 | 486 |
487 const vec_u8_t mperm = (const vec_u8_t) | 487 const vec_u8 mperm = (const vec_u8) |
488 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, | 488 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, |
489 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; | 489 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; |
490 int16_t *tmpbis = tmp; | 490 int16_t *tmpbis = tmp; |
491 | 491 |
492 vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, | 492 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, |
493 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, | 493 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, |
494 tmpP2ssA, tmpP2ssB; | 494 tmpP2ssA, tmpP2ssB; |
495 | 495 |
496 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, | 496 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, |
497 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, | 497 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, |
498 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, | 498 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, |
499 ssumAe, ssumAo, ssumBe, ssumBo; | 499 ssumAe, ssumAo, ssumBe, ssumBo; |
500 vec_u8_t fsum, sumv, sum, vdst; | 500 vec_u8 fsum, sumv, sum, vdst; |
501 vec_s16_t ssume, ssumo; | 501 vec_s16 ssume, ssumo; |
502 | 502 |
503 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); | 503 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); |
504 src -= (2 * srcStride); | 504 src -= (2 * srcStride); |
505 for (i = 0 ; i < 21 ; i ++) { | 505 for (i = 0 ; i < 21 ; i ++) { |
506 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | 506 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
507 vec_u8_t srcR1 = vec_ld(-2, src); | 507 vec_u8 srcR1 = vec_ld(-2, src); |
508 vec_u8_t srcR2 = vec_ld(14, src); | 508 vec_u8 srcR2 = vec_ld(14, src); |
509 | 509 |
510 switch (align) { | 510 switch (align) { |
511 default: { | 511 default: { |
512 srcM2 = vec_perm(srcR1, srcR2, permM2); | 512 srcM2 = vec_perm(srcR1, srcR2, permM2); |
513 srcM1 = vec_perm(srcR1, srcR2, permM1); | 513 srcM1 = vec_perm(srcR1, srcR2, permM1); |
523 srcP1 = vec_perm(srcR1, srcR2, permP1); | 523 srcP1 = vec_perm(srcR1, srcR2, permP1); |
524 srcP2 = vec_perm(srcR1, srcR2, permP2); | 524 srcP2 = vec_perm(srcR1, srcR2, permP2); |
525 srcP3 = srcR2; | 525 srcP3 = srcR2; |
526 } break; | 526 } break; |
527 case 12: { | 527 case 12: { |
528 vec_u8_t srcR3 = vec_ld(30, src); | 528 vec_u8 srcR3 = vec_ld(30, src); |
529 srcM2 = vec_perm(srcR1, srcR2, permM2); | 529 srcM2 = vec_perm(srcR1, srcR2, permM2); |
530 srcM1 = vec_perm(srcR1, srcR2, permM1); | 530 srcM1 = vec_perm(srcR1, srcR2, permM1); |
531 srcP0 = vec_perm(srcR1, srcR2, permP0); | 531 srcP0 = vec_perm(srcR1, srcR2, permP0); |
532 srcP1 = vec_perm(srcR1, srcR2, permP1); | 532 srcP1 = vec_perm(srcR1, srcR2, permP1); |
533 srcP2 = srcR2; | 533 srcP2 = srcR2; |
534 srcP3 = vec_perm(srcR2, srcR3, permP3); | 534 srcP3 = vec_perm(srcR2, srcR3, permP3); |
535 } break; | 535 } break; |
536 case 13: { | 536 case 13: { |
537 vec_u8_t srcR3 = vec_ld(30, src); | 537 vec_u8 srcR3 = vec_ld(30, src); |
538 srcM2 = vec_perm(srcR1, srcR2, permM2); | 538 srcM2 = vec_perm(srcR1, srcR2, permM2); |
539 srcM1 = vec_perm(srcR1, srcR2, permM1); | 539 srcM1 = vec_perm(srcR1, srcR2, permM1); |
540 srcP0 = vec_perm(srcR1, srcR2, permP0); | 540 srcP0 = vec_perm(srcR1, srcR2, permP0); |
541 srcP1 = srcR2; | 541 srcP1 = srcR2; |
542 srcP2 = vec_perm(srcR2, srcR3, permP2); | 542 srcP2 = vec_perm(srcR2, srcR3, permP2); |
543 srcP3 = vec_perm(srcR2, srcR3, permP3); | 543 srcP3 = vec_perm(srcR2, srcR3, permP3); |
544 } break; | 544 } break; |
545 case 14: { | 545 case 14: { |
546 vec_u8_t srcR3 = vec_ld(30, src); | 546 vec_u8 srcR3 = vec_ld(30, src); |
547 srcM2 = vec_perm(srcR1, srcR2, permM2); | 547 srcM2 = vec_perm(srcR1, srcR2, permM2); |
548 srcM1 = vec_perm(srcR1, srcR2, permM1); | 548 srcM1 = vec_perm(srcR1, srcR2, permM1); |
549 srcP0 = srcR2; | 549 srcP0 = srcR2; |
550 srcP1 = vec_perm(srcR2, srcR3, permP1); | 550 srcP1 = vec_perm(srcR2, srcR3, permP1); |
551 srcP2 = vec_perm(srcR2, srcR3, permP2); | 551 srcP2 = vec_perm(srcR2, srcR3, permP2); |
552 srcP3 = vec_perm(srcR2, srcR3, permP3); | 552 srcP3 = vec_perm(srcR2, srcR3, permP3); |
553 } break; | 553 } break; |
554 case 15: { | 554 case 15: { |
555 vec_u8_t srcR3 = vec_ld(30, src); | 555 vec_u8 srcR3 = vec_ld(30, src); |
556 srcM2 = vec_perm(srcR1, srcR2, permM2); | 556 srcM2 = vec_perm(srcR1, srcR2, permM2); |
557 srcM1 = srcR2; | 557 srcM1 = srcR2; |
558 srcP0 = vec_perm(srcR2, srcR3, permP0); | 558 srcP0 = vec_perm(srcR2, srcR3, permP0); |
559 srcP1 = vec_perm(srcR2, srcR3, permP1); | 559 srcP1 = vec_perm(srcR2, srcR3, permP1); |
560 srcP2 = vec_perm(srcR2, srcR3, permP2); | 560 srcP2 = vec_perm(srcR2, srcR3, permP2); |
561 srcP3 = vec_perm(srcR2, srcR3, permP3); | 561 srcP3 = vec_perm(srcR2, srcR3, permP3); |
562 } break; | 562 } break; |
563 } | 563 } |
564 | 564 |
565 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); | 565 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); |
566 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); | 566 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); |
567 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); | 567 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); |
568 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); | 568 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); |
569 | 569 |
570 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); | 570 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); |
571 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); | 571 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); |
572 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); | 572 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); |
573 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); | 573 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); |
574 | 574 |
575 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); | 575 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); |
576 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); | 576 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); |
577 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); | 577 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); |
578 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); | 578 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); |
579 | 579 |
580 sum1A = vec_adds(srcP0A, srcP1A); | 580 sum1A = vec_adds(srcP0A, srcP1A); |
581 sum1B = vec_adds(srcP0B, srcP1B); | 581 sum1B = vec_adds(srcP0B, srcP1B); |
582 sum2A = vec_adds(srcM1A, srcP2A); | 582 sum2A = vec_adds(srcM1A, srcP2A); |
583 sum2B = vec_adds(srcM1B, srcP2B); | 583 sum2B = vec_adds(srcM1B, srcP2B); |
615 tmpP2ssA = vec_ld(0, tmpbis); | 615 tmpP2ssA = vec_ld(0, tmpbis); |
616 tmpP2ssB = vec_ld(16, tmpbis); | 616 tmpP2ssB = vec_ld(16, tmpbis); |
617 tmpbis += tmpStride; | 617 tmpbis += tmpStride; |
618 | 618 |
619 for (i = 0 ; i < 16 ; i++) { | 619 for (i = 0 ; i < 16 ; i++) { |
620 const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis); | 620 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); |
621 const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis); | 621 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); |
622 | 622 |
623 const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA); | 623 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); |
624 const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB); | 624 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); |
625 const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA); | 625 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); |
626 const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB); | 626 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); |
627 const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA); | 627 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); |
628 const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB); | 628 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); |
629 | 629 |
630 tmpbis += tmpStride; | 630 tmpbis += tmpStride; |
631 | 631 |
632 tmpM2ssA = tmpM1ssA; | 632 tmpM2ssA = tmpM1ssA; |
633 tmpM2ssB = tmpM1ssB; | 633 tmpM2ssB = tmpM1ssB; |
648 pp2Ae = vec_mule(sum2A, v5ss); | 648 pp2Ae = vec_mule(sum2A, v5ss); |
649 pp2Ao = vec_mulo(sum2A, v5ss); | 649 pp2Ao = vec_mulo(sum2A, v5ss); |
650 pp2Be = vec_mule(sum2B, v5ss); | 650 pp2Be = vec_mule(sum2B, v5ss); |
651 pp2Bo = vec_mulo(sum2B, v5ss); | 651 pp2Bo = vec_mulo(sum2B, v5ss); |
652 | 652 |
653 pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); | 653 pp3Ae = vec_sra((vec_s32)sum3A, v16ui); |
654 pp3Ao = vec_mulo(sum3A, v1ss); | 654 pp3Ao = vec_mulo(sum3A, v1ss); |
655 pp3Be = vec_sra((vec_s32_t)sum3B, v16ui); | 655 pp3Be = vec_sra((vec_s32)sum3B, v16ui); |
656 pp3Bo = vec_mulo(sum3B, v1ss); | 656 pp3Bo = vec_mulo(sum3B, v1ss); |
657 | 657 |
658 pp1cAe = vec_add(pp1Ae, v512si); | 658 pp1cAe = vec_add(pp1Ae, v512si); |
659 pp1cAo = vec_add(pp1Ao, v512si); | 659 pp1cAo = vec_add(pp1Ao, v512si); |
660 pp1cBe = vec_add(pp1Be, v512si); | 660 pp1cBe = vec_add(pp1Be, v512si); |