Mercurial > libavcodec.hg
comparison ppc/dsputil_h264_template_altivec.c @ 3346:052765f11f1c libavcodec
Cosmetics: should not hurt performance, scream if are
author | lu_zero |
---|---|
date | Sat, 17 Jun 2006 18:46:06 +0000 |
parents | 7aa01243b4d3 |
children | eff63ac2b545 |
comparison
equal
deleted
inserted
replaced
3345:76620d530d9a | 3346:052765f11f1c |
---|---|
17 */ | 17 */ |
18 | 18 |
19 /* this code assume that stride % 16 == 0 */ | 19 /* this code assume that stride % 16 == 0 */ |
20 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { | 20 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { |
21 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); | 21 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); |
22 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); | 22 signed int ABCD[4] __attribute__((aligned(16))) = |
23 signed int ABCD[4] __attribute__((aligned(16))); | 23 {((8 - x) * (8 - y)), |
24 ((x) * (8 - y)), | |
25 ((8 - x) * (y)), | |
26 ((x) * (y))}; | |
24 register int i; | 27 register int i; |
25 ABCD[0] = ((8 - x) * (8 - y)); | 28 vector unsigned char fperm; |
26 ABCD[1] = ((x) * (8 - y)); | |
27 ABCD[2] = ((8 - x) * (y)); | |
28 ABCD[3] = ((x) * (y)); | |
29 const vector signed int vABCD = vec_ld(0, ABCD); | 29 const vector signed int vABCD = vec_ld(0, ABCD); |
30 const vector signed short vA = vec_splat((vector signed short)vABCD, 1); | 30 const vector signed short vA = vec_splat((vector signed short)vABCD, 1); |
31 const vector signed short vB = vec_splat((vector signed short)vABCD, 3); | 31 const vector signed short vB = vec_splat((vector signed short)vABCD, 3); |
32 const vector signed short vC = vec_splat((vector signed short)vABCD, 5); | 32 const vector signed short vC = vec_splat((vector signed short)vABCD, 5); |
33 const vector signed short vD = vec_splat((vector signed short)vABCD, 7); | 33 const vector signed short vD = vec_splat((vector signed short)vABCD, 7); |
34 const vector signed int vzero = vec_splat_s32(0); | 34 const vector signed int vzero = vec_splat_s32(0); |
35 const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); | 35 const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); |
36 const vector unsigned short v6us = vec_splat_u16(6); | 36 const vector unsigned short v6us = vec_splat_u16(6); |
37 | |
38 vector unsigned char fperm; | |
39 | |
40 if (((unsigned long)dst) % 16 == 0) { | |
41 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, | |
42 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); | |
43 } else { | |
44 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
45 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | |
46 } | |
47 | |
48 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; | 37 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; |
49 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; | 38 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; |
50 | 39 |
51 vector unsigned char vsrcAuc; | 40 vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; |
52 vector unsigned char vsrcBuc; | 41 vector unsigned char vsrc0uc, vsrc1uc; |
53 vector unsigned char vsrcperm0; | 42 vector signed short vsrc0ssH, vsrc1ssH; |
54 vector unsigned char vsrcperm1; | 43 vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; |
44 vector signed short vsrc2ssH, vsrc3ssH, psum; | |
45 vector unsigned char vdst, ppsum, vfdst, fsum; | |
46 | |
47 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); | |
48 | |
49 if (((unsigned long)dst) % 16 == 0) { | |
50 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, | |
51 0x14, 0x15, 0x16, 0x17, | |
52 0x08, 0x09, 0x0A, 0x0B, | |
53 0x0C, 0x0D, 0x0E, 0x0F); | |
54 } else { | |
55 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, | |
56 0x04, 0x05, 0x06, 0x07, | |
57 0x18, 0x19, 0x1A, 0x1B, | |
58 0x1C, 0x1D, 0x1E, 0x1F); | |
59 } | |
60 | |
55 vsrcAuc = vec_ld(0, src); | 61 vsrcAuc = vec_ld(0, src); |
62 | |
56 if (loadSecond) | 63 if (loadSecond) |
57 vsrcBuc = vec_ld(16, src); | 64 vsrcBuc = vec_ld(16, src); |
58 vsrcperm0 = vec_lvsl(0, src); | 65 vsrcperm0 = vec_lvsl(0, src); |
59 vsrcperm1 = vec_lvsl(1, src); | 66 vsrcperm1 = vec_lvsl(1, src); |
60 | 67 |
61 vector unsigned char vsrc0uc; | |
62 vector unsigned char vsrc1uc; | |
63 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); | 68 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); |
64 if (reallyBadAlign) | 69 if (reallyBadAlign) |
65 vsrc1uc = vsrcBuc; | 70 vsrc1uc = vsrcBuc; |
66 else | 71 else |
67 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); | 72 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); |
68 | 73 |
69 vector signed short vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc0uc); | 74 vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, |
70 vector signed short vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc1uc); | 75 (vector unsigned char)vsrc0uc); |
76 vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, | |
77 (vector unsigned char)vsrc1uc); | |
71 | 78 |
72 if (!loadSecond) {// -> !reallyBadAlign | 79 if (!loadSecond) {// -> !reallyBadAlign |
73 for (i = 0 ; i < h ; i++) { | 80 for (i = 0 ; i < h ; i++) { |
74 vector unsigned char vsrcCuc; | 81 |
82 | |
75 vsrcCuc = vec_ld(stride + 0, src); | 83 vsrcCuc = vec_ld(stride + 0, src); |
76 | 84 |
77 vector unsigned char vsrc2uc; | |
78 vector unsigned char vsrc3uc; | |
79 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); | 85 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); |
80 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); | 86 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); |
81 | 87 |
82 vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc); | 88 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, |
83 vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc); | 89 (vector unsigned char)vsrc2uc); |
84 | 90 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, |
85 vector signed short psum; | 91 (vector unsigned char)vsrc3uc); |
86 | 92 |
87 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); | 93 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); |
88 psum = vec_mladd(vB, vsrc1ssH, psum); | 94 psum = vec_mladd(vB, vsrc1ssH, psum); |
89 psum = vec_mladd(vC, vsrc2ssH, psum); | 95 psum = vec_mladd(vC, vsrc2ssH, psum); |
90 psum = vec_mladd(vD, vsrc3ssH, psum); | 96 psum = vec_mladd(vD, vsrc3ssH, psum); |
91 psum = vec_add(v32ss, psum); | 97 psum = vec_add(v32ss, psum); |
92 psum = vec_sra(psum, v6us); | 98 psum = vec_sra(psum, v6us); |
93 | 99 |
94 vector unsigned char vdst = vec_ld(0, dst); | 100 vdst = vec_ld(0, dst); |
95 vector unsigned char ppsum = (vector unsigned char)vec_packsu(psum, psum); | 101 ppsum = (vector unsigned char)vec_packsu(psum, psum); |
96 | 102 vfdst = vec_perm(vdst, ppsum, fperm); |
97 vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm); | |
98 vector unsigned char fsum; | |
99 | 103 |
100 OP_U8_ALTIVEC(fsum, vfdst, vdst); | 104 OP_U8_ALTIVEC(fsum, vfdst, vdst); |
101 | 105 |
102 vec_st(fsum, 0, dst); | 106 vec_st(fsum, 0, dst); |
103 | 107 |
106 | 110 |
107 dst += stride; | 111 dst += stride; |
108 src += stride; | 112 src += stride; |
109 } | 113 } |
110 } else { | 114 } else { |
115 vector unsigned char vsrcDuc; | |
111 for (i = 0 ; i < h ; i++) { | 116 for (i = 0 ; i < h ; i++) { |
112 vector unsigned char vsrcCuc; | |
113 vector unsigned char vsrcDuc; | |
114 vsrcCuc = vec_ld(stride + 0, src); | 117 vsrcCuc = vec_ld(stride + 0, src); |
115 vsrcDuc = vec_ld(stride + 16, src); | 118 vsrcDuc = vec_ld(stride + 16, src); |
116 | 119 |
117 vector unsigned char vsrc2uc; | |
118 vector unsigned char vsrc3uc; | |
119 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); | 120 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); |
120 if (reallyBadAlign) | 121 if (reallyBadAlign) |
121 vsrc3uc = vsrcDuc; | 122 vsrc3uc = vsrcDuc; |
122 else | 123 else |
123 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); | 124 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); |
124 | 125 |
125 vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc); | 126 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, |
126 vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc); | 127 (vector unsigned char)vsrc2uc); |
127 | 128 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, |
128 vector signed short psum; | 129 (vector unsigned char)vsrc3uc); |
129 | 130 |
130 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); | 131 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); |
131 psum = vec_mladd(vB, vsrc1ssH, psum); | 132 psum = vec_mladd(vB, vsrc1ssH, psum); |
132 psum = vec_mladd(vC, vsrc2ssH, psum); | 133 psum = vec_mladd(vC, vsrc2ssH, psum); |
133 psum = vec_mladd(vD, vsrc3ssH, psum); | 134 psum = vec_mladd(vD, vsrc3ssH, psum); |
134 psum = vec_add(v32ss, psum); | 135 psum = vec_add(v32ss, psum); |
135 psum = vec_sr(psum, v6us); | 136 psum = vec_sr(psum, v6us); |
136 | 137 |
137 vector unsigned char vdst = vec_ld(0, dst); | 138 vdst = vec_ld(0, dst); |
138 vector unsigned char ppsum = (vector unsigned char)vec_pack(psum, psum); | 139 ppsum = (vector unsigned char)vec_pack(psum, psum); |
139 | 140 vfdst = vec_perm(vdst, ppsum, fperm); |
140 vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm); | |
141 vector unsigned char fsum; | |
142 | 141 |
143 OP_U8_ALTIVEC(fsum, vfdst, vdst); | 142 OP_U8_ALTIVEC(fsum, vfdst, vdst); |
144 | 143 |
145 vec_st(fsum, 0, dst); | 144 vec_st(fsum, 0, dst); |
146 | 145 |
155 } | 154 } |
156 | 155 |
157 /* this code assume stride % 16 == 0 */ | 156 /* this code assume stride % 16 == 0 */ |
158 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { | 157 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
159 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); | 158 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); |
160 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); | |
161 register int i; | 159 register int i; |
162 | 160 |
163 const vector signed int vzero = vec_splat_s32(0); | 161 const vector signed int vzero = vec_splat_s32(0); |
164 const vector unsigned char permM2 = vec_lvsl(-2, src); | 162 const vector unsigned char permM2 = vec_lvsl(-2, src); |
165 const vector unsigned char permM1 = vec_lvsl(-1, src); | 163 const vector unsigned char permM1 = vec_lvsl(-1, src); |
170 const vector signed short v5ss = vec_splat_s16(5); | 168 const vector signed short v5ss = vec_splat_s16(5); |
171 const vector unsigned short v5us = vec_splat_u16(5); | 169 const vector unsigned short v5us = vec_splat_u16(5); |
172 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | 170 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
173 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | 171 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
174 const vector unsigned char dstperm = vec_lvsr(0, dst); | 172 const vector unsigned char dstperm = vec_lvsr(0, dst); |
175 const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); | 173 const vector unsigned char neg1 = |
176 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); | 174 (const vector unsigned char) vec_splat_s8(-1); |
175 | |
176 const vector unsigned char dstmask = | |
177 vec_perm((const vector unsigned char)vzero, | |
178 neg1, dstperm); | |
179 | |
180 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | |
177 | 181 |
178 register int align = ((((unsigned long)src) - 2) % 16); | 182 register int align = ((((unsigned long)src) - 2) % 16); |
179 | 183 |
184 vector signed short srcP0A, srcP0B, srcP1A, srcP1B, | |
185 srcP2A, srcP2B, srcP3A, srcP3B, | |
186 srcM1A, srcM1B, srcM2A, srcM2B, | |
187 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | |
188 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | |
189 psumA, psumB, sumA, sumB; | |
190 | |
191 vector unsigned char sum, dst1, dst2, vdst, fsum, | |
192 rsum, fdst1, fdst2; | |
193 | |
194 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); | |
195 | |
180 for (i = 0 ; i < 16 ; i ++) { | 196 for (i = 0 ; i < 16 ; i ++) { |
181 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | |
182 vector unsigned char srcR1 = vec_ld(-2, src); | 197 vector unsigned char srcR1 = vec_ld(-2, src); |
183 vector unsigned char srcR2 = vec_ld(14, src); | 198 vector unsigned char srcR2 = vec_ld(14, src); |
184 | 199 |
185 switch (align) { | 200 switch (align) { |
186 default: { | 201 default: { |
235 srcP2 = vec_perm(srcR2, srcR3, permP2); | 250 srcP2 = vec_perm(srcR2, srcR3, permP2); |
236 srcP3 = vec_perm(srcR2, srcR3, permP3); | 251 srcP3 = vec_perm(srcR2, srcR3, permP3); |
237 } break; | 252 } break; |
238 } | 253 } |
239 | 254 |
240 const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); | 255 srcP0A = vec_mergeh((vector unsigned char)vzero, srcP0); |
241 const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); | 256 srcP0B = vec_mergel((vector unsigned char)vzero, srcP0); |
242 const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); | 257 srcP1A = vec_mergeh((vector unsigned char)vzero, srcP1); |
243 const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); | 258 srcP1B = vec_mergel((vector unsigned char)vzero, srcP1); |
244 | 259 |
245 const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); | 260 srcP2A = vec_mergeh((vector unsigned char)vzero, srcP2); |
246 const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); | 261 srcP2B = vec_mergel((vector unsigned char)vzero, srcP2); |
247 const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); | 262 srcP3A = vec_mergeh((vector unsigned char)vzero, srcP3); |
248 const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); | 263 srcP3B = vec_mergel((vector unsigned char)vzero, srcP3); |
249 | 264 |
250 const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); | 265 srcM1A = vec_mergeh((vector unsigned char)vzero, srcM1); |
251 const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); | 266 srcM1B = vec_mergel((vector unsigned char)vzero, srcM1); |
252 const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); | 267 srcM2A = vec_mergeh((vector unsigned char)vzero, srcM2); |
253 const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); | 268 srcM2B = vec_mergel((vector unsigned char)vzero, srcM2); |
254 | 269 |
255 const vector signed short sum1A = vec_adds(srcP0A, srcP1A); | 270 sum1A = vec_adds(srcP0A, srcP1A); |
256 const vector signed short sum1B = vec_adds(srcP0B, srcP1B); | 271 sum1B = vec_adds(srcP0B, srcP1B); |
257 const vector signed short sum2A = vec_adds(srcM1A, srcP2A); | 272 sum2A = vec_adds(srcM1A, srcP2A); |
258 const vector signed short sum2B = vec_adds(srcM1B, srcP2B); | 273 sum2B = vec_adds(srcM1B, srcP2B); |
259 const vector signed short sum3A = vec_adds(srcM2A, srcP3A); | 274 sum3A = vec_adds(srcM2A, srcP3A); |
260 const vector signed short sum3B = vec_adds(srcM2B, srcP3B); | 275 sum3B = vec_adds(srcM2B, srcP3B); |
261 | 276 |
262 const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss); | 277 pp1A = vec_mladd(sum1A, v20ss, v16ss); |
263 const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss); | 278 pp1B = vec_mladd(sum1B, v20ss, v16ss); |
264 | 279 |
265 const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); | 280 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); |
266 const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); | 281 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); |
267 | 282 |
268 const vector signed short pp3A = vec_add(sum3A, pp1A); | 283 pp3A = vec_add(sum3A, pp1A); |
269 const vector signed short pp3B = vec_add(sum3B, pp1B); | 284 pp3B = vec_add(sum3B, pp1B); |
270 | 285 |
271 const vector signed short psumA = vec_sub(pp3A, pp2A); | 286 psumA = vec_sub(pp3A, pp2A); |
272 const vector signed short psumB = vec_sub(pp3B, pp2B); | 287 psumB = vec_sub(pp3B, pp2B); |
273 | 288 |
274 const vector signed short sumA = vec_sra(psumA, v5us); | 289 sumA = vec_sra(psumA, v5us); |
275 const vector signed short sumB = vec_sra(psumB, v5us); | 290 sumB = vec_sra(psumB, v5us); |
276 | 291 |
277 const vector unsigned char sum = vec_packsu(sumA, sumB); | 292 sum = vec_packsu(sumA, sumB); |
278 | 293 |
279 const vector unsigned char dst1 = vec_ld(0, dst); | 294 dst1 = vec_ld(0, dst); |
280 const vector unsigned char dst2 = vec_ld(16, dst); | 295 dst2 = vec_ld(16, dst); |
281 const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); | 296 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); |
282 | 297 |
283 vector unsigned char fsum; | |
284 OP_U8_ALTIVEC(fsum, sum, vdst); | 298 OP_U8_ALTIVEC(fsum, sum, vdst); |
285 | 299 |
286 const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); | 300 rsum = vec_perm(fsum, fsum, dstperm); |
287 const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); | 301 fdst1 = vec_sel(dst1, rsum, dstmask); |
288 const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); | 302 fdst2 = vec_sel(rsum, dst2, dstmask); |
289 | 303 |
290 vec_st(fdst1, 0, dst); | 304 vec_st(fdst1, 0, dst); |
291 vec_st(fdst2, 16, dst); | 305 vec_st(fdst2, 16, dst); |
292 | 306 |
293 src += srcStride; | 307 src += srcStride; |
297 } | 311 } |
298 | 312 |
299 /* this code assume stride % 16 == 0 */ | 313 /* this code assume stride % 16 == 0 */ |
300 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { | 314 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
301 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); | 315 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); |
302 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); | |
303 | 316 |
304 register int i; | 317 register int i; |
305 | 318 |
306 const vector signed int vzero = vec_splat_s32(0); | 319 const vector signed int vzero = vec_splat_s32(0); |
307 const vector unsigned char perm = vec_lvsl(0, src); | 320 const vector unsigned char perm = vec_lvsl(0, src); |
316 uint8_t *srcbis = src - (srcStride * 2); | 329 uint8_t *srcbis = src - (srcStride * 2); |
317 | 330 |
318 const vector unsigned char srcM2a = vec_ld(0, srcbis); | 331 const vector unsigned char srcM2a = vec_ld(0, srcbis); |
319 const vector unsigned char srcM2b = vec_ld(16, srcbis); | 332 const vector unsigned char srcM2b = vec_ld(16, srcbis); |
320 const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm); | 333 const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm); |
321 srcbis += srcStride; | 334 // srcbis += srcStride; |
322 const vector unsigned char srcM1a = vec_ld(0, srcbis); | 335 const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride); |
323 const vector unsigned char srcM1b = vec_ld(16, srcbis); | 336 const vector unsigned char srcM1b = vec_ld(16, srcbis); |
324 const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm); | 337 const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm); |
325 srcbis += srcStride; | 338 // srcbis += srcStride; |
326 const vector unsigned char srcP0a = vec_ld(0, srcbis); | 339 const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride); |
327 const vector unsigned char srcP0b = vec_ld(16, srcbis); | 340 const vector unsigned char srcP0b = vec_ld(16, srcbis); |
328 const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm); | 341 const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm); |
329 srcbis += srcStride; | 342 // srcbis += srcStride; |
330 const vector unsigned char srcP1a = vec_ld(0, srcbis); | 343 const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride); |
331 const vector unsigned char srcP1b = vec_ld(16, srcbis); | 344 const vector unsigned char srcP1b = vec_ld(16, srcbis); |
332 const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm); | 345 const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm); |
333 srcbis += srcStride; | 346 // srcbis += srcStride; |
334 const vector unsigned char srcP2a = vec_ld(0, srcbis); | 347 const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride); |
335 const vector unsigned char srcP2b = vec_ld(16, srcbis); | 348 const vector unsigned char srcP2b = vec_ld(16, srcbis); |
336 const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm); | 349 const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm); |
337 srcbis += srcStride; | 350 // srcbis += srcStride; |
338 | 351 |
339 vector signed short srcM2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); | 352 vector signed short srcM2ssA = (vector signed short) |
340 vector signed short srcM2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); | 353 vec_mergeh((vector unsigned char)vzero, srcM2); |
341 vector signed short srcM1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); | 354 vector signed short srcM2ssB = (vector signed short) |
342 vector signed short srcM1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); | 355 vec_mergel((vector unsigned char)vzero, srcM2); |
343 vector signed short srcP0ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); | 356 vector signed short srcM1ssA = (vector signed short) |
344 vector signed short srcP0ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); | 357 vec_mergeh((vector unsigned char)vzero, srcM1); |
345 vector signed short srcP1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); | 358 vector signed short srcM1ssB = (vector signed short) |
346 vector signed short srcP1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); | 359 vec_mergel((vector unsigned char)vzero, srcM1); |
347 vector signed short srcP2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); | 360 vector signed short srcP0ssA = (vector signed short) |
348 vector signed short srcP2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); | 361 vec_mergeh((vector unsigned char)vzero, srcP0); |
362 vector signed short srcP0ssB = (vector signed short) | |
363 vec_mergel((vector unsigned char)vzero, srcP0); | |
364 vector signed short srcP1ssA = (vector signed short) | |
365 vec_mergeh((vector unsigned char)vzero, srcP1); | |
366 vector signed short srcP1ssB = (vector signed short) | |
367 vec_mergel((vector unsigned char)vzero, srcP1); | |
368 vector signed short srcP2ssA = (vector signed short) | |
369 vec_mergeh((vector unsigned char)vzero, srcP2); | |
370 vector signed short srcP2ssB = (vector signed short) | |
371 vec_mergel((vector unsigned char)vzero, srcP2); | |
372 | |
373 vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | |
374 psumA, psumB, sumA, sumB, | |
375 srcP3ssA, srcP3ssB, | |
376 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; | |
377 | |
378 vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, | |
379 srcP3a, srcP3b, srcP3; | |
380 | |
381 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); | |
349 | 382 |
350 for (i = 0 ; i < 16 ; i++) { | 383 for (i = 0 ; i < 16 ; i++) { |
351 const vector unsigned char srcP3a = vec_ld(0, srcbis); | 384 srcP3a = vec_ld(0, srcbis += srcStride); |
352 const vector unsigned char srcP3b = vec_ld(16, srcbis); | 385 srcP3b = vec_ld(16, srcbis); |
353 const vector unsigned char srcP3 = vec_perm(srcP3a, srcP3b, perm); | 386 srcP3 = vec_perm(srcP3a, srcP3b, perm); |
354 const vector signed short srcP3ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); | 387 srcP3ssA = (vector signed short) |
355 const vector signed short srcP3ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); | 388 vec_mergeh((vector unsigned char)vzero, srcP3); |
356 srcbis += srcStride; | 389 srcP3ssB = (vector signed short) |
357 | 390 vec_mergel((vector unsigned char)vzero, srcP3); |
358 const vector signed short sum1A = vec_adds(srcP0ssA, srcP1ssA); | 391 // srcbis += srcStride; |
359 const vector signed short sum1B = vec_adds(srcP0ssB, srcP1ssB); | 392 |
360 const vector signed short sum2A = vec_adds(srcM1ssA, srcP2ssA); | 393 sum1A = vec_adds(srcP0ssA, srcP1ssA); |
361 const vector signed short sum2B = vec_adds(srcM1ssB, srcP2ssB); | 394 sum1B = vec_adds(srcP0ssB, srcP1ssB); |
362 const vector signed short sum3A = vec_adds(srcM2ssA, srcP3ssA); | 395 sum2A = vec_adds(srcM1ssA, srcP2ssA); |
363 const vector signed short sum3B = vec_adds(srcM2ssB, srcP3ssB); | 396 sum2B = vec_adds(srcM1ssB, srcP2ssB); |
397 sum3A = vec_adds(srcM2ssA, srcP3ssA); | |
398 sum3B = vec_adds(srcM2ssB, srcP3ssB); | |
364 | 399 |
365 srcM2ssA = srcM1ssA; | 400 srcM2ssA = srcM1ssA; |
366 srcM2ssB = srcM1ssB; | 401 srcM2ssB = srcM1ssB; |
367 srcM1ssA = srcP0ssA; | 402 srcM1ssA = srcP0ssA; |
368 srcM1ssB = srcP0ssB; | 403 srcM1ssB = srcP0ssB; |
371 srcP1ssA = srcP2ssA; | 406 srcP1ssA = srcP2ssA; |
372 srcP1ssB = srcP2ssB; | 407 srcP1ssB = srcP2ssB; |
373 srcP2ssA = srcP3ssA; | 408 srcP2ssA = srcP3ssA; |
374 srcP2ssB = srcP3ssB; | 409 srcP2ssB = srcP3ssB; |
375 | 410 |
376 const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss); | 411 pp1A = vec_mladd(sum1A, v20ss, v16ss); |
377 const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss); | 412 pp1B = vec_mladd(sum1B, v20ss, v16ss); |
378 | 413 |
379 const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); | 414 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); |
380 const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); | 415 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); |
381 | 416 |
382 const vector signed short pp3A = vec_add(sum3A, pp1A); | 417 pp3A = vec_add(sum3A, pp1A); |
383 const vector signed short pp3B = vec_add(sum3B, pp1B); | 418 pp3B = vec_add(sum3B, pp1B); |
384 | 419 |
385 const vector signed short psumA = vec_sub(pp3A, pp2A); | 420 psumA = vec_sub(pp3A, pp2A); |
386 const vector signed short psumB = vec_sub(pp3B, pp2B); | 421 psumB = vec_sub(pp3B, pp2B); |
387 | 422 |
388 const vector signed short sumA = vec_sra(psumA, v5us); | 423 sumA = vec_sra(psumA, v5us); |
389 const vector signed short sumB = vec_sra(psumB, v5us); | 424 sumB = vec_sra(psumB, v5us); |
390 | 425 |
391 const vector unsigned char sum = vec_packsu(sumA, sumB); | 426 sum = vec_packsu(sumA, sumB); |
392 | 427 |
393 const vector unsigned char dst1 = vec_ld(0, dst); | 428 dst1 = vec_ld(0, dst); |
394 const vector unsigned char dst2 = vec_ld(16, dst); | 429 dst2 = vec_ld(16, dst); |
395 const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); | 430 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); |
396 | 431 |
397 vector unsigned char fsum; | |
398 OP_U8_ALTIVEC(fsum, sum, vdst); | 432 OP_U8_ALTIVEC(fsum, sum, vdst); |
399 | 433 |
400 const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); | 434 rsum = vec_perm(fsum, fsum, dstperm); |
401 const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); | 435 fdst1 = vec_sel(dst1, rsum, dstmask); |
402 const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); | 436 fdst2 = vec_sel(rsum, dst2, dstmask); |
403 | 437 |
404 vec_st(fdst1, 0, dst); | 438 vec_st(fdst1, 0, dst); |
405 vec_st(fdst2, 16, dst); | 439 vec_st(fdst2, 16, dst); |
406 | 440 |
407 dst += dstStride; | 441 dst += dstStride; |
410 } | 444 } |
411 | 445 |
412 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ | 446 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ |
413 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { | 447 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { |
414 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); | 448 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); |
415 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); | |
416 register int i; | 449 register int i; |
417 const vector signed int vzero = vec_splat_s32(0); | 450 const vector signed int vzero = vec_splat_s32(0); |
418 const vector unsigned char permM2 = vec_lvsl(-2, src); | 451 const vector unsigned char permM2 = vec_lvsl(-2, src); |
419 const vector unsigned char permM1 = vec_lvsl(-1, src); | 452 const vector unsigned char permM1 = vec_lvsl(-1, src); |
420 const vector unsigned char permP0 = vec_lvsl(+0, src); | 453 const vector unsigned char permP0 = vec_lvsl(+0, src); |
428 const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); | 461 const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); |
429 const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); | 462 const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); |
430 | 463 |
431 register int align = ((((unsigned long)src) - 2) % 16); | 464 register int align = ((((unsigned long)src) - 2) % 16); |
432 | 465 |
466 const vector unsigned char neg1 = (const vector unsigned char) | |
467 vec_splat_s8(-1); | |
468 | |
469 vector signed short srcP0A, srcP0B, srcP1A, srcP1B, | |
470 srcP2A, srcP2B, srcP3A, srcP3B, | |
471 srcM1A, srcM1B, srcM2A, srcM2B, | |
472 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | |
473 pp1A, pp1B, pp2A, pp2B, psumA, psumB; | |
474 | |
475 const vector unsigned char dstperm = vec_lvsr(0, dst); | |
476 | |
477 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); | |
478 | |
479 const vector unsigned char mperm = (const vector unsigned char) | |
480 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, | |
481 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); | |
482 int16_t *tmpbis = tmp; | |
483 | |
484 vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, | |
485 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, | |
486 tmpP2ssA, tmpP2ssB; | |
487 | |
488 vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, | |
489 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, | |
490 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, | |
491 ssumAe, ssumAo, ssumBe, ssumBo; | |
492 vector unsigned char fsum, sumv, sum, dst1, dst2, vdst, | |
493 rsum, fdst1, fdst2; | |
494 vector signed short ssume, ssumo; | |
495 | |
496 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); | |
433 src -= (2 * srcStride); | 497 src -= (2 * srcStride); |
434 | |
435 for (i = 0 ; i < 21 ; i ++) { | 498 for (i = 0 ; i < 21 ; i ++) { |
436 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | 499 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
437 vector unsigned char srcR1 = vec_ld(-2, src); | 500 vector unsigned char srcR1 = vec_ld(-2, src); |
438 vector unsigned char srcR2 = vec_ld(14, src); | 501 vector unsigned char srcR2 = vec_ld(14, src); |
439 | 502 |
490 srcP2 = vec_perm(srcR2, srcR3, permP2); | 553 srcP2 = vec_perm(srcR2, srcR3, permP2); |
491 srcP3 = vec_perm(srcR2, srcR3, permP3); | 554 srcP3 = vec_perm(srcR2, srcR3, permP3); |
492 } break; | 555 } break; |
493 } | 556 } |
494 | 557 |
495 const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); | 558 srcP0A = (vector signed short) |
496 const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); | 559 vec_mergeh((vector unsigned char)vzero, srcP0); |
497 const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); | 560 srcP0B = (vector signed short) |
498 const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); | 561 vec_mergel((vector unsigned char)vzero, srcP0); |
499 | 562 srcP1A = (vector signed short) |
500 const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); | 563 vec_mergeh((vector unsigned char)vzero, srcP1); |
501 const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); | 564 srcP1B = (vector signed short) |
502 const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); | 565 vec_mergel((vector unsigned char)vzero, srcP1); |
503 const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); | 566 |
504 | 567 srcP2A = (vector signed short) |
505 const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); | 568 vec_mergeh((vector unsigned char)vzero, srcP2); |
506 const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); | 569 srcP2B = (vector signed short) |
507 const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); | 570 vec_mergel((vector unsigned char)vzero, srcP2); |
508 const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); | 571 srcP3A = (vector signed short) |
509 | 572 vec_mergeh((vector unsigned char)vzero, srcP3); |
510 const vector signed short sum1A = vec_adds(srcP0A, srcP1A); | 573 srcP3B = (vector signed short) |
511 const vector signed short sum1B = vec_adds(srcP0B, srcP1B); | 574 vec_mergel((vector unsigned char)vzero, srcP3); |
512 const vector signed short sum2A = vec_adds(srcM1A, srcP2A); | 575 |
513 const vector signed short sum2B = vec_adds(srcM1B, srcP2B); | 576 srcM1A = (vector signed short) |
514 const vector signed short sum3A = vec_adds(srcM2A, srcP3A); | 577 vec_mergeh((vector unsigned char)vzero, srcM1); |
515 const vector signed short sum3B = vec_adds(srcM2B, srcP3B); | 578 srcM1B = (vector signed short) |
516 | 579 vec_mergel((vector unsigned char)vzero, srcM1); |
517 const vector signed short pp1A = vec_mladd(sum1A, v20ss, sum3A); | 580 srcM2A = (vector signed short) |
518 const vector signed short pp1B = vec_mladd(sum1B, v20ss, sum3B); | 581 vec_mergeh((vector unsigned char)vzero, srcM2); |
519 | 582 srcM2B = (vector signed short) |
520 const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); | 583 vec_mergel((vector unsigned char)vzero, srcM2); |
521 const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); | 584 |
522 | 585 sum1A = vec_adds(srcP0A, srcP1A); |
523 const vector signed short psumA = vec_sub(pp1A, pp2A); | 586 sum1B = vec_adds(srcP0B, srcP1B); |
524 const vector signed short psumB = vec_sub(pp1B, pp2B); | 587 sum2A = vec_adds(srcM1A, srcP2A); |
588 sum2B = vec_adds(srcM1B, srcP2B); | |
589 sum3A = vec_adds(srcM2A, srcP3A); | |
590 sum3B = vec_adds(srcM2B, srcP3B); | |
591 | |
592 pp1A = vec_mladd(sum1A, v20ss, sum3A); | |
593 pp1B = vec_mladd(sum1B, v20ss, sum3B); | |
594 | |
595 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); | |
596 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); | |
597 | |
598 psumA = vec_sub(pp1A, pp2A); | |
599 psumB = vec_sub(pp1B, pp2B); | |
525 | 600 |
526 vec_st(psumA, 0, tmp); | 601 vec_st(psumA, 0, tmp); |
527 vec_st(psumB, 16, tmp); | 602 vec_st(psumB, 16, tmp); |
528 | 603 |
529 src += srcStride; | 604 src += srcStride; |
530 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ | 605 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ |
531 } | 606 } |
532 | 607 |
533 const vector unsigned char dstperm = vec_lvsr(0, dst); | 608 tmpM2ssA = vec_ld(0, tmpbis); |
534 const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); | 609 tmpM2ssB = vec_ld(16, tmpbis); |
535 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); | |
536 const vector unsigned char mperm = (const vector unsigned char) | |
537 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, | |
538 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); | |
539 | |
540 int16_t *tmpbis = tmp - (tmpStride * 21); | |
541 | |
542 vector signed short tmpM2ssA = vec_ld(0, tmpbis); | |
543 vector signed short tmpM2ssB = vec_ld(16, tmpbis); | |
544 tmpbis += tmpStride; | 610 tmpbis += tmpStride; |
545 vector signed short tmpM1ssA = vec_ld(0, tmpbis); | 611 tmpM1ssA = vec_ld(0, tmpbis); |
546 vector signed short tmpM1ssB = vec_ld(16, tmpbis); | 612 tmpM1ssB = vec_ld(16, tmpbis); |
547 tmpbis += tmpStride; | 613 tmpbis += tmpStride; |
548 vector signed short tmpP0ssA = vec_ld(0, tmpbis); | 614 tmpP0ssA = vec_ld(0, tmpbis); |
549 vector signed short tmpP0ssB = vec_ld(16, tmpbis); | 615 tmpP0ssB = vec_ld(16, tmpbis); |
550 tmpbis += tmpStride; | 616 tmpbis += tmpStride; |
551 vector signed short tmpP1ssA = vec_ld(0, tmpbis); | 617 tmpP1ssA = vec_ld(0, tmpbis); |
552 vector signed short tmpP1ssB = vec_ld(16, tmpbis); | 618 tmpP1ssB = vec_ld(16, tmpbis); |
553 tmpbis += tmpStride; | 619 tmpbis += tmpStride; |
554 vector signed short tmpP2ssA = vec_ld(0, tmpbis); | 620 tmpP2ssA = vec_ld(0, tmpbis); |
555 vector signed short tmpP2ssB = vec_ld(16, tmpbis); | 621 tmpP2ssB = vec_ld(16, tmpbis); |
556 tmpbis += tmpStride; | 622 tmpbis += tmpStride; |
557 | 623 |
558 for (i = 0 ; i < 16 ; i++) { | 624 for (i = 0 ; i < 16 ; i++) { |
559 const vector signed short tmpP3ssA = vec_ld(0, tmpbis); | 625 const vector signed short tmpP3ssA = vec_ld(0, tmpbis); |
560 const vector signed short tmpP3ssB = vec_ld(16, tmpbis); | 626 const vector signed short tmpP3ssB = vec_ld(16, tmpbis); |
561 tmpbis += tmpStride; | |
562 | 627 |
563 const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA); | 628 const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA); |
564 const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB); | 629 const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB); |
565 const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA); | 630 const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA); |
566 const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB); | 631 const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB); |
567 const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA); | 632 const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA); |
568 const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB); | 633 const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB); |
634 | |
635 tmpbis += tmpStride; | |
569 | 636 |
570 tmpM2ssA = tmpM1ssA; | 637 tmpM2ssA = tmpM1ssA; |
571 tmpM2ssB = tmpM1ssB; | 638 tmpM2ssB = tmpM1ssB; |
572 tmpM1ssA = tmpP0ssA; | 639 tmpM1ssA = tmpP0ssA; |
573 tmpM1ssB = tmpP0ssB; | 640 tmpM1ssB = tmpP0ssB; |
576 tmpP1ssA = tmpP2ssA; | 643 tmpP1ssA = tmpP2ssA; |
577 tmpP1ssB = tmpP2ssB; | 644 tmpP1ssB = tmpP2ssB; |
578 tmpP2ssA = tmpP3ssA; | 645 tmpP2ssA = tmpP3ssA; |
579 tmpP2ssB = tmpP3ssB; | 646 tmpP2ssB = tmpP3ssB; |
580 | 647 |
581 const vector signed int pp1Ae = vec_mule(sum1A, v20ss); | 648 pp1Ae = vec_mule(sum1A, v20ss); |
582 const vector signed int pp1Ao = vec_mulo(sum1A, v20ss); | 649 pp1Ao = vec_mulo(sum1A, v20ss); |
583 const vector signed int pp1Be = vec_mule(sum1B, v20ss); | 650 pp1Be = vec_mule(sum1B, v20ss); |
584 const vector signed int pp1Bo = vec_mulo(sum1B, v20ss); | 651 pp1Bo = vec_mulo(sum1B, v20ss); |
585 | 652 |
586 const vector signed int pp2Ae = vec_mule(sum2A, v5ss); | 653 pp2Ae = vec_mule(sum2A, v5ss); |
587 const vector signed int pp2Ao = vec_mulo(sum2A, v5ss); | 654 pp2Ao = vec_mulo(sum2A, v5ss); |
588 const vector signed int pp2Be = vec_mule(sum2B, v5ss); | 655 pp2Be = vec_mule(sum2B, v5ss); |
589 const vector signed int pp2Bo = vec_mulo(sum2B, v5ss); | 656 pp2Bo = vec_mulo(sum2B, v5ss); |
590 | 657 |
591 const vector signed int pp3Ae = vec_sra((vector signed int)sum3A, v16ui); | 658 pp3Ae = vec_sra((vector signed int)sum3A, v16ui); |
592 const vector signed int pp3Ao = vec_mulo(sum3A, v1ss); | 659 pp3Ao = vec_mulo(sum3A, v1ss); |
593 const vector signed int pp3Be = vec_sra((vector signed int)sum3B, v16ui); | 660 pp3Be = vec_sra((vector signed int)sum3B, v16ui); |
594 const vector signed int pp3Bo = vec_mulo(sum3B, v1ss); | 661 pp3Bo = vec_mulo(sum3B, v1ss); |
595 | 662 |
596 const vector signed int pp1cAe = vec_add(pp1Ae, v512si); | 663 pp1cAe = vec_add(pp1Ae, v512si); |
597 const vector signed int pp1cAo = vec_add(pp1Ao, v512si); | 664 pp1cAo = vec_add(pp1Ao, v512si); |
598 const vector signed int pp1cBe = vec_add(pp1Be, v512si); | 665 pp1cBe = vec_add(pp1Be, v512si); |
599 const vector signed int pp1cBo = vec_add(pp1Bo, v512si); | 666 pp1cBo = vec_add(pp1Bo, v512si); |
600 | 667 |
601 const vector signed int pp32Ae = vec_sub(pp3Ae, pp2Ae); | 668 pp32Ae = vec_sub(pp3Ae, pp2Ae); |
602 const vector signed int pp32Ao = vec_sub(pp3Ao, pp2Ao); | 669 pp32Ao = vec_sub(pp3Ao, pp2Ao); |
603 const vector signed int pp32Be = vec_sub(pp3Be, pp2Be); | 670 pp32Be = vec_sub(pp3Be, pp2Be); |
604 const vector signed int pp32Bo = vec_sub(pp3Bo, pp2Bo); | 671 pp32Bo = vec_sub(pp3Bo, pp2Bo); |
605 | 672 |
606 const vector signed int sumAe = vec_add(pp1cAe, pp32Ae); | 673 sumAe = vec_add(pp1cAe, pp32Ae); |
607 const vector signed int sumAo = vec_add(pp1cAo, pp32Ao); | 674 sumAo = vec_add(pp1cAo, pp32Ao); |
608 const vector signed int sumBe = vec_add(pp1cBe, pp32Be); | 675 sumBe = vec_add(pp1cBe, pp32Be); |
609 const vector signed int sumBo = vec_add(pp1cBo, pp32Bo); | 676 sumBo = vec_add(pp1cBo, pp32Bo); |
610 | 677 |
611 const vector signed int ssumAe = vec_sra(sumAe, v10ui); | 678 ssumAe = vec_sra(sumAe, v10ui); |
612 const vector signed int ssumAo = vec_sra(sumAo, v10ui); | 679 ssumAo = vec_sra(sumAo, v10ui); |
613 const vector signed int ssumBe = vec_sra(sumBe, v10ui); | 680 ssumBe = vec_sra(sumBe, v10ui); |
614 const vector signed int ssumBo = vec_sra(sumBo, v10ui); | 681 ssumBo = vec_sra(sumBo, v10ui); |
615 | 682 |
616 const vector signed short ssume = vec_packs(ssumAe, ssumBe); | 683 ssume = vec_packs(ssumAe, ssumBe); |
617 const vector signed short ssumo = vec_packs(ssumAo, ssumBo); | 684 ssumo = vec_packs(ssumAo, ssumBo); |
618 | 685 |
619 const vector unsigned char sumv = vec_packsu(ssume, ssumo); | 686 sumv = vec_packsu(ssume, ssumo); |
620 const vector unsigned char sum = vec_perm(sumv, sumv, mperm); | 687 sum = vec_perm(sumv, sumv, mperm); |
621 | 688 |
622 const vector unsigned char dst1 = vec_ld(0, dst); | 689 dst1 = vec_ld(0, dst); |
623 const vector unsigned char dst2 = vec_ld(16, dst); | 690 dst2 = vec_ld(16, dst); |
624 const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); | 691 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); |
625 | 692 |
626 vector unsigned char fsum; | |
627 OP_U8_ALTIVEC(fsum, sum, vdst); | 693 OP_U8_ALTIVEC(fsum, sum, vdst); |
628 | 694 |
629 const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); | 695 rsum = vec_perm(fsum, fsum, dstperm); |
630 const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); | 696 fdst1 = vec_sel(dst1, rsum, dstmask); |
631 const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); | 697 fdst2 = vec_sel(rsum, dst2, dstmask); |
632 | 698 |
633 vec_st(fdst1, 0, dst); | 699 vec_st(fdst1, 0, dst); |
634 vec_st(fdst2, 16, dst); | 700 vec_st(fdst2, 16, dst); |
635 | 701 |
636 dst += dstStride; | 702 dst += dstStride; |