Mercurial > libavcodec.hg
comparison ppc/h264_template_altivec.c @ 9444:fe17033a79ed libavcodec
Altivec version of avg_no_rnd_vc1_chroma_mc8
author | conrad |
---|---|
date | Wed, 15 Apr 2009 06:23:40 +0000 |
parents | fd8b4aa6e493 |
children | 34a65026fa06 |
comparison
equal
deleted
inserted
replaced
9443:3970fe47fea3 | 9444:fe17033a79ed |
---|---|
25 #define ASSERT_ALIGNED(ptr) ; | 25 #define ASSERT_ALIGNED(ptr) ; |
26 #endif | 26 #endif |
27 | 27 |
28 /* this code assume that stride % 16 == 0 */ | 28 /* this code assume that stride % 16 == 0 */ |
29 | 29 |
30 #define CHROMA_MC8_ALTIVEC_CORE \ | 30 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \ |
31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ | 31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ |
32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ | 32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ |
33 \ | 33 \ |
34 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ | 34 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\ |
35 psum = vec_mladd(vB, vsrc1ssH, psum);\ | 35 psum = vec_mladd(vB, vsrc1ssH, psum);\ |
36 psum = vec_mladd(vC, vsrc2ssH, psum);\ | 36 psum = vec_mladd(vC, vsrc2ssH, psum);\ |
37 psum = vec_mladd(vD, vsrc3ssH, psum);\ | 37 psum = vec_mladd(vD, vsrc3ssH, psum);\ |
38 psum = BIAS2(psum);\ | |
38 psum = vec_sr(psum, v6us);\ | 39 psum = vec_sr(psum, v6us);\ |
39 \ | 40 \ |
40 vdst = vec_ld(0, dst);\ | 41 vdst = vec_ld(0, dst);\ |
41 ppsum = (vec_u8)vec_pack(psum, psum);\ | 42 ppsum = (vec_u8)vec_pack(psum, psum);\ |
42 vfdst = vec_perm(vdst, ppsum, fperm);\ | 43 vfdst = vec_perm(vdst, ppsum, fperm);\ |
68 \ | 69 \ |
69 vec_st(fsum, 0, dst);\ | 70 vec_st(fsum, 0, dst);\ |
70 \ | 71 \ |
71 dst += stride;\ | 72 dst += stride;\ |
72 src += stride; | 73 src += stride; |
74 | |
75 #define noop(a) a | |
76 #define add28(a) vec_add(v28ss, a) | |
73 | 77 |
74 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, | 78 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, |
75 int stride, int h, int x, int y) { | 79 int stride, int h, int x, int y) { |
76 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); | 80 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); |
77 DECLARE_ALIGNED_16(signed int, ABCD[4]) = | 81 DECLARE_ALIGNED_16(signed int, ABCD[4]) = |
134 for (i = 0 ; i < h ; i++) { | 138 for (i = 0 ; i < h ; i++) { |
135 vsrcCuc = vec_ld(stride + 0, src); | 139 vsrcCuc = vec_ld(stride + 0, src); |
136 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); | 140 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); |
137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); | 141 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); |
138 | 142 |
139 CHROMA_MC8_ALTIVEC_CORE | 143 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) |
140 } | 144 } |
141 } else { | 145 } else { |
142 vec_u8 vsrcDuc; | 146 vec_u8 vsrcDuc; |
143 for (i = 0 ; i < h ; i++) { | 147 for (i = 0 ; i < h ; i++) { |
144 vsrcCuc = vec_ld(stride + 0, src); | 148 vsrcCuc = vec_ld(stride + 0, src); |
147 if (reallyBadAlign) | 151 if (reallyBadAlign) |
148 vsrc3uc = vsrcDuc; | 152 vsrc3uc = vsrcDuc; |
149 else | 153 else |
150 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); | 154 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); |
151 | 155 |
152 CHROMA_MC8_ALTIVEC_CORE | 156 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) |
153 } | 157 } |
154 } | 158 } |
155 } else { | 159 } else { |
156 const vec_s16 vE = vec_add(vB, vC); | 160 const vec_s16 vE = vec_add(vB, vC); |
157 if (ABCD[2]) { // x == 0 B == 0 | 161 if (ABCD[2]) { // x == 0 B == 0 |
200 } | 204 } |
201 } | 205 } |
202 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); | 206 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); |
203 } | 207 } |
204 | 208 |
209 /* this code assume that stride % 16 == 0 */ | |
210 void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { | |
211 DECLARE_ALIGNED_16(signed int, ABCD[4]) = | |
212 {((8 - x) * (8 - y)), | |
213 (( x) * (8 - y)), | |
214 ((8 - x) * ( y)), | |
215 (( x) * ( y))}; | |
216 register int i; | |
217 vec_u8 fperm; | |
218 const vec_s32 vABCD = vec_ld(0, ABCD); | |
219 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); | |
220 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); | |
221 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); | |
222 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); | |
223 LOAD_ZERO; | |
224 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); | |
225 const vec_u16 v6us = vec_splat_u16(6); | |
226 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; | |
227 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; | |
228 | |
229 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; | |
230 vec_u8 vsrc0uc, vsrc1uc; | |
231 vec_s16 vsrc0ssH, vsrc1ssH; | |
232 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; | |
233 vec_s16 vsrc2ssH, vsrc3ssH, psum; | |
234 vec_u8 vdst, ppsum, vfdst, fsum; | |
235 | |
236 if (((unsigned long)dst) % 16 == 0) { | |
237 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, | |
238 0x14, 0x15, 0x16, 0x17, | |
239 0x08, 0x09, 0x0A, 0x0B, | |
240 0x0C, 0x0D, 0x0E, 0x0F}; | |
241 } else { | |
242 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, | |
243 0x04, 0x05, 0x06, 0x07, | |
244 0x18, 0x19, 0x1A, 0x1B, | |
245 0x1C, 0x1D, 0x1E, 0x1F}; | |
246 } | |
247 | |
248 vsrcAuc = vec_ld(0, src); | |
249 | |
250 if (loadSecond) | |
251 vsrcBuc = vec_ld(16, src); | |
252 vsrcperm0 = vec_lvsl(0, src); | |
253 vsrcperm1 = vec_lvsl(1, src); | |
254 | |
255 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); | |
256 if (reallyBadAlign) | |
257 vsrc1uc = vsrcBuc; | |
258 else | |
259 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); | |
260 | |
261 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc); | |
262 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc); | |
263 | |
264 if (!loadSecond) {// -> !reallyBadAlign | |
265 for (i = 0 ; i < h ; i++) { | |
266 | |
267 | |
268 vsrcCuc = vec_ld(stride + 0, src); | |
269 | |
270 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); | |
271 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); | |
272 | |
273 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) | |
274 } | |
275 } else { | |
276 vec_u8 vsrcDuc; | |
277 for (i = 0 ; i < h ; i++) { | |
278 vsrcCuc = vec_ld(stride + 0, src); | |
279 vsrcDuc = vec_ld(stride + 16, src); | |
280 | |
281 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); | |
282 if (reallyBadAlign) | |
283 vsrc3uc = vsrcDuc; | |
284 else | |
285 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); | |
286 | |
287 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) | |
288 } | |
289 } | |
290 } | |
291 | |
292 #undef noop | |
293 #undef add28 | |
205 #undef CHROMA_MC8_ALTIVEC_CORE | 294 #undef CHROMA_MC8_ALTIVEC_CORE |
206 | 295 |
207 /* this code assume stride % 16 == 0 */ | 296 /* this code assume stride % 16 == 0 */ |
208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { | 297 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); | 298 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); |