comparison ppc/h264_template_altivec.c @ 9444:fe17033a79ed libavcodec

Altivec version of avg_no_rnd_vc1_chroma_mc8
author conrad
date Wed, 15 Apr 2009 06:23:40 +0000
parents fd8b4aa6e493
children 34a65026fa06
comparison
equal deleted inserted replaced
9443:3970fe47fea3 9444:fe17033a79ed
25 #define ASSERT_ALIGNED(ptr) ; 25 #define ASSERT_ALIGNED(ptr) ;
26 #endif 26 #endif
27 27
28 /* this code assume that stride % 16 == 0 */ 28 /* this code assume that stride % 16 == 0 */
29 29
30 #define CHROMA_MC8_ALTIVEC_CORE \ 30 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ 31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ 32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
33 \ 33 \
34 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 34 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
35 psum = vec_mladd(vB, vsrc1ssH, psum);\ 35 psum = vec_mladd(vB, vsrc1ssH, psum);\
36 psum = vec_mladd(vC, vsrc2ssH, psum);\ 36 psum = vec_mladd(vC, vsrc2ssH, psum);\
37 psum = vec_mladd(vD, vsrc3ssH, psum);\ 37 psum = vec_mladd(vD, vsrc3ssH, psum);\
38 psum = BIAS2(psum);\
38 psum = vec_sr(psum, v6us);\ 39 psum = vec_sr(psum, v6us);\
39 \ 40 \
40 vdst = vec_ld(0, dst);\ 41 vdst = vec_ld(0, dst);\
41 ppsum = (vec_u8)vec_pack(psum, psum);\ 42 ppsum = (vec_u8)vec_pack(psum, psum);\
42 vfdst = vec_perm(vdst, ppsum, fperm);\ 43 vfdst = vec_perm(vdst, ppsum, fperm);\
68 \ 69 \
69 vec_st(fsum, 0, dst);\ 70 vec_st(fsum, 0, dst);\
70 \ 71 \
71 dst += stride;\ 72 dst += stride;\
72 src += stride; 73 src += stride;
74
75 #define noop(a) a
76 #define add28(a) vec_add(v28ss, a)
73 77
74 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, 78 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
75 int stride, int h, int x, int y) { 79 int stride, int h, int x, int y) {
76 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); 80 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
77 DECLARE_ALIGNED_16(signed int, ABCD[4]) = 81 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
134 for (i = 0 ; i < h ; i++) { 138 for (i = 0 ; i < h ; i++) {
135 vsrcCuc = vec_ld(stride + 0, src); 139 vsrcCuc = vec_ld(stride + 0, src);
136 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 140 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 141 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
138 142
139 CHROMA_MC8_ALTIVEC_CORE 143 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
140 } 144 }
141 } else { 145 } else {
142 vec_u8 vsrcDuc; 146 vec_u8 vsrcDuc;
143 for (i = 0 ; i < h ; i++) { 147 for (i = 0 ; i < h ; i++) {
144 vsrcCuc = vec_ld(stride + 0, src); 148 vsrcCuc = vec_ld(stride + 0, src);
147 if (reallyBadAlign) 151 if (reallyBadAlign)
148 vsrc3uc = vsrcDuc; 152 vsrc3uc = vsrcDuc;
149 else 153 else
150 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 154 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
151 155
152 CHROMA_MC8_ALTIVEC_CORE 156 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
153 } 157 }
154 } 158 }
155 } else { 159 } else {
156 const vec_s16 vE = vec_add(vB, vC); 160 const vec_s16 vE = vec_add(vB, vC);
157 if (ABCD[2]) { // x == 0 B == 0 161 if (ABCD[2]) { // x == 0 B == 0
200 } 204 }
201 } 205 }
202 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); 206 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
203 } 207 }
204 208
209 /* this code assume that stride % 16 == 0 */
210 void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
211 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
212 {((8 - x) * (8 - y)),
213 (( x) * (8 - y)),
214 ((8 - x) * ( y)),
215 (( x) * ( y))};
216 register int i;
217 vec_u8 fperm;
218 const vec_s32 vABCD = vec_ld(0, ABCD);
219 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
220 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
221 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
222 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
223 LOAD_ZERO;
224 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
225 const vec_u16 v6us = vec_splat_u16(6);
226 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
227 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
228
229 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
230 vec_u8 vsrc0uc, vsrc1uc;
231 vec_s16 vsrc0ssH, vsrc1ssH;
232 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
233 vec_s16 vsrc2ssH, vsrc3ssH, psum;
234 vec_u8 vdst, ppsum, vfdst, fsum;
235
236 if (((unsigned long)dst) % 16 == 0) {
237 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
238 0x14, 0x15, 0x16, 0x17,
239 0x08, 0x09, 0x0A, 0x0B,
240 0x0C, 0x0D, 0x0E, 0x0F};
241 } else {
242 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
243 0x04, 0x05, 0x06, 0x07,
244 0x18, 0x19, 0x1A, 0x1B,
245 0x1C, 0x1D, 0x1E, 0x1F};
246 }
247
248 vsrcAuc = vec_ld(0, src);
249
250 if (loadSecond)
251 vsrcBuc = vec_ld(16, src);
252 vsrcperm0 = vec_lvsl(0, src);
253 vsrcperm1 = vec_lvsl(1, src);
254
255 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
256 if (reallyBadAlign)
257 vsrc1uc = vsrcBuc;
258 else
259 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
260
261 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
262 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
263
264 if (!loadSecond) {// -> !reallyBadAlign
265 for (i = 0 ; i < h ; i++) {
266
267
268 vsrcCuc = vec_ld(stride + 0, src);
269
270 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
271 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
272
273 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
274 }
275 } else {
276 vec_u8 vsrcDuc;
277 for (i = 0 ; i < h ; i++) {
278 vsrcCuc = vec_ld(stride + 0, src);
279 vsrcDuc = vec_ld(stride + 16, src);
280
281 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
282 if (reallyBadAlign)
283 vsrc3uc = vsrcDuc;
284 else
285 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
286
287 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
288 }
289 }
290 }
291
292 #undef noop
293 #undef add28
205 #undef CHROMA_MC8_ALTIVEC_CORE 294 #undef CHROMA_MC8_ALTIVEC_CORE
206 295
207 /* this code assume stride % 16 == 0 */ 296 /* this code assume stride % 16 == 0 */
208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 297 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); 298 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);