comparison ppc/h264_altivec.c @ 9444:fe17033a79ed libavcodec

Altivec version of avg_no_rnd_vc1_chroma_mc8
author conrad
date Wed, 15 Apr 2009 06:23:40 +0000
parents ef3a7b711cc0
children 34a65026fa06
comparison
equal deleted inserted replaced
9443:3970fe47fea3 9444:fe17033a79ed
29 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s 29 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
30 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s) 30 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
31 31
32 #define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC 32 #define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC
33 #define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec 33 #define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec
34 #define PREFIX_no_rnd_vc1_chroma_mc8_altivec put_no_rnd_vc1_chroma_mc8_altivec
34 #define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num 35 #define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num
35 #define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec 36 #define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec
36 #define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num 37 #define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num
37 #define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec 38 #define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec
38 #define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num 39 #define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num
39 #define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec 40 #define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec
40 #define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num 41 #define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num
41 #include "h264_template_altivec.c" 42 #include "h264_template_altivec.c"
42 #undef OP_U8_ALTIVEC 43 #undef OP_U8_ALTIVEC
43 #undef PREFIX_h264_chroma_mc8_altivec 44 #undef PREFIX_h264_chroma_mc8_altivec
45 #undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
44 #undef PREFIX_h264_chroma_mc8_num 46 #undef PREFIX_h264_chroma_mc8_num
45 #undef PREFIX_h264_qpel16_h_lowpass_altivec 47 #undef PREFIX_h264_qpel16_h_lowpass_altivec
46 #undef PREFIX_h264_qpel16_h_lowpass_num 48 #undef PREFIX_h264_qpel16_h_lowpass_num
47 #undef PREFIX_h264_qpel16_v_lowpass_altivec 49 #undef PREFIX_h264_qpel16_v_lowpass_altivec
48 #undef PREFIX_h264_qpel16_v_lowpass_num 50 #undef PREFIX_h264_qpel16_v_lowpass_num
49 #undef PREFIX_h264_qpel16_hv_lowpass_altivec 51 #undef PREFIX_h264_qpel16_hv_lowpass_altivec
50 #undef PREFIX_h264_qpel16_hv_lowpass_num 52 #undef PREFIX_h264_qpel16_hv_lowpass_num
51 53
52 #define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC 54 #define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC
53 #define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec 55 #define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec
56 #define PREFIX_no_rnd_vc1_chroma_mc8_altivec avg_no_rnd_vc1_chroma_mc8_altivec
54 #define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num 57 #define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num
55 #define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec 58 #define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec
56 #define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num 59 #define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num
57 #define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec 60 #define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec
58 #define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num 61 #define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num
59 #define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec 62 #define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec
60 #define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num 63 #define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num
61 #include "h264_template_altivec.c" 64 #include "h264_template_altivec.c"
62 #undef OP_U8_ALTIVEC 65 #undef OP_U8_ALTIVEC
63 #undef PREFIX_h264_chroma_mc8_altivec 66 #undef PREFIX_h264_chroma_mc8_altivec
67 #undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
64 #undef PREFIX_h264_chroma_mc8_num 68 #undef PREFIX_h264_chroma_mc8_num
65 #undef PREFIX_h264_qpel16_h_lowpass_altivec 69 #undef PREFIX_h264_qpel16_h_lowpass_altivec
66 #undef PREFIX_h264_qpel16_h_lowpass_num 70 #undef PREFIX_h264_qpel16_h_lowpass_num
67 #undef PREFIX_h264_qpel16_v_lowpass_altivec 71 #undef PREFIX_h264_qpel16_v_lowpass_altivec
68 #undef PREFIX_h264_qpel16_v_lowpass_num 72 #undef PREFIX_h264_qpel16_v_lowpass_num
176 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\ 180 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
177 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ 181 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
178 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ 182 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
179 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ 183 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
180 }\ 184 }\
181
182 /* this code assume that stride % 16 == 0 */
183 void put_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
184 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
185 {((8 - x) * (8 - y)),
186 (( x) * (8 - y)),
187 ((8 - x) * ( y)),
188 (( x) * ( y))};
189 register int i;
190 vec_u8 fperm;
191 const vec_s32 vABCD = vec_ld(0, ABCD);
192 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
193 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
194 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
195 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
196 LOAD_ZERO;
197 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
198 const vec_u16 v6us = vec_splat_u16(6);
199 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
200 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
201
202 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
203 vec_u8 vsrc0uc, vsrc1uc;
204 vec_s16 vsrc0ssH, vsrc1ssH;
205 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
206 vec_s16 vsrc2ssH, vsrc3ssH, psum;
207 vec_u8 vdst, ppsum, fsum;
208
209 if (((unsigned long)dst) % 16 == 0) {
210 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
211 0x14, 0x15, 0x16, 0x17,
212 0x08, 0x09, 0x0A, 0x0B,
213 0x0C, 0x0D, 0x0E, 0x0F};
214 } else {
215 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
216 0x04, 0x05, 0x06, 0x07,
217 0x18, 0x19, 0x1A, 0x1B,
218 0x1C, 0x1D, 0x1E, 0x1F};
219 }
220
221 vsrcAuc = vec_ld(0, src);
222
223 if (loadSecond)
224 vsrcBuc = vec_ld(16, src);
225 vsrcperm0 = vec_lvsl(0, src);
226 vsrcperm1 = vec_lvsl(1, src);
227
228 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
229 if (reallyBadAlign)
230 vsrc1uc = vsrcBuc;
231 else
232 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
233
234 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
235 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
236
237 if (!loadSecond) {// -> !reallyBadAlign
238 for (i = 0 ; i < h ; i++) {
239
240
241 vsrcCuc = vec_ld(stride + 0, src);
242
243 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
244 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
245
246 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc);
247 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc);
248
249 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
250 psum = vec_mladd(vB, vsrc1ssH, psum);
251 psum = vec_mladd(vC, vsrc2ssH, psum);
252 psum = vec_mladd(vD, vsrc3ssH, psum);
253 psum = vec_add(v28ss, psum);
254 psum = vec_sra(psum, v6us);
255
256 vdst = vec_ld(0, dst);
257 ppsum = (vec_u8)vec_packsu(psum, psum);
258 fsum = vec_perm(vdst, ppsum, fperm);
259
260 vec_st(fsum, 0, dst);
261
262 vsrc0ssH = vsrc2ssH;
263 vsrc1ssH = vsrc3ssH;
264
265 dst += stride;
266 src += stride;
267 }
268 } else {
269 vec_u8 vsrcDuc;
270 for (i = 0 ; i < h ; i++) {
271 vsrcCuc = vec_ld(stride + 0, src);
272 vsrcDuc = vec_ld(stride + 16, src);
273
274 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
275 if (reallyBadAlign)
276 vsrc3uc = vsrcDuc;
277 else
278 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
279
280 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc);
281 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc);
282
283 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
284 psum = vec_mladd(vB, vsrc1ssH, psum);
285 psum = vec_mladd(vC, vsrc2ssH, psum);
286 psum = vec_mladd(vD, vsrc3ssH, psum);
287 psum = vec_add(v28ss, psum);
288 psum = vec_sr(psum, v6us);
289
290 vdst = vec_ld(0, dst);
291 ppsum = (vec_u8)vec_pack(psum, psum);
292 fsum = vec_perm(vdst, ppsum, fperm);
293
294 vec_st(fsum, 0, dst);
295
296 vsrc0ssH = vsrc2ssH;
297 vsrc1ssH = vsrc3ssH;
298
299 dst += stride;
300 src += stride;
301 }
302 }
303 }
304 185
305 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, 186 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
306 const uint8_t * src2, int dst_stride, 187 const uint8_t * src2, int dst_stride,
307 int src_stride1, int h) 188 int src_stride1, int h)
308 { 189 {
1090 971
1091 if (has_altivec()) { 972 if (has_altivec()) {
1092 c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; 973 c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
1093 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; 974 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
1094 c->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec; 975 c->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec;
976 c->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec;
1095 c->h264_idct_add = ff_h264_idct_add_altivec; 977 c->h264_idct_add = ff_h264_idct_add_altivec;
1096 c->h264_idct_add8 = ff_h264_idct_add8_altivec; 978 c->h264_idct_add8 = ff_h264_idct_add8_altivec;
1097 c->h264_idct_add16 = ff_h264_idct_add16_altivec; 979 c->h264_idct_add16 = ff_h264_idct_add16_altivec;
1098 c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec; 980 c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec;
1099 c->h264_idct_dc_add= h264_idct_dc_add_altivec; 981 c->h264_idct_dc_add= h264_idct_dc_add_altivec;