Mercurial > libavcodec.hg
annotate ppc/vp8dsp_altivec.c @ 12483:0159a19bfff7 libavcodec
aacdec: Rework channel mapping compatibility hacks.
For a PCE based configuration map the channels solely based on tags.
For an indexed configuration map the channels solely based on position.
This works with all known exotic samples including al17, elem_id0, bad_concat,
and lfe_is_sce.
author | alexc |
---|---|
date | Fri, 10 Sep 2010 18:01:48 +0000 |
parents | 9fef0a8ddd63 |
children |
rev | line source |
---|---|
12011 | 1 /** |
2 * VP8 compatible video decoder | |
3 * | |
4 * Copyright (C) 2010 David Conrad | |
5 * | |
6 * This file is part of FFmpeg. | |
7 * | |
8 * FFmpeg is free software; you can redistribute it and/or | |
9 * modify it under the terms of the GNU Lesser General Public | |
10 * License as published by the Free Software Foundation; either | |
11 * version 2.1 of the License, or (at your option) any later version. | |
12 * | |
13 * FFmpeg is distributed in the hope that it will be useful, | |
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 * Lesser General Public License for more details. | |
17 * | |
18 * You should have received a copy of the GNU Lesser General Public | |
19 * License along with FFmpeg; if not, write to the Free Software | |
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 */ | |
22 | |
12475
9fef0a8ddd63
Move mm_support() from libavcodec to libavutil, make it a public
stefano
parents:
12473
diff
changeset
|
23 #include "libavutil/cpu.h" |
12011 | 24 #include "libavcodec/vp8dsp.h" |
25 #include "dsputil_altivec.h" | |
26 #include "types_altivec.h" | |
27 #include "util_altivec.h" | |
28 | |
29 #define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ } | |
30 | |
31 // h subpel filter uses msum to multiply+add 4 pixel taps at once | |
32 static const vec_s8 h_subpel_filters_inner[7] = | |
33 { | |
34 REPT4( -6, 123, 12, -1), | |
35 REPT4(-11, 108, 36, -8), | |
36 REPT4( -9, 93, 50, -6), | |
37 REPT4(-16, 77, 77, -16), | |
38 REPT4( -6, 50, 93, -9), | |
39 REPT4( -8, 36, 108, -11), | |
40 REPT4( -1, 12, 123, -6), | |
41 }; | |
42 | |
43 // for 6tap filters, these are the outer two taps | |
44 // The zeros mask off pixels 4-7 when filtering 0-3 | |
45 // and vice-versa | |
46 static const vec_s8 h_subpel_filters_outer[3] = | |
47 { | |
48 REPT4(0, 0, 2, 1), | |
49 REPT4(0, 0, 3, 3), | |
50 REPT4(0, 0, 1, 2), | |
51 }; | |
52 | |
53 #define LOAD_H_SUBPEL_FILTER(i) \ | |
54 vec_s8 filter_inner = h_subpel_filters_inner[i]; \ | |
55 vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \ | |
56 vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2) | |
57 | |
58 #define FILTER_H(dstv, off) \ | |
59 a = vec_ld((off)-2, src); \ | |
60 b = vec_ld((off)-2+15, src); \ | |
61 \ | |
62 pixh = vec_perm(a, b, permh##off); \ | |
63 pixl = vec_perm(a, b, perml##off); \ | |
64 filth = vec_msum(filter_inner, pixh, c64); \ | |
65 filtl = vec_msum(filter_inner, pixl, c64); \ | |
66 \ | |
67 if (is6tap) { \ | |
68 outer = vec_perm(a, b, perm_6tap##off); \ | |
69 filth = vec_msum(filter_outerh, outer, filth); \ | |
70 filtl = vec_msum(filter_outerl, outer, filtl); \ | |
71 } \ | |
72 if (w == 4) \ | |
73 filtl = filth; /* discard pixels 4-7 */ \ | |
74 dstv = vec_packs(filth, filtl); \ | |
75 dstv = vec_sra(dstv, c7) | |
76 | |
77 static av_always_inline | |
78 void put_vp8_epel_h_altivec_core(uint8_t *dst, int dst_stride, | |
79 uint8_t *src, int src_stride, | |
80 int h, int mx, int w, int is6tap) | |
81 { | |
82 LOAD_H_SUBPEL_FILTER(mx-1); | |
83 vec_u8 align_vec0, align_vec8, permh0, permh8, filt; | |
84 vec_u8 perm_6tap0, perm_6tap8, perml0, perml8; | |
85 vec_u8 a, b, pixh, pixl, outer; | |
86 vec_s16 f16h, f16l; | |
87 vec_s32 filth, filtl; | |
88 | |
89 vec_u8 perm_inner = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 }; | |
90 vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 }; | |
91 vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6)); | |
92 vec_u16 c7 = vec_splat_u16(7); | |
93 | |
94 align_vec0 = vec_lvsl( -2, src); | |
95 align_vec8 = vec_lvsl(8-2, src); | |
96 | |
97 permh0 = vec_perm(align_vec0, align_vec0, perm_inner); | |
98 permh8 = vec_perm(align_vec8, align_vec8, perm_inner); | |
99 perm_inner = vec_add(perm_inner, vec_splat_u8(4)); | |
100 perml0 = vec_perm(align_vec0, align_vec0, perm_inner); | |
101 perml8 = vec_perm(align_vec8, align_vec8, perm_inner); | |
102 perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer); | |
103 perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer); | |
104 | |
105 while (h --> 0) { | |
106 FILTER_H(f16h, 0); | |
107 | |
108 if (w == 16) { | |
109 FILTER_H(f16l, 8); | |
110 filt = vec_packsu(f16h, f16l); | |
111 vec_st(filt, 0, dst); | |
112 } else { | |
113 filt = vec_packsu(f16h, f16h); | |
114 vec_ste((vec_u32)filt, 0, (uint32_t*)dst); | |
115 if (w == 8) | |
116 vec_ste((vec_u32)filt, 4, (uint32_t*)dst); | |
117 } | |
118 src += src_stride; | |
119 dst += dst_stride; | |
120 } | |
121 } | |
122 | |
123 // v subpel filter does a simple vertical multiply + add | |
124 static const vec_u8 v_subpel_filters[7] = | |
125 { | |
126 { 0, 6, 123, 12, 1, 0 }, | |
127 { 2, 11, 108, 36, 8, 1 }, | |
128 { 0, 9, 93, 50, 6, 0 }, | |
129 { 3, 16, 77, 77, 16, 3 }, | |
130 { 0, 6, 50, 93, 9, 0 }, | |
131 { 1, 8, 36, 108, 11, 2 }, | |
132 { 0, 1, 12, 123, 6, 0 }, | |
133 }; | |
134 | |
135 #define LOAD_V_SUBPEL_FILTER(i) \ | |
136 vec_u8 subpel_filter = v_subpel_filters[i]; \ | |
137 vec_u8 f0 = vec_splat(subpel_filter, 0); \ | |
138 vec_u8 f1 = vec_splat(subpel_filter, 1); \ | |
139 vec_u8 f2 = vec_splat(subpel_filter, 2); \ | |
140 vec_u8 f3 = vec_splat(subpel_filter, 3); \ | |
141 vec_u8 f4 = vec_splat(subpel_filter, 4); \ | |
142 vec_u8 f5 = vec_splat(subpel_filter, 5) | |
143 | |
144 #define FILTER_V(dstv, vec_mul) \ | |
145 s1f = (vec_s16)vec_mul(s1, f1); \ | |
146 s2f = (vec_s16)vec_mul(s2, f2); \ | |
147 s3f = (vec_s16)vec_mul(s3, f3); \ | |
148 s4f = (vec_s16)vec_mul(s4, f4); \ | |
149 s2f = vec_subs(s2f, s1f); \ | |
150 s3f = vec_subs(s3f, s4f); \ | |
151 if (is6tap) { \ | |
152 s0f = (vec_s16)vec_mul(s0, f0); \ | |
153 s5f = (vec_s16)vec_mul(s5, f5); \ | |
154 s2f = vec_adds(s2f, s0f); \ | |
155 s3f = vec_adds(s3f, s5f); \ | |
156 } \ | |
157 dstv = vec_adds(s2f, s3f); \ | |
158 dstv = vec_adds(dstv, c64); \ | |
159 dstv = vec_sra(dstv, c7) | |
160 | |
161 static av_always_inline | |
162 void put_vp8_epel_v_altivec_core(uint8_t *dst, int dst_stride, | |
163 uint8_t *src, int src_stride, | |
164 int h, int my, int w, int is6tap) | |
165 { | |
166 LOAD_V_SUBPEL_FILTER(my-1); | |
167 vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl; | |
168 vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l; | |
169 vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6)); | |
170 vec_u16 c7 = vec_splat_u16(7); | |
171 | |
172 // we want pixels 0-7 to be in the even positions and 8-15 in the odd, | |
173 // so combine this permute with the alignment permute vector | |
174 align_vech = vec_lvsl(0, src); | |
175 align_vecl = vec_sld(align_vech, align_vech, 8); | |
176 if (w ==16) | |
177 perm_vec = vec_mergeh(align_vech, align_vecl); | |
178 else | |
179 perm_vec = vec_mergeh(align_vech, align_vech); | |
180 | |
181 if (is6tap) | |
182 s0 = load_with_perm_vec(-2*src_stride, src, perm_vec); | |
183 s1 = load_with_perm_vec(-1*src_stride, src, perm_vec); | |
184 s2 = load_with_perm_vec( 0*src_stride, src, perm_vec); | |
185 s3 = load_with_perm_vec( 1*src_stride, src, perm_vec); | |
186 if (is6tap) | |
187 s4 = load_with_perm_vec( 2*src_stride, src, perm_vec); | |
188 | |
189 src += (2+is6tap)*src_stride; | |
190 | |
191 while (h --> 0) { | |
192 if (is6tap) | |
193 s5 = load_with_perm_vec(0, src, perm_vec); | |
194 else | |
195 s4 = load_with_perm_vec(0, src, perm_vec); | |
196 | |
197 FILTER_V(f16h, vec_mule); | |
198 | |
199 if (w == 16) { | |
200 FILTER_V(f16l, vec_mulo); | |
201 filt = vec_packsu(f16h, f16l); | |
202 vec_st(filt, 0, dst); | |
203 } else { | |
204 filt = vec_packsu(f16h, f16h); | |
205 if (w == 4) | |
206 filt = (vec_u8)vec_splat((vec_u32)filt, 0); | |
207 else | |
208 vec_ste((vec_u32)filt, 4, (uint32_t*)dst); | |
209 vec_ste((vec_u32)filt, 0, (uint32_t*)dst); | |
210 } | |
211 | |
212 if (is6tap) | |
213 s0 = s1; | |
214 s1 = s2; | |
215 s2 = s3; | |
216 s3 = s4; | |
217 if (is6tap) | |
218 s4 = s5; | |
219 | |
220 dst += dst_stride; | |
221 src += src_stride; | |
222 } | |
223 } | |
224 | |
225 #define EPEL_FUNCS(WIDTH, TAPS) \ | |
226 static av_noinline \ | |
227 void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int h, int mx, int my) \ | |
228 { \ | |
229 put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \ | |
230 } \ | |
231 \ | |
232 static av_noinline \ | |
233 void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int h, int mx, int my) \ | |
234 { \ | |
235 put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \ | |
236 } | |
237 | |
238 #define EPEL_HV(WIDTH, HTAPS, VTAPS) \ | |
239 static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, int stride, uint8_t *src, int s, int h, int mx, int my) \ | |
240 { \ | |
241 DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \ | |
242 put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*stride, stride, h+5, mx, my); \ | |
243 put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+2*16, 16, h, mx, my); \ | |
244 } | |
245 | |
246 EPEL_FUNCS(16,6) | |
247 EPEL_FUNCS(8, 6) | |
248 EPEL_FUNCS(8, 4) | |
249 EPEL_FUNCS(4, 6) | |
250 EPEL_FUNCS(4, 4) | |
251 | |
252 EPEL_HV(16, 6,6) | |
253 EPEL_HV(8, 6,6) | |
254 EPEL_HV(8, 4,6) | |
255 EPEL_HV(8, 6,4) | |
256 EPEL_HV(8, 4,4) | |
257 EPEL_HV(4, 6,6) | |
258 EPEL_HV(4, 4,6) | |
259 EPEL_HV(4, 6,4) | |
260 EPEL_HV(4, 4,4) | |
261 | |
262 static void put_vp8_pixels16_altivec(uint8_t *dst, int stride, uint8_t *src, int s, int h, int mx, int my) | |
263 { | |
264 put_pixels16_altivec(dst, src, stride, h); | |
265 } | |
266 | |
267 av_cold void ff_vp8dsp_init_altivec(VP8DSPContext *c) | |
268 { | |
12475
9fef0a8ddd63
Move mm_support() from libavcodec to libavutil, make it a public
stefano
parents:
12473
diff
changeset
|
269 if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) |
12011 | 270 return; |
271 | |
272 c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec; | |
273 c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_altivec; | |
274 c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_altivec; | |
275 c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_altivec; | |
276 | |
277 c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_altivec; | |
278 c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_altivec; | |
279 c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_altivec; | |
280 c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_altivec; | |
281 | |
282 c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_altivec; | |
283 c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_altivec; | |
284 c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_altivec; | |
285 c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_altivec; | |
286 | |
287 c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_altivec; | |
288 c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_altivec; | |
289 c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_altivec; | |
290 c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_altivec; | |
291 | |
292 c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_altivec; | |
293 c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec; | |
294 c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec; | |
295 c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec; | |
296 } |