Mercurial > libavcodec.hg
annotate ppc/vc1dsp_altivec.c @ 12340:2d15f62f4f8a libavcodec
VP8: move zeroing of luma DC block into the WHT
Lets us do the zeroing in asm instead of C.
Also makes it consistent with the way the regular iDCT code does it.
author | darkshikari |
---|---|
date | Mon, 02 Aug 2010 20:18:09 +0000 |
parents | 50415a8f1451 |
children |
rev | line source |
---|---|
3537 | 1 /* |
2 * VC-1 and WMV3 decoder - DSP functions AltiVec-optimized | |
3 * Copyright (c) 2006 Konstantin Shishkov | |
4 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
3537 | 8 * modify it under the terms of the GNU Lesser General Public |
9 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
3537 | 11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
3537 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3537 | 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 */ | |
21 | |
6763 | 22 #include "libavcodec/dsputil.h" |
3537 | 23 |
5750
09f99af1db40
Sanitize altivec code so it can be built with runtime check properly
lu_zero
parents:
5215
diff
changeset
|
24 #include "util_altivec.h" |
11382
50415a8f1451
PPC: move prototypes to headers and make some functions static
mru
parents:
9364
diff
changeset
|
25 #include "dsputil_altivec.h" |
3537 | 26 |
27 // main steps of 8x8 transform | |
28 #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \ | |
29 do { \ | |
30 t0 = vec_sl(vec_add(s0, s4), vec_2); \ | |
31 t0 = vec_add(vec_sl(t0, vec_1), t0); \ | |
32 t0 = vec_add(t0, vec_rnd); \ | |
33 t1 = vec_sl(vec_sub(s0, s4), vec_2); \ | |
34 t1 = vec_add(vec_sl(t1, vec_1), t1); \ | |
35 t1 = vec_add(t1, vec_rnd); \ | |
36 t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \ | |
37 t2 = vec_add(t2, vec_sl(s2, vec_4)); \ | |
38 t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \ | |
39 t3 = vec_sub(t3, vec_sl(s6, vec_4)); \ | |
40 t4 = vec_add(t0, t2); \ | |
41 t5 = vec_add(t1, t3); \ | |
42 t6 = vec_sub(t1, t3); \ | |
43 t7 = vec_sub(t0, t2); \ | |
44 \ | |
45 t0 = vec_sl(vec_add(s1, s3), vec_4); \ | |
46 t0 = vec_add(t0, vec_sl(s5, vec_3)); \ | |
47 t0 = vec_add(t0, vec_sl(s7, vec_2)); \ | |
48 t0 = vec_add(t0, vec_sub(s5, s3)); \ | |
49 \ | |
50 t1 = vec_sl(vec_sub(s1, s5), vec_4); \ | |
51 t1 = vec_sub(t1, vec_sl(s7, vec_3)); \ | |
52 t1 = vec_sub(t1, vec_sl(s3, vec_2)); \ | |
53 t1 = vec_sub(t1, vec_add(s1, s7)); \ | |
54 \ | |
55 t2 = vec_sl(vec_sub(s7, s3), vec_4); \ | |
56 t2 = vec_add(t2, vec_sl(s1, vec_3)); \ | |
57 t2 = vec_add(t2, vec_sl(s5, vec_2)); \ | |
58 t2 = vec_add(t2, vec_sub(s1, s7)); \ | |
59 \ | |
60 t3 = vec_sl(vec_sub(s5, s7), vec_4); \ | |
61 t3 = vec_sub(t3, vec_sl(s3, vec_3)); \ | |
62 t3 = vec_add(t3, vec_sl(s1, vec_2)); \ | |
63 t3 = vec_sub(t3, vec_add(s3, s5)); \ | |
64 \ | |
65 s0 = vec_add(t4, t0); \ | |
66 s1 = vec_add(t5, t1); \ | |
67 s2 = vec_add(t6, t2); \ | |
68 s3 = vec_add(t7, t3); \ | |
69 s4 = vec_sub(t7, t3); \ | |
70 s5 = vec_sub(t6, t2); \ | |
71 s6 = vec_sub(t5, t1); \ | |
72 s7 = vec_sub(t4, t0); \ | |
73 }while(0) | |
74 | |
75 #define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \ | |
76 do { \ | |
77 s0 = vec_sra(s0, vec_3); \ | |
78 s1 = vec_sra(s1, vec_3); \ | |
79 s2 = vec_sra(s2, vec_3); \ | |
80 s3 = vec_sra(s3, vec_3); \ | |
81 s4 = vec_sra(s4, vec_3); \ | |
82 s5 = vec_sra(s5, vec_3); \ | |
83 s6 = vec_sra(s6, vec_3); \ | |
84 s7 = vec_sra(s7, vec_3); \ | |
85 }while(0) | |
86 | |
87 #define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \ | |
88 do { \ | |
89 s0 = vec_sra(s0, vec_7); \ | |
90 s1 = vec_sra(s1, vec_7); \ | |
91 s2 = vec_sra(s2, vec_7); \ | |
92 s3 = vec_sra(s3, vec_7); \ | |
93 s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \ | |
94 s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \ | |
95 s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \ | |
96 s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \ | |
97 }while(0) | |
98 | |
99 /* main steps of 4x4 transform */ | |
100 #define STEP4(s0, s1, s2, s3, vec_rnd) \ | |
101 do { \ | |
102 t1 = vec_add(vec_sl(s0, vec_4), s0); \ | |
103 t1 = vec_add(t1, vec_rnd); \ | |
104 t2 = vec_add(vec_sl(s2, vec_4), s2); \ | |
105 t0 = vec_add(t1, t2); \ | |
106 t1 = vec_sub(t1, t2); \ | |
107 t3 = vec_sl(vec_sub(s3, s1), vec_1); \ | |
108 t3 = vec_add(t3, vec_sl(t3, vec_2)); \ | |
109 t2 = vec_add(t3, vec_sl(s1, vec_5)); \ | |
110 t3 = vec_add(t3, vec_sl(s3, vec_3)); \ | |
111 t3 = vec_add(t3, vec_sl(s3, vec_2)); \ | |
112 s0 = vec_add(t0, t2); \ | |
113 s1 = vec_sub(t1, t3); \ | |
114 s2 = vec_add(t1, t3); \ | |
115 s3 = vec_sub(t0, t2); \ | |
116 }while (0) | |
117 | |
118 #define SHIFT_HOR4(s0, s1, s2, s3) \ | |
119 s0 = vec_sra(s0, vec_3); \ | |
120 s1 = vec_sra(s1, vec_3); \ | |
121 s2 = vec_sra(s2, vec_3); \ | |
122 s3 = vec_sra(s3, vec_3); | |
123 | |
124 #define SHIFT_VERT4(s0, s1, s2, s3) \ | |
125 s0 = vec_sra(s0, vec_7); \ | |
126 s1 = vec_sra(s1, vec_7); \ | |
127 s2 = vec_sra(s2, vec_7); \ | |
128 s3 = vec_sra(s3, vec_7); | |
129 | |
130 /** Do inverse transform on 8x8 block | |
131 */ | |
132 static void vc1_inv_trans_8x8_altivec(DCTELEM block[64]) | |
133 { | |
134 vector signed short src0, src1, src2, src3, src4, src5, src6, src7; | |
135 vector signed int s0, s1, s2, s3, s4, s5, s6, s7; | |
136 vector signed int s8, s9, sA, sB, sC, sD, sE, sF; | |
137 vector signed int t0, t1, t2, t3, t4, t5, t6, t7; | |
138 const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); | |
139 const vector unsigned int vec_7 = vec_splat_u32(7); | |
140 const vector unsigned int vec_4 = vec_splat_u32(4); | |
141 const vector signed int vec_4s = vec_splat_s32(4); | |
142 const vector unsigned int vec_3 = vec_splat_u32(3); | |
143 const vector unsigned int vec_2 = vec_splat_u32(2); | |
144 const vector signed int vec_1s = vec_splat_s32(1); | |
145 const vector unsigned int vec_1 = vec_splat_u32(1); | |
146 | |
147 | |
148 src0 = vec_ld( 0, block); | |
149 src1 = vec_ld( 16, block); | |
150 src2 = vec_ld( 32, block); | |
151 src3 = vec_ld( 48, block); | |
152 src4 = vec_ld( 64, block); | |
153 src5 = vec_ld( 80, block); | |
154 src6 = vec_ld( 96, block); | |
155 src7 = vec_ld(112, block); | |
156 | |
157 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); | |
158 s0 = vec_unpackl(src0); | |
159 s1 = vec_unpackl(src1); | |
160 s2 = vec_unpackl(src2); | |
161 s3 = vec_unpackl(src3); | |
162 s4 = vec_unpackl(src4); | |
163 s5 = vec_unpackl(src5); | |
164 s6 = vec_unpackl(src6); | |
165 s7 = vec_unpackl(src7); | |
166 s8 = vec_unpackh(src0); | |
167 s9 = vec_unpackh(src1); | |
168 sA = vec_unpackh(src2); | |
169 sB = vec_unpackh(src3); | |
170 sC = vec_unpackh(src4); | |
171 sD = vec_unpackh(src5); | |
172 sE = vec_unpackh(src6); | |
173 sF = vec_unpackh(src7); | |
174 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); | |
175 SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); | |
176 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); | |
177 SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); | |
178 src0 = vec_pack(s8, s0); | |
179 src1 = vec_pack(s9, s1); | |
180 src2 = vec_pack(sA, s2); | |
181 src3 = vec_pack(sB, s3); | |
182 src4 = vec_pack(sC, s4); | |
183 src5 = vec_pack(sD, s5); | |
184 src6 = vec_pack(sE, s6); | |
185 src7 = vec_pack(sF, s7); | |
186 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); | |
187 | |
188 s0 = vec_unpackl(src0); | |
189 s1 = vec_unpackl(src1); | |
190 s2 = vec_unpackl(src2); | |
191 s3 = vec_unpackl(src3); | |
192 s4 = vec_unpackl(src4); | |
193 s5 = vec_unpackl(src5); | |
194 s6 = vec_unpackl(src6); | |
195 s7 = vec_unpackl(src7); | |
196 s8 = vec_unpackh(src0); | |
197 s9 = vec_unpackh(src1); | |
198 sA = vec_unpackh(src2); | |
199 sB = vec_unpackh(src3); | |
200 sC = vec_unpackh(src4); | |
201 sD = vec_unpackh(src5); | |
202 sE = vec_unpackh(src6); | |
203 sF = vec_unpackh(src7); | |
204 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64); | |
205 SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7); | |
206 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64); | |
207 SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF); | |
208 src0 = vec_pack(s8, s0); | |
209 src1 = vec_pack(s9, s1); | |
210 src2 = vec_pack(sA, s2); | |
211 src3 = vec_pack(sB, s3); | |
212 src4 = vec_pack(sC, s4); | |
213 src5 = vec_pack(sD, s5); | |
214 src6 = vec_pack(sE, s6); | |
215 src7 = vec_pack(sF, s7); | |
216 | |
217 vec_st(src0, 0, block); | |
218 vec_st(src1, 16, block); | |
219 vec_st(src2, 32, block); | |
220 vec_st(src3, 48, block); | |
221 vec_st(src4, 64, block); | |
222 vec_st(src5, 80, block); | |
223 vec_st(src6, 96, block); | |
224 vec_st(src7,112, block); | |
225 } | |
226 | |
227 /** Do inverse transform on 8x4 part of block | |
228 */ | |
5999 | 229 static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block) |
3537 | 230 { |
231 vector signed short src0, src1, src2, src3, src4, src5, src6, src7; | |
232 vector signed int s0, s1, s2, s3, s4, s5, s6, s7; | |
233 vector signed int s8, s9, sA, sB, sC, sD, sE, sF; | |
234 vector signed int t0, t1, t2, t3, t4, t5, t6, t7; | |
235 const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); | |
236 const vector unsigned int vec_7 = vec_splat_u32(7); | |
237 const vector unsigned int vec_5 = vec_splat_u32(5); | |
238 const vector unsigned int vec_4 = vec_splat_u32(4); | |
239 const vector signed int vec_4s = vec_splat_s32(4); | |
240 const vector unsigned int vec_3 = vec_splat_u32(3); | |
241 const vector unsigned int vec_2 = vec_splat_u32(2); | |
242 const vector unsigned int vec_1 = vec_splat_u32(1); | |
5999 | 243 vector unsigned char tmp; |
244 vector signed short tmp2, tmp3; | |
245 vector unsigned char perm0, perm1, p0, p1, p; | |
3537 | 246 |
247 src0 = vec_ld( 0, block); | |
248 src1 = vec_ld( 16, block); | |
249 src2 = vec_ld( 32, block); | |
250 src3 = vec_ld( 48, block); | |
251 src4 = vec_ld( 64, block); | |
252 src5 = vec_ld( 80, block); | |
253 src6 = vec_ld( 96, block); | |
254 src7 = vec_ld(112, block); | |
255 | |
256 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); | |
257 s0 = vec_unpackl(src0); | |
258 s1 = vec_unpackl(src1); | |
259 s2 = vec_unpackl(src2); | |
260 s3 = vec_unpackl(src3); | |
261 s4 = vec_unpackl(src4); | |
262 s5 = vec_unpackl(src5); | |
263 s6 = vec_unpackl(src6); | |
264 s7 = vec_unpackl(src7); | |
265 s8 = vec_unpackh(src0); | |
266 s9 = vec_unpackh(src1); | |
267 sA = vec_unpackh(src2); | |
268 sB = vec_unpackh(src3); | |
269 sC = vec_unpackh(src4); | |
270 sD = vec_unpackh(src5); | |
271 sE = vec_unpackh(src6); | |
272 sF = vec_unpackh(src7); | |
273 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); | |
274 SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); | |
275 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); | |
276 SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); | |
277 src0 = vec_pack(s8, s0); | |
278 src1 = vec_pack(s9, s1); | |
279 src2 = vec_pack(sA, s2); | |
280 src3 = vec_pack(sB, s3); | |
281 src4 = vec_pack(sC, s4); | |
282 src5 = vec_pack(sD, s5); | |
283 src6 = vec_pack(sE, s6); | |
284 src7 = vec_pack(sF, s7); | |
285 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); | |
286 | |
6000 | 287 s0 = vec_unpackh(src0); |
288 s1 = vec_unpackh(src1); | |
289 s2 = vec_unpackh(src2); | |
290 s3 = vec_unpackh(src3); | |
291 s8 = vec_unpackl(src0); | |
292 s9 = vec_unpackl(src1); | |
293 sA = vec_unpackl(src2); | |
294 sB = vec_unpackl(src3); | |
295 STEP4(s0, s1, s2, s3, vec_64); | |
296 SHIFT_VERT4(s0, s1, s2, s3); | |
297 STEP4(s8, s9, sA, sB, vec_64); | |
298 SHIFT_VERT4(s8, s9, sA, sB); | |
299 src0 = vec_pack(s0, s8); | |
300 src1 = vec_pack(s1, s9); | |
301 src2 = vec_pack(s2, sA); | |
302 src3 = vec_pack(s3, sB); | |
3537 | 303 |
5999 | 304 p0 = vec_lvsl (0, dest); |
305 p1 = vec_lvsl (stride, dest); | |
306 p = vec_splat_u8 (-1); | |
307 perm0 = vec_mergeh (p, p0); | |
308 perm1 = vec_mergeh (p, p1); | |
3537 | 309 |
5999 | 310 #define ADD(dest,src,perm) \ |
311 /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ | |
312 tmp = vec_ld (0, dest); \ | |
6028
1ba8ee13e5b9
Make strict altivec parsers happy (gcc-4.3 and others)
lu_zero
parents:
6000
diff
changeset
|
313 tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \ |
5999 | 314 tmp3 = vec_adds (tmp2, src); \ |
315 tmp = vec_packsu (tmp3, tmp3); \ | |
316 vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \ | |
317 vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest); | |
318 | |
319 ADD (dest, src0, perm0) dest += stride; | |
320 ADD (dest, src1, perm1) dest += stride; | |
321 ADD (dest, src2, perm0) dest += stride; | |
322 ADD (dest, src3, perm1) | |
3537 | 323 } |
324 | |
325 | |
326 void vc1dsp_init_altivec(DSPContext* dsp, AVCodecContext *avctx) { | |
327 dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec; | |
5999 | 328 dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec; |
3537 | 329 } |