Mercurial > libavcodec.hg
annotate ppc/vc1dsp_altivec.c @ 11032:01bd040f8607 libavcodec
Unroll main loop so the edge==0 case is seperate.
This allows many things to be simplified away.
h264 decoder is overall 1% faster with a mbaff sample and
0.1% slower with the cathedral sample, probably because the slow loop
filter code must be loaded into the code cache for each first MB of each
row but isnt used for the following MBs.
author | michael |
---|---|
date | Thu, 28 Jan 2010 01:24:25 +0000 |
parents | 7cee7292d5cc |
children | 50415a8f1451 |
rev | line source |
---|---|
3537 | 1 /* |
2 * VC-1 and WMV3 decoder - DSP functions AltiVec-optimized | |
3 * Copyright (c) 2006 Konstantin Shishkov | |
4 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
3537 | 8 * modify it under the terms of the GNU Lesser General Public |
9 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
3537 | 11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
3537 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3537 | 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 */ | |
21 | |
6763 | 22 #include "libavcodec/dsputil.h" |
3537 | 23 |
5750
09f99af1db40
Sanitize altivec code so it can be built with runtime check properly
lu_zero
parents:
5215
diff
changeset
|
24 #include "util_altivec.h" |
3537 | 25 |
26 // main steps of 8x8 transform | |
27 #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \ | |
28 do { \ | |
29 t0 = vec_sl(vec_add(s0, s4), vec_2); \ | |
30 t0 = vec_add(vec_sl(t0, vec_1), t0); \ | |
31 t0 = vec_add(t0, vec_rnd); \ | |
32 t1 = vec_sl(vec_sub(s0, s4), vec_2); \ | |
33 t1 = vec_add(vec_sl(t1, vec_1), t1); \ | |
34 t1 = vec_add(t1, vec_rnd); \ | |
35 t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \ | |
36 t2 = vec_add(t2, vec_sl(s2, vec_4)); \ | |
37 t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \ | |
38 t3 = vec_sub(t3, vec_sl(s6, vec_4)); \ | |
39 t4 = vec_add(t0, t2); \ | |
40 t5 = vec_add(t1, t3); \ | |
41 t6 = vec_sub(t1, t3); \ | |
42 t7 = vec_sub(t0, t2); \ | |
43 \ | |
44 t0 = vec_sl(vec_add(s1, s3), vec_4); \ | |
45 t0 = vec_add(t0, vec_sl(s5, vec_3)); \ | |
46 t0 = vec_add(t0, vec_sl(s7, vec_2)); \ | |
47 t0 = vec_add(t0, vec_sub(s5, s3)); \ | |
48 \ | |
49 t1 = vec_sl(vec_sub(s1, s5), vec_4); \ | |
50 t1 = vec_sub(t1, vec_sl(s7, vec_3)); \ | |
51 t1 = vec_sub(t1, vec_sl(s3, vec_2)); \ | |
52 t1 = vec_sub(t1, vec_add(s1, s7)); \ | |
53 \ | |
54 t2 = vec_sl(vec_sub(s7, s3), vec_4); \ | |
55 t2 = vec_add(t2, vec_sl(s1, vec_3)); \ | |
56 t2 = vec_add(t2, vec_sl(s5, vec_2)); \ | |
57 t2 = vec_add(t2, vec_sub(s1, s7)); \ | |
58 \ | |
59 t3 = vec_sl(vec_sub(s5, s7), vec_4); \ | |
60 t3 = vec_sub(t3, vec_sl(s3, vec_3)); \ | |
61 t3 = vec_add(t3, vec_sl(s1, vec_2)); \ | |
62 t3 = vec_sub(t3, vec_add(s3, s5)); \ | |
63 \ | |
64 s0 = vec_add(t4, t0); \ | |
65 s1 = vec_add(t5, t1); \ | |
66 s2 = vec_add(t6, t2); \ | |
67 s3 = vec_add(t7, t3); \ | |
68 s4 = vec_sub(t7, t3); \ | |
69 s5 = vec_sub(t6, t2); \ | |
70 s6 = vec_sub(t5, t1); \ | |
71 s7 = vec_sub(t4, t0); \ | |
72 }while(0) | |
73 | |
74 #define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \ | |
75 do { \ | |
76 s0 = vec_sra(s0, vec_3); \ | |
77 s1 = vec_sra(s1, vec_3); \ | |
78 s2 = vec_sra(s2, vec_3); \ | |
79 s3 = vec_sra(s3, vec_3); \ | |
80 s4 = vec_sra(s4, vec_3); \ | |
81 s5 = vec_sra(s5, vec_3); \ | |
82 s6 = vec_sra(s6, vec_3); \ | |
83 s7 = vec_sra(s7, vec_3); \ | |
84 }while(0) | |
85 | |
86 #define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \ | |
87 do { \ | |
88 s0 = vec_sra(s0, vec_7); \ | |
89 s1 = vec_sra(s1, vec_7); \ | |
90 s2 = vec_sra(s2, vec_7); \ | |
91 s3 = vec_sra(s3, vec_7); \ | |
92 s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \ | |
93 s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \ | |
94 s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \ | |
95 s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \ | |
96 }while(0) | |
97 | |
98 /* main steps of 4x4 transform */ | |
99 #define STEP4(s0, s1, s2, s3, vec_rnd) \ | |
100 do { \ | |
101 t1 = vec_add(vec_sl(s0, vec_4), s0); \ | |
102 t1 = vec_add(t1, vec_rnd); \ | |
103 t2 = vec_add(vec_sl(s2, vec_4), s2); \ | |
104 t0 = vec_add(t1, t2); \ | |
105 t1 = vec_sub(t1, t2); \ | |
106 t3 = vec_sl(vec_sub(s3, s1), vec_1); \ | |
107 t3 = vec_add(t3, vec_sl(t3, vec_2)); \ | |
108 t2 = vec_add(t3, vec_sl(s1, vec_5)); \ | |
109 t3 = vec_add(t3, vec_sl(s3, vec_3)); \ | |
110 t3 = vec_add(t3, vec_sl(s3, vec_2)); \ | |
111 s0 = vec_add(t0, t2); \ | |
112 s1 = vec_sub(t1, t3); \ | |
113 s2 = vec_add(t1, t3); \ | |
114 s3 = vec_sub(t0, t2); \ | |
115 }while (0) | |
116 | |
117 #define SHIFT_HOR4(s0, s1, s2, s3) \ | |
118 s0 = vec_sra(s0, vec_3); \ | |
119 s1 = vec_sra(s1, vec_3); \ | |
120 s2 = vec_sra(s2, vec_3); \ | |
121 s3 = vec_sra(s3, vec_3); | |
122 | |
123 #define SHIFT_VERT4(s0, s1, s2, s3) \ | |
124 s0 = vec_sra(s0, vec_7); \ | |
125 s1 = vec_sra(s1, vec_7); \ | |
126 s2 = vec_sra(s2, vec_7); \ | |
127 s3 = vec_sra(s3, vec_7); | |
128 | |
129 /** Do inverse transform on 8x8 block | |
130 */ | |
131 static void vc1_inv_trans_8x8_altivec(DCTELEM block[64]) | |
132 { | |
133 vector signed short src0, src1, src2, src3, src4, src5, src6, src7; | |
134 vector signed int s0, s1, s2, s3, s4, s5, s6, s7; | |
135 vector signed int s8, s9, sA, sB, sC, sD, sE, sF; | |
136 vector signed int t0, t1, t2, t3, t4, t5, t6, t7; | |
137 const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); | |
138 const vector unsigned int vec_7 = vec_splat_u32(7); | |
139 const vector unsigned int vec_4 = vec_splat_u32(4); | |
140 const vector signed int vec_4s = vec_splat_s32(4); | |
141 const vector unsigned int vec_3 = vec_splat_u32(3); | |
142 const vector unsigned int vec_2 = vec_splat_u32(2); | |
143 const vector signed int vec_1s = vec_splat_s32(1); | |
144 const vector unsigned int vec_1 = vec_splat_u32(1); | |
145 | |
146 | |
147 src0 = vec_ld( 0, block); | |
148 src1 = vec_ld( 16, block); | |
149 src2 = vec_ld( 32, block); | |
150 src3 = vec_ld( 48, block); | |
151 src4 = vec_ld( 64, block); | |
152 src5 = vec_ld( 80, block); | |
153 src6 = vec_ld( 96, block); | |
154 src7 = vec_ld(112, block); | |
155 | |
156 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); | |
157 s0 = vec_unpackl(src0); | |
158 s1 = vec_unpackl(src1); | |
159 s2 = vec_unpackl(src2); | |
160 s3 = vec_unpackl(src3); | |
161 s4 = vec_unpackl(src4); | |
162 s5 = vec_unpackl(src5); | |
163 s6 = vec_unpackl(src6); | |
164 s7 = vec_unpackl(src7); | |
165 s8 = vec_unpackh(src0); | |
166 s9 = vec_unpackh(src1); | |
167 sA = vec_unpackh(src2); | |
168 sB = vec_unpackh(src3); | |
169 sC = vec_unpackh(src4); | |
170 sD = vec_unpackh(src5); | |
171 sE = vec_unpackh(src6); | |
172 sF = vec_unpackh(src7); | |
173 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); | |
174 SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); | |
175 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); | |
176 SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); | |
177 src0 = vec_pack(s8, s0); | |
178 src1 = vec_pack(s9, s1); | |
179 src2 = vec_pack(sA, s2); | |
180 src3 = vec_pack(sB, s3); | |
181 src4 = vec_pack(sC, s4); | |
182 src5 = vec_pack(sD, s5); | |
183 src6 = vec_pack(sE, s6); | |
184 src7 = vec_pack(sF, s7); | |
185 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); | |
186 | |
187 s0 = vec_unpackl(src0); | |
188 s1 = vec_unpackl(src1); | |
189 s2 = vec_unpackl(src2); | |
190 s3 = vec_unpackl(src3); | |
191 s4 = vec_unpackl(src4); | |
192 s5 = vec_unpackl(src5); | |
193 s6 = vec_unpackl(src6); | |
194 s7 = vec_unpackl(src7); | |
195 s8 = vec_unpackh(src0); | |
196 s9 = vec_unpackh(src1); | |
197 sA = vec_unpackh(src2); | |
198 sB = vec_unpackh(src3); | |
199 sC = vec_unpackh(src4); | |
200 sD = vec_unpackh(src5); | |
201 sE = vec_unpackh(src6); | |
202 sF = vec_unpackh(src7); | |
203 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64); | |
204 SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7); | |
205 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64); | |
206 SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF); | |
207 src0 = vec_pack(s8, s0); | |
208 src1 = vec_pack(s9, s1); | |
209 src2 = vec_pack(sA, s2); | |
210 src3 = vec_pack(sB, s3); | |
211 src4 = vec_pack(sC, s4); | |
212 src5 = vec_pack(sD, s5); | |
213 src6 = vec_pack(sE, s6); | |
214 src7 = vec_pack(sF, s7); | |
215 | |
216 vec_st(src0, 0, block); | |
217 vec_st(src1, 16, block); | |
218 vec_st(src2, 32, block); | |
219 vec_st(src3, 48, block); | |
220 vec_st(src4, 64, block); | |
221 vec_st(src5, 80, block); | |
222 vec_st(src6, 96, block); | |
223 vec_st(src7,112, block); | |
224 } | |
225 | |
226 /** Do inverse transform on 8x4 part of block | |
227 */ | |
5999 | 228 static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block) |
3537 | 229 { |
230 vector signed short src0, src1, src2, src3, src4, src5, src6, src7; | |
231 vector signed int s0, s1, s2, s3, s4, s5, s6, s7; | |
232 vector signed int s8, s9, sA, sB, sC, sD, sE, sF; | |
233 vector signed int t0, t1, t2, t3, t4, t5, t6, t7; | |
234 const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); | |
235 const vector unsigned int vec_7 = vec_splat_u32(7); | |
236 const vector unsigned int vec_5 = vec_splat_u32(5); | |
237 const vector unsigned int vec_4 = vec_splat_u32(4); | |
238 const vector signed int vec_4s = vec_splat_s32(4); | |
239 const vector unsigned int vec_3 = vec_splat_u32(3); | |
240 const vector unsigned int vec_2 = vec_splat_u32(2); | |
241 const vector unsigned int vec_1 = vec_splat_u32(1); | |
5999 | 242 vector unsigned char tmp; |
243 vector signed short tmp2, tmp3; | |
244 vector unsigned char perm0, perm1, p0, p1, p; | |
3537 | 245 |
246 src0 = vec_ld( 0, block); | |
247 src1 = vec_ld( 16, block); | |
248 src2 = vec_ld( 32, block); | |
249 src3 = vec_ld( 48, block); | |
250 src4 = vec_ld( 64, block); | |
251 src5 = vec_ld( 80, block); | |
252 src6 = vec_ld( 96, block); | |
253 src7 = vec_ld(112, block); | |
254 | |
255 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); | |
256 s0 = vec_unpackl(src0); | |
257 s1 = vec_unpackl(src1); | |
258 s2 = vec_unpackl(src2); | |
259 s3 = vec_unpackl(src3); | |
260 s4 = vec_unpackl(src4); | |
261 s5 = vec_unpackl(src5); | |
262 s6 = vec_unpackl(src6); | |
263 s7 = vec_unpackl(src7); | |
264 s8 = vec_unpackh(src0); | |
265 s9 = vec_unpackh(src1); | |
266 sA = vec_unpackh(src2); | |
267 sB = vec_unpackh(src3); | |
268 sC = vec_unpackh(src4); | |
269 sD = vec_unpackh(src5); | |
270 sE = vec_unpackh(src6); | |
271 sF = vec_unpackh(src7); | |
272 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); | |
273 SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); | |
274 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); | |
275 SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); | |
276 src0 = vec_pack(s8, s0); | |
277 src1 = vec_pack(s9, s1); | |
278 src2 = vec_pack(sA, s2); | |
279 src3 = vec_pack(sB, s3); | |
280 src4 = vec_pack(sC, s4); | |
281 src5 = vec_pack(sD, s5); | |
282 src6 = vec_pack(sE, s6); | |
283 src7 = vec_pack(sF, s7); | |
284 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); | |
285 | |
6000 | 286 s0 = vec_unpackh(src0); |
287 s1 = vec_unpackh(src1); | |
288 s2 = vec_unpackh(src2); | |
289 s3 = vec_unpackh(src3); | |
290 s8 = vec_unpackl(src0); | |
291 s9 = vec_unpackl(src1); | |
292 sA = vec_unpackl(src2); | |
293 sB = vec_unpackl(src3); | |
294 STEP4(s0, s1, s2, s3, vec_64); | |
295 SHIFT_VERT4(s0, s1, s2, s3); | |
296 STEP4(s8, s9, sA, sB, vec_64); | |
297 SHIFT_VERT4(s8, s9, sA, sB); | |
298 src0 = vec_pack(s0, s8); | |
299 src1 = vec_pack(s1, s9); | |
300 src2 = vec_pack(s2, sA); | |
301 src3 = vec_pack(s3, sB); | |
3537 | 302 |
5999 | 303 p0 = vec_lvsl (0, dest); |
304 p1 = vec_lvsl (stride, dest); | |
305 p = vec_splat_u8 (-1); | |
306 perm0 = vec_mergeh (p, p0); | |
307 perm1 = vec_mergeh (p, p1); | |
3537 | 308 |
5999 | 309 #define ADD(dest,src,perm) \ |
310 /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ | |
311 tmp = vec_ld (0, dest); \ | |
6028
1ba8ee13e5b9
Make strict altivec parsers happy (gcc-4.3 and others)
lu_zero
parents:
6000
diff
changeset
|
312 tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \ |
5999 | 313 tmp3 = vec_adds (tmp2, src); \ |
314 tmp = vec_packsu (tmp3, tmp3); \ | |
315 vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \ | |
316 vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest); | |
317 | |
318 ADD (dest, src0, perm0) dest += stride; | |
319 ADD (dest, src1, perm1) dest += stride; | |
320 ADD (dest, src2, perm0) dest += stride; | |
321 ADD (dest, src3, perm1) | |
3537 | 322 } |
323 | |
324 | |
325 void vc1dsp_init_altivec(DSPContext* dsp, AVCodecContext *avctx) { | |
326 dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec; | |
5999 | 327 dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec; |
3537 | 328 } |