annotate ppc/vp3dsp_altivec.c @ 11032:01bd040f8607 libavcodec

Unroll main loop so the edge==0 case is seperate. This allows many things to be simplified away. h264 decoder is overall 1% faster with a mbaff sample and 0.1% slower with the cathedral sample, probably because the slow loop filter code must be loaded into the code cache for each first MB of each row but isnt used for the following MBs.
author michael
date Thu, 28 Jan 2010 01:24:25 +0000
parents d563821462b4
children 50415a8f1451
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9711
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
1 /*
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
2 * Copyright (C) 2009 David Conrad
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
3 *
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
4 * This file is part of FFmpeg.
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
5 *
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
10 *
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
14 * Lesser General Public License for more details.
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
15 *
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
19 */
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
20
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
21 #include "libavcodec/dsputil.h"
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
22 #include "util_altivec.h"
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
23 #include "types_altivec.h"
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
24
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
25 static const vec_s16 constants =
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
26 {0, 64277, 60547, 54491, 46341, 36410, 25080, 12785};
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
27 static const vec_u8 interleave_high =
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
28 {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
29
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
30 #define IDCT_START \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
31 vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
32 vec_s16 Ed, Gd, Add, Bdd, Fd, Hd;\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
33 vec_s16 eight = vec_splat_s16(8);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
34 vec_u16 four = vec_splat_u16(4);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
35 \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
36 vec_s16 C1 = vec_splat(constants, 1);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
37 vec_s16 C2 = vec_splat(constants, 2);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
38 vec_s16 C3 = vec_splat(constants, 3);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
39 vec_s16 C4 = vec_splat(constants, 4);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
40 vec_s16 C5 = vec_splat(constants, 5);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
41 vec_s16 C6 = vec_splat(constants, 6);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
42 vec_s16 C7 = vec_splat(constants, 7);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
43 \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
44 vec_s16 b0 = vec_ld(0x00, block);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
45 vec_s16 b1 = vec_ld(0x10, block);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
46 vec_s16 b2 = vec_ld(0x20, block);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
47 vec_s16 b3 = vec_ld(0x30, block);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
48 vec_s16 b4 = vec_ld(0x40, block);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
49 vec_s16 b5 = vec_ld(0x50, block);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
50 vec_s16 b6 = vec_ld(0x60, block);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
51 vec_s16 b7 = vec_ld(0x70, block);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
52
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
53 // these functions do (a*C)>>16
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
54 // things are tricky because a is signed, but C unsigned.
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
55 // M15 is used if C fits in 15 bit unsigned (C6,C7)
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
56 // M16 is used if C requires 16 bits unsigned
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
57 static inline vec_s16 M15(vec_s16 a, vec_s16 C)
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
58 {
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
59 return (vec_s16)vec_perm(vec_mule(a,C), vec_mulo(a,C), interleave_high);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
60 }
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
61 static inline vec_s16 M16(vec_s16 a, vec_s16 C)
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
62 {
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
63 return vec_add(a, M15(a, C));
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
64 }
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
65
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
66 #define IDCT_1D(ADD, SHIFT)\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
67 A = vec_add(M16(b1, C1), M15(b7, C7));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
68 B = vec_sub(M15(b1, C7), M16(b7, C1));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
69 C = vec_add(M16(b3, C3), M16(b5, C5));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
70 D = vec_sub(M16(b5, C3), M16(b3, C5));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
71 \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
72 Ad = M16(vec_sub(A, C), C4);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
73 Bd = M16(vec_sub(B, D), C4);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
74 \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
75 Cd = vec_add(A, C);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
76 Dd = vec_add(B, D);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
77 \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
78 E = ADD(M16(vec_add(b0, b4), C4));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
79 F = ADD(M16(vec_sub(b0, b4), C4));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
80 \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
81 G = vec_add(M16(b2, C2), M15(b6, C6));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
82 H = vec_sub(M15(b2, C6), M16(b6, C2));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
83 \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
84 Ed = vec_sub(E, G);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
85 Gd = vec_add(E, G);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
86 \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
87 Add = vec_add(F, Ad);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
88 Bdd = vec_sub(Bd, H);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
89 \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
90 Fd = vec_sub(F, Ad);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
91 Hd = vec_add(Bd, H);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
92 \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
93 b0 = SHIFT(vec_add(Gd, Cd));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
94 b7 = SHIFT(vec_sub(Gd, Cd));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
95 \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
96 b1 = SHIFT(vec_add(Add, Hd));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
97 b2 = SHIFT(vec_sub(Add, Hd));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
98 \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
99 b3 = SHIFT(vec_add(Ed, Dd));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
100 b4 = SHIFT(vec_sub(Ed, Dd));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
101 \
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
102 b5 = SHIFT(vec_add(Fd, Bdd));\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
103 b6 = SHIFT(vec_sub(Fd, Bdd));
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
104
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
105 #define NOP(a) a
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
106 #define ADD8(a) vec_add(a, eight)
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
107 #define SHIFT4(a) vec_sra(a, four)
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
108
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
109 void ff_vp3_idct_altivec(DCTELEM block[64])
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
110 {
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
111 IDCT_START
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
112
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
113 IDCT_1D(NOP, NOP)
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
114 TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
115 IDCT_1D(ADD8, SHIFT4)
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
116
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
117 vec_st(b0, 0x00, block);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
118 vec_st(b1, 0x10, block);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
119 vec_st(b2, 0x20, block);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
120 vec_st(b3, 0x30, block);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
121 vec_st(b4, 0x40, block);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
122 vec_st(b5, 0x50, block);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
123 vec_st(b6, 0x60, block);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
124 vec_st(b7, 0x70, block);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
125 }
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
126
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
127 void ff_vp3_idct_put_altivec(uint8_t *dst, int stride, DCTELEM block[64])
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
128 {
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
129 vec_u8 t;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
130 IDCT_START
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
131
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
132 // pixels are signed; so add 128*16 in addition to the normal 8
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
133 vec_s16 v2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11));
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
134 eight = vec_add(eight, v2048);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
135
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
136 IDCT_1D(NOP, NOP)
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
137 TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
138 IDCT_1D(ADD8, SHIFT4)
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
139
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
140 #define PUT(a)\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
141 t = vec_packsu(a, a);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
142 vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
143 vec_ste((vec_u32)t, 4, (unsigned int *)dst);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
144
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
145 PUT(b0) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
146 PUT(b1) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
147 PUT(b2) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
148 PUT(b3) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
149 PUT(b4) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
150 PUT(b5) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
151 PUT(b6) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
152 PUT(b7)
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
153 }
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
154
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
155 void ff_vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64])
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
156 {
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
157 LOAD_ZERO;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
158 vec_u8 t, vdst;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
159 vec_s16 vdst_16;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
160 vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst));
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
161
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
162 IDCT_START
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
163
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
164 IDCT_1D(NOP, NOP)
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
165 TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
166 IDCT_1D(ADD8, SHIFT4)
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
167
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
168 #define ADD(a)\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
169 vdst = vec_ld(0, dst);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
170 vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
171 vdst_16 = vec_adds(a, vdst_16);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
172 t = vec_packsu(vdst_16, vdst_16);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
173 vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
174 vec_ste((vec_u32)t, 4, (unsigned int *)dst);
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
175
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
176 ADD(b0) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
177 ADD(b1) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
178 ADD(b2) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
179 ADD(b3) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
180 ADD(b4) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
181 ADD(b5) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
182 ADD(b6) dst += stride;
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
183 ADD(b7)
d563821462b4 Altivec VP3 IDCT
conrad
parents:
diff changeset
184 }