9857
|
1 /*
|
|
2 * idct_altivec.c
|
|
3 * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
|
|
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
|
|
5 *
|
|
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
|
|
7 * See http://libmpeg2.sourceforge.net/ for updates.
|
|
8 *
|
|
9 * mpeg2dec is free software; you can redistribute it and/or modify
|
|
10 * it under the terms of the GNU General Public License as published by
|
|
11 * the Free Software Foundation; either version 2 of the License, or
|
|
12 * (at your option) any later version.
|
|
13 *
|
|
14 * mpeg2dec is distributed in the hope that it will be useful,
|
|
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
17 * GNU General Public License for more details.
|
|
18 *
|
|
19 * You should have received a copy of the GNU General Public License
|
|
20 * along with this program; if not, write to the Free Software
|
|
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
22 */
|
|
23
|
|
24 #ifndef __ALTIVEC__
|
|
25
|
|
26 #include "config.h"
|
|
27
|
|
28 #ifdef ARCH_PPC
|
|
29
|
|
30 #include <inttypes.h>
|
|
31
|
|
32 #include "mpeg2.h"
|
|
33 #include "mpeg2_internal.h"
|
|
34 #include "attributes.h"
|
|
35
|
|
36 static const int16_t constants[5][8] ATTR_ALIGN(16) = {
|
|
37 {23170, 13573, 6518, 21895, -23170, -21895, 32, 31},
|
|
38 {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725},
|
|
39 {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521},
|
|
40 {21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692},
|
|
41 {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722}
|
|
42 };
|
|
43
|
|
44 /*
|
|
45 * The asm code is generated with:
|
|
46 *
|
|
47 * gcc-2.95 -fvec -D__ALTIVEC__ -O9 -fomit-frame-pointer -mregnames -S
|
|
48 * idct_altivec.c
|
|
49 *
|
|
50 * awk '{args=""; len=split ($2, arg, ",");
|
|
51 * for (i=1; i<=len; i++) { a=arg[i]; if (i<len) a=a",";
|
|
52 * args = args sprintf ("%-6s", a) }
|
|
53 * printf ("\t\"\t%-16s%-24s\\n\"\n", $1, args) }' idct_altivec.s |
|
|
54 * unexpand -a
|
|
55 *
|
|
56 * I then do some simple trimming on the function prolog/trailers
|
|
57 */
|
|
58
|
|
59 void mpeg2_idct_copy_altivec (int16_t * block, uint8_t * dest, int stride)
|
|
60 {
|
|
61 asm (" \n"
|
|
62 "# stwu %r1, -128(%r1) \n"
|
|
63 "# mflr %r0 \n"
|
|
64 "# stw %r0, 132(%r1) \n"
|
|
65 "# addi %r0, %r1, 128 \n"
|
|
66 "# bl _savev25 \n"
|
|
67
|
|
68 " addi %r9, %r3, 112 \n"
|
|
69 " vspltish %v25, 4 \n"
|
|
70 " vxor %v13, %v13, %v13 \n"
|
|
71 " lis %r10, constants@ha \n"
|
|
72 " lvx %v1, 0, %r9 \n"
|
|
73 " la %r10, constants@l(%r10) \n"
|
|
74 " lvx %v5, 0, %r3 \n"
|
|
75 " addi %r9, %r3, 16 \n"
|
|
76 " lvx %v8, 0, %r10 \n"
|
|
77 " addi %r11, %r10, 32 \n"
|
|
78 " lvx %v12, 0, %r9 \n"
|
|
79 " lvx %v6, 0, %r11 \n"
|
|
80 " addi %r8, %r3, 48 \n"
|
|
81 " vslh %v1, %v1, %v25 \n"
|
|
82 " addi %r9, %r3, 80 \n"
|
|
83 " lvx %v11, 0, %r8 \n"
|
|
84 " vslh %v5, %v5, %v25 \n"
|
|
85 " lvx %v0, 0, %r9 \n"
|
|
86 " addi %r11, %r10, 64 \n"
|
|
87 " vsplth %v3, %v8, 2 \n"
|
|
88 " lvx %v7, 0, %r11 \n"
|
|
89 " addi %r9, %r3, 96 \n"
|
|
90 " vslh %v12, %v12, %v25 \n"
|
|
91 " vmhraddshs %v27, %v1, %v6, %v13 \n"
|
|
92 " addi %r8, %r3, 32 \n"
|
|
93 " vsplth %v2, %v8, 5 \n"
|
|
94 " lvx %v1, 0, %r9 \n"
|
|
95 " vslh %v11, %v11, %v25 \n"
|
|
96 " addi %r3, %r3, 64 \n"
|
|
97 " lvx %v9, 0, %r8 \n"
|
|
98 " addi %r9, %r10, 48 \n"
|
|
99 " vslh %v0, %v0, %v25 \n"
|
|
100 " lvx %v4, 0, %r9 \n"
|
|
101 " vmhraddshs %v31, %v12, %v6, %v13 \n"
|
|
102 " addi %r10, %r10, 16 \n"
|
|
103 " vmhraddshs %v30, %v0, %v7, %v13 \n"
|
|
104 " lvx %v10, 0, %r3 \n"
|
|
105 " vsplth %v19, %v8, 3 \n"
|
|
106 " vmhraddshs %v15, %v11, %v7, %v13 \n"
|
|
107 " lvx %v12, 0, %r10 \n"
|
|
108 " vsplth %v6, %v8, 4 \n"
|
|
109 " vslh %v1, %v1, %v25 \n"
|
|
110 " vsplth %v11, %v8, 1 \n"
|
|
111 " li %r9, 4 \n"
|
|
112 " vslh %v9, %v9, %v25 \n"
|
|
113 " vsplth %v7, %v8, 0 \n"
|
|
114 " vmhraddshs %v18, %v1, %v4, %v13 \n"
|
|
115 " vspltw %v8, %v8, 3 \n"
|
|
116 " vsubshs %v0, %v13, %v27 \n"
|
|
117 " vmhraddshs %v1, %v9, %v4, %v13 \n"
|
|
118 " vmhraddshs %v17, %v3, %v31, %v0 \n"
|
|
119 " vmhraddshs %v4, %v2, %v15, %v30 \n"
|
|
120 " vslh %v10, %v10, %v25 \n"
|
|
121 " vmhraddshs %v9, %v5, %v12, %v13 \n"
|
|
122 " vspltish %v25, 6 \n"
|
|
123 " vmhraddshs %v5, %v10, %v12, %v13 \n"
|
|
124 " vmhraddshs %v28, %v19, %v30, %v15 \n"
|
|
125 " vmhraddshs %v27, %v3, %v27, %v31 \n"
|
|
126 " vsubshs %v0, %v13, %v18 \n"
|
|
127 " vmhraddshs %v18, %v11, %v18, %v1 \n"
|
|
128 " vaddshs %v30, %v17, %v4 \n"
|
|
129 " vmhraddshs %v12, %v11, %v1, %v0 \n"
|
|
130 " vsubshs %v4, %v17, %v4 \n"
|
|
131 " vaddshs %v10, %v9, %v5 \n"
|
|
132 " vsubshs %v17, %v27, %v28 \n"
|
|
133 " vaddshs %v27, %v27, %v28 \n"
|
|
134 " vsubshs %v1, %v9, %v5 \n"
|
|
135 " vaddshs %v28, %v10, %v18 \n"
|
|
136 " vsubshs %v18, %v10, %v18 \n"
|
|
137 " vaddshs %v10, %v1, %v12 \n"
|
|
138 " vsubshs %v1, %v1, %v12 \n"
|
|
139 " vsubshs %v12, %v17, %v4 \n"
|
|
140 " vaddshs %v4, %v17, %v4 \n"
|
|
141 " vmhraddshs %v5, %v7, %v12, %v1 \n"
|
|
142 " vmhraddshs %v26, %v6, %v4, %v10 \n"
|
|
143 " vmhraddshs %v29, %v6, %v12, %v1 \n"
|
|
144 " vmhraddshs %v14, %v7, %v4, %v10 \n"
|
|
145 " vsubshs %v12, %v18, %v30 \n"
|
|
146 " vaddshs %v9, %v28, %v27 \n"
|
|
147 " vaddshs %v16, %v18, %v30 \n"
|
|
148 " vsubshs %v10, %v28, %v27 \n"
|
|
149 " vmrglh %v31, %v9, %v12 \n"
|
|
150 " vmrglh %v30, %v5, %v26 \n"
|
|
151 " vmrglh %v15, %v14, %v29 \n"
|
|
152 " vmrghh %v5, %v5, %v26 \n"
|
|
153 " vmrglh %v27, %v16, %v10 \n"
|
|
154 " vmrghh %v9, %v9, %v12 \n"
|
|
155 " vmrghh %v18, %v16, %v10 \n"
|
|
156 " vmrghh %v1, %v14, %v29 \n"
|
|
157 " vmrglh %v14, %v9, %v5 \n"
|
|
158 " vmrglh %v16, %v31, %v30 \n"
|
|
159 " vmrglh %v10, %v15, %v27 \n"
|
|
160 " vmrghh %v9, %v9, %v5 \n"
|
|
161 " vmrghh %v26, %v15, %v27 \n"
|
|
162 " vmrglh %v27, %v16, %v10 \n"
|
|
163 " vmrghh %v12, %v1, %v18 \n"
|
|
164 " vmrglh %v29, %v1, %v18 \n"
|
|
165 " vsubshs %v0, %v13, %v27 \n"
|
|
166 " vmrghh %v5, %v31, %v30 \n"
|
|
167 " vmrglh %v31, %v9, %v12 \n"
|
|
168 " vmrglh %v30, %v5, %v26 \n"
|
|
169 " vmrglh %v15, %v14, %v29 \n"
|
|
170 " vmhraddshs %v17, %v3, %v31, %v0 \n"
|
|
171 " vmrghh %v18, %v16, %v10 \n"
|
|
172 " vmhraddshs %v27, %v3, %v27, %v31 \n"
|
|
173 " vmhraddshs %v4, %v2, %v15, %v30 \n"
|
|
174 " vmrghh %v1, %v14, %v29 \n"
|
|
175 " vmhraddshs %v28, %v19, %v30, %v15 \n"
|
|
176 " vmrghh %v0, %v9, %v12 \n"
|
|
177 " vsubshs %v13, %v13, %v18 \n"
|
|
178 " vmrghh %v5, %v5, %v26 \n"
|
|
179 " vmhraddshs %v18, %v11, %v18, %v1 \n"
|
|
180 " vaddshs %v9, %v0, %v8 \n"
|
|
181 " vaddshs %v30, %v17, %v4 \n"
|
|
182 " vmhraddshs %v12, %v11, %v1, %v13 \n"
|
|
183 " vsubshs %v4, %v17, %v4 \n"
|
|
184 " vaddshs %v10, %v9, %v5 \n"
|
|
185 " vsubshs %v17, %v27, %v28 \n"
|
|
186 " vaddshs %v27, %v27, %v28 \n"
|
|
187 " vsubshs %v1, %v9, %v5 \n"
|
|
188 " vaddshs %v28, %v10, %v18 \n"
|
|
189 " vsubshs %v18, %v10, %v18 \n"
|
|
190 " vaddshs %v10, %v1, %v12 \n"
|
|
191 " vsubshs %v1, %v1, %v12 \n"
|
|
192 " vsubshs %v12, %v17, %v4 \n"
|
|
193 " vaddshs %v4, %v17, %v4 \n"
|
|
194 " vaddshs %v9, %v28, %v27 \n"
|
|
195 " vmhraddshs %v14, %v7, %v4, %v10 \n"
|
|
196 " vsrah %v9, %v9, %v25 \n"
|
|
197 " vmhraddshs %v5, %v7, %v12, %v1 \n"
|
|
198 " vpkshus %v0, %v9, %v9 \n"
|
|
199 " vmhraddshs %v29, %v6, %v12, %v1 \n"
|
|
200 " stvewx %v0, 0, %r4 \n"
|
|
201 " vaddshs %v16, %v18, %v30 \n"
|
|
202 " vsrah %v31, %v14, %v25 \n"
|
|
203 " stvewx %v0, %r9, %r4 \n"
|
|
204 " add %r4, %r4, %r5 \n"
|
|
205 " vsrah %v15, %v16, %v25 \n"
|
|
206 " vpkshus %v0, %v31, %v31 \n"
|
|
207 " vsrah %v1, %v5, %v25 \n"
|
|
208 " stvewx %v0, 0, %r4 \n"
|
|
209 " vsubshs %v12, %v18, %v30 \n"
|
|
210 " stvewx %v0, %r9, %r4 \n"
|
|
211 " vmhraddshs %v26, %v6, %v4, %v10 \n"
|
|
212 " vpkshus %v0, %v1, %v1 \n"
|
|
213 " add %r4, %r4, %r5 \n"
|
|
214 " vsrah %v5, %v12, %v25 \n"
|
|
215 " stvewx %v0, 0, %r4 \n"
|
|
216 " vsrah %v30, %v29, %v25 \n"
|
|
217 " stvewx %v0, %r9, %r4 \n"
|
|
218 " vsubshs %v10, %v28, %v27 \n"
|
|
219 " vpkshus %v0, %v15, %v15 \n"
|
|
220 " add %r4, %r4, %r5 \n"
|
|
221 " stvewx %v0, 0, %r4 \n"
|
|
222 " vsrah %v18, %v26, %v25 \n"
|
|
223 " stvewx %v0, %r9, %r4 \n"
|
|
224 " vsrah %v27, %v10, %v25 \n"
|
|
225 " vpkshus %v0, %v5, %v5 \n"
|
|
226 " add %r4, %r4, %r5 \n"
|
|
227 " stvewx %v0, 0, %r4 \n"
|
|
228 " stvewx %v0, %r9, %r4 \n"
|
|
229 " vpkshus %v0, %v30, %v30 \n"
|
|
230 " add %r4, %r4, %r5 \n"
|
|
231 " stvewx %v0, 0, %r4 \n"
|
|
232 " stvewx %v0, %r9, %r4 \n"
|
|
233 " vpkshus %v0, %v18, %v18 \n"
|
|
234 " add %r4, %r4, %r5 \n"
|
|
235 " stvewx %v0, 0, %r4 \n"
|
|
236 " stvewx %v0, %r9, %r4 \n"
|
|
237 " add %r4, %r4, %r5 \n"
|
|
238 " vpkshus %v0, %v27, %v27 \n"
|
|
239 " stvewx %v0, 0, %r4 \n"
|
|
240 " stvewx %v0, %r9, %r4 \n"
|
|
241
|
|
242 "# addi %r0, %r1, 128 \n"
|
|
243 "# bl _restv25 \n"
|
|
244 "# lwz %r0, 132(%r1) \n"
|
|
245 "# mtlr %r0 \n"
|
|
246 "# la %r1, 128(%r1) \n"
|
|
247
|
|
248 " vxor %v1, %v1, %v1 \n"
|
|
249 " addi %r9, %r3, 16 \n"
|
|
250 " stvx %v1, 0, %r3 \n"
|
|
251 " stvx %v1, 0, %r9 \n"
|
|
252 " addi %r11, %r3, 32 \n"
|
|
253 " stvx %v1, 0, %r11 \n"
|
|
254 " addi %r9, %r3, 48 \n"
|
|
255 " stvx %v1, 0, %r9 \n"
|
|
256 " addi %r11, %r3, -64 \n"
|
|
257 " stvx %v1, 0, %r11 \n"
|
|
258 " addi %r9, %r3, -48 \n"
|
|
259 " stvx %v1, 0, %r9 \n"
|
|
260 " addi %r11, %r3, -32 \n"
|
|
261 " stvx %v1, 0, %r11 \n"
|
|
262 " addi %r3, %r3, -16 \n"
|
|
263 " stvx %v1, 0, %r3 \n"
|
|
264 );
|
|
265 }
|
|
266
|
|
267 void mpeg2_idct_add_altivec (int last, int16_t * block,
|
|
268 uint8_t * dest, int stride)
|
|
269 {
|
|
270 asm (" \n"
|
|
271 "# stwu %r1, -192(%r1) \n"
|
|
272 "# mflr %r0 \n"
|
|
273 "# stw %r0, 196(%r1) \n"
|
|
274 "# addi %r0, %r1, 192 \n"
|
|
275 "# bl _savev21 \n"
|
|
276
|
|
277 " addi %r9, %r4, 112 \n"
|
|
278 " vspltish %v21, 4 \n"
|
|
279 " vxor %v1, %v1, %v1 \n"
|
|
280 " lvx %v13, 0, %r9 \n"
|
|
281 " lis %r10, constants@ha \n"
|
|
282 " vspltisw %v3, -1 \n"
|
|
283 " la %r10, constants@l(%r10) \n"
|
|
284 " lvx %v5, 0, %r4 \n"
|
|
285 " addi %r9, %r4, 16 \n"
|
|
286 " lvx %v8, 0, %r10 \n"
|
|
287 " lvx %v12, 0, %r9 \n"
|
|
288 " addi %r11, %r10, 32 \n"
|
|
289 " lvx %v6, 0, %r11 \n"
|
|
290 " addi %r8, %r4, 48 \n"
|
|
291 " vslh %v13, %v13, %v21 \n"
|
|
292 " addi %r9, %r4, 80 \n"
|
|
293 " lvx %v11, 0, %r8 \n"
|
|
294 " vslh %v5, %v5, %v21 \n"
|
|
295 " lvx %v0, 0, %r9 \n"
|
|
296 " addi %r11, %r10, 64 \n"
|
|
297 " vsplth %v2, %v8, 2 \n"
|
|
298 " lvx %v7, 0, %r11 \n"
|
|
299 " vslh %v12, %v12, %v21 \n"
|
|
300 " addi %r9, %r4, 96 \n"
|
|
301 " vmhraddshs %v24, %v13, %v6, %v1 \n"
|
|
302 " addi %r8, %r4, 32 \n"
|
|
303 " vsplth %v17, %v8, 5 \n"
|
|
304 " lvx %v13, 0, %r9 \n"
|
|
305 " vslh %v11, %v11, %v21 \n"
|
|
306 " addi %r4, %r4, 64 \n"
|
|
307 " lvx %v10, 0, %r8 \n"
|
|
308 " vslh %v0, %v0, %v21 \n"
|
|
309 " addi %r9, %r10, 48 \n"
|
|
310 " vmhraddshs %v31, %v12, %v6, %v1 \n"
|
|
311 " lvx %v4, 0, %r9 \n"
|
|
312 " addi %r10, %r10, 16 \n"
|
|
313 " vmhraddshs %v26, %v0, %v7, %v1 \n"
|
|
314 " lvx %v9, 0, %r4 \n"
|
|
315 " vsplth %v16, %v8, 3 \n"
|
|
316 " vmhraddshs %v22, %v11, %v7, %v1 \n"
|
|
317 " lvx %v6, 0, %r10 \n"
|
|
318 " lvsl %v19, 0, %r5 \n"
|
|
319 " vsubshs %v12, %v1, %v24 \n"
|
|
320 " lvsl %v0, %r6, %r5 \n"
|
|
321 " vsplth %v11, %v8, 1 \n"
|
|
322 " vslh %v10, %v10, %v21 \n"
|
|
323 " vmrghb %v19, %v3, %v19 \n"
|
|
324 " lvx %v15, 0, %r5 \n"
|
|
325 " vslh %v13, %v13, %v21 \n"
|
|
326 " vmrghb %v3, %v3, %v0 \n"
|
|
327 " li %r9, 4 \n"
|
|
328 " vmhraddshs %v14, %v2, %v31, %v12 \n"
|
|
329 " vsplth %v7, %v8, 0 \n"
|
|
330 " vmhraddshs %v23, %v13, %v4, %v1 \n"
|
|
331 " vsplth %v18, %v8, 4 \n"
|
|
332 " vmhraddshs %v27, %v10, %v4, %v1 \n"
|
|
333 " vspltw %v8, %v8, 3 \n"
|
|
334 " vmhraddshs %v12, %v17, %v22, %v26 \n"
|
|
335 " vperm %v15, %v15, %v1, %v19 \n"
|
|
336 " vslh %v9, %v9, %v21 \n"
|
|
337 " vmhraddshs %v10, %v5, %v6, %v1 \n"
|
|
338 " vspltish %v21, 6 \n"
|
|
339 " vmhraddshs %v30, %v9, %v6, %v1 \n"
|
|
340 " vmhraddshs %v26, %v16, %v26, %v22 \n"
|
|
341 " vmhraddshs %v24, %v2, %v24, %v31 \n"
|
|
342 " vmhraddshs %v31, %v11, %v23, %v27 \n"
|
|
343 " vsubshs %v0, %v1, %v23 \n"
|
|
344 " vaddshs %v23, %v14, %v12 \n"
|
|
345 " vmhraddshs %v9, %v11, %v27, %v0 \n"
|
|
346 " vsubshs %v12, %v14, %v12 \n"
|
|
347 " vaddshs %v6, %v10, %v30 \n"
|
|
348 " vsubshs %v14, %v24, %v26 \n"
|
|
349 " vaddshs %v24, %v24, %v26 \n"
|
|
350 " vsubshs %v13, %v10, %v30 \n"
|
|
351 " vaddshs %v26, %v6, %v31 \n"
|
|
352 " vsubshs %v31, %v6, %v31 \n"
|
|
353 " vaddshs %v6, %v13, %v9 \n"
|
|
354 " vsubshs %v13, %v13, %v9 \n"
|
|
355 " vsubshs %v9, %v14, %v12 \n"
|
|
356 " vaddshs %v12, %v14, %v12 \n"
|
|
357 " vmhraddshs %v30, %v7, %v9, %v13 \n"
|
|
358 " vmhraddshs %v25, %v18, %v12, %v6 \n"
|
|
359 " vmhraddshs %v28, %v18, %v9, %v13 \n"
|
|
360 " vmhraddshs %v29, %v7, %v12, %v6 \n"
|
|
361 " vaddshs %v10, %v26, %v24 \n"
|
|
362 " vsubshs %v5, %v31, %v23 \n"
|
|
363 " vsubshs %v13, %v26, %v24 \n"
|
|
364 " vaddshs %v4, %v31, %v23 \n"
|
|
365 " vmrglh %v26, %v30, %v25 \n"
|
|
366 " vmrglh %v31, %v10, %v5 \n"
|
|
367 " vmrglh %v22, %v29, %v28 \n"
|
|
368 " vmrghh %v30, %v30, %v25 \n"
|
|
369 " vmrglh %v24, %v4, %v13 \n"
|
|
370 " vmrghh %v10, %v10, %v5 \n"
|
|
371 " vmrghh %v23, %v4, %v13 \n"
|
|
372 " vmrghh %v27, %v29, %v28 \n"
|
|
373 " vmrglh %v29, %v10, %v30 \n"
|
|
374 " vmrglh %v4, %v31, %v26 \n"
|
|
375 " vmrglh %v13, %v22, %v24 \n"
|
|
376 " vmrghh %v10, %v10, %v30 \n"
|
|
377 " vmrghh %v25, %v22, %v24 \n"
|
|
378 " vmrglh %v24, %v4, %v13 \n"
|
|
379 " vmrghh %v5, %v27, %v23 \n"
|
|
380 " vmrglh %v28, %v27, %v23 \n"
|
|
381 " vsubshs %v0, %v1, %v24 \n"
|
|
382 " vmrghh %v30, %v31, %v26 \n"
|
|
383 " vmrglh %v31, %v10, %v5 \n"
|
|
384 " vmrglh %v26, %v30, %v25 \n"
|
|
385 " vmrglh %v22, %v29, %v28 \n"
|
|
386 " vmhraddshs %v14, %v2, %v31, %v0 \n"
|
|
387 " vmrghh %v23, %v4, %v13 \n"
|
|
388 " vmhraddshs %v24, %v2, %v24, %v31 \n"
|
|
389 " vmhraddshs %v12, %v17, %v22, %v26 \n"
|
|
390 " vmrghh %v27, %v29, %v28 \n"
|
|
391 " vmhraddshs %v26, %v16, %v26, %v22 \n"
|
|
392 " vmrghh %v0, %v10, %v5 \n"
|
|
393 " vmhraddshs %v31, %v11, %v23, %v27 \n"
|
|
394 " vmrghh %v30, %v30, %v25 \n"
|
|
395 " vsubshs %v13, %v1, %v23 \n"
|
|
396 " vaddshs %v10, %v0, %v8 \n"
|
|
397 " vaddshs %v23, %v14, %v12 \n"
|
|
398 " vsubshs %v12, %v14, %v12 \n"
|
|
399 " vaddshs %v6, %v10, %v30 \n"
|
|
400 " vsubshs %v14, %v24, %v26 \n"
|
|
401 " vmhraddshs %v9, %v11, %v27, %v13 \n"
|
|
402 " vaddshs %v24, %v24, %v26 \n"
|
|
403 " vaddshs %v26, %v6, %v31 \n"
|
|
404 " vsubshs %v13, %v10, %v30 \n"
|
|
405 " vaddshs %v10, %v26, %v24 \n"
|
|
406 " vsubshs %v31, %v6, %v31 \n"
|
|
407 " vaddshs %v6, %v13, %v9 \n"
|
|
408 " vsrah %v10, %v10, %v21 \n"
|
|
409 " vsubshs %v13, %v13, %v9 \n"
|
|
410 " vaddshs %v0, %v15, %v10 \n"
|
|
411 " vsubshs %v9, %v14, %v12 \n"
|
|
412 " vaddshs %v12, %v14, %v12 \n"
|
|
413 " vpkshus %v15, %v0, %v0 \n"
|
|
414 " stvewx %v15, 0, %r5 \n"
|
|
415 " vaddshs %v4, %v31, %v23 \n"
|
|
416 " vmhraddshs %v29, %v7, %v12, %v6 \n"
|
|
417 " stvewx %v15, %r9, %r5 \n"
|
|
418 " add %r5, %r5, %r6 \n"
|
|
419 " vsubshs %v5, %v31, %v23 \n"
|
|
420 " lvx %v15, 0, %r5 \n"
|
|
421 " vmhraddshs %v30, %v7, %v9, %v13 \n"
|
|
422 " vsrah %v22, %v4, %v21 \n"
|
|
423 " vperm %v15, %v15, %v1, %v3 \n"
|
|
424 " vmhraddshs %v28, %v18, %v9, %v13 \n"
|
|
425 " vsrah %v31, %v29, %v21 \n"
|
|
426 " vsubshs %v13, %v26, %v24 \n"
|
|
427 " vaddshs %v0, %v15, %v31 \n"
|
|
428 " vsrah %v27, %v30, %v21 \n"
|
|
429 " vpkshus %v15, %v0, %v0 \n"
|
|
430 " vsrah %v30, %v5, %v21 \n"
|
|
431 " stvewx %v15, 0, %r5 \n"
|
|
432 " vsrah %v26, %v28, %v21 \n"
|
|
433 " stvewx %v15, %r9, %r5 \n"
|
|
434 " vmhraddshs %v25, %v18, %v12, %v6 \n"
|
|
435 " add %r5, %r5, %r6 \n"
|
|
436 " vsrah %v24, %v13, %v21 \n"
|
|
437 " lvx %v15, 0, %r5 \n"
|
|
438 " vperm %v15, %v15, %v1, %v19 \n"
|
|
439 " vsrah %v23, %v25, %v21 \n"
|
|
440 " vaddshs %v0, %v15, %v27 \n"
|
|
441 " vpkshus %v15, %v0, %v0 \n"
|
|
442 " stvewx %v15, 0, %r5 \n"
|
|
443 " stvewx %v15, %r9, %r5 \n"
|
|
444 " add %r5, %r5, %r6 \n"
|
|
445 " lvx %v15, 0, %r5 \n"
|
|
446 " vperm %v15, %v15, %v1, %v3 \n"
|
|
447 " vaddshs %v0, %v15, %v22 \n"
|
|
448 " vpkshus %v15, %v0, %v0 \n"
|
|
449 " stvewx %v15, 0, %r5 \n"
|
|
450 " stvewx %v15, %r9, %r5 \n"
|
|
451 " add %r5, %r5, %r6 \n"
|
|
452 " lvx %v15, 0, %r5 \n"
|
|
453 " vperm %v15, %v15, %v1, %v19 \n"
|
|
454 " vaddshs %v0, %v15, %v30 \n"
|
|
455 " vpkshus %v15, %v0, %v0 \n"
|
|
456 " stvewx %v15, 0, %r5 \n"
|
|
457 " stvewx %v15, %r9, %r5 \n"
|
|
458 " add %r5, %r5, %r6 \n"
|
|
459 " lvx %v15, 0, %r5 \n"
|
|
460 " vperm %v15, %v15, %v1, %v3 \n"
|
|
461 " vaddshs %v0, %v15, %v26 \n"
|
|
462 " vpkshus %v15, %v0, %v0 \n"
|
|
463 " stvewx %v15, 0, %r5 \n"
|
|
464 " stvewx %v15, %r9, %r5 \n"
|
|
465 " add %r5, %r5, %r6 \n"
|
|
466 " lvx %v15, 0, %r5 \n"
|
|
467 " vperm %v15, %v15, %v1, %v19 \n"
|
|
468 " vaddshs %v0, %v15, %v23 \n"
|
|
469 " vpkshus %v15, %v0, %v0 \n"
|
|
470 " stvewx %v15, 0, %r5 \n"
|
|
471 " stvewx %v15, %r9, %r5 \n"
|
|
472 " add %r5, %r5, %r6 \n"
|
|
473 " lvx %v15, 0, %r5 \n"
|
|
474 " vperm %v15, %v15, %v1, %v3 \n"
|
|
475 " vaddshs %v0, %v15, %v24 \n"
|
|
476 " vpkshus %v15, %v0, %v0 \n"
|
|
477 " stvewx %v15, 0, %r5 \n"
|
|
478 " stvewx %v15, %r9, %r5 \n"
|
|
479
|
|
480 "# addi %r0, %r1, 192 \n"
|
|
481 "# bl _restv21 \n"
|
|
482 "# lwz %r0, 196(%r1) \n"
|
|
483 "# mtlr %r0 \n"
|
|
484 "# la %r1, 192(%r1) \n"
|
|
485
|
|
486 " addi %r9, %r4, 16 \n"
|
|
487 " stvx %v1, 0, %r4 \n"
|
|
488 " stvx %v1, 0, %r9 \n"
|
|
489 " addi %r11, %r4, 32 \n"
|
|
490 " stvx %v1, 0, %r11 \n"
|
|
491 " addi %r9, %r4, 48 \n"
|
|
492 " stvx %v1, 0, %r9 \n"
|
|
493 " addi %r11, %r4, -64 \n"
|
|
494 " stvx %v1, 0, %r11 \n"
|
|
495 " addi %r9, %r4, -48 \n"
|
|
496 " stvx %v1, 0, %r9 \n"
|
|
497 " addi %r11, %r4, -32 \n"
|
|
498 " stvx %v1, 0, %r11 \n"
|
|
499 " addi %r4, %r4, -16 \n"
|
|
500 " stvx %v1, 0, %r4 \n"
|
|
501 );
|
|
502 }
|
|
503
|
|
504 void mpeg2_idct_altivec_init (void)
|
|
505 {
|
|
506 extern uint8_t mpeg2_scan_norm[64];
|
|
507 extern uint8_t mpeg2_scan_alt[64];
|
|
508 int i, j;
|
|
509
|
|
510 i = constants[0][0]; /* just pretending - keeps gcc happy */
|
|
511
|
|
512 /* the altivec idct uses a transposed input, so we patch scan tables */
|
|
513 for (i = 0; i < 64; i++) {
|
|
514 j = mpeg2_scan_norm[i];
|
|
515 mpeg2_scan_norm[i] = (j >> 3) | ((j & 7) << 3);
|
|
516 j = mpeg2_scan_alt[i];
|
|
517 mpeg2_scan_alt[i] = (j >> 3) | ((j & 7) << 3);
|
|
518 }
|
|
519 }
|
|
520
|
|
521 #endif /* ARCH_PPC */
|
|
522
|
|
523 #else /* __ALTIVEC__ */
|
|
524
|
|
525 #define vector_s16_t vector signed short
|
|
526 #define vector_u16_t vector unsigned short
|
|
527 #define vector_s8_t vector signed char
|
|
528 #define vector_u8_t vector unsigned char
|
|
529 #define vector_s32_t vector signed int
|
|
530 #define vector_u32_t vector unsigned int
|
|
531
|
|
532 #define IDCT_HALF \
|
|
533 /* 1st stage */ \
|
|
534 t1 = vec_mradds (a1, vx7, vx1 ); \
|
|
535 t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \
|
|
536 t7 = vec_mradds (a2, vx5, vx3); \
|
|
537 t3 = vec_mradds (ma2, vx3, vx5); \
|
|
538 \
|
|
539 /* 2nd stage */ \
|
|
540 t5 = vec_adds (vx0, vx4); \
|
|
541 t0 = vec_subs (vx0, vx4); \
|
|
542 t2 = vec_mradds (a0, vx6, vx2); \
|
|
543 t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \
|
|
544 t6 = vec_adds (t8, t3); \
|
|
545 t3 = vec_subs (t8, t3); \
|
|
546 t8 = vec_subs (t1, t7); \
|
|
547 t1 = vec_adds (t1, t7); \
|
|
548 \
|
|
549 /* 3rd stage */ \
|
|
550 t7 = vec_adds (t5, t2); \
|
|
551 t2 = vec_subs (t5, t2); \
|
|
552 t5 = vec_adds (t0, t4); \
|
|
553 t0 = vec_subs (t0, t4); \
|
|
554 t4 = vec_subs (t8, t3); \
|
|
555 t3 = vec_adds (t8, t3); \
|
|
556 \
|
|
557 /* 4th stage */ \
|
|
558 vy0 = vec_adds (t7, t1); \
|
|
559 vy7 = vec_subs (t7, t1); \
|
|
560 vy1 = vec_mradds (c4, t3, t5); \
|
|
561 vy6 = vec_mradds (mc4, t3, t5); \
|
|
562 vy2 = vec_mradds (c4, t4, t0); \
|
|
563 vy5 = vec_mradds (mc4, t4, t0); \
|
|
564 vy3 = vec_adds (t2, t6); \
|
|
565 vy4 = vec_subs (t2, t6);
|
|
566
|
|
567 #define IDCT \
|
|
568 vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
|
|
569 vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
|
|
570 vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \
|
|
571 vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \
|
|
572 vector_u16_t shift; \
|
|
573 \
|
|
574 c4 = vec_splat (constants[0], 0); \
|
|
575 a0 = vec_splat (constants[0], 1); \
|
|
576 a1 = vec_splat (constants[0], 2); \
|
|
577 a2 = vec_splat (constants[0], 3); \
|
|
578 mc4 = vec_splat (constants[0], 4); \
|
|
579 ma2 = vec_splat (constants[0], 5); \
|
|
580 bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \
|
|
581 \
|
|
582 zero = vec_splat_s16 (0); \
|
|
583 shift = vec_splat_u16 (4); \
|
|
584 \
|
|
585 vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \
|
|
586 vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \
|
|
587 vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \
|
|
588 vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \
|
|
589 vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \
|
|
590 vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \
|
|
591 vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \
|
|
592 vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \
|
|
593 \
|
|
594 IDCT_HALF \
|
|
595 \
|
|
596 vx0 = vec_mergeh (vy0, vy4); \
|
|
597 vx1 = vec_mergel (vy0, vy4); \
|
|
598 vx2 = vec_mergeh (vy1, vy5); \
|
|
599 vx3 = vec_mergel (vy1, vy5); \
|
|
600 vx4 = vec_mergeh (vy2, vy6); \
|
|
601 vx5 = vec_mergel (vy2, vy6); \
|
|
602 vx6 = vec_mergeh (vy3, vy7); \
|
|
603 vx7 = vec_mergel (vy3, vy7); \
|
|
604 \
|
|
605 vy0 = vec_mergeh (vx0, vx4); \
|
|
606 vy1 = vec_mergel (vx0, vx4); \
|
|
607 vy2 = vec_mergeh (vx1, vx5); \
|
|
608 vy3 = vec_mergel (vx1, vx5); \
|
|
609 vy4 = vec_mergeh (vx2, vx6); \
|
|
610 vy5 = vec_mergel (vx2, vx6); \
|
|
611 vy6 = vec_mergeh (vx3, vx7); \
|
|
612 vy7 = vec_mergel (vx3, vx7); \
|
|
613 \
|
|
614 vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \
|
|
615 vx1 = vec_mergel (vy0, vy4); \
|
|
616 vx2 = vec_mergeh (vy1, vy5); \
|
|
617 vx3 = vec_mergel (vy1, vy5); \
|
|
618 vx4 = vec_mergeh (vy2, vy6); \
|
|
619 vx5 = vec_mergel (vy2, vy6); \
|
|
620 vx6 = vec_mergeh (vy3, vy7); \
|
|
621 vx7 = vec_mergel (vy3, vy7); \
|
|
622 \
|
|
623 IDCT_HALF \
|
|
624 \
|
|
625 shift = vec_splat_u16 (6); \
|
|
626 vx0 = vec_sra (vy0, shift); \
|
|
627 vx1 = vec_sra (vy1, shift); \
|
|
628 vx2 = vec_sra (vy2, shift); \
|
|
629 vx3 = vec_sra (vy3, shift); \
|
|
630 vx4 = vec_sra (vy4, shift); \
|
|
631 vx5 = vec_sra (vy5, shift); \
|
|
632 vx6 = vec_sra (vy6, shift); \
|
|
633 vx7 = vec_sra (vy7, shift);
|
|
634
|
|
635 static const vector_s16_t constants[5] = {
|
|
636 (vector_s16_t)(23170, 13573, 6518, 21895, -23170, -21895, 32, 31),
|
|
637 (vector_s16_t)(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725),
|
|
638 (vector_s16_t)(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521),
|
|
639 (vector_s16_t)(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692),
|
|
640 (vector_s16_t)(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722)
|
|
641 };
|
|
642
|
|
643 void mpeg2_idct_copy_altivec (vector_s16_t * const block, unsigned char * dest,
|
|
644 const int stride)
|
|
645 {
|
|
646 vector_u8_t tmp;
|
|
647
|
|
648 IDCT
|
|
649
|
|
650 #define COPY(dest,src) \
|
|
651 tmp = vec_packsu (src, src); \
|
|
652 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \
|
|
653 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
654
|
|
655 COPY (dest, vx0) dest += stride;
|
|
656 COPY (dest, vx1) dest += stride;
|
|
657 COPY (dest, vx2) dest += stride;
|
|
658 COPY (dest, vx3) dest += stride;
|
|
659 COPY (dest, vx4) dest += stride;
|
|
660 COPY (dest, vx5) dest += stride;
|
|
661 COPY (dest, vx6) dest += stride;
|
|
662 COPY (dest, vx7)
|
|
663
|
|
664 memset (block, 0, 64 * sizeof (signed short));
|
|
665 }
|
|
666
|
|
667 void mpeg2_idct_add_altivec (const int last, vector_s16_t * const block,
|
|
668 unsigned char * dest, const int stride)
|
|
669 {
|
|
670 vector_u8_t tmp;
|
|
671 vector_s16_t tmp2, tmp3;
|
|
672 vector_u8_t perm0;
|
|
673 vector_u8_t perm1;
|
|
674 vector_u8_t p0, p1, p;
|
|
675
|
|
676 IDCT
|
|
677
|
|
678 p0 = vec_lvsl (0, dest);
|
|
679 p1 = vec_lvsl (stride, dest);
|
|
680 p = vec_splat_u8 (-1);
|
|
681 perm0 = vec_mergeh (p, p0);
|
|
682 perm1 = vec_mergeh (p, p1);
|
|
683
|
|
684 #define ADD(dest,src,perm) \
|
|
685 /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
|
|
686 tmp = vec_ld (0, dest); \
|
|
687 tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \
|
|
688 tmp3 = vec_adds (tmp2, src); \
|
|
689 tmp = vec_packsu (tmp3, tmp3); \
|
|
690 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \
|
|
691 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
|
|
692
|
|
693 ADD (dest, vx0, perm0) dest += stride;
|
|
694 ADD (dest, vx1, perm1) dest += stride;
|
|
695 ADD (dest, vx2, perm0) dest += stride;
|
|
696 ADD (dest, vx3, perm1) dest += stride;
|
|
697 ADD (dest, vx4, perm0) dest += stride;
|
|
698 ADD (dest, vx5, perm1) dest += stride;
|
|
699 ADD (dest, vx6, perm0) dest += stride;
|
|
700 ADD (dest, vx7, perm1)
|
|
701
|
|
702 memset (block, 0, 64 * sizeof (signed short));
|
|
703 }
|
|
704
|
|
705 #endif /* __ALTIVEC__ */
|