Mercurial > mplayer.hg
comparison libmpeg2/idct_altivec.c @ 9857:89b48bc6c441
Importing libmpeg2 from mpeg2dec-0.3.1
author | arpi |
---|---|
date | Sun, 06 Apr 2003 16:41:49 +0000 |
parents | |
children | aeea70a0e72c |
comparison
equal
deleted
inserted
replaced
9856:08496327b7ec | 9857:89b48bc6c441 |
---|---|
1 /* | |
2 * idct_altivec.c | |
3 * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org> | |
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> | |
5 * | |
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
7 * See http://libmpeg2.sourceforge.net/ for updates. | |
8 * | |
9 * mpeg2dec is free software; you can redistribute it and/or modify | |
10 * it under the terms of the GNU General Public License as published by | |
11 * the Free Software Foundation; either version 2 of the License, or | |
12 * (at your option) any later version. | |
13 * | |
14 * mpeg2dec is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 * GNU General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU General Public License | |
20 * along with this program; if not, write to the Free Software | |
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
22 */ | |
23 | |
24 #ifndef __ALTIVEC__ | |
25 | |
26 #include "config.h" | |
27 | |
28 #ifdef ARCH_PPC | |
29 | |
30 #include <inttypes.h> | |
31 | |
32 #include "mpeg2.h" | |
33 #include "mpeg2_internal.h" | |
34 #include "attributes.h" | |
35 | |
36 static const int16_t constants[5][8] ATTR_ALIGN(16) = { | |
37 {23170, 13573, 6518, 21895, -23170, -21895, 32, 31}, | |
38 {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725}, | |
39 {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521}, | |
40 {21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692}, | |
41 {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722} | |
42 }; | |
43 | |
44 /* | |
45 * The asm code is generated with: | |
46 * | |
47 * gcc-2.95 -fvec -D__ALTIVEC__ -O9 -fomit-frame-pointer -mregnames -S | |
48 * idct_altivec.c | |
49 * | |
50 * awk '{args=""; len=split ($2, arg, ","); | |
51 * for (i=1; i<=len; i++) { a=arg[i]; if (i<len) a=a","; | |
52 * args = args sprintf ("%-6s", a) } | |
53 * printf ("\t\"\t%-16s%-24s\\n\"\n", $1, args) }' idct_altivec.s | | |
54 * unexpand -a | |
55 * | |
56 * I then do some simple trimming on the function prolog/trailers | |
57 */ | |
58 | |
59 void mpeg2_idct_copy_altivec (int16_t * block, uint8_t * dest, int stride) | |
60 { | |
61 asm (" \n" | |
62 "# stwu %r1, -128(%r1) \n" | |
63 "# mflr %r0 \n" | |
64 "# stw %r0, 132(%r1) \n" | |
65 "# addi %r0, %r1, 128 \n" | |
66 "# bl _savev25 \n" | |
67 | |
68 " addi %r9, %r3, 112 \n" | |
69 " vspltish %v25, 4 \n" | |
70 " vxor %v13, %v13, %v13 \n" | |
71 " lis %r10, constants@ha \n" | |
72 " lvx %v1, 0, %r9 \n" | |
73 " la %r10, constants@l(%r10) \n" | |
74 " lvx %v5, 0, %r3 \n" | |
75 " addi %r9, %r3, 16 \n" | |
76 " lvx %v8, 0, %r10 \n" | |
77 " addi %r11, %r10, 32 \n" | |
78 " lvx %v12, 0, %r9 \n" | |
79 " lvx %v6, 0, %r11 \n" | |
80 " addi %r8, %r3, 48 \n" | |
81 " vslh %v1, %v1, %v25 \n" | |
82 " addi %r9, %r3, 80 \n" | |
83 " lvx %v11, 0, %r8 \n" | |
84 " vslh %v5, %v5, %v25 \n" | |
85 " lvx %v0, 0, %r9 \n" | |
86 " addi %r11, %r10, 64 \n" | |
87 " vsplth %v3, %v8, 2 \n" | |
88 " lvx %v7, 0, %r11 \n" | |
89 " addi %r9, %r3, 96 \n" | |
90 " vslh %v12, %v12, %v25 \n" | |
91 " vmhraddshs %v27, %v1, %v6, %v13 \n" | |
92 " addi %r8, %r3, 32 \n" | |
93 " vsplth %v2, %v8, 5 \n" | |
94 " lvx %v1, 0, %r9 \n" | |
95 " vslh %v11, %v11, %v25 \n" | |
96 " addi %r3, %r3, 64 \n" | |
97 " lvx %v9, 0, %r8 \n" | |
98 " addi %r9, %r10, 48 \n" | |
99 " vslh %v0, %v0, %v25 \n" | |
100 " lvx %v4, 0, %r9 \n" | |
101 " vmhraddshs %v31, %v12, %v6, %v13 \n" | |
102 " addi %r10, %r10, 16 \n" | |
103 " vmhraddshs %v30, %v0, %v7, %v13 \n" | |
104 " lvx %v10, 0, %r3 \n" | |
105 " vsplth %v19, %v8, 3 \n" | |
106 " vmhraddshs %v15, %v11, %v7, %v13 \n" | |
107 " lvx %v12, 0, %r10 \n" | |
108 " vsplth %v6, %v8, 4 \n" | |
109 " vslh %v1, %v1, %v25 \n" | |
110 " vsplth %v11, %v8, 1 \n" | |
111 " li %r9, 4 \n" | |
112 " vslh %v9, %v9, %v25 \n" | |
113 " vsplth %v7, %v8, 0 \n" | |
114 " vmhraddshs %v18, %v1, %v4, %v13 \n" | |
115 " vspltw %v8, %v8, 3 \n" | |
116 " vsubshs %v0, %v13, %v27 \n" | |
117 " vmhraddshs %v1, %v9, %v4, %v13 \n" | |
118 " vmhraddshs %v17, %v3, %v31, %v0 \n" | |
119 " vmhraddshs %v4, %v2, %v15, %v30 \n" | |
120 " vslh %v10, %v10, %v25 \n" | |
121 " vmhraddshs %v9, %v5, %v12, %v13 \n" | |
122 " vspltish %v25, 6 \n" | |
123 " vmhraddshs %v5, %v10, %v12, %v13 \n" | |
124 " vmhraddshs %v28, %v19, %v30, %v15 \n" | |
125 " vmhraddshs %v27, %v3, %v27, %v31 \n" | |
126 " vsubshs %v0, %v13, %v18 \n" | |
127 " vmhraddshs %v18, %v11, %v18, %v1 \n" | |
128 " vaddshs %v30, %v17, %v4 \n" | |
129 " vmhraddshs %v12, %v11, %v1, %v0 \n" | |
130 " vsubshs %v4, %v17, %v4 \n" | |
131 " vaddshs %v10, %v9, %v5 \n" | |
132 " vsubshs %v17, %v27, %v28 \n" | |
133 " vaddshs %v27, %v27, %v28 \n" | |
134 " vsubshs %v1, %v9, %v5 \n" | |
135 " vaddshs %v28, %v10, %v18 \n" | |
136 " vsubshs %v18, %v10, %v18 \n" | |
137 " vaddshs %v10, %v1, %v12 \n" | |
138 " vsubshs %v1, %v1, %v12 \n" | |
139 " vsubshs %v12, %v17, %v4 \n" | |
140 " vaddshs %v4, %v17, %v4 \n" | |
141 " vmhraddshs %v5, %v7, %v12, %v1 \n" | |
142 " vmhraddshs %v26, %v6, %v4, %v10 \n" | |
143 " vmhraddshs %v29, %v6, %v12, %v1 \n" | |
144 " vmhraddshs %v14, %v7, %v4, %v10 \n" | |
145 " vsubshs %v12, %v18, %v30 \n" | |
146 " vaddshs %v9, %v28, %v27 \n" | |
147 " vaddshs %v16, %v18, %v30 \n" | |
148 " vsubshs %v10, %v28, %v27 \n" | |
149 " vmrglh %v31, %v9, %v12 \n" | |
150 " vmrglh %v30, %v5, %v26 \n" | |
151 " vmrglh %v15, %v14, %v29 \n" | |
152 " vmrghh %v5, %v5, %v26 \n" | |
153 " vmrglh %v27, %v16, %v10 \n" | |
154 " vmrghh %v9, %v9, %v12 \n" | |
155 " vmrghh %v18, %v16, %v10 \n" | |
156 " vmrghh %v1, %v14, %v29 \n" | |
157 " vmrglh %v14, %v9, %v5 \n" | |
158 " vmrglh %v16, %v31, %v30 \n" | |
159 " vmrglh %v10, %v15, %v27 \n" | |
160 " vmrghh %v9, %v9, %v5 \n" | |
161 " vmrghh %v26, %v15, %v27 \n" | |
162 " vmrglh %v27, %v16, %v10 \n" | |
163 " vmrghh %v12, %v1, %v18 \n" | |
164 " vmrglh %v29, %v1, %v18 \n" | |
165 " vsubshs %v0, %v13, %v27 \n" | |
166 " vmrghh %v5, %v31, %v30 \n" | |
167 " vmrglh %v31, %v9, %v12 \n" | |
168 " vmrglh %v30, %v5, %v26 \n" | |
169 " vmrglh %v15, %v14, %v29 \n" | |
170 " vmhraddshs %v17, %v3, %v31, %v0 \n" | |
171 " vmrghh %v18, %v16, %v10 \n" | |
172 " vmhraddshs %v27, %v3, %v27, %v31 \n" | |
173 " vmhraddshs %v4, %v2, %v15, %v30 \n" | |
174 " vmrghh %v1, %v14, %v29 \n" | |
175 " vmhraddshs %v28, %v19, %v30, %v15 \n" | |
176 " vmrghh %v0, %v9, %v12 \n" | |
177 " vsubshs %v13, %v13, %v18 \n" | |
178 " vmrghh %v5, %v5, %v26 \n" | |
179 " vmhraddshs %v18, %v11, %v18, %v1 \n" | |
180 " vaddshs %v9, %v0, %v8 \n" | |
181 " vaddshs %v30, %v17, %v4 \n" | |
182 " vmhraddshs %v12, %v11, %v1, %v13 \n" | |
183 " vsubshs %v4, %v17, %v4 \n" | |
184 " vaddshs %v10, %v9, %v5 \n" | |
185 " vsubshs %v17, %v27, %v28 \n" | |
186 " vaddshs %v27, %v27, %v28 \n" | |
187 " vsubshs %v1, %v9, %v5 \n" | |
188 " vaddshs %v28, %v10, %v18 \n" | |
189 " vsubshs %v18, %v10, %v18 \n" | |
190 " vaddshs %v10, %v1, %v12 \n" | |
191 " vsubshs %v1, %v1, %v12 \n" | |
192 " vsubshs %v12, %v17, %v4 \n" | |
193 " vaddshs %v4, %v17, %v4 \n" | |
194 " vaddshs %v9, %v28, %v27 \n" | |
195 " vmhraddshs %v14, %v7, %v4, %v10 \n" | |
196 " vsrah %v9, %v9, %v25 \n" | |
197 " vmhraddshs %v5, %v7, %v12, %v1 \n" | |
198 " vpkshus %v0, %v9, %v9 \n" | |
199 " vmhraddshs %v29, %v6, %v12, %v1 \n" | |
200 " stvewx %v0, 0, %r4 \n" | |
201 " vaddshs %v16, %v18, %v30 \n" | |
202 " vsrah %v31, %v14, %v25 \n" | |
203 " stvewx %v0, %r9, %r4 \n" | |
204 " add %r4, %r4, %r5 \n" | |
205 " vsrah %v15, %v16, %v25 \n" | |
206 " vpkshus %v0, %v31, %v31 \n" | |
207 " vsrah %v1, %v5, %v25 \n" | |
208 " stvewx %v0, 0, %r4 \n" | |
209 " vsubshs %v12, %v18, %v30 \n" | |
210 " stvewx %v0, %r9, %r4 \n" | |
211 " vmhraddshs %v26, %v6, %v4, %v10 \n" | |
212 " vpkshus %v0, %v1, %v1 \n" | |
213 " add %r4, %r4, %r5 \n" | |
214 " vsrah %v5, %v12, %v25 \n" | |
215 " stvewx %v0, 0, %r4 \n" | |
216 " vsrah %v30, %v29, %v25 \n" | |
217 " stvewx %v0, %r9, %r4 \n" | |
218 " vsubshs %v10, %v28, %v27 \n" | |
219 " vpkshus %v0, %v15, %v15 \n" | |
220 " add %r4, %r4, %r5 \n" | |
221 " stvewx %v0, 0, %r4 \n" | |
222 " vsrah %v18, %v26, %v25 \n" | |
223 " stvewx %v0, %r9, %r4 \n" | |
224 " vsrah %v27, %v10, %v25 \n" | |
225 " vpkshus %v0, %v5, %v5 \n" | |
226 " add %r4, %r4, %r5 \n" | |
227 " stvewx %v0, 0, %r4 \n" | |
228 " stvewx %v0, %r9, %r4 \n" | |
229 " vpkshus %v0, %v30, %v30 \n" | |
230 " add %r4, %r4, %r5 \n" | |
231 " stvewx %v0, 0, %r4 \n" | |
232 " stvewx %v0, %r9, %r4 \n" | |
233 " vpkshus %v0, %v18, %v18 \n" | |
234 " add %r4, %r4, %r5 \n" | |
235 " stvewx %v0, 0, %r4 \n" | |
236 " stvewx %v0, %r9, %r4 \n" | |
237 " add %r4, %r4, %r5 \n" | |
238 " vpkshus %v0, %v27, %v27 \n" | |
239 " stvewx %v0, 0, %r4 \n" | |
240 " stvewx %v0, %r9, %r4 \n" | |
241 | |
242 "# addi %r0, %r1, 128 \n" | |
243 "# bl _restv25 \n" | |
244 "# lwz %r0, 132(%r1) \n" | |
245 "# mtlr %r0 \n" | |
246 "# la %r1, 128(%r1) \n" | |
247 | |
248 " vxor %v1, %v1, %v1 \n" | |
249 " addi %r9, %r3, 16 \n" | |
250 " stvx %v1, 0, %r3 \n" | |
251 " stvx %v1, 0, %r9 \n" | |
252 " addi %r11, %r3, 32 \n" | |
253 " stvx %v1, 0, %r11 \n" | |
254 " addi %r9, %r3, 48 \n" | |
255 " stvx %v1, 0, %r9 \n" | |
256 " addi %r11, %r3, -64 \n" | |
257 " stvx %v1, 0, %r11 \n" | |
258 " addi %r9, %r3, -48 \n" | |
259 " stvx %v1, 0, %r9 \n" | |
260 " addi %r11, %r3, -32 \n" | |
261 " stvx %v1, 0, %r11 \n" | |
262 " addi %r3, %r3, -16 \n" | |
263 " stvx %v1, 0, %r3 \n" | |
264 ); | |
265 } | |
266 | |
267 void mpeg2_idct_add_altivec (int last, int16_t * block, | |
268 uint8_t * dest, int stride) | |
269 { | |
270 asm (" \n" | |
271 "# stwu %r1, -192(%r1) \n" | |
272 "# mflr %r0 \n" | |
273 "# stw %r0, 196(%r1) \n" | |
274 "# addi %r0, %r1, 192 \n" | |
275 "# bl _savev21 \n" | |
276 | |
277 " addi %r9, %r4, 112 \n" | |
278 " vspltish %v21, 4 \n" | |
279 " vxor %v1, %v1, %v1 \n" | |
280 " lvx %v13, 0, %r9 \n" | |
281 " lis %r10, constants@ha \n" | |
282 " vspltisw %v3, -1 \n" | |
283 " la %r10, constants@l(%r10) \n" | |
284 " lvx %v5, 0, %r4 \n" | |
285 " addi %r9, %r4, 16 \n" | |
286 " lvx %v8, 0, %r10 \n" | |
287 " lvx %v12, 0, %r9 \n" | |
288 " addi %r11, %r10, 32 \n" | |
289 " lvx %v6, 0, %r11 \n" | |
290 " addi %r8, %r4, 48 \n" | |
291 " vslh %v13, %v13, %v21 \n" | |
292 " addi %r9, %r4, 80 \n" | |
293 " lvx %v11, 0, %r8 \n" | |
294 " vslh %v5, %v5, %v21 \n" | |
295 " lvx %v0, 0, %r9 \n" | |
296 " addi %r11, %r10, 64 \n" | |
297 " vsplth %v2, %v8, 2 \n" | |
298 " lvx %v7, 0, %r11 \n" | |
299 " vslh %v12, %v12, %v21 \n" | |
300 " addi %r9, %r4, 96 \n" | |
301 " vmhraddshs %v24, %v13, %v6, %v1 \n" | |
302 " addi %r8, %r4, 32 \n" | |
303 " vsplth %v17, %v8, 5 \n" | |
304 " lvx %v13, 0, %r9 \n" | |
305 " vslh %v11, %v11, %v21 \n" | |
306 " addi %r4, %r4, 64 \n" | |
307 " lvx %v10, 0, %r8 \n" | |
308 " vslh %v0, %v0, %v21 \n" | |
309 " addi %r9, %r10, 48 \n" | |
310 " vmhraddshs %v31, %v12, %v6, %v1 \n" | |
311 " lvx %v4, 0, %r9 \n" | |
312 " addi %r10, %r10, 16 \n" | |
313 " vmhraddshs %v26, %v0, %v7, %v1 \n" | |
314 " lvx %v9, 0, %r4 \n" | |
315 " vsplth %v16, %v8, 3 \n" | |
316 " vmhraddshs %v22, %v11, %v7, %v1 \n" | |
317 " lvx %v6, 0, %r10 \n" | |
318 " lvsl %v19, 0, %r5 \n" | |
319 " vsubshs %v12, %v1, %v24 \n" | |
320 " lvsl %v0, %r6, %r5 \n" | |
321 " vsplth %v11, %v8, 1 \n" | |
322 " vslh %v10, %v10, %v21 \n" | |
323 " vmrghb %v19, %v3, %v19 \n" | |
324 " lvx %v15, 0, %r5 \n" | |
325 " vslh %v13, %v13, %v21 \n" | |
326 " vmrghb %v3, %v3, %v0 \n" | |
327 " li %r9, 4 \n" | |
328 " vmhraddshs %v14, %v2, %v31, %v12 \n" | |
329 " vsplth %v7, %v8, 0 \n" | |
330 " vmhraddshs %v23, %v13, %v4, %v1 \n" | |
331 " vsplth %v18, %v8, 4 \n" | |
332 " vmhraddshs %v27, %v10, %v4, %v1 \n" | |
333 " vspltw %v8, %v8, 3 \n" | |
334 " vmhraddshs %v12, %v17, %v22, %v26 \n" | |
335 " vperm %v15, %v15, %v1, %v19 \n" | |
336 " vslh %v9, %v9, %v21 \n" | |
337 " vmhraddshs %v10, %v5, %v6, %v1 \n" | |
338 " vspltish %v21, 6 \n" | |
339 " vmhraddshs %v30, %v9, %v6, %v1 \n" | |
340 " vmhraddshs %v26, %v16, %v26, %v22 \n" | |
341 " vmhraddshs %v24, %v2, %v24, %v31 \n" | |
342 " vmhraddshs %v31, %v11, %v23, %v27 \n" | |
343 " vsubshs %v0, %v1, %v23 \n" | |
344 " vaddshs %v23, %v14, %v12 \n" | |
345 " vmhraddshs %v9, %v11, %v27, %v0 \n" | |
346 " vsubshs %v12, %v14, %v12 \n" | |
347 " vaddshs %v6, %v10, %v30 \n" | |
348 " vsubshs %v14, %v24, %v26 \n" | |
349 " vaddshs %v24, %v24, %v26 \n" | |
350 " vsubshs %v13, %v10, %v30 \n" | |
351 " vaddshs %v26, %v6, %v31 \n" | |
352 " vsubshs %v31, %v6, %v31 \n" | |
353 " vaddshs %v6, %v13, %v9 \n" | |
354 " vsubshs %v13, %v13, %v9 \n" | |
355 " vsubshs %v9, %v14, %v12 \n" | |
356 " vaddshs %v12, %v14, %v12 \n" | |
357 " vmhraddshs %v30, %v7, %v9, %v13 \n" | |
358 " vmhraddshs %v25, %v18, %v12, %v6 \n" | |
359 " vmhraddshs %v28, %v18, %v9, %v13 \n" | |
360 " vmhraddshs %v29, %v7, %v12, %v6 \n" | |
361 " vaddshs %v10, %v26, %v24 \n" | |
362 " vsubshs %v5, %v31, %v23 \n" | |
363 " vsubshs %v13, %v26, %v24 \n" | |
364 " vaddshs %v4, %v31, %v23 \n" | |
365 " vmrglh %v26, %v30, %v25 \n" | |
366 " vmrglh %v31, %v10, %v5 \n" | |
367 " vmrglh %v22, %v29, %v28 \n" | |
368 " vmrghh %v30, %v30, %v25 \n" | |
369 " vmrglh %v24, %v4, %v13 \n" | |
370 " vmrghh %v10, %v10, %v5 \n" | |
371 " vmrghh %v23, %v4, %v13 \n" | |
372 " vmrghh %v27, %v29, %v28 \n" | |
373 " vmrglh %v29, %v10, %v30 \n" | |
374 " vmrglh %v4, %v31, %v26 \n" | |
375 " vmrglh %v13, %v22, %v24 \n" | |
376 " vmrghh %v10, %v10, %v30 \n" | |
377 " vmrghh %v25, %v22, %v24 \n" | |
378 " vmrglh %v24, %v4, %v13 \n" | |
379 " vmrghh %v5, %v27, %v23 \n" | |
380 " vmrglh %v28, %v27, %v23 \n" | |
381 " vsubshs %v0, %v1, %v24 \n" | |
382 " vmrghh %v30, %v31, %v26 \n" | |
383 " vmrglh %v31, %v10, %v5 \n" | |
384 " vmrglh %v26, %v30, %v25 \n" | |
385 " vmrglh %v22, %v29, %v28 \n" | |
386 " vmhraddshs %v14, %v2, %v31, %v0 \n" | |
387 " vmrghh %v23, %v4, %v13 \n" | |
388 " vmhraddshs %v24, %v2, %v24, %v31 \n" | |
389 " vmhraddshs %v12, %v17, %v22, %v26 \n" | |
390 " vmrghh %v27, %v29, %v28 \n" | |
391 " vmhraddshs %v26, %v16, %v26, %v22 \n" | |
392 " vmrghh %v0, %v10, %v5 \n" | |
393 " vmhraddshs %v31, %v11, %v23, %v27 \n" | |
394 " vmrghh %v30, %v30, %v25 \n" | |
395 " vsubshs %v13, %v1, %v23 \n" | |
396 " vaddshs %v10, %v0, %v8 \n" | |
397 " vaddshs %v23, %v14, %v12 \n" | |
398 " vsubshs %v12, %v14, %v12 \n" | |
399 " vaddshs %v6, %v10, %v30 \n" | |
400 " vsubshs %v14, %v24, %v26 \n" | |
401 " vmhraddshs %v9, %v11, %v27, %v13 \n" | |
402 " vaddshs %v24, %v24, %v26 \n" | |
403 " vaddshs %v26, %v6, %v31 \n" | |
404 " vsubshs %v13, %v10, %v30 \n" | |
405 " vaddshs %v10, %v26, %v24 \n" | |
406 " vsubshs %v31, %v6, %v31 \n" | |
407 " vaddshs %v6, %v13, %v9 \n" | |
408 " vsrah %v10, %v10, %v21 \n" | |
409 " vsubshs %v13, %v13, %v9 \n" | |
410 " vaddshs %v0, %v15, %v10 \n" | |
411 " vsubshs %v9, %v14, %v12 \n" | |
412 " vaddshs %v12, %v14, %v12 \n" | |
413 " vpkshus %v15, %v0, %v0 \n" | |
414 " stvewx %v15, 0, %r5 \n" | |
415 " vaddshs %v4, %v31, %v23 \n" | |
416 " vmhraddshs %v29, %v7, %v12, %v6 \n" | |
417 " stvewx %v15, %r9, %r5 \n" | |
418 " add %r5, %r5, %r6 \n" | |
419 " vsubshs %v5, %v31, %v23 \n" | |
420 " lvx %v15, 0, %r5 \n" | |
421 " vmhraddshs %v30, %v7, %v9, %v13 \n" | |
422 " vsrah %v22, %v4, %v21 \n" | |
423 " vperm %v15, %v15, %v1, %v3 \n" | |
424 " vmhraddshs %v28, %v18, %v9, %v13 \n" | |
425 " vsrah %v31, %v29, %v21 \n" | |
426 " vsubshs %v13, %v26, %v24 \n" | |
427 " vaddshs %v0, %v15, %v31 \n" | |
428 " vsrah %v27, %v30, %v21 \n" | |
429 " vpkshus %v15, %v0, %v0 \n" | |
430 " vsrah %v30, %v5, %v21 \n" | |
431 " stvewx %v15, 0, %r5 \n" | |
432 " vsrah %v26, %v28, %v21 \n" | |
433 " stvewx %v15, %r9, %r5 \n" | |
434 " vmhraddshs %v25, %v18, %v12, %v6 \n" | |
435 " add %r5, %r5, %r6 \n" | |
436 " vsrah %v24, %v13, %v21 \n" | |
437 " lvx %v15, 0, %r5 \n" | |
438 " vperm %v15, %v15, %v1, %v19 \n" | |
439 " vsrah %v23, %v25, %v21 \n" | |
440 " vaddshs %v0, %v15, %v27 \n" | |
441 " vpkshus %v15, %v0, %v0 \n" | |
442 " stvewx %v15, 0, %r5 \n" | |
443 " stvewx %v15, %r9, %r5 \n" | |
444 " add %r5, %r5, %r6 \n" | |
445 " lvx %v15, 0, %r5 \n" | |
446 " vperm %v15, %v15, %v1, %v3 \n" | |
447 " vaddshs %v0, %v15, %v22 \n" | |
448 " vpkshus %v15, %v0, %v0 \n" | |
449 " stvewx %v15, 0, %r5 \n" | |
450 " stvewx %v15, %r9, %r5 \n" | |
451 " add %r5, %r5, %r6 \n" | |
452 " lvx %v15, 0, %r5 \n" | |
453 " vperm %v15, %v15, %v1, %v19 \n" | |
454 " vaddshs %v0, %v15, %v30 \n" | |
455 " vpkshus %v15, %v0, %v0 \n" | |
456 " stvewx %v15, 0, %r5 \n" | |
457 " stvewx %v15, %r9, %r5 \n" | |
458 " add %r5, %r5, %r6 \n" | |
459 " lvx %v15, 0, %r5 \n" | |
460 " vperm %v15, %v15, %v1, %v3 \n" | |
461 " vaddshs %v0, %v15, %v26 \n" | |
462 " vpkshus %v15, %v0, %v0 \n" | |
463 " stvewx %v15, 0, %r5 \n" | |
464 " stvewx %v15, %r9, %r5 \n" | |
465 " add %r5, %r5, %r6 \n" | |
466 " lvx %v15, 0, %r5 \n" | |
467 " vperm %v15, %v15, %v1, %v19 \n" | |
468 " vaddshs %v0, %v15, %v23 \n" | |
469 " vpkshus %v15, %v0, %v0 \n" | |
470 " stvewx %v15, 0, %r5 \n" | |
471 " stvewx %v15, %r9, %r5 \n" | |
472 " add %r5, %r5, %r6 \n" | |
473 " lvx %v15, 0, %r5 \n" | |
474 " vperm %v15, %v15, %v1, %v3 \n" | |
475 " vaddshs %v0, %v15, %v24 \n" | |
476 " vpkshus %v15, %v0, %v0 \n" | |
477 " stvewx %v15, 0, %r5 \n" | |
478 " stvewx %v15, %r9, %r5 \n" | |
479 | |
480 "# addi %r0, %r1, 192 \n" | |
481 "# bl _restv21 \n" | |
482 "# lwz %r0, 196(%r1) \n" | |
483 "# mtlr %r0 \n" | |
484 "# la %r1, 192(%r1) \n" | |
485 | |
486 " addi %r9, %r4, 16 \n" | |
487 " stvx %v1, 0, %r4 \n" | |
488 " stvx %v1, 0, %r9 \n" | |
489 " addi %r11, %r4, 32 \n" | |
490 " stvx %v1, 0, %r11 \n" | |
491 " addi %r9, %r4, 48 \n" | |
492 " stvx %v1, 0, %r9 \n" | |
493 " addi %r11, %r4, -64 \n" | |
494 " stvx %v1, 0, %r11 \n" | |
495 " addi %r9, %r4, -48 \n" | |
496 " stvx %v1, 0, %r9 \n" | |
497 " addi %r11, %r4, -32 \n" | |
498 " stvx %v1, 0, %r11 \n" | |
499 " addi %r4, %r4, -16 \n" | |
500 " stvx %v1, 0, %r4 \n" | |
501 ); | |
502 } | |
503 | |
504 void mpeg2_idct_altivec_init (void) | |
505 { | |
506 extern uint8_t mpeg2_scan_norm[64]; | |
507 extern uint8_t mpeg2_scan_alt[64]; | |
508 int i, j; | |
509 | |
510 i = constants[0][0]; /* just pretending - keeps gcc happy */ | |
511 | |
512 /* the altivec idct uses a transposed input, so we patch scan tables */ | |
513 for (i = 0; i < 64; i++) { | |
514 j = mpeg2_scan_norm[i]; | |
515 mpeg2_scan_norm[i] = (j >> 3) | ((j & 7) << 3); | |
516 j = mpeg2_scan_alt[i]; | |
517 mpeg2_scan_alt[i] = (j >> 3) | ((j & 7) << 3); | |
518 } | |
519 } | |
520 | |
521 #endif /* ARCH_PPC */ | |
522 | |
523 #else /* __ALTIVEC__ */ | |
524 | |
525 #define vector_s16_t vector signed short | |
526 #define vector_u16_t vector unsigned short | |
527 #define vector_s8_t vector signed char | |
528 #define vector_u8_t vector unsigned char | |
529 #define vector_s32_t vector signed int | |
530 #define vector_u32_t vector unsigned int | |
531 | |
532 #define IDCT_HALF \ | |
533 /* 1st stage */ \ | |
534 t1 = vec_mradds (a1, vx7, vx1 ); \ | |
535 t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ | |
536 t7 = vec_mradds (a2, vx5, vx3); \ | |
537 t3 = vec_mradds (ma2, vx3, vx5); \ | |
538 \ | |
539 /* 2nd stage */ \ | |
540 t5 = vec_adds (vx0, vx4); \ | |
541 t0 = vec_subs (vx0, vx4); \ | |
542 t2 = vec_mradds (a0, vx6, vx2); \ | |
543 t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ | |
544 t6 = vec_adds (t8, t3); \ | |
545 t3 = vec_subs (t8, t3); \ | |
546 t8 = vec_subs (t1, t7); \ | |
547 t1 = vec_adds (t1, t7); \ | |
548 \ | |
549 /* 3rd stage */ \ | |
550 t7 = vec_adds (t5, t2); \ | |
551 t2 = vec_subs (t5, t2); \ | |
552 t5 = vec_adds (t0, t4); \ | |
553 t0 = vec_subs (t0, t4); \ | |
554 t4 = vec_subs (t8, t3); \ | |
555 t3 = vec_adds (t8, t3); \ | |
556 \ | |
557 /* 4th stage */ \ | |
558 vy0 = vec_adds (t7, t1); \ | |
559 vy7 = vec_subs (t7, t1); \ | |
560 vy1 = vec_mradds (c4, t3, t5); \ | |
561 vy6 = vec_mradds (mc4, t3, t5); \ | |
562 vy2 = vec_mradds (c4, t4, t0); \ | |
563 vy5 = vec_mradds (mc4, t4, t0); \ | |
564 vy3 = vec_adds (t2, t6); \ | |
565 vy4 = vec_subs (t2, t6); | |
566 | |
567 #define IDCT \ | |
568 vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ | |
569 vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ | |
570 vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \ | |
571 vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \ | |
572 vector_u16_t shift; \ | |
573 \ | |
574 c4 = vec_splat (constants[0], 0); \ | |
575 a0 = vec_splat (constants[0], 1); \ | |
576 a1 = vec_splat (constants[0], 2); \ | |
577 a2 = vec_splat (constants[0], 3); \ | |
578 mc4 = vec_splat (constants[0], 4); \ | |
579 ma2 = vec_splat (constants[0], 5); \ | |
580 bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \ | |
581 \ | |
582 zero = vec_splat_s16 (0); \ | |
583 shift = vec_splat_u16 (4); \ | |
584 \ | |
585 vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ | |
586 vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ | |
587 vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ | |
588 vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ | |
589 vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ | |
590 vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ | |
591 vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ | |
592 vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ | |
593 \ | |
594 IDCT_HALF \ | |
595 \ | |
596 vx0 = vec_mergeh (vy0, vy4); \ | |
597 vx1 = vec_mergel (vy0, vy4); \ | |
598 vx2 = vec_mergeh (vy1, vy5); \ | |
599 vx3 = vec_mergel (vy1, vy5); \ | |
600 vx4 = vec_mergeh (vy2, vy6); \ | |
601 vx5 = vec_mergel (vy2, vy6); \ | |
602 vx6 = vec_mergeh (vy3, vy7); \ | |
603 vx7 = vec_mergel (vy3, vy7); \ | |
604 \ | |
605 vy0 = vec_mergeh (vx0, vx4); \ | |
606 vy1 = vec_mergel (vx0, vx4); \ | |
607 vy2 = vec_mergeh (vx1, vx5); \ | |
608 vy3 = vec_mergel (vx1, vx5); \ | |
609 vy4 = vec_mergeh (vx2, vx6); \ | |
610 vy5 = vec_mergel (vx2, vx6); \ | |
611 vy6 = vec_mergeh (vx3, vx7); \ | |
612 vy7 = vec_mergel (vx3, vx7); \ | |
613 \ | |
614 vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ | |
615 vx1 = vec_mergel (vy0, vy4); \ | |
616 vx2 = vec_mergeh (vy1, vy5); \ | |
617 vx3 = vec_mergel (vy1, vy5); \ | |
618 vx4 = vec_mergeh (vy2, vy6); \ | |
619 vx5 = vec_mergel (vy2, vy6); \ | |
620 vx6 = vec_mergeh (vy3, vy7); \ | |
621 vx7 = vec_mergel (vy3, vy7); \ | |
622 \ | |
623 IDCT_HALF \ | |
624 \ | |
625 shift = vec_splat_u16 (6); \ | |
626 vx0 = vec_sra (vy0, shift); \ | |
627 vx1 = vec_sra (vy1, shift); \ | |
628 vx2 = vec_sra (vy2, shift); \ | |
629 vx3 = vec_sra (vy3, shift); \ | |
630 vx4 = vec_sra (vy4, shift); \ | |
631 vx5 = vec_sra (vy5, shift); \ | |
632 vx6 = vec_sra (vy6, shift); \ | |
633 vx7 = vec_sra (vy7, shift); | |
634 | |
635 static const vector_s16_t constants[5] = { | |
636 (vector_s16_t)(23170, 13573, 6518, 21895, -23170, -21895, 32, 31), | |
637 (vector_s16_t)(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725), | |
638 (vector_s16_t)(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521), | |
639 (vector_s16_t)(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692), | |
640 (vector_s16_t)(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722) | |
641 }; | |
642 | |
643 void mpeg2_idct_copy_altivec (vector_s16_t * const block, unsigned char * dest, | |
644 const int stride) | |
645 { | |
646 vector_u8_t tmp; | |
647 | |
648 IDCT | |
649 | |
650 #define COPY(dest,src) \ | |
651 tmp = vec_packsu (src, src); \ | |
652 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ | |
653 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); | |
654 | |
655 COPY (dest, vx0) dest += stride; | |
656 COPY (dest, vx1) dest += stride; | |
657 COPY (dest, vx2) dest += stride; | |
658 COPY (dest, vx3) dest += stride; | |
659 COPY (dest, vx4) dest += stride; | |
660 COPY (dest, vx5) dest += stride; | |
661 COPY (dest, vx6) dest += stride; | |
662 COPY (dest, vx7) | |
663 | |
664 memset (block, 0, 64 * sizeof (signed short)); | |
665 } | |
666 | |
667 void mpeg2_idct_add_altivec (const int last, vector_s16_t * const block, | |
668 unsigned char * dest, const int stride) | |
669 { | |
670 vector_u8_t tmp; | |
671 vector_s16_t tmp2, tmp3; | |
672 vector_u8_t perm0; | |
673 vector_u8_t perm1; | |
674 vector_u8_t p0, p1, p; | |
675 | |
676 IDCT | |
677 | |
678 p0 = vec_lvsl (0, dest); | |
679 p1 = vec_lvsl (stride, dest); | |
680 p = vec_splat_u8 (-1); | |
681 perm0 = vec_mergeh (p, p0); | |
682 perm1 = vec_mergeh (p, p1); | |
683 | |
684 #define ADD(dest,src,perm) \ | |
685 /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ | |
686 tmp = vec_ld (0, dest); \ | |
687 tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \ | |
688 tmp3 = vec_adds (tmp2, src); \ | |
689 tmp = vec_packsu (tmp3, tmp3); \ | |
690 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ | |
691 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); | |
692 | |
693 ADD (dest, vx0, perm0) dest += stride; | |
694 ADD (dest, vx1, perm1) dest += stride; | |
695 ADD (dest, vx2, perm0) dest += stride; | |
696 ADD (dest, vx3, perm1) dest += stride; | |
697 ADD (dest, vx4, perm0) dest += stride; | |
698 ADD (dest, vx5, perm1) dest += stride; | |
699 ADD (dest, vx6, perm0) dest += stride; | |
700 ADD (dest, vx7, perm1) | |
701 | |
702 memset (block, 0, 64 * sizeof (signed short)); | |
703 } | |
704 | |
705 #endif /* __ALTIVEC__ */ |