Mercurial > libavcodec.hg
annotate sparc/simple_idct_vis.c @ 11032:01bd040f8607 libavcodec
Unroll main loop so the edge==0 case is seperate.
This allows many things to be simplified away.
h264 decoder is overall 1% faster with a mbaff sample and
0.1% slower with the cathedral sample, probably because the slow loop
filter code must be loaded into the code cache for each first MB of each
row but isnt used for the following MBs.
author | michael |
---|---|
date | Thu, 28 Jan 2010 01:24:25 +0000 |
parents | 34a65026fa06 |
children | 98970e51365a |
rev | line source |
---|---|
5618 | 1 /* |
2 * SPARC VIS optimized inverse DCT | |
3 * Copyright (c) 2007 Denes Balatoni < dbalatoni XatX interware XdotX hu > | |
4 * | |
5 * I did consult the following fine web page about dct | |
6 * http://www.geocities.com/ssavekar/dct.htm | |
7 * | |
8 * This file is part of FFmpeg. | |
9 * | |
10 * FFmpeg is free software; you can redistribute it and/or | |
11 * modify it under the terms of the GNU Lesser General Public | |
12 * License as published by the Free Software Foundation; either | |
13 * version 2.1 of the License, or (at your option) any later version. | |
14 * | |
15 * FFmpeg is distributed in the hope that it will be useful, | |
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 * Lesser General Public License for more details. | |
19 * | |
20 * You should have received a copy of the GNU Lesser General Public | |
21 * License along with FFmpeg; if not, write to the Free Software | |
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 */ | |
24 | |
6763 | 25 #include "libavcodec/dsputil.h" |
5618 | 26 |
10961
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
8285
diff
changeset
|
27 static const DECLARE_ALIGNED_8(int16_t, coeffs)[28] = { |
5618 | 28 - 1259,- 1259,- 1259,- 1259, |
29 - 4989,- 4989,- 4989,- 4989, | |
30 -11045,-11045,-11045,-11045, | |
31 -19195,-19195,-19195,-19195, | |
32 -29126,-29126,-29126,-29126, | |
33 25080, 25080, 25080, 25080, | |
34 12785, 12785, 12785, 12785 | |
35 }; | |
10961
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
8285
diff
changeset
|
36 static const DECLARE_ALIGNED_8(uint16_t, scale)[4] = { |
5618 | 37 65536>>6, 65536>>6, 65536>>6, 65536>>6 |
38 }; | |
10961
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
8285
diff
changeset
|
39 static const DECLARE_ALIGNED_8(uint16_t, rounder)[4] = { |
5618 | 40 1<<5, 1<<5, 1<<5, 1<<5 |
41 }; | |
10961
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
8285
diff
changeset
|
42 static const DECLARE_ALIGNED_8(uint16_t, expand)[4] = { |
5618 | 43 1<<14, 1<<14, 1<<14, 1<<14 |
44 }; | |
45 | |
46 #define INIT_IDCT \ | |
47 "ldd [%1], %%f32 \n\t"\ | |
48 "ldd [%1+8], %%f34 \n\t"\ | |
49 "ldd [%1+16], %%f36 \n\t"\ | |
50 "ldd [%1+24], %%f38 \n\t"\ | |
51 "ldd [%1+32], %%f40 \n\t"\ | |
52 "ldd [%1+40], %%f42 \n\t"\ | |
53 "ldd [%1+48], %%f44 \n\t"\ | |
54 "ldd [%0], %%f46 \n\t"\ | |
55 "fzero %%f62 \n\t"\ | |
56 | |
57 #define LOADSCALE(in) \ | |
58 "ldd [" in "], %%f0 \n\t"\ | |
59 "ldd [" in "+16], %%f2 \n\t"\ | |
60 "ldd [" in "+32], %%f4 \n\t"\ | |
61 "ldd [" in "+48], %%f6 \n\t"\ | |
62 "ldd [" in "+64], %%f8 \n\t"\ | |
63 "ldd [" in "+80], %%f10 \n\t"\ | |
64 "ldd [" in "+96], %%f12 \n\t"\ | |
65 "ldd [" in "+112], %%f14 \n\t"\ | |
66 "fpadd16 %%f0, %%f0, %%f0 \n\t"\ | |
67 "fpadd16 %%f2, %%f2, %%f2 \n\t"\ | |
68 "fpadd16 %%f4, %%f4, %%f4 \n\t"\ | |
69 "fpadd16 %%f6, %%f6, %%f6 \n\t"\ | |
70 "fpadd16 %%f8, %%f8, %%f8 \n\t"\ | |
71 "fpadd16 %%f10, %%f10, %%f10 \n\t"\ | |
72 "fpadd16 %%f12, %%f12, %%f12 \n\t"\ | |
73 "fpadd16 %%f14, %%f14, %%f14 \n\t"\ | |
74 \ | |
75 "fpadd16 %%f0, %%f0, %%f0 \n\t"\ | |
76 "fpadd16 %%f2, %%f2, %%f2 \n\t"\ | |
77 "fpadd16 %%f4, %%f4, %%f4 \n\t"\ | |
78 "fpadd16 %%f6, %%f6, %%f6 \n\t"\ | |
79 "fpadd16 %%f8, %%f8, %%f8 \n\t"\ | |
80 "fpadd16 %%f10, %%f10, %%f10 \n\t"\ | |
81 "fpadd16 %%f12, %%f12, %%f12 \n\t"\ | |
82 "fpadd16 %%f14, %%f14, %%f14 \n\t"\ | |
83 \ | |
84 "fpadd16 %%f0, %%f0, %%f0 \n\t"\ | |
85 "fpadd16 %%f2, %%f2, %%f2 \n\t"\ | |
86 "fpadd16 %%f4, %%f4, %%f4 \n\t"\ | |
87 "fpadd16 %%f6, %%f6, %%f6 \n\t"\ | |
88 "fpadd16 %%f8, %%f8, %%f8 \n\t"\ | |
89 "fpadd16 %%f10, %%f10, %%f10 \n\t"\ | |
90 "fpadd16 %%f12, %%f12, %%f12 \n\t"\ | |
91 "fpadd16 %%f14, %%f14, %%f14 \n\t"\ | |
92 \ | |
93 "fpadd16 %%f0, %%f0, %%f0 \n\t"\ | |
94 "fpadd16 %%f2, %%f2, %%f2 \n\t"\ | |
95 "fpadd16 %%f4, %%f4, %%f4 \n\t"\ | |
96 "fpadd16 %%f6, %%f6, %%f6 \n\t"\ | |
97 "fpadd16 %%f8, %%f8, %%f8 \n\t"\ | |
98 "fpadd16 %%f10, %%f10, %%f10 \n\t"\ | |
99 "fpadd16 %%f12, %%f12, %%f12 \n\t"\ | |
100 "fpadd16 %%f14, %%f14, %%f14 \n\t"\ | |
101 | |
102 #define LOAD(in) \ | |
103 "ldd [" in "], %%f16 \n\t"\ | |
104 "ldd [" in "+8], %%f18 \n\t"\ | |
105 "ldd [" in "+16], %%f20 \n\t"\ | |
106 "ldd [" in "+24], %%f22 \n\t"\ | |
107 "ldd [" in "+32], %%f24 \n\t"\ | |
108 "ldd [" in "+40], %%f26 \n\t"\ | |
109 "ldd [" in "+48], %%f28 \n\t"\ | |
110 "ldd [" in "+56], %%f30 \n\t"\ | |
111 | |
112 #define TRANSPOSE \ | |
113 "fpmerge %%f16, %%f24, %%f0 \n\t"\ | |
114 "fpmerge %%f20, %%f28, %%f2 \n\t"\ | |
115 "fpmerge %%f17, %%f25, %%f4 \n\t"\ | |
116 "fpmerge %%f21, %%f29, %%f6 \n\t"\ | |
117 "fpmerge %%f18, %%f26, %%f8 \n\t"\ | |
118 "fpmerge %%f22, %%f30, %%f10 \n\t"\ | |
119 "fpmerge %%f19, %%f27, %%f12 \n\t"\ | |
120 "fpmerge %%f23, %%f31, %%f14 \n\t"\ | |
121 \ | |
122 "fpmerge %%f0, %%f2, %%f16 \n\t"\ | |
123 "fpmerge %%f1, %%f3, %%f18 \n\t"\ | |
124 "fpmerge %%f4, %%f6, %%f20 \n\t"\ | |
125 "fpmerge %%f5, %%f7, %%f22 \n\t"\ | |
126 "fpmerge %%f8, %%f10, %%f24 \n\t"\ | |
127 "fpmerge %%f9, %%f11, %%f26 \n\t"\ | |
128 "fpmerge %%f12, %%f14, %%f28 \n\t"\ | |
129 "fpmerge %%f13, %%f15, %%f30 \n\t"\ | |
130 \ | |
131 "fpmerge %%f16, %%f17, %%f0 \n\t"\ | |
132 "fpmerge %%f18, %%f19, %%f2 \n\t"\ | |
133 "fpmerge %%f20, %%f21, %%f4 \n\t"\ | |
134 "fpmerge %%f22, %%f23, %%f6 \n\t"\ | |
135 "fpmerge %%f24, %%f25, %%f8 \n\t"\ | |
136 "fpmerge %%f26, %%f27, %%f10 \n\t"\ | |
137 "fpmerge %%f28, %%f29, %%f12 \n\t"\ | |
138 "fpmerge %%f30, %%f31, %%f14 \n\t"\ | |
139 | |
140 #define IDCT4ROWS \ | |
141 /* 1. column */\ | |
142 "fmul8ulx16 %%f0, %%f38, %%f28 \n\t"\ | |
143 "for %%f4, %%f6, %%f60 \n\t"\ | |
144 "fmul8ulx16 %%f2, %%f32, %%f18 \n\t"\ | |
145 "fmul8ulx16 %%f2, %%f36, %%f22 \n\t"\ | |
146 "fmul8ulx16 %%f2, %%f40, %%f26 \n\t"\ | |
147 "fmul8ulx16 %%f2, %%f44, %%f30 \n\t"\ | |
148 \ | |
149 ADDROUNDER\ | |
150 \ | |
151 "fmul8sux16 %%f0, %%f38, %%f48 \n\t"\ | |
152 "fcmpd %%fcc0, %%f62, %%f60 \n\t"\ | |
153 "for %%f8, %%f10, %%f60 \n\t"\ | |
154 "fmul8sux16 %%f2, %%f32, %%f50 \n\t"\ | |
155 "fmul8sux16 %%f2, %%f36, %%f52 \n\t"\ | |
156 "fmul8sux16 %%f2, %%f40, %%f54 \n\t"\ | |
157 "fmul8sux16 %%f2, %%f44, %%f56 \n\t"\ | |
158 \ | |
159 "fpadd16 %%f48, %%f28, %%f28 \n\t"\ | |
160 "fcmpd %%fcc1, %%f62, %%f60 \n\t"\ | |
161 "for %%f12, %%f14, %%f60 \n\t"\ | |
162 "fpadd16 %%f50, %%f18, %%f18 \n\t"\ | |
163 "fpadd16 %%f52, %%f22, %%f22 \n\t"\ | |
164 "fpadd16 %%f54, %%f26, %%f26 \n\t"\ | |
165 "fpadd16 %%f56, %%f30, %%f30 \n\t"\ | |
166 \ | |
167 "fpadd16 %%f28, %%f0, %%f16 \n\t"\ | |
168 "fcmpd %%fcc2, %%f62, %%f60 \n\t"\ | |
169 "fpadd16 %%f28, %%f0, %%f20 \n\t"\ | |
170 "fpadd16 %%f28, %%f0, %%f24 \n\t"\ | |
171 "fpadd16 %%f28, %%f0, %%f28 \n\t"\ | |
172 "fpadd16 %%f18, %%f2, %%f18 \n\t"\ | |
173 "fpadd16 %%f22, %%f2, %%f22 \n\t"\ | |
174 /* 2. column */\ | |
175 "fbe %%fcc0, 3f \n\t"\ | |
176 "fpadd16 %%f26, %%f2, %%f26 \n\t"\ | |
177 "fmul8ulx16 %%f4, %%f34, %%f48 \n\t"\ | |
178 "fmul8ulx16 %%f4, %%f42, %%f50 \n\t"\ | |
179 "fmul8ulx16 %%f6, %%f36, %%f52 \n\t"\ | |
180 "fmul8ulx16 %%f6, %%f44, %%f54 \n\t"\ | |
181 "fmul8ulx16 %%f6, %%f32, %%f56 \n\t"\ | |
182 "fmul8ulx16 %%f6, %%f40, %%f58 \n\t"\ | |
183 \ | |
184 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ | |
185 "fpadd16 %%f20, %%f50, %%f20 \n\t"\ | |
186 "fpsub16 %%f24, %%f50, %%f24 \n\t"\ | |
187 "fpsub16 %%f28, %%f48, %%f28 \n\t"\ | |
188 "fpadd16 %%f18, %%f52, %%f18 \n\t"\ | |
189 "fpsub16 %%f22, %%f54, %%f22 \n\t"\ | |
190 "fpsub16 %%f26, %%f56, %%f26 \n\t"\ | |
191 "fpsub16 %%f30, %%f58, %%f30 \n\t"\ | |
192 \ | |
193 "fmul8sux16 %%f4, %%f34, %%f48 \n\t"\ | |
194 "fmul8sux16 %%f4, %%f42, %%f50 \n\t"\ | |
195 "fmul8sux16 %%f6, %%f36, %%f52 \n\t"\ | |
196 "fmul8sux16 %%f6, %%f44, %%f54 \n\t"\ | |
197 "fmul8sux16 %%f6, %%f32, %%f56 \n\t"\ | |
198 "fmul8sux16 %%f6, %%f40, %%f58 \n\t"\ | |
199 \ | |
200 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ | |
201 "fpadd16 %%f20, %%f50, %%f20 \n\t"\ | |
202 "fpsub16 %%f24, %%f50, %%f24 \n\t"\ | |
203 "fpsub16 %%f28, %%f48, %%f28 \n\t"\ | |
204 "fpadd16 %%f18, %%f52, %%f18 \n\t"\ | |
205 "fpsub16 %%f22, %%f54, %%f22 \n\t"\ | |
206 "fpsub16 %%f26, %%f56, %%f26 \n\t"\ | |
207 "fpsub16 %%f30, %%f58, %%f30 \n\t"\ | |
208 \ | |
209 "fpadd16 %%f16, %%f4, %%f16 \n\t"\ | |
210 "fpsub16 %%f28, %%f4, %%f28 \n\t"\ | |
211 "fpadd16 %%f18, %%f6, %%f18 \n\t"\ | |
212 "fpsub16 %%f26, %%f6, %%f26 \n\t"\ | |
213 /* 3. column */\ | |
214 "3: \n\t"\ | |
215 "fbe %%fcc1, 4f \n\t"\ | |
216 "fpsub16 %%f30, %%f6, %%f30 \n\t"\ | |
217 "fmul8ulx16 %%f8, %%f38, %%f48 \n\t"\ | |
218 "fmul8ulx16 %%f10, %%f40, %%f50 \n\t"\ | |
219 "fmul8ulx16 %%f10, %%f32, %%f52 \n\t"\ | |
220 "fmul8ulx16 %%f10, %%f44, %%f54 \n\t"\ | |
221 "fmul8ulx16 %%f10, %%f36, %%f56 \n\t"\ | |
222 \ | |
223 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ | |
224 "fpsub16 %%f20, %%f48, %%f20 \n\t"\ | |
225 "fpsub16 %%f24, %%f48, %%f24 \n\t"\ | |
226 "fpadd16 %%f28, %%f48, %%f28 \n\t"\ | |
227 "fpadd16 %%f18, %%f50, %%f18 \n\t"\ | |
228 "fpsub16 %%f22, %%f52, %%f22 \n\t"\ | |
229 "fpadd16 %%f26, %%f54, %%f26 \n\t"\ | |
230 "fpadd16 %%f30, %%f56, %%f30 \n\t"\ | |
231 \ | |
232 "fmul8sux16 %%f8, %%f38, %%f48 \n\t"\ | |
233 "fmul8sux16 %%f10, %%f40, %%f50 \n\t"\ | |
234 "fmul8sux16 %%f10, %%f32, %%f52 \n\t"\ | |
235 "fmul8sux16 %%f10, %%f44, %%f54 \n\t"\ | |
236 "fmul8sux16 %%f10, %%f36, %%f56 \n\t"\ | |
237 \ | |
238 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ | |
239 "fpsub16 %%f20, %%f48, %%f20 \n\t"\ | |
240 "fpsub16 %%f24, %%f48, %%f24 \n\t"\ | |
241 "fpadd16 %%f28, %%f48, %%f28 \n\t"\ | |
242 "fpadd16 %%f18, %%f50, %%f18 \n\t"\ | |
243 "fpsub16 %%f22, %%f52, %%f22 \n\t"\ | |
244 "fpadd16 %%f26, %%f54, %%f26 \n\t"\ | |
245 "fpadd16 %%f30, %%f56, %%f30 \n\t"\ | |
246 \ | |
247 "fpadd16 %%f16, %%f8, %%f16 \n\t"\ | |
248 "fpsub16 %%f20, %%f8, %%f20 \n\t"\ | |
249 "fpsub16 %%f24, %%f8, %%f24 \n\t"\ | |
250 "fpadd16 %%f28, %%f8, %%f28 \n\t"\ | |
251 "fpadd16 %%f18, %%f10, %%f18 \n\t"\ | |
252 "fpsub16 %%f22, %%f10, %%f22 \n\t"\ | |
253 /* 4. column */\ | |
254 "4: \n\t"\ | |
255 "fbe %%fcc2, 5f \n\t"\ | |
256 "fpadd16 %%f30, %%f10, %%f30 \n\t"\ | |
257 "fmul8ulx16 %%f12, %%f42, %%f48 \n\t"\ | |
258 "fmul8ulx16 %%f12, %%f34, %%f50 \n\t"\ | |
259 "fmul8ulx16 %%f14, %%f44, %%f52 \n\t"\ | |
260 "fmul8ulx16 %%f14, %%f40, %%f54 \n\t"\ | |
261 "fmul8ulx16 %%f14, %%f36, %%f56 \n\t"\ | |
262 "fmul8ulx16 %%f14, %%f32, %%f58 \n\t"\ | |
263 \ | |
264 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ | |
265 "fpsub16 %%f20, %%f50, %%f20 \n\t"\ | |
266 "fpadd16 %%f24, %%f50, %%f24 \n\t"\ | |
267 "fpsub16 %%f28, %%f48, %%f28 \n\t"\ | |
268 "fpadd16 %%f18, %%f52, %%f18 \n\t"\ | |
269 "fpsub16 %%f22, %%f54, %%f22 \n\t"\ | |
270 "fpadd16 %%f26, %%f56, %%f26 \n\t"\ | |
271 "fpsub16 %%f30, %%f58, %%f30 \n\t"\ | |
272 \ | |
273 "fmul8sux16 %%f12, %%f42, %%f48 \n\t"\ | |
274 "fmul8sux16 %%f12, %%f34, %%f50 \n\t"\ | |
275 "fmul8sux16 %%f14, %%f44, %%f52 \n\t"\ | |
276 "fmul8sux16 %%f14, %%f40, %%f54 \n\t"\ | |
277 "fmul8sux16 %%f14, %%f36, %%f56 \n\t"\ | |
278 "fmul8sux16 %%f14, %%f32, %%f58 \n\t"\ | |
279 \ | |
280 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ | |
281 "fpsub16 %%f20, %%f50, %%f20 \n\t"\ | |
282 "fpadd16 %%f24, %%f50, %%f24 \n\t"\ | |
283 "fpsub16 %%f28, %%f48, %%f28 \n\t"\ | |
284 "fpadd16 %%f18, %%f52, %%f18 \n\t"\ | |
285 "fpsub16 %%f22, %%f54, %%f22 \n\t"\ | |
286 "fpadd16 %%f26, %%f56, %%f26 \n\t"\ | |
287 "fpsub16 %%f30, %%f58, %%f30 \n\t"\ | |
288 \ | |
289 "fpsub16 %%f20, %%f12, %%f20 \n\t"\ | |
290 "fpadd16 %%f24, %%f12, %%f24 \n\t"\ | |
291 "fpsub16 %%f22, %%f14, %%f22 \n\t"\ | |
292 "fpadd16 %%f26, %%f14, %%f26 \n\t"\ | |
293 "fpsub16 %%f30, %%f14, %%f30 \n\t"\ | |
294 /* final butterfly */\ | |
295 "5: \n\t"\ | |
296 "fpsub16 %%f16, %%f18, %%f48 \n\t"\ | |
297 "fpsub16 %%f20, %%f22, %%f50 \n\t"\ | |
298 "fpsub16 %%f24, %%f26, %%f52 \n\t"\ | |
299 "fpsub16 %%f28, %%f30, %%f54 \n\t"\ | |
300 "fpadd16 %%f16, %%f18, %%f16 \n\t"\ | |
301 "fpadd16 %%f20, %%f22, %%f20 \n\t"\ | |
302 "fpadd16 %%f24, %%f26, %%f24 \n\t"\ | |
303 "fpadd16 %%f28, %%f30, %%f28 \n\t"\ | |
304 | |
305 #define STOREROWS(out) \ | |
306 "std %%f48, [" out "+112] \n\t"\ | |
307 "std %%f50, [" out "+96] \n\t"\ | |
308 "std %%f52, [" out "+80] \n\t"\ | |
309 "std %%f54, [" out "+64] \n\t"\ | |
310 "std %%f16, [" out "] \n\t"\ | |
311 "std %%f20, [" out "+16] \n\t"\ | |
312 "std %%f24, [" out "+32] \n\t"\ | |
313 "std %%f28, [" out "+48] \n\t"\ | |
314 | |
315 #define SCALEROWS \ | |
316 "fmul8sux16 %%f46, %%f48, %%f48 \n\t"\ | |
317 "fmul8sux16 %%f46, %%f50, %%f50 \n\t"\ | |
318 "fmul8sux16 %%f46, %%f52, %%f52 \n\t"\ | |
319 "fmul8sux16 %%f46, %%f54, %%f54 \n\t"\ | |
320 "fmul8sux16 %%f46, %%f16, %%f16 \n\t"\ | |
321 "fmul8sux16 %%f46, %%f20, %%f20 \n\t"\ | |
322 "fmul8sux16 %%f46, %%f24, %%f24 \n\t"\ | |
323 "fmul8sux16 %%f46, %%f28, %%f28 \n\t"\ | |
324 | |
325 #define PUTPIXELSCLAMPED(dest) \ | |
326 "fpack16 %%f48, %%f14 \n\t"\ | |
327 "fpack16 %%f50, %%f12 \n\t"\ | |
328 "fpack16 %%f16, %%f0 \n\t"\ | |
329 "fpack16 %%f20, %%f2 \n\t"\ | |
330 "fpack16 %%f24, %%f4 \n\t"\ | |
331 "fpack16 %%f28, %%f6 \n\t"\ | |
332 "fpack16 %%f54, %%f8 \n\t"\ | |
333 "fpack16 %%f52, %%f10 \n\t"\ | |
334 "st %%f0, [%3+" dest "] \n\t"\ | |
335 "st %%f2, [%5+" dest "] \n\t"\ | |
336 "st %%f4, [%6+" dest "] \n\t"\ | |
337 "st %%f6, [%7+" dest "] \n\t"\ | |
338 "st %%f8, [%8+" dest "] \n\t"\ | |
339 "st %%f10, [%9+" dest "] \n\t"\ | |
340 "st %%f12, [%10+" dest "] \n\t"\ | |
341 "st %%f14, [%11+" dest "] \n\t"\ | |
342 | |
343 #define ADDPIXELSCLAMPED(dest) \ | |
344 "ldd [%5], %%f18 \n\t"\ | |
345 "ld [%3+" dest"], %%f0 \n\t"\ | |
346 "ld [%6+" dest"], %%f2 \n\t"\ | |
347 "ld [%7+" dest"], %%f4 \n\t"\ | |
348 "ld [%8+" dest"], %%f6 \n\t"\ | |
349 "ld [%9+" dest"], %%f8 \n\t"\ | |
350 "ld [%10+" dest"], %%f10 \n\t"\ | |
351 "ld [%11+" dest"], %%f12 \n\t"\ | |
352 "ld [%12+" dest"], %%f14 \n\t"\ | |
353 "fmul8x16 %%f0, %%f18, %%f0 \n\t"\ | |
354 "fmul8x16 %%f2, %%f18, %%f2 \n\t"\ | |
355 "fmul8x16 %%f4, %%f18, %%f4 \n\t"\ | |
356 "fmul8x16 %%f6, %%f18, %%f6 \n\t"\ | |
357 "fmul8x16 %%f8, %%f18, %%f8 \n\t"\ | |
358 "fmul8x16 %%f10, %%f18, %%f10 \n\t"\ | |
359 "fmul8x16 %%f12, %%f18, %%f12 \n\t"\ | |
360 "fmul8x16 %%f14, %%f18, %%f14 \n\t"\ | |
361 "fpadd16 %%f0, %%f16, %%f0 \n\t"\ | |
362 "fpadd16 %%f2, %%f20, %%f2 \n\t"\ | |
363 "fpadd16 %%f4, %%f24, %%f4 \n\t"\ | |
364 "fpadd16 %%f6, %%f28, %%f6 \n\t"\ | |
365 "fpadd16 %%f8, %%f54, %%f8 \n\t"\ | |
366 "fpadd16 %%f10, %%f52, %%f10 \n\t"\ | |
367 "fpadd16 %%f12, %%f50, %%f12 \n\t"\ | |
368 "fpadd16 %%f14, %%f48, %%f14 \n\t"\ | |
369 "fpack16 %%f0, %%f0 \n\t"\ | |
370 "fpack16 %%f2, %%f2 \n\t"\ | |
371 "fpack16 %%f4, %%f4 \n\t"\ | |
372 "fpack16 %%f6, %%f6 \n\t"\ | |
373 "fpack16 %%f8, %%f8 \n\t"\ | |
374 "fpack16 %%f10, %%f10 \n\t"\ | |
375 "fpack16 %%f12, %%f12 \n\t"\ | |
376 "fpack16 %%f14, %%f14 \n\t"\ | |
377 "st %%f0, [%3+" dest "] \n\t"\ | |
378 "st %%f2, [%6+" dest "] \n\t"\ | |
379 "st %%f4, [%7+" dest "] \n\t"\ | |
380 "st %%f6, [%8+" dest "] \n\t"\ | |
381 "st %%f8, [%9+" dest "] \n\t"\ | |
382 "st %%f10, [%10+" dest "] \n\t"\ | |
383 "st %%f12, [%11+" dest "] \n\t"\ | |
384 "st %%f14, [%12+" dest "] \n\t"\ | |
385 | |
386 | |
8285
197fe6f703a3
Remove useless inline qualifier, fixes linking with gcc 4.3.
diego
parents:
8031
diff
changeset
|
387 void ff_simple_idct_vis(DCTELEM *data) { |
5618 | 388 int out1, out2, out3, out4; |
10961
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
8285
diff
changeset
|
389 DECLARE_ALIGNED_8(int16_t, temp)[8*8]; |
5618 | 390 |
8031 | 391 __asm__ volatile( |
5618 | 392 INIT_IDCT |
393 | |
394 #define ADDROUNDER | |
395 | |
396 // shift right 16-4=12 | |
397 LOADSCALE("%2+8") | |
398 IDCT4ROWS | |
399 STOREROWS("%3+8") | |
400 LOADSCALE("%2+0") | |
401 IDCT4ROWS | |
402 "std %%f48, [%3+112] \n\t" | |
403 "std %%f50, [%3+96] \n\t" | |
404 "std %%f52, [%3+80] \n\t" | |
405 "std %%f54, [%3+64] \n\t" | |
406 | |
407 // shift right 16+4 | |
408 "ldd [%3+8], %%f18 \n\t" | |
409 "ldd [%3+24], %%f22 \n\t" | |
410 "ldd [%3+40], %%f26 \n\t" | |
411 "ldd [%3+56], %%f30 \n\t" | |
412 TRANSPOSE | |
413 IDCT4ROWS | |
414 SCALEROWS | |
415 STOREROWS("%2+0") | |
416 LOAD("%3+64") | |
417 TRANSPOSE | |
418 IDCT4ROWS | |
419 SCALEROWS | |
420 STOREROWS("%2+8") | |
421 | |
422 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4) | |
423 : "0" (scale), "1" (coeffs), "2" (data), "3" (temp) | |
424 ); | |
425 } | |
426 | |
427 void ff_simple_idct_put_vis(uint8_t *dest, int line_size, DCTELEM *data) { | |
428 int out1, out2, out3, out4, out5; | |
429 int r1, r2, r3, r4, r5, r6, r7; | |
430 | |
8031 | 431 __asm__ volatile( |
5618 | 432 "wr %%g0, 0x8, %%gsr \n\t" |
433 | |
434 INIT_IDCT | |
435 | |
436 "add %3, %4, %5 \n\t" | |
437 "add %5, %4, %6 \n\t" | |
438 "add %6, %4, %7 \n\t" | |
439 "add %7, %4, %8 \n\t" | |
440 "add %8, %4, %9 \n\t" | |
441 "add %9, %4, %10 \n\t" | |
442 "add %10, %4, %11 \n\t" | |
443 | |
444 // shift right 16-4=12 | |
445 LOADSCALE("%2+8") | |
446 IDCT4ROWS | |
447 STOREROWS("%2+8") | |
448 LOADSCALE("%2+0") | |
449 IDCT4ROWS | |
450 "std %%f48, [%2+112] \n\t" | |
451 "std %%f50, [%2+96] \n\t" | |
452 "std %%f52, [%2+80] \n\t" | |
453 "std %%f54, [%2+64] \n\t" | |
454 | |
455 #undef ADDROUNDER | |
456 #define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t" | |
457 | |
458 // shift right 16+4 | |
459 "ldd [%2+8], %%f18 \n\t" | |
460 "ldd [%2+24], %%f22 \n\t" | |
461 "ldd [%2+40], %%f26 \n\t" | |
462 "ldd [%2+56], %%f30 \n\t" | |
463 TRANSPOSE | |
464 IDCT4ROWS | |
465 PUTPIXELSCLAMPED("0") | |
466 LOAD("%2+64") | |
467 TRANSPOSE | |
468 IDCT4ROWS | |
469 PUTPIXELSCLAMPED("4") | |
470 | |
471 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), | |
472 "=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7) | |
473 : "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size) | |
474 ); | |
475 } | |
476 | |
477 void ff_simple_idct_add_vis(uint8_t *dest, int line_size, DCTELEM *data) { | |
478 int out1, out2, out3, out4, out5, out6; | |
479 int r1, r2, r3, r4, r5, r6, r7; | |
480 | |
8031 | 481 __asm__ volatile( |
5618 | 482 "wr %%g0, 0x8, %%gsr \n\t" |
483 | |
484 INIT_IDCT | |
485 | |
486 "add %3, %4, %6 \n\t" | |
487 "add %6, %4, %7 \n\t" | |
488 "add %7, %4, %8 \n\t" | |
489 "add %8, %4, %9 \n\t" | |
490 "add %9, %4, %10 \n\t" | |
491 "add %10, %4, %11 \n\t" | |
492 "add %11, %4, %12 \n\t" | |
493 | |
494 #undef ADDROUNDER | |
495 #define ADDROUNDER | |
496 | |
497 // shift right 16-4=12 | |
498 LOADSCALE("%2+8") | |
499 IDCT4ROWS | |
500 STOREROWS("%2+8") | |
501 LOADSCALE("%2+0") | |
502 IDCT4ROWS | |
503 "std %%f48, [%2+112] \n\t" | |
504 "std %%f50, [%2+96] \n\t" | |
505 "std %%f52, [%2+80] \n\t" | |
506 "std %%f54, [%2+64] \n\t" | |
507 | |
508 #undef ADDROUNDER | |
509 #define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t" | |
510 | |
511 // shift right 16+4 | |
512 "ldd [%2+8], %%f18 \n\t" | |
513 "ldd [%2+24], %%f22 \n\t" | |
514 "ldd [%2+40], %%f26 \n\t" | |
515 "ldd [%2+56], %%f30 \n\t" | |
516 TRANSPOSE | |
517 IDCT4ROWS | |
518 ADDPIXELSCLAMPED("0") | |
519 LOAD("%2+64") | |
520 TRANSPOSE | |
521 IDCT4ROWS | |
522 ADDPIXELSCLAMPED("4") | |
523 | |
524 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6), | |
525 "=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7) | |
526 : "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size), "5" (expand) | |
527 ); | |
528 } |