Mercurial > libavcodec.hg
comparison armv4l/simple_idct_armv6.S @ 4427:765df9cbb2b3 libavcodec
ARMv6 SIMD IDCT
author | mru |
---|---|
date | Sun, 28 Jan 2007 21:32:08 +0000 |
parents | |
children | cab2986ffc0b |
comparison
equal
deleted
inserted
replaced
4426:65ef5fd314ad | 4427:765df9cbb2b3 |
---|---|
1 /* | |
2 * Simple IDCT | |
3 * | |
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> | |
5 * Copyright (c) 2007 Mans Rullgard <mru@inprovide.com> | |
6 * | |
7 * This file is part of FFmpeg. | |
8 * | |
9 * FFmpeg is free software; you can redistribute it and/or | |
10 * modify it under the terms of the GNU Lesser General Public | |
11 * License as published by the Free Software Foundation; either | |
12 * version 2.1 of the License, or (at your option) any later version. | |
13 * | |
14 * FFmpeg is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 * Lesser General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU Lesser General Public | |
20 * License along with FFmpeg; if not, write to the Free Software | |
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 */ | |
23 | |
24 #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
25 #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
26 #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
27 #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
28 #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
29 #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
30 #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
31 #define ROW_SHIFT 11 | |
32 #define COL_SHIFT 20 | |
33 | |
34 #define W13 (W1 | (W3 << 16)) | |
35 #define W26 (W2 | (W6 << 16)) | |
36 #define W42 (W4 | (W2 << 16)) | |
37 #define W42n (-W4&0xffff | (-W2 << 16)) | |
38 #define W46 (W4 | (W6 << 16)) | |
39 #define W57 (W5 | (W7 << 16)) | |
40 | |
41 .text | |
42 .align | |
43 w13: .long W13 | |
44 w26: .long W26 | |
45 w42: .long W42 | |
46 w42n: .long W42n | |
47 w46: .long W46 | |
48 w57: .long W57 | |
49 | |
50 /* | |
51 Compute partial IDCT of single row. | |
52 shift = left-shift amount | |
53 a1 = source address | |
54 | |
55 Output in registers v1--v8 | |
56 */ | |
57 .macro idct_row shift | |
58 ldr a3, [a1] /* a3 = row[2,0] */ | |
59 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ | |
60 ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ | |
61 ldr a4, [a1, #8] /* a4 = row[3,1] */ | |
62 mov a2, #(1<<(\shift-1)) | |
63 smlad v1, a3, ip, a2 | |
64 smlsd v4, a3, ip, a2 | |
65 ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ | |
66 ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ | |
67 smlad v2, a3, lr, a2 | |
68 smlsd v3, a3, lr, a2 | |
69 | |
70 smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ | |
71 smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ | |
72 ldr lr, [a1, #12] /* lr = row[7,5] */ | |
73 pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ | |
74 pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ | |
75 smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */ | |
76 smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ | |
77 smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ | |
78 | |
79 smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */ | |
80 ldr a3, [a1, #4] /* a3 = row[6,4] */ | |
81 ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */ | |
82 smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */ | |
83 ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */ | |
84 smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */ | |
85 | |
86 smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */ | |
87 smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */ | |
88 smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */ | |
89 smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */ | |
90 .endm | |
91 | |
92 /* | |
93 Compute final part of IDCT single row without shift. | |
94 Input in registers v1--v8 | |
95 Output in registers ip, v1--v3, lr, v5--v7 | |
96 */ | |
97 .macro idct_finish | |
98 add ip, v1, v5 /* a2 = A0 + B0 */ | |
99 sub lr, v1, v5 /* a3 = A0 - B0 */ | |
100 sub v1, v2, v6 /* a3 = A1 + B1 */ | |
101 add v5, v2, v6 /* a3 = A1 - B1 */ | |
102 add v2, v3, v7 /* a2 = A2 + B2 */ | |
103 sub v6, v3, v7 /* a2 = A2 - B2 */ | |
104 add v3, v4, fp /* a3 = A3 + B3 */ | |
105 sub v7, v4, fp /* a3 = A3 - B3 */ | |
106 .endm | |
107 | |
108 /* | |
109 Compute final part of IDCT single row. | |
110 shift = right-shift amount | |
111 Input/output in registers v1--v8 | |
112 */ | |
113 .macro idct_finish_shift shift | |
114 add a4, v1, v5 /* a4 = A0 + B0 */ | |
115 sub a3, v1, v5 /* a3 = A0 - B0 */ | |
116 mov v1, a4, asr #\shift | |
117 mov v5, a3, asr #\shift | |
118 | |
119 sub a4, v2, v6 /* a4 = A1 + B1 */ | |
120 add a3, v2, v6 /* a3 = A1 - B1 */ | |
121 mov v2, a4, asr #\shift | |
122 mov v6, a3, asr #\shift | |
123 | |
124 add a4, v3, v7 /* a4 = A2 + B2 */ | |
125 sub a3, v3, v7 /* a3 = A2 - B2 */ | |
126 mov v3, a4, asr #\shift | |
127 mov v7, a3, asr #\shift | |
128 | |
129 add a4, v4, fp /* a4 = A3 + B3 */ | |
130 sub a3, v4, fp /* a3 = A3 - B3 */ | |
131 mov v4, a4, asr #\shift | |
132 mov fp, a3, asr #\shift | |
133 .endm | |
134 | |
135 /* | |
136 Compute final part of IDCT single row, saturating results at 8 bits. | |
137 shift = right-shift amount | |
138 Input/output in registers v1--v8 | |
139 */ | |
140 .macro idct_finish_shift_sat shift | |
141 add a4, v1, v5 /* a4 = A0 + B0 */ | |
142 sub ip, v1, v5 /* ip = A0 - B0 */ | |
143 usat v1, #8, a4, asr #\shift | |
144 usat v5, #8, ip, asr #\shift | |
145 | |
146 sub a4, v2, v6 /* a4 = A1 + B1 */ | |
147 add ip, v2, v6 /* ip = A1 - B1 */ | |
148 usat v2, #8, a4, asr #\shift | |
149 usat v6, #8, ip, asr #\shift | |
150 | |
151 add a4, v3, v7 /* a4 = A2 + B2 */ | |
152 sub ip, v3, v7 /* ip = A2 - B2 */ | |
153 usat v3, #8, a4, asr #\shift | |
154 usat v7, #8, ip, asr #\shift | |
155 | |
156 add a4, v4, fp /* a4 = A3 + B3 */ | |
157 sub ip, v4, fp /* ip = A3 - B3 */ | |
158 usat v4, #8, a4, asr #\shift | |
159 usat fp, #8, ip, asr #\shift | |
160 .endm | |
161 | |
162 /* | |
163 Compute IDCT of single row, storing as column. | |
164 a1 = source | |
165 a2 = dest | |
166 */ | |
167 .align | |
168 .func idct_row_armv6 | |
169 idct_row_armv6: | |
170 stmfd sp!, {a2, lr} | |
171 | |
172 idct_row ROW_SHIFT | |
173 ldr a2, [sp], #4 | |
174 idct_finish_shift ROW_SHIFT | |
175 | |
176 strh v1, [a2] | |
177 strh v2, [a2, #(16*2)] | |
178 strh v3, [a2, #(16*4)] | |
179 strh v4, [a2, #(16*6)] | |
180 strh fp, [a2, #(16*1)] | |
181 strh v7, [a2, #(16*3)] | |
182 strh v6, [a2, #(16*5)] | |
183 strh v5, [a2, #(16*7)] | |
184 | |
185 ldr pc, [sp], #4 | |
186 .endfunc | |
187 | |
188 /* | |
189 Compute IDCT of single column, read as row. | |
190 a1 = source | |
191 a2 = dest | |
192 */ | |
193 .align | |
194 .func idct_col_armv6 | |
195 idct_col_armv6: | |
196 stmfd sp!, {a2, lr} | |
197 | |
198 idct_row COL_SHIFT | |
199 ldr a2, [sp], #4 | |
200 idct_finish_shift COL_SHIFT | |
201 | |
202 strh v1, [a2] | |
203 strh v2, [a2, #(16*1)] | |
204 strh v3, [a2, #(16*2)] | |
205 strh v4, [a2, #(16*3)] | |
206 strh fp, [a2, #(16*4)] | |
207 strh v7, [a2, #(16*5)] | |
208 strh v6, [a2, #(16*6)] | |
209 strh v5, [a2, #(16*7)] | |
210 | |
211 ldr pc, [sp], #4 | |
212 .endfunc | |
213 | |
214 /* | |
215 Compute IDCT of single column, read as row, store saturated 8-bit. | |
216 a1 = source | |
217 a2 = dest | |
218 a3 = line size | |
219 */ | |
220 .align | |
221 .func idct_col_put_armv6 | |
222 idct_col_put_armv6: | |
223 stmfd sp!, {a2, a3, lr} | |
224 | |
225 idct_row COL_SHIFT | |
226 ldmfd sp!, {a2, a3} | |
227 idct_finish_shift_sat COL_SHIFT | |
228 | |
229 strb v1, [a2], a3 | |
230 strb v2, [a2], a3 | |
231 strb v3, [a2], a3 | |
232 strb v4, [a2], a3 | |
233 strb fp, [a2], a3 | |
234 strb v7, [a2], a3 | |
235 strb v6, [a2], a3 | |
236 strb v5, [a2], a3 | |
237 | |
238 sub a2, a2, a3, lsl #3 | |
239 | |
240 ldr pc, [sp], #4 | |
241 .endfunc | |
242 | |
243 /* | |
244 Compute IDCT of single column, read as row, add/store saturated 8-bit. | |
245 a1 = source | |
246 a2 = dest | |
247 a3 = line size | |
248 */ | |
249 .align | |
250 .func idct_col_add_armv6 | |
251 idct_col_add_armv6: | |
252 stmfd sp!, {a2, a3, lr} | |
253 | |
254 idct_row COL_SHIFT | |
255 ldmfd sp!, {a2, a3} | |
256 idct_finish | |
257 | |
258 ldrb a4, [a2] | |
259 ldrb v4, [a2, a3] | |
260 ldrb fp, [a2, a3, lsl #2] | |
261 add ip, a4, ip, asr #COL_SHIFT | |
262 usat ip, #8, ip | |
263 add v1, v4, v1, asr #COL_SHIFT | |
264 strb ip, [a2], a3 | |
265 ldrb ip, [a2, a3] | |
266 usat v1, #8, v1 | |
267 ldrb fp, [a2, a3, lsl #2] | |
268 add v2, ip, v2, asr #COL_SHIFT | |
269 usat v2, #8, v2 | |
270 strb v1, [a2], a3 | |
271 ldrb a4, [a2, a3] | |
272 ldrb ip, [a2, a3, lsl #2] | |
273 strb v2, [a2], a3 | |
274 ldrb v4, [a2, a3] | |
275 ldrb v1, [a2, a3, lsl #2] | |
276 add v3, a4, v3, asr #COL_SHIFT | |
277 usat v3, #8, v3 | |
278 add v7, v4, v7, asr #COL_SHIFT | |
279 usat v7, #8, v7 | |
280 add v6, fp, v6, asr #COL_SHIFT | |
281 usat v6, #8, v6 | |
282 add v5, ip, v5, asr #COL_SHIFT | |
283 usat v5, #8, v5 | |
284 add lr, v1, lr, asr #COL_SHIFT | |
285 usat lr, #8, lr | |
286 strb v3, [a2], a3 | |
287 strb v7, [a2], a3 | |
288 strb v6, [a2], a3 | |
289 strb v5, [a2], a3 | |
290 strb lr, [a2], a3 | |
291 | |
292 sub a2, a2, a3, lsl #3 | |
293 | |
294 ldr pc, [sp], #4 | |
295 .endfunc | |
296 | |
297 /* | |
298 Compute 8 IDCT row transforms. | |
299 func = IDCT row->col function | |
300 width = width of columns in bytes | |
301 */ | |
302 .macro idct_rows func width | |
303 bl \func | |
304 add a1, a1, #(16*2) | |
305 add a2, a2, #\width | |
306 bl \func | |
307 add a1, a1, #(16*2) | |
308 add a2, a2, #\width | |
309 bl \func | |
310 add a1, a1, #(16*2) | |
311 add a2, a2, #\width | |
312 bl \func | |
313 sub a1, a1, #(16*5) | |
314 add a2, a2, #\width | |
315 bl \func | |
316 add a1, a1, #(16*2) | |
317 add a2, a2, #\width | |
318 bl \func | |
319 add a1, a1, #(16*2) | |
320 add a2, a2, #\width | |
321 bl \func | |
322 add a1, a1, #(16*2) | |
323 add a2, a2, #\width | |
324 bl \func | |
325 | |
326 sub a1, a1, #(16*7) | |
327 .endm | |
328 | |
329 .align | |
330 .global ff_simple_idct_armv6 | |
331 .func ff_simple_idct_armv6 | |
332 /* void ff_simple_idct_armv6(DCTELEM *data); */ | |
333 ff_simple_idct_armv6: | |
334 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} | |
335 sub sp, sp, #128 | |
336 | |
337 mov a2, sp | |
338 idct_rows idct_row_armv6, 2 | |
339 mov a2, a1 | |
340 mov a1, sp | |
341 idct_rows idct_col_armv6, 2 | |
342 | |
343 add sp, sp, #128 | |
344 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} | |
345 .endfunc | |
346 | |
347 .align | |
348 .global ff_simple_idct_add_armv6 | |
349 .func ff_simple_idct_add_armv6 | |
350 /* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ | |
351 ff_simple_idct_add_armv6: | |
352 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} | |
353 sub sp, sp, #128 | |
354 | |
355 mov a1, a3 | |
356 mov a2, sp | |
357 idct_rows idct_row_armv6, 2 | |
358 mov a1, sp | |
359 ldr a2, [sp, #128] | |
360 ldr a3, [sp, #(128+4)] | |
361 idct_rows idct_col_add_armv6, 1 | |
362 | |
363 add sp, sp, #(128+8) | |
364 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} | |
365 .endfunc | |
366 | |
367 .align | |
368 .global ff_simple_idct_put_armv6 | |
369 .func ff_simple_idct_put_armv6 | |
370 /* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ | |
371 ff_simple_idct_put_armv6: | |
372 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} | |
373 sub sp, sp, #128 | |
374 | |
375 mov a1, a3 | |
376 mov a2, sp | |
377 idct_rows idct_row_armv6, 2 | |
378 mov a1, sp | |
379 ldr a2, [sp, #128] | |
380 ldr a3, [sp, #(128+4)] | |
381 idct_rows idct_col_put_armv6, 1 | |
382 | |
383 add sp, sp, #(128+8) | |
384 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} | |
385 .endfunc |