Mercurial > libavcodec.hg
annotate armv4l/simple_idct_armv6.S @ 6323:e6da66f378c7 libavcodec
mpegvideo.h has two function declarations with the 'inline' specifier
but no definition for those functions. The C standard requires a
definition to appear in the same translation unit for any function
declared with 'inline'. Most of the files including mpegvideo.h do not
define those functions. Fix this by removing the 'inline' specifiers
from the header.
patch by Uoti Urpala
author | diego |
---|---|
date | Sun, 03 Feb 2008 17:54:30 +0000 |
parents | 744e91a36a23 |
children | 316762ae96a7 |
rev | line source |
---|---|
4427 | 1 /* |
2 * Simple IDCT | |
3 * | |
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> | |
5220 | 5 * Copyright (c) 2007 Mans Rullgard <mans@mansr.com> |
4427 | 6 * |
7 * This file is part of FFmpeg. | |
8 * | |
9 * FFmpeg is free software; you can redistribute it and/or | |
10 * modify it under the terms of the GNU Lesser General Public | |
11 * License as published by the Free Software Foundation; either | |
12 * version 2.1 of the License, or (at your option) any later version. | |
13 * | |
14 * FFmpeg is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 * Lesser General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU Lesser General Public | |
20 * License along with FFmpeg; if not, write to the Free Software | |
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 */ | |
23 | |
24 #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
25 #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
26 #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
27 #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
28 #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
29 #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
30 #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
31 #define ROW_SHIFT 11 | |
32 #define COL_SHIFT 20 | |
33 | |
34 #define W13 (W1 | (W3 << 16)) | |
35 #define W26 (W2 | (W6 << 16)) | |
36 #define W42 (W4 | (W2 << 16)) | |
37 #define W42n (-W4&0xffff | (-W2 << 16)) | |
38 #define W46 (W4 | (W6 << 16)) | |
39 #define W57 (W5 | (W7 << 16)) | |
40 | |
41 .text | |
42 .align | |
43 w13: .long W13 | |
44 w26: .long W26 | |
45 w42: .long W42 | |
46 w42n: .long W42n | |
47 w46: .long W46 | |
48 w57: .long W57 | |
49 | |
50 /* | |
51 Compute partial IDCT of single row. | |
52 shift = left-shift amount | |
53 a1 = source address | |
4458 | 54 a3 = row[2,0] <= 2 cycles |
4457 | 55 a4 = row[3,1] |
4458 | 56 ip = w42 <= 2 cycles |
4427 | 57 |
58 Output in registers v1--v8 | |
59 */ | |
60 .macro idct_row shift | |
4483 | 61 ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ |
62 mov a2, #(1<<(\shift-1)) | |
63 smlad v1, a3, ip, a2 | |
64 smlsd v4, a3, ip, a2 | |
65 ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ | |
66 ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ | |
67 smlad v2, a3, lr, a2 | |
68 smlsd v3, a3, lr, a2 | |
69 | |
70 smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ | |
71 smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ | |
4427 | 72 ldr lr, [a1, #12] /* lr = row[7,5] */ |
73 pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ | |
74 pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ | |
4434 | 75 smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ |
4427 | 76 smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */ |
77 smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ | |
78 | |
4434 | 79 ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */ |
4427 | 80 smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */ |
81 ldr a3, [a1, #4] /* a3 = row[6,4] */ | |
82 smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */ | |
83 ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */ | |
84 smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */ | |
85 | |
86 smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */ | |
87 smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */ | |
88 smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */ | |
89 smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */ | |
90 .endm | |
91 | |
92 /* | |
4452 | 93 Compute partial IDCT of half row. |
94 shift = left-shift amount | |
95 a3 = row[2,0] | |
96 a4 = row[3,1] | |
4460 | 97 ip = w42 |
4452 | 98 |
99 Output in registers v1--v8 | |
100 */ | |
101 .macro idct_row4 shift | |
4483 | 102 ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ |
103 ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ | |
104 mov a2, #(1<<(\shift-1)) | |
105 smlad v1, a3, ip, a2 | |
106 smlsd v4, a3, ip, a2 | |
107 ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ | |
108 smlad v2, a3, lr, a2 | |
109 smlsd v3, a3, lr, a2 | |
110 smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ | |
111 smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ | |
4452 | 112 pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ |
113 pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ | |
114 smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ | |
115 smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ | |
116 .endm | |
117 | |
118 /* | |
4427 | 119 Compute final part of IDCT single row without shift. |
120 Input in registers v1--v8 | |
121 Output in registers ip, v1--v3, lr, v5--v7 | |
122 */ | |
123 .macro idct_finish | |
124 add ip, v1, v5 /* a2 = A0 + B0 */ | |
125 sub lr, v1, v5 /* a3 = A0 - B0 */ | |
126 sub v1, v2, v6 /* a3 = A1 + B1 */ | |
127 add v5, v2, v6 /* a3 = A1 - B1 */ | |
128 add v2, v3, v7 /* a2 = A2 + B2 */ | |
129 sub v6, v3, v7 /* a2 = A2 - B2 */ | |
130 add v3, v4, fp /* a3 = A3 + B3 */ | |
131 sub v7, v4, fp /* a3 = A3 - B3 */ | |
132 .endm | |
133 | |
134 /* | |
135 Compute final part of IDCT single row. | |
136 shift = right-shift amount | |
137 Input/output in registers v1--v8 | |
138 */ | |
139 .macro idct_finish_shift shift | |
140 add a4, v1, v5 /* a4 = A0 + B0 */ | |
141 sub a3, v1, v5 /* a3 = A0 - B0 */ | |
142 mov v1, a4, asr #\shift | |
143 mov v5, a3, asr #\shift | |
144 | |
145 sub a4, v2, v6 /* a4 = A1 + B1 */ | |
146 add a3, v2, v6 /* a3 = A1 - B1 */ | |
147 mov v2, a4, asr #\shift | |
148 mov v6, a3, asr #\shift | |
149 | |
150 add a4, v3, v7 /* a4 = A2 + B2 */ | |
151 sub a3, v3, v7 /* a3 = A2 - B2 */ | |
152 mov v3, a4, asr #\shift | |
153 mov v7, a3, asr #\shift | |
154 | |
155 add a4, v4, fp /* a4 = A3 + B3 */ | |
156 sub a3, v4, fp /* a3 = A3 - B3 */ | |
157 mov v4, a4, asr #\shift | |
158 mov fp, a3, asr #\shift | |
159 .endm | |
160 | |
161 /* | |
162 Compute final part of IDCT single row, saturating results at 8 bits. | |
163 shift = right-shift amount | |
164 Input/output in registers v1--v8 | |
165 */ | |
166 .macro idct_finish_shift_sat shift | |
167 add a4, v1, v5 /* a4 = A0 + B0 */ | |
168 sub ip, v1, v5 /* ip = A0 - B0 */ | |
169 usat v1, #8, a4, asr #\shift | |
170 usat v5, #8, ip, asr #\shift | |
171 | |
172 sub a4, v2, v6 /* a4 = A1 + B1 */ | |
173 add ip, v2, v6 /* ip = A1 - B1 */ | |
174 usat v2, #8, a4, asr #\shift | |
175 usat v6, #8, ip, asr #\shift | |
176 | |
177 add a4, v3, v7 /* a4 = A2 + B2 */ | |
178 sub ip, v3, v7 /* ip = A2 - B2 */ | |
179 usat v3, #8, a4, asr #\shift | |
180 usat v7, #8, ip, asr #\shift | |
181 | |
182 add a4, v4, fp /* a4 = A3 + B3 */ | |
183 sub ip, v4, fp /* ip = A3 - B3 */ | |
184 usat v4, #8, a4, asr #\shift | |
185 usat fp, #8, ip, asr #\shift | |
186 .endm | |
187 | |
188 /* | |
189 Compute IDCT of single row, storing as column. | |
190 a1 = source | |
191 a2 = dest | |
192 */ | |
193 .align | |
4867
97d82c7585b4
add .type foo, %function directives for the benefit of debuggers
mru
parents:
4483
diff
changeset
|
194 .type idct_row_armv6, %function |
4427 | 195 .func idct_row_armv6 |
196 idct_row_armv6: | |
4483 | 197 str lr, [sp, #-4]! |
198 | |
199 ldr lr, [a1, #12] /* lr = row[7,5] */ | |
200 ldr ip, [a1, #4] /* ip = row[6,4] */ | |
4452 | 201 ldr a4, [a1, #8] /* a4 = row[3,1] */ |
202 ldr a3, [a1] /* a3 = row[2,0] */ | |
4483 | 203 orrs lr, lr, ip |
204 cmpeq lr, a4 | |
205 cmpeq lr, a3, lsr #16 | |
4452 | 206 beq 1f |
4483 | 207 str a2, [sp, #-4]! |
208 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ | |
209 cmp lr, #0 | |
210 beq 2f | |
4427 | 211 |
4483 | 212 idct_row ROW_SHIFT |
213 b 3f | |
4452 | 214 |
4483 | 215 2: idct_row4 ROW_SHIFT |
4452 | 216 |
217 3: ldr a2, [sp], #4 | |
4483 | 218 idct_finish_shift ROW_SHIFT |
4427 | 219 |
220 strh v1, [a2] | |
221 strh v2, [a2, #(16*2)] | |
222 strh v3, [a2, #(16*4)] | |
223 strh v4, [a2, #(16*6)] | |
224 strh fp, [a2, #(16*1)] | |
225 strh v7, [a2, #(16*3)] | |
226 strh v6, [a2, #(16*5)] | |
227 strh v5, [a2, #(16*7)] | |
228 | |
229 ldr pc, [sp], #4 | |
4452 | 230 |
231 1: mov a3, a3, lsl #3 | |
232 strh a3, [a2] | |
233 strh a3, [a2, #(16*2)] | |
234 strh a3, [a2, #(16*4)] | |
235 strh a3, [a2, #(16*6)] | |
236 strh a3, [a2, #(16*1)] | |
237 strh a3, [a2, #(16*3)] | |
238 strh a3, [a2, #(16*5)] | |
239 strh a3, [a2, #(16*7)] | |
4483 | 240 ldr pc, [sp], #4 |
4427 | 241 .endfunc |
242 | |
243 /* | |
244 Compute IDCT of single column, read as row. | |
245 a1 = source | |
246 a2 = dest | |
247 */ | |
248 .align | |
4867
97d82c7585b4
add .type foo, %function directives for the benefit of debuggers
mru
parents:
4483
diff
changeset
|
249 .type idct_col_armv6, %function |
4427 | 250 .func idct_col_armv6 |
251 idct_col_armv6: | |
252 stmfd sp!, {a2, lr} | |
253 | |
4457 | 254 ldr a3, [a1] /* a3 = row[2,0] */ |
4483 | 255 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |
4457 | 256 ldr a4, [a1, #8] /* a4 = row[3,1] */ |
4427 | 257 idct_row COL_SHIFT |
258 ldr a2, [sp], #4 | |
259 idct_finish_shift COL_SHIFT | |
260 | |
261 strh v1, [a2] | |
262 strh v2, [a2, #(16*1)] | |
263 strh v3, [a2, #(16*2)] | |
264 strh v4, [a2, #(16*3)] | |
265 strh fp, [a2, #(16*4)] | |
266 strh v7, [a2, #(16*5)] | |
267 strh v6, [a2, #(16*6)] | |
268 strh v5, [a2, #(16*7)] | |
269 | |
270 ldr pc, [sp], #4 | |
271 .endfunc | |
272 | |
273 /* | |
274 Compute IDCT of single column, read as row, store saturated 8-bit. | |
275 a1 = source | |
276 a2 = dest | |
277 a3 = line size | |
278 */ | |
279 .align | |
4867
97d82c7585b4
add .type foo, %function directives for the benefit of debuggers
mru
parents:
4483
diff
changeset
|
280 .type idct_col_put_armv6, %function |
4427 | 281 .func idct_col_put_armv6 |
282 idct_col_put_armv6: | |
283 stmfd sp!, {a2, a3, lr} | |
284 | |
4457 | 285 ldr a3, [a1] /* a3 = row[2,0] */ |
4483 | 286 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |
4457 | 287 ldr a4, [a1, #8] /* a4 = row[3,1] */ |
4427 | 288 idct_row COL_SHIFT |
289 ldmfd sp!, {a2, a3} | |
290 idct_finish_shift_sat COL_SHIFT | |
291 | |
292 strb v1, [a2], a3 | |
293 strb v2, [a2], a3 | |
294 strb v3, [a2], a3 | |
295 strb v4, [a2], a3 | |
296 strb fp, [a2], a3 | |
297 strb v7, [a2], a3 | |
298 strb v6, [a2], a3 | |
299 strb v5, [a2], a3 | |
300 | |
301 sub a2, a2, a3, lsl #3 | |
302 | |
303 ldr pc, [sp], #4 | |
304 .endfunc | |
305 | |
306 /* | |
307 Compute IDCT of single column, read as row, add/store saturated 8-bit. | |
308 a1 = source | |
309 a2 = dest | |
310 a3 = line size | |
311 */ | |
312 .align | |
4867
97d82c7585b4
add .type foo, %function directives for the benefit of debuggers
mru
parents:
4483
diff
changeset
|
313 .type idct_col_add_armv6, %function |
4427 | 314 .func idct_col_add_armv6 |
315 idct_col_add_armv6: | |
316 stmfd sp!, {a2, a3, lr} | |
317 | |
4457 | 318 ldr a3, [a1] /* a3 = row[2,0] */ |
4483 | 319 ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |
4457 | 320 ldr a4, [a1, #8] /* a4 = row[3,1] */ |
4427 | 321 idct_row COL_SHIFT |
322 ldmfd sp!, {a2, a3} | |
323 idct_finish | |
324 | |
325 ldrb a4, [a2] | |
326 ldrb v4, [a2, a3] | |
327 ldrb fp, [a2, a3, lsl #2] | |
328 add ip, a4, ip, asr #COL_SHIFT | |
329 usat ip, #8, ip | |
330 add v1, v4, v1, asr #COL_SHIFT | |
331 strb ip, [a2], a3 | |
332 ldrb ip, [a2, a3] | |
333 usat v1, #8, v1 | |
334 ldrb fp, [a2, a3, lsl #2] | |
335 add v2, ip, v2, asr #COL_SHIFT | |
336 usat v2, #8, v2 | |
337 strb v1, [a2], a3 | |
338 ldrb a4, [a2, a3] | |
339 ldrb ip, [a2, a3, lsl #2] | |
340 strb v2, [a2], a3 | |
341 ldrb v4, [a2, a3] | |
342 ldrb v1, [a2, a3, lsl #2] | |
343 add v3, a4, v3, asr #COL_SHIFT | |
344 usat v3, #8, v3 | |
345 add v7, v4, v7, asr #COL_SHIFT | |
346 usat v7, #8, v7 | |
347 add v6, fp, v6, asr #COL_SHIFT | |
348 usat v6, #8, v6 | |
349 add v5, ip, v5, asr #COL_SHIFT | |
350 usat v5, #8, v5 | |
351 add lr, v1, lr, asr #COL_SHIFT | |
352 usat lr, #8, lr | |
353 strb v3, [a2], a3 | |
354 strb v7, [a2], a3 | |
355 strb v6, [a2], a3 | |
356 strb v5, [a2], a3 | |
357 strb lr, [a2], a3 | |
358 | |
359 sub a2, a2, a3, lsl #3 | |
360 | |
361 ldr pc, [sp], #4 | |
362 .endfunc | |
363 | |
364 /* | |
365 Compute 8 IDCT row transforms. | |
366 func = IDCT row->col function | |
367 width = width of columns in bytes | |
368 */ | |
369 .macro idct_rows func width | |
370 bl \func | |
371 add a1, a1, #(16*2) | |
372 add a2, a2, #\width | |
373 bl \func | |
374 add a1, a1, #(16*2) | |
375 add a2, a2, #\width | |
376 bl \func | |
377 add a1, a1, #(16*2) | |
378 add a2, a2, #\width | |
379 bl \func | |
380 sub a1, a1, #(16*5) | |
381 add a2, a2, #\width | |
382 bl \func | |
383 add a1, a1, #(16*2) | |
384 add a2, a2, #\width | |
385 bl \func | |
386 add a1, a1, #(16*2) | |
387 add a2, a2, #\width | |
388 bl \func | |
389 add a1, a1, #(16*2) | |
390 add a2, a2, #\width | |
391 bl \func | |
392 | |
393 sub a1, a1, #(16*7) | |
394 .endm | |
395 | |
396 .align | |
397 .global ff_simple_idct_armv6 | |
4867
97d82c7585b4
add .type foo, %function directives for the benefit of debuggers
mru
parents:
4483
diff
changeset
|
398 .type ff_simple_idct_armv6, %function |
4427 | 399 .func ff_simple_idct_armv6 |
400 /* void ff_simple_idct_armv6(DCTELEM *data); */ | |
401 ff_simple_idct_armv6: | |
402 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} | |
403 sub sp, sp, #128 | |
404 | |
405 mov a2, sp | |
406 idct_rows idct_row_armv6, 2 | |
407 mov a2, a1 | |
408 mov a1, sp | |
409 idct_rows idct_col_armv6, 2 | |
410 | |
411 add sp, sp, #128 | |
412 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} | |
413 .endfunc | |
414 | |
415 .align | |
416 .global ff_simple_idct_add_armv6 | |
4867
97d82c7585b4
add .type foo, %function directives for the benefit of debuggers
mru
parents:
4483
diff
changeset
|
417 .type ff_simple_idct_add_armv6, %function |
4427 | 418 .func ff_simple_idct_add_armv6 |
419 /* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ | |
420 ff_simple_idct_add_armv6: | |
421 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} | |
422 sub sp, sp, #128 | |
423 | |
424 mov a1, a3 | |
425 mov a2, sp | |
426 idct_rows idct_row_armv6, 2 | |
427 mov a1, sp | |
428 ldr a2, [sp, #128] | |
429 ldr a3, [sp, #(128+4)] | |
430 idct_rows idct_col_add_armv6, 1 | |
431 | |
432 add sp, sp, #(128+8) | |
433 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} | |
434 .endfunc | |
435 | |
436 .align | |
437 .global ff_simple_idct_put_armv6 | |
4867
97d82c7585b4
add .type foo, %function directives for the benefit of debuggers
mru
parents:
4483
diff
changeset
|
438 .type ff_simple_idct_put_armv6, %function |
4427 | 439 .func ff_simple_idct_put_armv6 |
440 /* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ | |
441 ff_simple_idct_put_armv6: | |
442 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} | |
443 sub sp, sp, #128 | |
444 | |
445 mov a1, a3 | |
446 mov a2, sp | |
447 idct_rows idct_row_armv6, 2 | |
448 mov a1, sp | |
449 ldr a2, [sp, #128] | |
450 ldr a3, [sp, #(128+4)] | |
451 idct_rows idct_col_put_armv6, 1 | |
452 | |
453 add sp, sp, #(128+8) | |
454 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} | |
455 .endfunc |