Mercurial > libavcodec.hg
annotate arm/simple_idct_neon.S @ 11980:263b4ef7ad87 libavcodec
tablegen: implement and use WRITE_ARRAY macros
Two macros (WRITE_ARRAY and WRITE_ARRAY_2D) take the prefix (modifiers)
(not all tables are static, and they might not be constant either), the
type, and the name of the array. It'll be copied with same name and type,
and with the correct size of the currently-defined object.
author | flameeyes |
---|---|
date | Sun, 27 Jun 2010 12:21:12 +0000 |
parents | 361a5fcb4393 |
children | 17a110bfdeb6 |
rev | line source |
---|---|
8335 | 1 /* |
2 * ARM NEON IDCT | |
3 * | |
4 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
5 * | |
6 * Based on Simple IDCT | |
7 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> | |
8 * | |
9 * This file is part of FFmpeg. | |
10 * | |
11 * FFmpeg is free software; you can redistribute it and/or | |
12 * modify it under the terms of the GNU Lesser General Public | |
13 * License as published by the Free Software Foundation; either | |
14 * version 2.1 of the License, or (at your option) any later version. | |
15 * | |
16 * FFmpeg is distributed in the hope that it will be useful, | |
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
19 * Lesser General Public License for more details. | |
20 * | |
21 * You should have received a copy of the GNU Lesser General Public | |
22 * License along with FFmpeg; if not, write to the Free Software | |
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
24 */ | |
25 | |
26 #include "asm.S" | |
27 | |
28 #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
29 #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
30 #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
31 #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
32 #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
33 #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
34 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
35 #define W4c ((1<<(COL_SHIFT-1))/W4) | |
36 #define ROW_SHIFT 11 | |
37 #define COL_SHIFT 20 | |
38 | |
39 #define w1 d0[0] | |
40 #define w2 d0[1] | |
41 #define w3 d0[2] | |
42 #define w4 d0[3] | |
43 #define w5 d1[0] | |
44 #define w6 d1[1] | |
45 #define w7 d1[2] | |
46 #define w4c d1[3] | |
47 | |
48 .macro idct_col4_top | |
49 vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ | |
50 vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ | |
51 vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ | |
52 vadd.i32 q11, q15, q7 | |
53 vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ | |
54 vadd.i32 q12, q15, q8 | |
55 vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ | |
56 vsub.i32 q13, q15, q8 | |
57 vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ | |
58 vsub.i32 q14, q15, q7 | |
59 | |
60 vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ | |
61 vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ | |
62 vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ | |
63 vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ | |
64 .endm | |
65 | |
66 .text | |
67 .align 6 | |
68 | |
9724 | 69 function idct_row4_pld_neon |
70 pld [r0] | |
71 add r3, r0, r1, lsl #2 | |
72 pld [r0, r1] | |
73 pld [r0, r1, lsl #1] | |
74 pld [r3, -r1] | |
75 pld [r3] | |
76 pld [r3, r1] | |
77 add r3, r3, r1, lsl #1 | |
78 pld [r3] | |
79 pld [r3, r1] | |
11443 | 80 endfunc |
9724 | 81 |
8335 | 82 function idct_row4_neon |
83 vmov.i32 q15, #(1<<(ROW_SHIFT-1)) | |
84 vld1.64 {d2-d5}, [r2,:128]! | |
85 vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ | |
86 vld1.64 {d6,d7}, [r2,:128]! | |
87 vorr d10, d3, d5 | |
88 vld1.64 {d8,d9}, [r2,:128]! | |
89 add r2, r2, #-64 | |
90 | |
91 vorr d11, d7, d9 | |
92 vorr d10, d10, d11 | |
93 vmov r3, r4, d10 | |
94 | |
95 idct_col4_top | |
96 | |
97 orrs r3, r3, r4 | |
98 beq 1f | |
99 | |
100 vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ | |
101 vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ | |
102 vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ | |
103 vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ | |
104 vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ | |
105 vadd.i32 q11, q11, q7 | |
106 vsub.i32 q12, q12, q7 | |
107 vsub.i32 q13, q13, q7 | |
108 vadd.i32 q14, q14, q7 | |
109 vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ | |
110 vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ | |
111 vmlal.s16 q9, d9, w7 | |
112 vmlsl.s16 q10, d9, w5 | |
113 vmlal.s16 q5, d9, w3 | |
114 vmlsl.s16 q6, d9, w1 | |
115 vadd.i32 q11, q11, q7 | |
116 vsub.i32 q12, q12, q8 | |
117 vadd.i32 q13, q13, q8 | |
118 vsub.i32 q14, q14, q7 | |
119 | |
120 1: vadd.i32 q3, q11, q9 | |
121 vadd.i32 q4, q12, q10 | |
122 vshrn.i32 d2, q3, #ROW_SHIFT | |
123 vshrn.i32 d4, q4, #ROW_SHIFT | |
124 vadd.i32 q7, q13, q5 | |
125 vadd.i32 q8, q14, q6 | |
126 vtrn.16 d2, d4 | |
127 vshrn.i32 d6, q7, #ROW_SHIFT | |
128 vshrn.i32 d8, q8, #ROW_SHIFT | |
129 vsub.i32 q14, q14, q6 | |
130 vsub.i32 q11, q11, q9 | |
131 vtrn.16 d6, d8 | |
132 vsub.i32 q13, q13, q5 | |
133 vshrn.i32 d3, q14, #ROW_SHIFT | |
134 vtrn.32 d2, d6 | |
135 vsub.i32 q12, q12, q10 | |
136 vtrn.32 d4, d8 | |
137 vshrn.i32 d5, q13, #ROW_SHIFT | |
138 vshrn.i32 d7, q12, #ROW_SHIFT | |
139 vshrn.i32 d9, q11, #ROW_SHIFT | |
140 | |
141 vtrn.16 d3, d5 | |
142 vtrn.16 d7, d9 | |
143 vtrn.32 d3, d7 | |
144 vtrn.32 d5, d9 | |
145 | |
146 vst1.64 {d2-d5}, [r2,:128]! | |
147 vst1.64 {d6-d9}, [r2,:128]! | |
148 | |
149 bx lr | |
11443 | 150 endfunc |
8335 | 151 |
152 function idct_col4_neon | |
153 mov ip, #16 | |
154 vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ | |
155 vdup.16 d30, w4c | |
156 vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ | |
157 vadd.i16 d30, d30, d2 | |
158 vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ | |
159 vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/ | |
160 vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */ | |
161 | |
162 ldrd r4, [r2] | |
163 ldrd r6, [r2, #16] | |
164 orrs r4, r4, r5 | |
165 | |
166 idct_col4_top | |
167 addeq r2, r2, #16 | |
168 beq 1f | |
169 | |
170 vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */ | |
171 vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ | |
172 vadd.i32 q11, q11, q7 | |
173 vsub.i32 q12, q12, q7 | |
174 vsub.i32 q13, q13, q7 | |
175 vadd.i32 q14, q14, q7 | |
176 | |
177 1: orrs r6, r6, r7 | |
178 ldrd r4, [r2, #16] | |
179 addeq r2, r2, #16 | |
180 beq 2f | |
181 | |
182 vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */ | |
183 vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ | |
184 vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ | |
185 vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ | |
186 vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ | |
187 | |
188 2: orrs r4, r4, r5 | |
189 ldrd r4, [r2, #16] | |
190 addeq r2, r2, #16 | |
191 beq 3f | |
192 | |
193 vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */ | |
194 vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ | |
195 vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ | |
196 vadd.i32 q11, q11, q7 | |
197 vsub.i32 q14, q14, q7 | |
198 vsub.i32 q12, q12, q8 | |
199 vadd.i32 q13, q13, q8 | |
200 | |
201 3: orrs r4, r4, r5 | |
202 addeq r2, r2, #16 | |
203 beq 4f | |
204 | |
205 vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */ | |
206 vmlal.s16 q9, d9, w7 | |
207 vmlsl.s16 q10, d9, w5 | |
208 vmlal.s16 q5, d9, w3 | |
209 vmlsl.s16 q6, d9, w1 | |
210 | |
211 4: vaddhn.i32 d2, q11, q9 | |
212 vaddhn.i32 d3, q12, q10 | |
213 vaddhn.i32 d4, q13, q5 | |
214 vaddhn.i32 d5, q14, q6 | |
215 vsubhn.i32 d9, q11, q9 | |
216 vsubhn.i32 d8, q12, q10 | |
217 vsubhn.i32 d7, q13, q5 | |
218 vsubhn.i32 d6, q14, q6 | |
219 | |
220 bx lr | |
11443 | 221 endfunc |
8335 | 222 |
223 .align 6 | |
224 | |
225 function idct_col4_st8_neon | |
226 vqshrun.s16 d2, q1, #COL_SHIFT-16 | |
227 vqshrun.s16 d3, q2, #COL_SHIFT-16 | |
228 vqshrun.s16 d4, q3, #COL_SHIFT-16 | |
229 vqshrun.s16 d5, q4, #COL_SHIFT-16 | |
230 vst1.32 {d2[0]}, [r0,:32], r1 | |
231 vst1.32 {d2[1]}, [r0,:32], r1 | |
232 vst1.32 {d3[0]}, [r0,:32], r1 | |
233 vst1.32 {d3[1]}, [r0,:32], r1 | |
234 vst1.32 {d4[0]}, [r0,:32], r1 | |
235 vst1.32 {d4[1]}, [r0,:32], r1 | |
236 vst1.32 {d5[0]}, [r0,:32], r1 | |
237 vst1.32 {d5[1]}, [r0,:32], r1 | |
238 | |
239 bx lr | |
11443 | 240 endfunc |
8335 | 241 |
242 .section .rodata | |
243 .align 4 | |
8506 | 244 idct_coeff_neon: |
245 .short W1, W2, W3, W4, W5, W6, W7, W4c | |
8335 | 246 .previous |
247 | |
248 .macro idct_start data | |
249 push {r4-r7, lr} | |
250 pld [\data] | |
251 pld [\data, #64] | |
252 vpush {d8-d15} | |
8507
779a9c93bf61
ARM: work around linker bug with movw/movt relocations in shared libs
mru
parents:
8506
diff
changeset
|
253 movrel r3, idct_coeff_neon |
8335 | 254 vld1.64 {d0,d1}, [r3,:128] |
255 .endm | |
256 | |
257 .macro idct_end | |
258 vpop {d8-d15} | |
259 pop {r4-r7, pc} | |
260 .endm | |
261 | |
262 /* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */ | |
263 function ff_simple_idct_put_neon, export=1 | |
264 idct_start r2 | |
265 | |
9724 | 266 bl idct_row4_pld_neon |
8335 | 267 bl idct_row4_neon |
268 add r2, r2, #-128 | |
269 bl idct_col4_neon | |
270 bl idct_col4_st8_neon | |
271 sub r0, r0, r1, lsl #3 | |
272 add r0, r0, #4 | |
273 add r2, r2, #-120 | |
274 bl idct_col4_neon | |
275 bl idct_col4_st8_neon | |
276 | |
277 idct_end | |
11443 | 278 endfunc |
8335 | 279 |
280 .align 6 | |
281 | |
282 function idct_col4_add8_neon | |
283 mov ip, r0 | |
284 | |
285 vld1.32 {d10[0]}, [r0,:32], r1 | |
286 vshr.s16 q1, q1, #COL_SHIFT-16 | |
287 vld1.32 {d10[1]}, [r0,:32], r1 | |
288 vshr.s16 q2, q2, #COL_SHIFT-16 | |
289 vld1.32 {d11[0]}, [r0,:32], r1 | |
290 vshr.s16 q3, q3, #COL_SHIFT-16 | |
291 vld1.32 {d11[1]}, [r0,:32], r1 | |
292 vshr.s16 q4, q4, #COL_SHIFT-16 | |
293 vld1.32 {d12[0]}, [r0,:32], r1 | |
294 vaddw.u8 q1, q1, d10 | |
295 vld1.32 {d12[1]}, [r0,:32], r1 | |
296 vaddw.u8 q2, q2, d11 | |
297 vld1.32 {d13[0]}, [r0,:32], r1 | |
298 vqmovun.s16 d2, q1 | |
299 vld1.32 {d13[1]}, [r0,:32], r1 | |
300 vaddw.u8 q3, q3, d12 | |
301 vst1.32 {d2[0]}, [ip,:32], r1 | |
302 vqmovun.s16 d3, q2 | |
303 vst1.32 {d2[1]}, [ip,:32], r1 | |
304 vaddw.u8 q4, q4, d13 | |
305 vst1.32 {d3[0]}, [ip,:32], r1 | |
306 vqmovun.s16 d4, q3 | |
307 vst1.32 {d3[1]}, [ip,:32], r1 | |
308 vqmovun.s16 d5, q4 | |
309 vst1.32 {d4[0]}, [ip,:32], r1 | |
310 vst1.32 {d4[1]}, [ip,:32], r1 | |
311 vst1.32 {d5[0]}, [ip,:32], r1 | |
312 vst1.32 {d5[1]}, [ip,:32], r1 | |
313 | |
314 bx lr | |
11443 | 315 endfunc |
8335 | 316 |
317 /* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */ | |
318 function ff_simple_idct_add_neon, export=1 | |
319 idct_start r2 | |
320 | |
9724 | 321 bl idct_row4_pld_neon |
8335 | 322 bl idct_row4_neon |
323 add r2, r2, #-128 | |
324 bl idct_col4_neon | |
325 bl idct_col4_add8_neon | |
326 sub r0, r0, r1, lsl #3 | |
327 add r0, r0, #4 | |
328 add r2, r2, #-120 | |
329 bl idct_col4_neon | |
330 bl idct_col4_add8_neon | |
331 | |
332 idct_end | |
11443 | 333 endfunc |
8335 | 334 |
335 .align 6 | |
336 | |
337 function idct_col4_st16_neon | |
338 mov ip, #16 | |
339 | |
340 vshr.s16 q1, q1, #COL_SHIFT-16 | |
341 vshr.s16 q2, q2, #COL_SHIFT-16 | |
342 vst1.64 {d2}, [r2,:64], ip | |
343 vshr.s16 q3, q3, #COL_SHIFT-16 | |
344 vst1.64 {d3}, [r2,:64], ip | |
345 vshr.s16 q4, q4, #COL_SHIFT-16 | |
346 vst1.64 {d4}, [r2,:64], ip | |
347 vst1.64 {d5}, [r2,:64], ip | |
348 vst1.64 {d6}, [r2,:64], ip | |
349 vst1.64 {d7}, [r2,:64], ip | |
350 vst1.64 {d8}, [r2,:64], ip | |
351 vst1.64 {d9}, [r2,:64], ip | |
352 | |
353 bx lr | |
11443 | 354 endfunc |
8335 | 355 |
356 /* void ff_simple_idct_neon(DCTELEM *data); */ | |
357 function ff_simple_idct_neon, export=1 | |
358 idct_start r0 | |
359 | |
360 mov r2, r0 | |
361 bl idct_row4_neon | |
362 bl idct_row4_neon | |
363 add r2, r2, #-128 | |
364 bl idct_col4_neon | |
365 add r2, r2, #-128 | |
366 bl idct_col4_st16_neon | |
367 add r2, r2, #-120 | |
368 bl idct_col4_neon | |
369 add r2, r2, #-128 | |
370 bl idct_col4_st16_neon | |
371 | |
372 idct_end | |
11443 | 373 endfunc |