Mercurial > libavcodec.hg
annotate arm/simple_idct_neon.S @ 11562:66f3d2ff88b7 libavcodec
H264: Copy h264dsp when creating new slice threads
Fixes slice multithreading (broken in r22565)
Fixes issue1815
author | astrange |
---|---|
date | Wed, 31 Mar 2010 03:55:42 +0000 |
parents | 361a5fcb4393 |
children | 17a110bfdeb6 |
rev | line source |
---|---|
8335 | 1 /* |
2 * ARM NEON IDCT | |
3 * | |
4 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
5 * | |
6 * Based on Simple IDCT | |
7 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> | |
8 * | |
9 * This file is part of FFmpeg. | |
10 * | |
11 * FFmpeg is free software; you can redistribute it and/or | |
12 * modify it under the terms of the GNU Lesser General Public | |
13 * License as published by the Free Software Foundation; either | |
14 * version 2.1 of the License, or (at your option) any later version. | |
15 * | |
16 * FFmpeg is distributed in the hope that it will be useful, | |
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
19 * Lesser General Public License for more details. | |
20 * | |
21 * You should have received a copy of the GNU Lesser General Public | |
22 * License along with FFmpeg; if not, write to the Free Software | |
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
24 */ | |
25 | |
26 #include "asm.S" | |
27 | |
28 #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
29 #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
30 #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
31 #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
32 #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
33 #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
34 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
35 #define W4c ((1<<(COL_SHIFT-1))/W4) | |
36 #define ROW_SHIFT 11 | |
37 #define COL_SHIFT 20 | |
38 | |
39 #define w1 d0[0] | |
40 #define w2 d0[1] | |
41 #define w3 d0[2] | |
42 #define w4 d0[3] | |
43 #define w5 d1[0] | |
44 #define w6 d1[1] | |
45 #define w7 d1[2] | |
46 #define w4c d1[3] | |
47 | |
48 .macro idct_col4_top | |
49 vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ | |
50 vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ | |
51 vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ | |
52 vadd.i32 q11, q15, q7 | |
53 vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ | |
54 vadd.i32 q12, q15, q8 | |
55 vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ | |
56 vsub.i32 q13, q15, q8 | |
57 vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ | |
58 vsub.i32 q14, q15, q7 | |
59 | |
60 vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ | |
61 vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ | |
62 vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ | |
63 vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ | |
64 .endm | |
65 | |
66 .text | |
67 .align 6 | |
68 | |
9724 | 69 function idct_row4_pld_neon |
70 pld [r0] | |
71 add r3, r0, r1, lsl #2 | |
72 pld [r0, r1] | |
73 pld [r0, r1, lsl #1] | |
74 pld [r3, -r1] | |
75 pld [r3] | |
76 pld [r3, r1] | |
77 add r3, r3, r1, lsl #1 | |
78 pld [r3] | |
79 pld [r3, r1] | |
11443 | 80 endfunc |
9724 | 81 |
8335 | 82 function idct_row4_neon |
83 vmov.i32 q15, #(1<<(ROW_SHIFT-1)) | |
84 vld1.64 {d2-d5}, [r2,:128]! | |
85 vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ | |
86 vld1.64 {d6,d7}, [r2,:128]! | |
87 vorr d10, d3, d5 | |
88 vld1.64 {d8,d9}, [r2,:128]! | |
89 add r2, r2, #-64 | |
90 | |
91 vorr d11, d7, d9 | |
92 vorr d10, d10, d11 | |
93 vmov r3, r4, d10 | |
94 | |
95 idct_col4_top | |
96 | |
97 orrs r3, r3, r4 | |
98 beq 1f | |
99 | |
100 vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ | |
101 vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ | |
102 vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ | |
103 vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ | |
104 vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ | |
105 vadd.i32 q11, q11, q7 | |
106 vsub.i32 q12, q12, q7 | |
107 vsub.i32 q13, q13, q7 | |
108 vadd.i32 q14, q14, q7 | |
109 vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ | |
110 vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ | |
111 vmlal.s16 q9, d9, w7 | |
112 vmlsl.s16 q10, d9, w5 | |
113 vmlal.s16 q5, d9, w3 | |
114 vmlsl.s16 q6, d9, w1 | |
115 vadd.i32 q11, q11, q7 | |
116 vsub.i32 q12, q12, q8 | |
117 vadd.i32 q13, q13, q8 | |
118 vsub.i32 q14, q14, q7 | |
119 | |
120 1: vadd.i32 q3, q11, q9 | |
121 vadd.i32 q4, q12, q10 | |
122 vshrn.i32 d2, q3, #ROW_SHIFT | |
123 vshrn.i32 d4, q4, #ROW_SHIFT | |
124 vadd.i32 q7, q13, q5 | |
125 vadd.i32 q8, q14, q6 | |
126 vtrn.16 d2, d4 | |
127 vshrn.i32 d6, q7, #ROW_SHIFT | |
128 vshrn.i32 d8, q8, #ROW_SHIFT | |
129 vsub.i32 q14, q14, q6 | |
130 vsub.i32 q11, q11, q9 | |
131 vtrn.16 d6, d8 | |
132 vsub.i32 q13, q13, q5 | |
133 vshrn.i32 d3, q14, #ROW_SHIFT | |
134 vtrn.32 d2, d6 | |
135 vsub.i32 q12, q12, q10 | |
136 vtrn.32 d4, d8 | |
137 vshrn.i32 d5, q13, #ROW_SHIFT | |
138 vshrn.i32 d7, q12, #ROW_SHIFT | |
139 vshrn.i32 d9, q11, #ROW_SHIFT | |
140 | |
141 vtrn.16 d3, d5 | |
142 vtrn.16 d7, d9 | |
143 vtrn.32 d3, d7 | |
144 vtrn.32 d5, d9 | |
145 | |
146 vst1.64 {d2-d5}, [r2,:128]! | |
147 vst1.64 {d6-d9}, [r2,:128]! | |
148 | |
149 bx lr | |
11443 | 150 endfunc |
8335 | 151 |
152 function idct_col4_neon | |
153 mov ip, #16 | |
154 vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ | |
155 vdup.16 d30, w4c | |
156 vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ | |
157 vadd.i16 d30, d30, d2 | |
158 vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ | |
159 vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/ | |
160 vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */ | |
161 | |
162 ldrd r4, [r2] | |
163 ldrd r6, [r2, #16] | |
164 orrs r4, r4, r5 | |
165 | |
166 idct_col4_top | |
167 addeq r2, r2, #16 | |
168 beq 1f | |
169 | |
170 vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */ | |
171 vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ | |
172 vadd.i32 q11, q11, q7 | |
173 vsub.i32 q12, q12, q7 | |
174 vsub.i32 q13, q13, q7 | |
175 vadd.i32 q14, q14, q7 | |
176 | |
177 1: orrs r6, r6, r7 | |
178 ldrd r4, [r2, #16] | |
179 addeq r2, r2, #16 | |
180 beq 2f | |
181 | |
182 vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */ | |
183 vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ | |
184 vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ | |
185 vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ | |
186 vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ | |
187 | |
188 2: orrs r4, r4, r5 | |
189 ldrd r4, [r2, #16] | |
190 addeq r2, r2, #16 | |
191 beq 3f | |
192 | |
193 vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */ | |
194 vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ | |
195 vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ | |
196 vadd.i32 q11, q11, q7 | |
197 vsub.i32 q14, q14, q7 | |
198 vsub.i32 q12, q12, q8 | |
199 vadd.i32 q13, q13, q8 | |
200 | |
201 3: orrs r4, r4, r5 | |
202 addeq r2, r2, #16 | |
203 beq 4f | |
204 | |
205 vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */ | |
206 vmlal.s16 q9, d9, w7 | |
207 vmlsl.s16 q10, d9, w5 | |
208 vmlal.s16 q5, d9, w3 | |
209 vmlsl.s16 q6, d9, w1 | |
210 | |
211 4: vaddhn.i32 d2, q11, q9 | |
212 vaddhn.i32 d3, q12, q10 | |
213 vaddhn.i32 d4, q13, q5 | |
214 vaddhn.i32 d5, q14, q6 | |
215 vsubhn.i32 d9, q11, q9 | |
216 vsubhn.i32 d8, q12, q10 | |
217 vsubhn.i32 d7, q13, q5 | |
218 vsubhn.i32 d6, q14, q6 | |
219 | |
220 bx lr | |
11443 | 221 endfunc |
8335 | 222 |
223 .align 6 | |
224 | |
225 function idct_col4_st8_neon | |
226 vqshrun.s16 d2, q1, #COL_SHIFT-16 | |
227 vqshrun.s16 d3, q2, #COL_SHIFT-16 | |
228 vqshrun.s16 d4, q3, #COL_SHIFT-16 | |
229 vqshrun.s16 d5, q4, #COL_SHIFT-16 | |
230 vst1.32 {d2[0]}, [r0,:32], r1 | |
231 vst1.32 {d2[1]}, [r0,:32], r1 | |
232 vst1.32 {d3[0]}, [r0,:32], r1 | |
233 vst1.32 {d3[1]}, [r0,:32], r1 | |
234 vst1.32 {d4[0]}, [r0,:32], r1 | |
235 vst1.32 {d4[1]}, [r0,:32], r1 | |
236 vst1.32 {d5[0]}, [r0,:32], r1 | |
237 vst1.32 {d5[1]}, [r0,:32], r1 | |
238 | |
239 bx lr | |
11443 | 240 endfunc |
8335 | 241 |
242 .section .rodata | |
243 .align 4 | |
8506 | 244 idct_coeff_neon: |
245 .short W1, W2, W3, W4, W5, W6, W7, W4c | |
8335 | 246 .previous |
247 | |
248 .macro idct_start data | |
249 push {r4-r7, lr} | |
250 pld [\data] | |
251 pld [\data, #64] | |
252 vpush {d8-d15} | |
8507
779a9c93bf61
ARM: work around linker bug with movw/movt relocations in shared libs
mru
parents:
8506
diff
changeset
|
253 movrel r3, idct_coeff_neon |
8335 | 254 vld1.64 {d0,d1}, [r3,:128] |
255 .endm | |
256 | |
257 .macro idct_end | |
258 vpop {d8-d15} | |
259 pop {r4-r7, pc} | |
260 .endm | |
261 | |
262 /* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */ | |
263 function ff_simple_idct_put_neon, export=1 | |
264 idct_start r2 | |
265 | |
9724 | 266 bl idct_row4_pld_neon |
8335 | 267 bl idct_row4_neon |
268 add r2, r2, #-128 | |
269 bl idct_col4_neon | |
270 bl idct_col4_st8_neon | |
271 sub r0, r0, r1, lsl #3 | |
272 add r0, r0, #4 | |
273 add r2, r2, #-120 | |
274 bl idct_col4_neon | |
275 bl idct_col4_st8_neon | |
276 | |
277 idct_end | |
11443 | 278 endfunc |
8335 | 279 |
280 .align 6 | |
281 | |
282 function idct_col4_add8_neon | |
283 mov ip, r0 | |
284 | |
285 vld1.32 {d10[0]}, [r0,:32], r1 | |
286 vshr.s16 q1, q1, #COL_SHIFT-16 | |
287 vld1.32 {d10[1]}, [r0,:32], r1 | |
288 vshr.s16 q2, q2, #COL_SHIFT-16 | |
289 vld1.32 {d11[0]}, [r0,:32], r1 | |
290 vshr.s16 q3, q3, #COL_SHIFT-16 | |
291 vld1.32 {d11[1]}, [r0,:32], r1 | |
292 vshr.s16 q4, q4, #COL_SHIFT-16 | |
293 vld1.32 {d12[0]}, [r0,:32], r1 | |
294 vaddw.u8 q1, q1, d10 | |
295 vld1.32 {d12[1]}, [r0,:32], r1 | |
296 vaddw.u8 q2, q2, d11 | |
297 vld1.32 {d13[0]}, [r0,:32], r1 | |
298 vqmovun.s16 d2, q1 | |
299 vld1.32 {d13[1]}, [r0,:32], r1 | |
300 vaddw.u8 q3, q3, d12 | |
301 vst1.32 {d2[0]}, [ip,:32], r1 | |
302 vqmovun.s16 d3, q2 | |
303 vst1.32 {d2[1]}, [ip,:32], r1 | |
304 vaddw.u8 q4, q4, d13 | |
305 vst1.32 {d3[0]}, [ip,:32], r1 | |
306 vqmovun.s16 d4, q3 | |
307 vst1.32 {d3[1]}, [ip,:32], r1 | |
308 vqmovun.s16 d5, q4 | |
309 vst1.32 {d4[0]}, [ip,:32], r1 | |
310 vst1.32 {d4[1]}, [ip,:32], r1 | |
311 vst1.32 {d5[0]}, [ip,:32], r1 | |
312 vst1.32 {d5[1]}, [ip,:32], r1 | |
313 | |
314 bx lr | |
11443 | 315 endfunc |
8335 | 316 |
317 /* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */ | |
318 function ff_simple_idct_add_neon, export=1 | |
319 idct_start r2 | |
320 | |
9724 | 321 bl idct_row4_pld_neon |
8335 | 322 bl idct_row4_neon |
323 add r2, r2, #-128 | |
324 bl idct_col4_neon | |
325 bl idct_col4_add8_neon | |
326 sub r0, r0, r1, lsl #3 | |
327 add r0, r0, #4 | |
328 add r2, r2, #-120 | |
329 bl idct_col4_neon | |
330 bl idct_col4_add8_neon | |
331 | |
332 idct_end | |
11443 | 333 endfunc |
8335 | 334 |
335 .align 6 | |
336 | |
337 function idct_col4_st16_neon | |
338 mov ip, #16 | |
339 | |
340 vshr.s16 q1, q1, #COL_SHIFT-16 | |
341 vshr.s16 q2, q2, #COL_SHIFT-16 | |
342 vst1.64 {d2}, [r2,:64], ip | |
343 vshr.s16 q3, q3, #COL_SHIFT-16 | |
344 vst1.64 {d3}, [r2,:64], ip | |
345 vshr.s16 q4, q4, #COL_SHIFT-16 | |
346 vst1.64 {d4}, [r2,:64], ip | |
347 vst1.64 {d5}, [r2,:64], ip | |
348 vst1.64 {d6}, [r2,:64], ip | |
349 vst1.64 {d7}, [r2,:64], ip | |
350 vst1.64 {d8}, [r2,:64], ip | |
351 vst1.64 {d9}, [r2,:64], ip | |
352 | |
353 bx lr | |
11443 | 354 endfunc |
8335 | 355 |
356 /* void ff_simple_idct_neon(DCTELEM *data); */ | |
357 function ff_simple_idct_neon, export=1 | |
358 idct_start r0 | |
359 | |
360 mov r2, r0 | |
361 bl idct_row4_neon | |
362 bl idct_row4_neon | |
363 add r2, r2, #-128 | |
364 bl idct_col4_neon | |
365 add r2, r2, #-128 | |
366 bl idct_col4_st16_neon | |
367 add r2, r2, #-120 | |
368 bl idct_col4_neon | |
369 add r2, r2, #-128 | |
370 bl idct_col4_st16_neon | |
371 | |
372 idct_end | |
11443 | 373 endfunc |