Mercurial > libavcodec.hg
comparison arm/mdct_neon.S @ 10153:7a63015e4627 libavcodec
ARM: NEON optimised FFT and MDCT
Vorbis and AC3 ~3x faster.
Parts by Naotoshi Nojiri, naonoj gmail
author | mru |
---|---|
date | Thu, 10 Sep 2009 08:50:03 +0000 |
parents | |
children | 75bab19c59a2 |
comparison
equal
deleted
inserted
replaced
10152:ed85bbd5dccb | 10153:7a63015e4627 |
---|---|
1 /* | |
2 * ARM NEON optimised MDCT | |
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
22 #include "asm.S" | |
23 | |
24 .fpu neon | |
25 .text | |
26 | |
27 function ff_imdct_half_neon, export=1 | |
28 push {r4-r8,lr} | |
29 | |
30 mov r12, #1 | |
31 ldr lr, [r0, #4] @ nbits | |
32 ldr r4, [r0, #8] @ tcos | |
33 ldr r5, [r0, #12] @ tsin | |
34 ldr r3, [r0, #24] @ revtab | |
35 lsl r12, r12, lr @ n = 1 << nbits | |
36 lsr lr, r12, #2 @ n4 = n >> 2 | |
37 add r7, r2, r12, lsl #1 | |
38 mov r12, #-16 | |
39 sub r7, r7, #16 | |
40 | |
41 vld1.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0 | |
42 vld1.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x | |
43 vld1.32 {d2}, [r4,:64]! @ d2=c0,c1 | |
44 vld1.32 {d3}, [r5,:64]! @ d3=s0,s1 | |
45 vuzp.32 d17, d16 | |
46 vuzp.32 d0, d1 | |
47 vmul.f32 d6, d16, d2 | |
48 vmul.f32 d7, d0, d2 | |
49 1: | |
50 subs lr, lr, #2 | |
51 ldr r6, [r3], #4 | |
52 vmul.f32 d4, d0, d3 | |
53 vmul.f32 d5, d16, d3 | |
54 vsub.f32 d4, d6, d4 | |
55 vadd.f32 d5, d5, d7 | |
56 uxtah r8, r1, r6, ror #16 | |
57 uxtah r6, r1, r6 | |
58 beq 1f | |
59 vld1.32 {d16-d17},[r7,:128],r12 | |
60 vld1.32 {d0-d1}, [r2,:128]! | |
61 vuzp.32 d17, d16 | |
62 vld1.32 {d2}, [r4,:64]! | |
63 vuzp.32 d0, d1 | |
64 vmul.f32 d6, d16, d2 | |
65 vld1.32 {d3}, [r5,:64]! | |
66 vmul.f32 d7, d0, d2 | |
67 vst2.32 {d4[0],d5[0]}, [r6,:64] | |
68 vst2.32 {d4[1],d5[1]}, [r8,:64] | |
69 b 1b | |
70 1: | |
71 vst2.32 {d4[0],d5[0]}, [r6,:64] | |
72 vst2.32 {d4[1],d5[1]}, [r8,:64] | |
73 | |
74 mov r4, r0 | |
75 mov r6, r1 | |
76 add r0, r0, #16 | |
77 bl ff_fft_calc_neon | |
78 | |
79 mov r12, #1 | |
80 ldr lr, [r4, #4] @ nbits | |
81 ldr r5, [r4, #12] @ tsin | |
82 ldr r4, [r4, #8] @ tcos | |
83 lsl r12, r12, lr @ n = 1 << nbits | |
84 lsr lr, r12, #3 @ n8 = n >> 3 | |
85 | |
86 add r4, r4, lr, lsl #2 | |
87 add r5, r5, lr, lsl #2 | |
88 add r6, r6, lr, lsl #3 | |
89 sub r1, r4, #8 | |
90 sub r2, r5, #8 | |
91 sub r3, r6, #16 | |
92 | |
93 mov r7, #-16 | |
94 mov r12, #-8 | |
95 mov r8, r6 | |
96 mov r0, r3 | |
97 | |
98 vld1.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0 | |
99 vld1.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3 | |
100 vld1.32 {d18}, [r2,:64], r12 @ d18=s1,s0 | |
101 vuzp.32 d20, d21 | |
102 vuzp.32 d0, d1 | |
103 1: | |
104 subs lr, lr, #2 | |
105 vmul.f32 d7, d0, d18 | |
106 vld1.32 {d19}, [r5,:64]! @ d19=s2,s3 | |
107 vmul.f32 d4, d1, d18 | |
108 vld1.32 {d16}, [r1,:64], r12 @ d16=c1,c0 | |
109 vmul.f32 d5, d21, d19 | |
110 vld1.32 {d17}, [r4,:64]! @ d17=c2,c3 | |
111 vmul.f32 d6, d20, d19 | |
112 vmul.f32 d22, d1, d16 | |
113 vmul.f32 d23, d21, d17 | |
114 vmul.f32 d24, d0, d16 | |
115 vmul.f32 d25, d20, d17 | |
116 vadd.f32 d7, d7, d22 | |
117 vadd.f32 d6, d6, d23 | |
118 vsub.f32 d4, d4, d24 | |
119 vsub.f32 d5, d5, d25 | |
120 beq 1f | |
121 vld1.32 {d0-d1}, [r3,:128], r7 | |
122 vld1.32 {d20-d21},[r6,:128]! | |
123 vld1.32 {d18}, [r2,:64], r12 | |
124 vuzp.32 d20, d21 | |
125 vuzp.32 d0, d1 | |
126 vrev64.32 q3, q3 | |
127 vtrn.32 d4, d6 | |
128 vtrn.32 d5, d7 | |
129 vswp d5, d6 | |
130 vst1.32 {d4-d5}, [r0,:128], r7 | |
131 vst1.32 {d6-d7}, [r8,:128]! | |
132 b 1b | |
133 1: | |
134 vrev64.32 q3, q3 | |
135 vtrn.32 d4, d6 | |
136 vtrn.32 d5, d7 | |
137 vswp d5, d6 | |
138 vst1.32 {d4-d5}, [r0,:128] | |
139 vst1.32 {d6-d7}, [r8,:128] | |
140 | |
141 pop {r4-r8,pc} | |
142 .endfunc | |
143 | |
144 function ff_imdct_calc_neon, export=1 | |
145 push {r4-r6,lr} | |
146 | |
147 ldr r3, [r0, #4] | |
148 mov r4, #1 | |
149 mov r5, r1 | |
150 lsl r4, r4, r3 | |
151 add r1, r1, r4 | |
152 | |
153 bl ff_imdct_half_neon | |
154 | |
155 add r0, r5, r4, lsl #2 | |
156 add r1, r5, r4, lsl #1 | |
157 sub r0, r0, #8 | |
158 sub r2, r1, #16 | |
159 mov r3, #-16 | |
160 mov r6, #-8 | |
161 vmov.i32 d30, #1<<31 | |
162 1: | |
163 vld1.32 {d0-d1}, [r2,:128], r3 | |
164 pld [r0, #-16] | |
165 vrev64.32 q0, q0 | |
166 vld1.32 {d2-d3}, [r1,:128]! | |
167 veor d4, d1, d30 | |
168 pld [r2, #-16] | |
169 vrev64.32 q1, q1 | |
170 veor d5, d0, d30 | |
171 vst1.32 {d2}, [r0,:64], r6 | |
172 vst1.32 {d3}, [r0,:64], r6 | |
173 vst1.32 {d4-d5}, [r5,:128]! | |
174 subs r4, r4, #16 | |
175 bgt 1b | |
176 | |
177 pop {r4-r6,pc} | |
178 .endfunc |