8334
|
1 /*
|
|
2 * ARM NEON optimised DSP functions
|
|
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
4 *
|
|
5 * This file is part of FFmpeg.
|
|
6 *
|
|
7 * FFmpeg is free software; you can redistribute it and/or
|
|
8 * modify it under the terms of the GNU Lesser General Public
|
|
9 * License as published by the Free Software Foundation; either
|
|
10 * version 2.1 of the License, or (at your option) any later version.
|
|
11 *
|
|
12 * FFmpeg is distributed in the hope that it will be useful,
|
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
15 * Lesser General Public License for more details.
|
|
16 *
|
|
17 * You should have received a copy of the GNU Lesser General Public
|
|
18 * License along with FFmpeg; if not, write to the Free Software
|
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
20 */
|
|
21
|
|
22 #include "asm.S"
|
|
23
|
|
24 preserve8
|
|
25 .fpu neon
|
|
26 .text
|
|
27
|
|
28 .macro pixels16 avg=0
|
|
29 .if \avg
|
|
30 mov ip, r0
|
|
31 .endif
|
|
32 1: vld1.64 {d0, d1}, [r1], r2
|
|
33 vld1.64 {d2, d3}, [r1], r2
|
|
34 vld1.64 {d4, d5}, [r1], r2
|
|
35 pld [r1, r2, lsl #2]
|
|
36 vld1.64 {d6, d7}, [r1], r2
|
|
37 pld [r1]
|
|
38 pld [r1, r2]
|
|
39 pld [r1, r2, lsl #1]
|
|
40 .if \avg
|
|
41 vld1.64 {d16,d17}, [ip], r2
|
|
42 vrhadd.u8 q0, q0, q8
|
|
43 vld1.64 {d18,d19}, [ip], r2
|
|
44 vrhadd.u8 q1, q1, q9
|
|
45 vld1.64 {d20,d21}, [ip], r2
|
|
46 vrhadd.u8 q2, q2, q10
|
|
47 vld1.64 {d22,d23}, [ip], r2
|
|
48 vrhadd.u8 q3, q3, q11
|
|
49 .endif
|
|
50 subs r3, r3, #4
|
|
51 vst1.64 {d0, d1}, [r0,:128], r2
|
|
52 vst1.64 {d2, d3}, [r0,:128], r2
|
|
53 vst1.64 {d4, d5}, [r0,:128], r2
|
|
54 vst1.64 {d6, d7}, [r0,:128], r2
|
|
55 bne 1b
|
|
56 bx lr
|
|
57 .endm
|
|
58
|
|
59 .macro pixels16_x2 vhadd=vrhadd.u8
|
|
60 1: vld1.64 {d0-d2}, [r1], r2
|
|
61 vld1.64 {d4-d6}, [r1], r2
|
|
62 pld [r1]
|
|
63 pld [r1, r2]
|
|
64 subs r3, r3, #2
|
|
65 vext.8 q1, q0, q1, #1
|
|
66 \vhadd q0, q0, q1
|
|
67 vext.8 q3, q2, q3, #1
|
|
68 \vhadd q2, q2, q3
|
|
69 vst1.64 {d0, d1}, [r0,:128], r2
|
|
70 vst1.64 {d4, d5}, [r0,:128], r2
|
|
71 bne 1b
|
|
72 bx lr
|
|
73 .endm
|
|
74
|
|
75 .macro pixels16_y2 vhadd=vrhadd.u8
|
|
76 push {lr}
|
|
77 add ip, r1, r2
|
|
78 lsl lr, r2, #1
|
|
79 vld1.64 {d0, d1}, [r1], lr
|
|
80 vld1.64 {d2, d3}, [ip], lr
|
|
81 1: subs r3, r3, #2
|
|
82 \vhadd q2, q0, q1
|
|
83 vld1.64 {d0, d1}, [r1], lr
|
|
84 \vhadd q3, q0, q1
|
|
85 vld1.64 {d2, d3}, [ip], lr
|
|
86 pld [r1]
|
|
87 pld [ip]
|
|
88 vst1.64 {d4, d5}, [r0,:128], r2
|
|
89 vst1.64 {d6, d7}, [r0,:128], r2
|
|
90 bne 1b
|
|
91 pop {pc}
|
|
92 .endm
|
|
93
|
|
94 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
|
|
95 push {lr}
|
|
96 lsl lr, r2, #1
|
|
97 add ip, r1, r2
|
|
98 vld1.64 {d0-d2}, [r1], lr
|
|
99 vld1.64 {d4-d6}, [ip], lr
|
|
100 .if \no_rnd
|
|
101 vmov.i16 q13, #1
|
|
102 .endif
|
|
103 pld [r1]
|
|
104 pld [ip]
|
|
105 vext.8 q1, q0, q1, #1
|
|
106 vext.8 q3, q2, q3, #1
|
|
107 vaddl.u8 q8, d0, d2
|
|
108 vaddl.u8 q10, d1, d3
|
|
109 vaddl.u8 q9, d4, d6
|
|
110 vaddl.u8 q11, d5, d7
|
|
111 1: subs r3, r3, #2
|
|
112 vld1.64 {d0-d2}, [r1], lr
|
|
113 vadd.u16 q12, q8, q9
|
|
114 pld [r1]
|
|
115 .if \no_rnd
|
|
116 vadd.u16 q12, q12, q13
|
|
117 .endif
|
|
118 vext.8 q15, q0, q1, #1
|
|
119 vadd.u16 q1 , q10, q11
|
|
120 \vshrn d28, q12, #2
|
|
121 .if \no_rnd
|
|
122 vadd.u16 q1, q1, q13
|
|
123 .endif
|
|
124 \vshrn d29, q1, #2
|
|
125 vaddl.u8 q8, d0, d30
|
|
126 vld1.64 {d2-d4}, [ip], lr
|
|
127 vaddl.u8 q10, d1, d31
|
|
128 vst1.64 {d28,d29}, [r0,:128], r2
|
|
129 vadd.u16 q12, q8, q9
|
|
130 pld [ip]
|
|
131 .if \no_rnd
|
|
132 vadd.u16 q12, q12, q13
|
|
133 .endif
|
|
134 vext.8 q2, q1, q2, #1
|
|
135 vadd.u16 q0, q10, q11
|
|
136 \vshrn d30, q12, #2
|
|
137 .if \no_rnd
|
|
138 vadd.u16 q0, q0, q13
|
|
139 .endif
|
|
140 \vshrn d31, q0, #2
|
|
141 vaddl.u8 q9, d2, d4
|
|
142 vaddl.u8 q11, d3, d5
|
|
143 vst1.64 {d30,d31}, [r0,:128], r2
|
|
144 bgt 1b
|
|
145 pop {pc}
|
|
146 .endm
|
|
147
|
|
148 .macro pixels8
|
|
149 1: vld1.64 {d0}, [r1], r2
|
|
150 vld1.64 {d1}, [r1], r2
|
|
151 vld1.64 {d2}, [r1], r2
|
|
152 pld [r1, r2, lsl #2]
|
|
153 vld1.64 {d3}, [r1], r2
|
|
154 pld [r1]
|
|
155 pld [r1, r2]
|
|
156 pld [r1, r2, lsl #1]
|
|
157 subs r3, r3, #4
|
|
158 vst1.64 {d0}, [r0,:64], r2
|
|
159 vst1.64 {d1}, [r0,:64], r2
|
|
160 vst1.64 {d2}, [r0,:64], r2
|
|
161 vst1.64 {d3}, [r0,:64], r2
|
|
162 bne 1b
|
|
163 bx lr
|
|
164 .endm
|
|
165
|
|
166 .macro pixels8_x2 vhadd=vrhadd.u8
|
|
167 1: vld1.64 {d0, d1}, [r1], r2
|
|
168 vext.8 d1, d0, d1, #1
|
|
169 vld1.64 {d2, d3}, [r1], r2
|
|
170 vext.8 d3, d2, d3, #1
|
|
171 pld [r1]
|
|
172 pld [r1, r2]
|
|
173 subs r3, r3, #2
|
|
174 vswp d1, d2
|
|
175 \vhadd q0, q0, q1
|
|
176 vst1.64 {d0}, [r0,:64], r2
|
|
177 vst1.64 {d1}, [r0,:64], r2
|
|
178 bne 1b
|
|
179 bx lr
|
|
180 .endm
|
|
181
|
|
182 .macro pixels8_y2 vhadd=vrhadd.u8
|
|
183 push {lr}
|
|
184 add ip, r1, r2
|
|
185 lsl lr, r2, #1
|
|
186 vld1.64 {d0}, [r1], lr
|
|
187 vld1.64 {d1}, [ip], lr
|
|
188 1: subs r3, r3, #2
|
|
189 \vhadd d4, d0, d1
|
|
190 vld1.64 {d0}, [r1], lr
|
|
191 \vhadd d5, d0, d1
|
|
192 vld1.64 {d1}, [ip], lr
|
|
193 pld [r1]
|
|
194 pld [ip]
|
|
195 vst1.64 {d4}, [r0,:64], r2
|
|
196 vst1.64 {d5}, [r0,:64], r2
|
|
197 bne 1b
|
|
198 pop {pc}
|
|
199 .endm
|
|
200
|
|
201 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
|
|
202 push {lr}
|
|
203 lsl lr, r2, #1
|
|
204 add ip, r1, r2
|
|
205 vld1.64 {d0, d1}, [r1], lr
|
|
206 vld1.64 {d2, d3}, [ip], lr
|
|
207 .if \no_rnd
|
|
208 vmov.i16 q11, #1
|
|
209 .endif
|
|
210 pld [r1]
|
|
211 pld [ip]
|
|
212 vext.8 d4, d0, d1, #1
|
|
213 vext.8 d6, d2, d3, #1
|
|
214 vaddl.u8 q8, d0, d4
|
|
215 vaddl.u8 q9, d2, d6
|
|
216 1: subs r3, r3, #2
|
|
217 vld1.64 {d0, d1}, [r1], lr
|
|
218 pld [r1]
|
|
219 vadd.u16 q10, q8, q9
|
|
220 vext.8 d4, d0, d1, #1
|
|
221 .if \no_rnd
|
|
222 vadd.u16 q10, q10, q11
|
|
223 .endif
|
|
224 vaddl.u8 q8, d0, d4
|
|
225 \vshrn d5, q10, #2
|
|
226 vld1.64 {d2, d3}, [ip], lr
|
|
227 vadd.u16 q10, q8, q9
|
|
228 pld [ip]
|
|
229 .if \no_rnd
|
|
230 vadd.u16 q10, q10, q11
|
|
231 .endif
|
|
232 vst1.64 {d5}, [r0,:64], r2
|
|
233 \vshrn d7, q10, #2
|
|
234 vext.8 d6, d2, d3, #1
|
|
235 vaddl.u8 q9, d2, d6
|
|
236 vst1.64 {d7}, [r0,:64], r2
|
|
237 bgt 1b
|
|
238 pop {pc}
|
|
239 .endm
|
|
240
|
|
241 .macro pixfunc pfx name suf rnd_op args:vararg
|
|
242 function ff_\pfx\name\suf\()_neon, export=1
|
|
243 \name \rnd_op \args
|
|
244 .endfunc
|
|
245 .endm
|
|
246
|
|
247 .macro pixfunc2 pfx name args:vararg
|
|
248 pixfunc \pfx \name
|
|
249 pixfunc \pfx \name \args
|
|
250 .endm
|
|
251
|
|
252 function ff_put_h264_qpel16_mc00_neon, export=1
|
|
253 mov r3, #16
|
|
254 .endfunc
|
|
255
|
|
256 pixfunc put_ pixels16
|
|
257 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
|
|
258 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
|
|
259 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
|
|
260
|
|
261 function ff_avg_h264_qpel16_mc00_neon, export=1
|
|
262 mov r3, #16
|
|
263 .endfunc
|
|
264
|
|
265 pixfunc avg_ pixels16,, 1
|
|
266
|
|
267 function ff_put_h264_qpel8_mc00_neon, export=1
|
|
268 mov r3, #8
|
|
269 .endfunc
|
|
270
|
|
271 pixfunc put_ pixels8
|
|
272 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
|
|
273 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
|
|
274 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
|