comparison arm/dsputil_neon_s.S @ 8359:9281a8a9387a libavcodec

ARM: replace "armv4l" with "arm"
author mru
date Wed, 17 Dec 2008 00:54:54 +0000
parents armv4l/dsputil_neon_s.S@6bdd6dfc3574
children 639169d7fad5
comparison
equal deleted inserted replaced
8358:c30b92cf446b 8359:9281a8a9387a
1 /*
2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include "asm.S"
23
24 preserve8
25 .fpu neon
26 .text
27
28 .macro pixels16 avg=0
29 .if \avg
30 mov ip, r0
31 .endif
32 1: vld1.64 {d0, d1}, [r1], r2
33 vld1.64 {d2, d3}, [r1], r2
34 vld1.64 {d4, d5}, [r1], r2
35 pld [r1, r2, lsl #2]
36 vld1.64 {d6, d7}, [r1], r2
37 pld [r1]
38 pld [r1, r2]
39 pld [r1, r2, lsl #1]
40 .if \avg
41 vld1.64 {d16,d17}, [ip], r2
42 vrhadd.u8 q0, q0, q8
43 vld1.64 {d18,d19}, [ip], r2
44 vrhadd.u8 q1, q1, q9
45 vld1.64 {d20,d21}, [ip], r2
46 vrhadd.u8 q2, q2, q10
47 vld1.64 {d22,d23}, [ip], r2
48 vrhadd.u8 q3, q3, q11
49 .endif
50 subs r3, r3, #4
51 vst1.64 {d0, d1}, [r0,:128], r2
52 vst1.64 {d2, d3}, [r0,:128], r2
53 vst1.64 {d4, d5}, [r0,:128], r2
54 vst1.64 {d6, d7}, [r0,:128], r2
55 bne 1b
56 bx lr
57 .endm
58
59 .macro pixels16_x2 vhadd=vrhadd.u8
60 1: vld1.64 {d0-d2}, [r1], r2
61 vld1.64 {d4-d6}, [r1], r2
62 pld [r1]
63 pld [r1, r2]
64 subs r3, r3, #2
65 vext.8 q1, q0, q1, #1
66 \vhadd q0, q0, q1
67 vext.8 q3, q2, q3, #1
68 \vhadd q2, q2, q3
69 vst1.64 {d0, d1}, [r0,:128], r2
70 vst1.64 {d4, d5}, [r0,:128], r2
71 bne 1b
72 bx lr
73 .endm
74
75 .macro pixels16_y2 vhadd=vrhadd.u8
76 push {lr}
77 add ip, r1, r2
78 lsl lr, r2, #1
79 vld1.64 {d0, d1}, [r1], lr
80 vld1.64 {d2, d3}, [ip], lr
81 1: subs r3, r3, #2
82 \vhadd q2, q0, q1
83 vld1.64 {d0, d1}, [r1], lr
84 \vhadd q3, q0, q1
85 vld1.64 {d2, d3}, [ip], lr
86 pld [r1]
87 pld [ip]
88 vst1.64 {d4, d5}, [r0,:128], r2
89 vst1.64 {d6, d7}, [r0,:128], r2
90 bne 1b
91 pop {pc}
92 .endm
93
94 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
95 push {lr}
96 lsl lr, r2, #1
97 add ip, r1, r2
98 vld1.64 {d0-d2}, [r1], lr
99 vld1.64 {d4-d6}, [ip], lr
100 .if \no_rnd
101 vmov.i16 q13, #1
102 .endif
103 pld [r1]
104 pld [ip]
105 vext.8 q1, q0, q1, #1
106 vext.8 q3, q2, q3, #1
107 vaddl.u8 q8, d0, d2
108 vaddl.u8 q10, d1, d3
109 vaddl.u8 q9, d4, d6
110 vaddl.u8 q11, d5, d7
111 1: subs r3, r3, #2
112 vld1.64 {d0-d2}, [r1], lr
113 vadd.u16 q12, q8, q9
114 pld [r1]
115 .if \no_rnd
116 vadd.u16 q12, q12, q13
117 .endif
118 vext.8 q15, q0, q1, #1
119 vadd.u16 q1 , q10, q11
120 \vshrn d28, q12, #2
121 .if \no_rnd
122 vadd.u16 q1, q1, q13
123 .endif
124 \vshrn d29, q1, #2
125 vaddl.u8 q8, d0, d30
126 vld1.64 {d2-d4}, [ip], lr
127 vaddl.u8 q10, d1, d31
128 vst1.64 {d28,d29}, [r0,:128], r2
129 vadd.u16 q12, q8, q9
130 pld [ip]
131 .if \no_rnd
132 vadd.u16 q12, q12, q13
133 .endif
134 vext.8 q2, q1, q2, #1
135 vadd.u16 q0, q10, q11
136 \vshrn d30, q12, #2
137 .if \no_rnd
138 vadd.u16 q0, q0, q13
139 .endif
140 \vshrn d31, q0, #2
141 vaddl.u8 q9, d2, d4
142 vaddl.u8 q11, d3, d5
143 vst1.64 {d30,d31}, [r0,:128], r2
144 bgt 1b
145 pop {pc}
146 .endm
147
148 .macro pixels8
149 1: vld1.64 {d0}, [r1], r2
150 vld1.64 {d1}, [r1], r2
151 vld1.64 {d2}, [r1], r2
152 pld [r1, r2, lsl #2]
153 vld1.64 {d3}, [r1], r2
154 pld [r1]
155 pld [r1, r2]
156 pld [r1, r2, lsl #1]
157 subs r3, r3, #4
158 vst1.64 {d0}, [r0,:64], r2
159 vst1.64 {d1}, [r0,:64], r2
160 vst1.64 {d2}, [r0,:64], r2
161 vst1.64 {d3}, [r0,:64], r2
162 bne 1b
163 bx lr
164 .endm
165
166 .macro pixels8_x2 vhadd=vrhadd.u8
167 1: vld1.64 {d0, d1}, [r1], r2
168 vext.8 d1, d0, d1, #1
169 vld1.64 {d2, d3}, [r1], r2
170 vext.8 d3, d2, d3, #1
171 pld [r1]
172 pld [r1, r2]
173 subs r3, r3, #2
174 vswp d1, d2
175 \vhadd q0, q0, q1
176 vst1.64 {d0}, [r0,:64], r2
177 vst1.64 {d1}, [r0,:64], r2
178 bne 1b
179 bx lr
180 .endm
181
182 .macro pixels8_y2 vhadd=vrhadd.u8
183 push {lr}
184 add ip, r1, r2
185 lsl lr, r2, #1
186 vld1.64 {d0}, [r1], lr
187 vld1.64 {d1}, [ip], lr
188 1: subs r3, r3, #2
189 \vhadd d4, d0, d1
190 vld1.64 {d0}, [r1], lr
191 \vhadd d5, d0, d1
192 vld1.64 {d1}, [ip], lr
193 pld [r1]
194 pld [ip]
195 vst1.64 {d4}, [r0,:64], r2
196 vst1.64 {d5}, [r0,:64], r2
197 bne 1b
198 pop {pc}
199 .endm
200
201 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
202 push {lr}
203 lsl lr, r2, #1
204 add ip, r1, r2
205 vld1.64 {d0, d1}, [r1], lr
206 vld1.64 {d2, d3}, [ip], lr
207 .if \no_rnd
208 vmov.i16 q11, #1
209 .endif
210 pld [r1]
211 pld [ip]
212 vext.8 d4, d0, d1, #1
213 vext.8 d6, d2, d3, #1
214 vaddl.u8 q8, d0, d4
215 vaddl.u8 q9, d2, d6
216 1: subs r3, r3, #2
217 vld1.64 {d0, d1}, [r1], lr
218 pld [r1]
219 vadd.u16 q10, q8, q9
220 vext.8 d4, d0, d1, #1
221 .if \no_rnd
222 vadd.u16 q10, q10, q11
223 .endif
224 vaddl.u8 q8, d0, d4
225 \vshrn d5, q10, #2
226 vld1.64 {d2, d3}, [ip], lr
227 vadd.u16 q10, q8, q9
228 pld [ip]
229 .if \no_rnd
230 vadd.u16 q10, q10, q11
231 .endif
232 vst1.64 {d5}, [r0,:64], r2
233 \vshrn d7, q10, #2
234 vext.8 d6, d2, d3, #1
235 vaddl.u8 q9, d2, d6
236 vst1.64 {d7}, [r0,:64], r2
237 bgt 1b
238 pop {pc}
239 .endm
240
241 .macro pixfunc pfx name suf rnd_op args:vararg
242 function ff_\pfx\name\suf\()_neon, export=1
243 \name \rnd_op \args
244 .endfunc
245 .endm
246
247 .macro pixfunc2 pfx name args:vararg
248 pixfunc \pfx \name
249 pixfunc \pfx \name \args
250 .endm
251
252 function ff_put_h264_qpel16_mc00_neon, export=1
253 mov r3, #16
254 .endfunc
255
256 pixfunc put_ pixels16
257 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
258 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
259 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
260
261 function ff_avg_h264_qpel16_mc00_neon, export=1
262 mov r3, #16
263 .endfunc
264
265 pixfunc avg_ pixels16,, 1
266
267 function ff_put_h264_qpel8_mc00_neon, export=1
268 mov r3, #8
269 .endfunc
270
271 pixfunc put_ pixels8
272 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
273 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
274 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1