Mercurial > libavcodec.hg
comparison arm/dsputil_neon_s.S @ 8359:9281a8a9387a libavcodec
ARM: replace "armv4l" with "arm"
author | mru |
---|---|
date | Wed, 17 Dec 2008 00:54:54 +0000 |
parents | armv4l/dsputil_neon_s.S@6bdd6dfc3574 |
children | 639169d7fad5 |
comparison
equal
deleted
inserted
replaced
8358:c30b92cf446b | 8359:9281a8a9387a |
---|---|
1 /* | |
2 * ARM NEON optimised DSP functions | |
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
22 #include "asm.S" | |
23 | |
24 preserve8 | |
25 .fpu neon | |
26 .text | |
27 | |
28 .macro pixels16 avg=0 | |
29 .if \avg | |
30 mov ip, r0 | |
31 .endif | |
32 1: vld1.64 {d0, d1}, [r1], r2 | |
33 vld1.64 {d2, d3}, [r1], r2 | |
34 vld1.64 {d4, d5}, [r1], r2 | |
35 pld [r1, r2, lsl #2] | |
36 vld1.64 {d6, d7}, [r1], r2 | |
37 pld [r1] | |
38 pld [r1, r2] | |
39 pld [r1, r2, lsl #1] | |
40 .if \avg | |
41 vld1.64 {d16,d17}, [ip], r2 | |
42 vrhadd.u8 q0, q0, q8 | |
43 vld1.64 {d18,d19}, [ip], r2 | |
44 vrhadd.u8 q1, q1, q9 | |
45 vld1.64 {d20,d21}, [ip], r2 | |
46 vrhadd.u8 q2, q2, q10 | |
47 vld1.64 {d22,d23}, [ip], r2 | |
48 vrhadd.u8 q3, q3, q11 | |
49 .endif | |
50 subs r3, r3, #4 | |
51 vst1.64 {d0, d1}, [r0,:128], r2 | |
52 vst1.64 {d2, d3}, [r0,:128], r2 | |
53 vst1.64 {d4, d5}, [r0,:128], r2 | |
54 vst1.64 {d6, d7}, [r0,:128], r2 | |
55 bne 1b | |
56 bx lr | |
57 .endm | |
58 | |
59 .macro pixels16_x2 vhadd=vrhadd.u8 | |
60 1: vld1.64 {d0-d2}, [r1], r2 | |
61 vld1.64 {d4-d6}, [r1], r2 | |
62 pld [r1] | |
63 pld [r1, r2] | |
64 subs r3, r3, #2 | |
65 vext.8 q1, q0, q1, #1 | |
66 \vhadd q0, q0, q1 | |
67 vext.8 q3, q2, q3, #1 | |
68 \vhadd q2, q2, q3 | |
69 vst1.64 {d0, d1}, [r0,:128], r2 | |
70 vst1.64 {d4, d5}, [r0,:128], r2 | |
71 bne 1b | |
72 bx lr | |
73 .endm | |
74 | |
75 .macro pixels16_y2 vhadd=vrhadd.u8 | |
76 push {lr} | |
77 add ip, r1, r2 | |
78 lsl lr, r2, #1 | |
79 vld1.64 {d0, d1}, [r1], lr | |
80 vld1.64 {d2, d3}, [ip], lr | |
81 1: subs r3, r3, #2 | |
82 \vhadd q2, q0, q1 | |
83 vld1.64 {d0, d1}, [r1], lr | |
84 \vhadd q3, q0, q1 | |
85 vld1.64 {d2, d3}, [ip], lr | |
86 pld [r1] | |
87 pld [ip] | |
88 vst1.64 {d4, d5}, [r0,:128], r2 | |
89 vst1.64 {d6, d7}, [r0,:128], r2 | |
90 bne 1b | |
91 pop {pc} | |
92 .endm | |
93 | |
94 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
95 push {lr} | |
96 lsl lr, r2, #1 | |
97 add ip, r1, r2 | |
98 vld1.64 {d0-d2}, [r1], lr | |
99 vld1.64 {d4-d6}, [ip], lr | |
100 .if \no_rnd | |
101 vmov.i16 q13, #1 | |
102 .endif | |
103 pld [r1] | |
104 pld [ip] | |
105 vext.8 q1, q0, q1, #1 | |
106 vext.8 q3, q2, q3, #1 | |
107 vaddl.u8 q8, d0, d2 | |
108 vaddl.u8 q10, d1, d3 | |
109 vaddl.u8 q9, d4, d6 | |
110 vaddl.u8 q11, d5, d7 | |
111 1: subs r3, r3, #2 | |
112 vld1.64 {d0-d2}, [r1], lr | |
113 vadd.u16 q12, q8, q9 | |
114 pld [r1] | |
115 .if \no_rnd | |
116 vadd.u16 q12, q12, q13 | |
117 .endif | |
118 vext.8 q15, q0, q1, #1 | |
119 vadd.u16 q1 , q10, q11 | |
120 \vshrn d28, q12, #2 | |
121 .if \no_rnd | |
122 vadd.u16 q1, q1, q13 | |
123 .endif | |
124 \vshrn d29, q1, #2 | |
125 vaddl.u8 q8, d0, d30 | |
126 vld1.64 {d2-d4}, [ip], lr | |
127 vaddl.u8 q10, d1, d31 | |
128 vst1.64 {d28,d29}, [r0,:128], r2 | |
129 vadd.u16 q12, q8, q9 | |
130 pld [ip] | |
131 .if \no_rnd | |
132 vadd.u16 q12, q12, q13 | |
133 .endif | |
134 vext.8 q2, q1, q2, #1 | |
135 vadd.u16 q0, q10, q11 | |
136 \vshrn d30, q12, #2 | |
137 .if \no_rnd | |
138 vadd.u16 q0, q0, q13 | |
139 .endif | |
140 \vshrn d31, q0, #2 | |
141 vaddl.u8 q9, d2, d4 | |
142 vaddl.u8 q11, d3, d5 | |
143 vst1.64 {d30,d31}, [r0,:128], r2 | |
144 bgt 1b | |
145 pop {pc} | |
146 .endm | |
147 | |
148 .macro pixels8 | |
149 1: vld1.64 {d0}, [r1], r2 | |
150 vld1.64 {d1}, [r1], r2 | |
151 vld1.64 {d2}, [r1], r2 | |
152 pld [r1, r2, lsl #2] | |
153 vld1.64 {d3}, [r1], r2 | |
154 pld [r1] | |
155 pld [r1, r2] | |
156 pld [r1, r2, lsl #1] | |
157 subs r3, r3, #4 | |
158 vst1.64 {d0}, [r0,:64], r2 | |
159 vst1.64 {d1}, [r0,:64], r2 | |
160 vst1.64 {d2}, [r0,:64], r2 | |
161 vst1.64 {d3}, [r0,:64], r2 | |
162 bne 1b | |
163 bx lr | |
164 .endm | |
165 | |
166 .macro pixels8_x2 vhadd=vrhadd.u8 | |
167 1: vld1.64 {d0, d1}, [r1], r2 | |
168 vext.8 d1, d0, d1, #1 | |
169 vld1.64 {d2, d3}, [r1], r2 | |
170 vext.8 d3, d2, d3, #1 | |
171 pld [r1] | |
172 pld [r1, r2] | |
173 subs r3, r3, #2 | |
174 vswp d1, d2 | |
175 \vhadd q0, q0, q1 | |
176 vst1.64 {d0}, [r0,:64], r2 | |
177 vst1.64 {d1}, [r0,:64], r2 | |
178 bne 1b | |
179 bx lr | |
180 .endm | |
181 | |
182 .macro pixels8_y2 vhadd=vrhadd.u8 | |
183 push {lr} | |
184 add ip, r1, r2 | |
185 lsl lr, r2, #1 | |
186 vld1.64 {d0}, [r1], lr | |
187 vld1.64 {d1}, [ip], lr | |
188 1: subs r3, r3, #2 | |
189 \vhadd d4, d0, d1 | |
190 vld1.64 {d0}, [r1], lr | |
191 \vhadd d5, d0, d1 | |
192 vld1.64 {d1}, [ip], lr | |
193 pld [r1] | |
194 pld [ip] | |
195 vst1.64 {d4}, [r0,:64], r2 | |
196 vst1.64 {d5}, [r0,:64], r2 | |
197 bne 1b | |
198 pop {pc} | |
199 .endm | |
200 | |
201 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
202 push {lr} | |
203 lsl lr, r2, #1 | |
204 add ip, r1, r2 | |
205 vld1.64 {d0, d1}, [r1], lr | |
206 vld1.64 {d2, d3}, [ip], lr | |
207 .if \no_rnd | |
208 vmov.i16 q11, #1 | |
209 .endif | |
210 pld [r1] | |
211 pld [ip] | |
212 vext.8 d4, d0, d1, #1 | |
213 vext.8 d6, d2, d3, #1 | |
214 vaddl.u8 q8, d0, d4 | |
215 vaddl.u8 q9, d2, d6 | |
216 1: subs r3, r3, #2 | |
217 vld1.64 {d0, d1}, [r1], lr | |
218 pld [r1] | |
219 vadd.u16 q10, q8, q9 | |
220 vext.8 d4, d0, d1, #1 | |
221 .if \no_rnd | |
222 vadd.u16 q10, q10, q11 | |
223 .endif | |
224 vaddl.u8 q8, d0, d4 | |
225 \vshrn d5, q10, #2 | |
226 vld1.64 {d2, d3}, [ip], lr | |
227 vadd.u16 q10, q8, q9 | |
228 pld [ip] | |
229 .if \no_rnd | |
230 vadd.u16 q10, q10, q11 | |
231 .endif | |
232 vst1.64 {d5}, [r0,:64], r2 | |
233 \vshrn d7, q10, #2 | |
234 vext.8 d6, d2, d3, #1 | |
235 vaddl.u8 q9, d2, d6 | |
236 vst1.64 {d7}, [r0,:64], r2 | |
237 bgt 1b | |
238 pop {pc} | |
239 .endm | |
240 | |
241 .macro pixfunc pfx name suf rnd_op args:vararg | |
242 function ff_\pfx\name\suf\()_neon, export=1 | |
243 \name \rnd_op \args | |
244 .endfunc | |
245 .endm | |
246 | |
247 .macro pixfunc2 pfx name args:vararg | |
248 pixfunc \pfx \name | |
249 pixfunc \pfx \name \args | |
250 .endm | |
251 | |
252 function ff_put_h264_qpel16_mc00_neon, export=1 | |
253 mov r3, #16 | |
254 .endfunc | |
255 | |
256 pixfunc put_ pixels16 | |
257 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 | |
258 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 | |
259 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 | |
260 | |
261 function ff_avg_h264_qpel16_mc00_neon, export=1 | |
262 mov r3, #16 | |
263 .endfunc | |
264 | |
265 pixfunc avg_ pixels16,, 1 | |
266 | |
267 function ff_put_h264_qpel8_mc00_neon, export=1 | |
268 mov r3, #8 | |
269 .endfunc | |
270 | |
271 pixfunc put_ pixels8 | |
272 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 | |
273 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 | |
274 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 |