comparison arm/dsputil_neon_s.S @ 9581:2b3b9358bee7 libavcodec

ARM: Use fewer register in NEON put_pixels _y2 and _xy2 Approved by Mans on IRC
author conrad
date Wed, 29 Apr 2009 11:38:09 +0000
parents 51e8f5ab8f1e
children 5cca2790d582
comparison
equal deleted inserted replaced
9580:51e8f5ab8f1e 9581:2b3b9358bee7
71 bne 1b 71 bne 1b
72 bx lr 72 bx lr
73 .endm 73 .endm
74 74
75 .macro pixels16_y2 vhadd=vrhadd.u8 75 .macro pixels16_y2 vhadd=vrhadd.u8
76 push {lr} 76 vld1.64 {d0, d1}, [r1], r2
77 add ip, r1, r2 77 vld1.64 {d2, d3}, [r1], r2
78 lsl lr, r2, #1
79 vld1.64 {d0, d1}, [r1], lr
80 vld1.64 {d2, d3}, [ip], lr
81 1: subs r3, r3, #2 78 1: subs r3, r3, #2
82 \vhadd q2, q0, q1 79 \vhadd q2, q0, q1
83 vld1.64 {d0, d1}, [r1], lr 80 vld1.64 {d0, d1}, [r1], r2
84 \vhadd q3, q0, q1 81 \vhadd q3, q0, q1
85 vld1.64 {d2, d3}, [ip], lr 82 vld1.64 {d2, d3}, [r1], r2
86 pld [r1] 83 pld [r1]
87 pld [ip] 84 pld [r1, r2]
88 vst1.64 {d4, d5}, [r0,:128], r2 85 vst1.64 {d4, d5}, [r0,:128], r2
89 vst1.64 {d6, d7}, [r0,:128], r2 86 vst1.64 {d6, d7}, [r0,:128], r2
90 bne 1b 87 bne 1b
91 pop {pc} 88 bx lr
92 .endm 89 .endm
93 90
94 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 91 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
95 push {lr} 92 vld1.64 {d0-d2}, [r1], r2
96 lsl lr, r2, #1 93 vld1.64 {d4-d6}, [r1], r2
97 add ip, r1, r2
98 vld1.64 {d0-d2}, [r1], lr
99 vld1.64 {d4-d6}, [ip], lr
100 .if \no_rnd 94 .if \no_rnd
101 vmov.i16 q13, #1 95 vmov.i16 q13, #1
102 .endif 96 .endif
103 pld [r1] 97 pld [r1]
104 pld [ip] 98 pld [r1, r2]
105 vext.8 q1, q0, q1, #1 99 vext.8 q1, q0, q1, #1
106 vext.8 q3, q2, q3, #1 100 vext.8 q3, q2, q3, #1
107 vaddl.u8 q8, d0, d2 101 vaddl.u8 q8, d0, d2
108 vaddl.u8 q10, d1, d3 102 vaddl.u8 q10, d1, d3
109 vaddl.u8 q9, d4, d6 103 vaddl.u8 q9, d4, d6
110 vaddl.u8 q11, d5, d7 104 vaddl.u8 q11, d5, d7
111 1: subs r3, r3, #2 105 1: subs r3, r3, #2
112 vld1.64 {d0-d2}, [r1], lr 106 vld1.64 {d0-d2}, [r1], r2
113 vadd.u16 q12, q8, q9 107 vadd.u16 q12, q8, q9
114 pld [r1] 108 pld [r1]
115 .if \no_rnd 109 .if \no_rnd
116 vadd.u16 q12, q12, q13 110 vadd.u16 q12, q12, q13
117 .endif 111 .endif
121 .if \no_rnd 115 .if \no_rnd
122 vadd.u16 q1, q1, q13 116 vadd.u16 q1, q1, q13
123 .endif 117 .endif
124 \vshrn d29, q1, #2 118 \vshrn d29, q1, #2
125 vaddl.u8 q8, d0, d30 119 vaddl.u8 q8, d0, d30
126 vld1.64 {d2-d4}, [ip], lr 120 vld1.64 {d2-d4}, [r1], r2
127 vaddl.u8 q10, d1, d31 121 vaddl.u8 q10, d1, d31
128 vst1.64 {d28,d29}, [r0,:128], r2 122 vst1.64 {d28,d29}, [r0,:128], r2
129 vadd.u16 q12, q8, q9 123 vadd.u16 q12, q8, q9
130 pld [ip] 124 pld [r1, r2]
131 .if \no_rnd 125 .if \no_rnd
132 vadd.u16 q12, q12, q13 126 vadd.u16 q12, q12, q13
133 .endif 127 .endif
134 vext.8 q2, q1, q2, #1 128 vext.8 q2, q1, q2, #1
135 vadd.u16 q0, q10, q11 129 vadd.u16 q0, q10, q11
140 \vshrn d31, q0, #2 134 \vshrn d31, q0, #2
141 vaddl.u8 q9, d2, d4 135 vaddl.u8 q9, d2, d4
142 vaddl.u8 q11, d3, d5 136 vaddl.u8 q11, d3, d5
143 vst1.64 {d30,d31}, [r0,:128], r2 137 vst1.64 {d30,d31}, [r0,:128], r2
144 bgt 1b 138 bgt 1b
145 pop {pc} 139 bx lr
146 .endm 140 .endm
147 141
148 .macro pixels8 142 .macro pixels8
149 1: vld1.64 {d0}, [r1], r2 143 1: vld1.64 {d0}, [r1], r2
150 vld1.64 {d1}, [r1], r2 144 vld1.64 {d1}, [r1], r2
178 bne 1b 172 bne 1b
179 bx lr 173 bx lr
180 .endm 174 .endm
181 175
182 .macro pixels8_y2 vhadd=vrhadd.u8 176 .macro pixels8_y2 vhadd=vrhadd.u8
183 push {lr} 177 vld1.64 {d0}, [r1], r2
184 add ip, r1, r2 178 vld1.64 {d1}, [r1], r2
185 lsl lr, r2, #1
186 vld1.64 {d0}, [r1], lr
187 vld1.64 {d1}, [ip], lr
188 1: subs r3, r3, #2 179 1: subs r3, r3, #2
189 \vhadd d4, d0, d1 180 \vhadd d4, d0, d1
190 vld1.64 {d0}, [r1], lr 181 vld1.64 {d0}, [r1], r2
191 \vhadd d5, d0, d1 182 \vhadd d5, d0, d1
192 vld1.64 {d1}, [ip], lr 183 vld1.64 {d1}, [r1], r2
193 pld [r1] 184 pld [r1]
194 pld [ip] 185 pld [r1, r2]
195 vst1.64 {d4}, [r0,:64], r2 186 vst1.64 {d4}, [r0,:64], r2
196 vst1.64 {d5}, [r0,:64], r2 187 vst1.64 {d5}, [r0,:64], r2
197 bne 1b 188 bne 1b
198 pop {pc} 189 bx lr
199 .endm 190 .endm
200 191
201 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 192 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
202 push {lr} 193 vld1.64 {d0, d1}, [r1], r2
203 lsl lr, r2, #1 194 vld1.64 {d2, d3}, [r1], r2
204 add ip, r1, r2
205 vld1.64 {d0, d1}, [r1], lr
206 vld1.64 {d2, d3}, [ip], lr
207 .if \no_rnd 195 .if \no_rnd
208 vmov.i16 q11, #1 196 vmov.i16 q11, #1
209 .endif 197 .endif
210 pld [r1] 198 pld [r1]
211 pld [ip] 199 pld [r1, r2]
212 vext.8 d4, d0, d1, #1 200 vext.8 d4, d0, d1, #1
213 vext.8 d6, d2, d3, #1 201 vext.8 d6, d2, d3, #1
214 vaddl.u8 q8, d0, d4 202 vaddl.u8 q8, d0, d4
215 vaddl.u8 q9, d2, d6 203 vaddl.u8 q9, d2, d6
216 1: subs r3, r3, #2 204 1: subs r3, r3, #2
217 vld1.64 {d0, d1}, [r1], lr 205 vld1.64 {d0, d1}, [r1], r2
218 pld [r1] 206 pld [r1]
219 vadd.u16 q10, q8, q9 207 vadd.u16 q10, q8, q9
220 vext.8 d4, d0, d1, #1 208 vext.8 d4, d0, d1, #1
221 .if \no_rnd 209 .if \no_rnd
222 vadd.u16 q10, q10, q11 210 vadd.u16 q10, q10, q11
223 .endif 211 .endif
224 vaddl.u8 q8, d0, d4 212 vaddl.u8 q8, d0, d4
225 \vshrn d5, q10, #2 213 \vshrn d5, q10, #2
226 vld1.64 {d2, d3}, [ip], lr 214 vld1.64 {d2, d3}, [r1], r2
227 vadd.u16 q10, q8, q9 215 vadd.u16 q10, q8, q9
228 pld [ip] 216 pld [r1, r2]
229 .if \no_rnd 217 .if \no_rnd
230 vadd.u16 q10, q10, q11 218 vadd.u16 q10, q10, q11
231 .endif 219 .endif
232 vst1.64 {d5}, [r0,:64], r2 220 vst1.64 {d5}, [r0,:64], r2
233 \vshrn d7, q10, #2 221 \vshrn d7, q10, #2
234 vext.8 d6, d2, d3, #1 222 vext.8 d6, d2, d3, #1
235 vaddl.u8 q9, d2, d6 223 vaddl.u8 q9, d2, d6
236 vst1.64 {d7}, [r0,:64], r2 224 vst1.64 {d7}, [r0,:64], r2
237 bgt 1b 225 bgt 1b
238 pop {pc} 226 bx lr
239 .endm 227 .endm
240 228
241 .macro pixfunc pfx name suf rnd_op args:vararg 229 .macro pixfunc pfx name suf rnd_op args:vararg
242 function ff_\pfx\name\suf\()_neon, export=1 230 function ff_\pfx\name\suf\()_neon, export=1
243 \name \rnd_op \args 231 \name \rnd_op \args