Mercurial > libavcodec.hg
comparison arm/dsputil_neon_s.S @ 9581:2b3b9358bee7 libavcodec
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
Approved by Mans on IRC
author | conrad |
---|---|
date | Wed, 29 Apr 2009 11:38:09 +0000 |
parents | 51e8f5ab8f1e |
children | 5cca2790d582 |
comparison
equal
deleted
inserted
replaced
9580:51e8f5ab8f1e | 9581:2b3b9358bee7 |
---|---|
71 bne 1b | 71 bne 1b |
72 bx lr | 72 bx lr |
73 .endm | 73 .endm |
74 | 74 |
75 .macro pixels16_y2 vhadd=vrhadd.u8 | 75 .macro pixels16_y2 vhadd=vrhadd.u8 |
76 push {lr} | 76 vld1.64 {d0, d1}, [r1], r2 |
77 add ip, r1, r2 | 77 vld1.64 {d2, d3}, [r1], r2 |
78 lsl lr, r2, #1 | |
79 vld1.64 {d0, d1}, [r1], lr | |
80 vld1.64 {d2, d3}, [ip], lr | |
81 1: subs r3, r3, #2 | 78 1: subs r3, r3, #2 |
82 \vhadd q2, q0, q1 | 79 \vhadd q2, q0, q1 |
83 vld1.64 {d0, d1}, [r1], lr | 80 vld1.64 {d0, d1}, [r1], r2 |
84 \vhadd q3, q0, q1 | 81 \vhadd q3, q0, q1 |
85 vld1.64 {d2, d3}, [ip], lr | 82 vld1.64 {d2, d3}, [r1], r2 |
86 pld [r1] | 83 pld [r1] |
87 pld [ip] | 84 pld [r1, r2] |
88 vst1.64 {d4, d5}, [r0,:128], r2 | 85 vst1.64 {d4, d5}, [r0,:128], r2 |
89 vst1.64 {d6, d7}, [r0,:128], r2 | 86 vst1.64 {d6, d7}, [r0,:128], r2 |
90 bne 1b | 87 bne 1b |
91 pop {pc} | 88 bx lr |
92 .endm | 89 .endm |
93 | 90 |
94 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 | 91 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 |
95 push {lr} | 92 vld1.64 {d0-d2}, [r1], r2 |
96 lsl lr, r2, #1 | 93 vld1.64 {d4-d6}, [r1], r2 |
97 add ip, r1, r2 | |
98 vld1.64 {d0-d2}, [r1], lr | |
99 vld1.64 {d4-d6}, [ip], lr | |
100 .if \no_rnd | 94 .if \no_rnd |
101 vmov.i16 q13, #1 | 95 vmov.i16 q13, #1 |
102 .endif | 96 .endif |
103 pld [r1] | 97 pld [r1] |
104 pld [ip] | 98 pld [r1, r2] |
105 vext.8 q1, q0, q1, #1 | 99 vext.8 q1, q0, q1, #1 |
106 vext.8 q3, q2, q3, #1 | 100 vext.8 q3, q2, q3, #1 |
107 vaddl.u8 q8, d0, d2 | 101 vaddl.u8 q8, d0, d2 |
108 vaddl.u8 q10, d1, d3 | 102 vaddl.u8 q10, d1, d3 |
109 vaddl.u8 q9, d4, d6 | 103 vaddl.u8 q9, d4, d6 |
110 vaddl.u8 q11, d5, d7 | 104 vaddl.u8 q11, d5, d7 |
111 1: subs r3, r3, #2 | 105 1: subs r3, r3, #2 |
112 vld1.64 {d0-d2}, [r1], lr | 106 vld1.64 {d0-d2}, [r1], r2 |
113 vadd.u16 q12, q8, q9 | 107 vadd.u16 q12, q8, q9 |
114 pld [r1] | 108 pld [r1] |
115 .if \no_rnd | 109 .if \no_rnd |
116 vadd.u16 q12, q12, q13 | 110 vadd.u16 q12, q12, q13 |
117 .endif | 111 .endif |
121 .if \no_rnd | 115 .if \no_rnd |
122 vadd.u16 q1, q1, q13 | 116 vadd.u16 q1, q1, q13 |
123 .endif | 117 .endif |
124 \vshrn d29, q1, #2 | 118 \vshrn d29, q1, #2 |
125 vaddl.u8 q8, d0, d30 | 119 vaddl.u8 q8, d0, d30 |
126 vld1.64 {d2-d4}, [ip], lr | 120 vld1.64 {d2-d4}, [r1], r2 |
127 vaddl.u8 q10, d1, d31 | 121 vaddl.u8 q10, d1, d31 |
128 vst1.64 {d28,d29}, [r0,:128], r2 | 122 vst1.64 {d28,d29}, [r0,:128], r2 |
129 vadd.u16 q12, q8, q9 | 123 vadd.u16 q12, q8, q9 |
130 pld [ip] | 124 pld [r1, r2] |
131 .if \no_rnd | 125 .if \no_rnd |
132 vadd.u16 q12, q12, q13 | 126 vadd.u16 q12, q12, q13 |
133 .endif | 127 .endif |
134 vext.8 q2, q1, q2, #1 | 128 vext.8 q2, q1, q2, #1 |
135 vadd.u16 q0, q10, q11 | 129 vadd.u16 q0, q10, q11 |
140 \vshrn d31, q0, #2 | 134 \vshrn d31, q0, #2 |
141 vaddl.u8 q9, d2, d4 | 135 vaddl.u8 q9, d2, d4 |
142 vaddl.u8 q11, d3, d5 | 136 vaddl.u8 q11, d3, d5 |
143 vst1.64 {d30,d31}, [r0,:128], r2 | 137 vst1.64 {d30,d31}, [r0,:128], r2 |
144 bgt 1b | 138 bgt 1b |
145 pop {pc} | 139 bx lr |
146 .endm | 140 .endm |
147 | 141 |
148 .macro pixels8 | 142 .macro pixels8 |
149 1: vld1.64 {d0}, [r1], r2 | 143 1: vld1.64 {d0}, [r1], r2 |
150 vld1.64 {d1}, [r1], r2 | 144 vld1.64 {d1}, [r1], r2 |
178 bne 1b | 172 bne 1b |
179 bx lr | 173 bx lr |
180 .endm | 174 .endm |
181 | 175 |
182 .macro pixels8_y2 vhadd=vrhadd.u8 | 176 .macro pixels8_y2 vhadd=vrhadd.u8 |
183 push {lr} | 177 vld1.64 {d0}, [r1], r2 |
184 add ip, r1, r2 | 178 vld1.64 {d1}, [r1], r2 |
185 lsl lr, r2, #1 | |
186 vld1.64 {d0}, [r1], lr | |
187 vld1.64 {d1}, [ip], lr | |
188 1: subs r3, r3, #2 | 179 1: subs r3, r3, #2 |
189 \vhadd d4, d0, d1 | 180 \vhadd d4, d0, d1 |
190 vld1.64 {d0}, [r1], lr | 181 vld1.64 {d0}, [r1], r2 |
191 \vhadd d5, d0, d1 | 182 \vhadd d5, d0, d1 |
192 vld1.64 {d1}, [ip], lr | 183 vld1.64 {d1}, [r1], r2 |
193 pld [r1] | 184 pld [r1] |
194 pld [ip] | 185 pld [r1, r2] |
195 vst1.64 {d4}, [r0,:64], r2 | 186 vst1.64 {d4}, [r0,:64], r2 |
196 vst1.64 {d5}, [r0,:64], r2 | 187 vst1.64 {d5}, [r0,:64], r2 |
197 bne 1b | 188 bne 1b |
198 pop {pc} | 189 bx lr |
199 .endm | 190 .endm |
200 | 191 |
201 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 | 192 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 |
202 push {lr} | 193 vld1.64 {d0, d1}, [r1], r2 |
203 lsl lr, r2, #1 | 194 vld1.64 {d2, d3}, [r1], r2 |
204 add ip, r1, r2 | |
205 vld1.64 {d0, d1}, [r1], lr | |
206 vld1.64 {d2, d3}, [ip], lr | |
207 .if \no_rnd | 195 .if \no_rnd |
208 vmov.i16 q11, #1 | 196 vmov.i16 q11, #1 |
209 .endif | 197 .endif |
210 pld [r1] | 198 pld [r1] |
211 pld [ip] | 199 pld [r1, r2] |
212 vext.8 d4, d0, d1, #1 | 200 vext.8 d4, d0, d1, #1 |
213 vext.8 d6, d2, d3, #1 | 201 vext.8 d6, d2, d3, #1 |
214 vaddl.u8 q8, d0, d4 | 202 vaddl.u8 q8, d0, d4 |
215 vaddl.u8 q9, d2, d6 | 203 vaddl.u8 q9, d2, d6 |
216 1: subs r3, r3, #2 | 204 1: subs r3, r3, #2 |
217 vld1.64 {d0, d1}, [r1], lr | 205 vld1.64 {d0, d1}, [r1], r2 |
218 pld [r1] | 206 pld [r1] |
219 vadd.u16 q10, q8, q9 | 207 vadd.u16 q10, q8, q9 |
220 vext.8 d4, d0, d1, #1 | 208 vext.8 d4, d0, d1, #1 |
221 .if \no_rnd | 209 .if \no_rnd |
222 vadd.u16 q10, q10, q11 | 210 vadd.u16 q10, q10, q11 |
223 .endif | 211 .endif |
224 vaddl.u8 q8, d0, d4 | 212 vaddl.u8 q8, d0, d4 |
225 \vshrn d5, q10, #2 | 213 \vshrn d5, q10, #2 |
226 vld1.64 {d2, d3}, [ip], lr | 214 vld1.64 {d2, d3}, [r1], r2 |
227 vadd.u16 q10, q8, q9 | 215 vadd.u16 q10, q8, q9 |
228 pld [ip] | 216 pld [r1, r2] |
229 .if \no_rnd | 217 .if \no_rnd |
230 vadd.u16 q10, q10, q11 | 218 vadd.u16 q10, q10, q11 |
231 .endif | 219 .endif |
232 vst1.64 {d5}, [r0,:64], r2 | 220 vst1.64 {d5}, [r0,:64], r2 |
233 \vshrn d7, q10, #2 | 221 \vshrn d7, q10, #2 |
234 vext.8 d6, d2, d3, #1 | 222 vext.8 d6, d2, d3, #1 |
235 vaddl.u8 q9, d2, d6 | 223 vaddl.u8 q9, d2, d6 |
236 vst1.64 {d7}, [r0,:64], r2 | 224 vst1.64 {d7}, [r0,:64], r2 |
237 bgt 1b | 225 bgt 1b |
238 pop {pc} | 226 bx lr |
239 .endm | 227 .endm |
240 | 228 |
241 .macro pixfunc pfx name suf rnd_op args:vararg | 229 .macro pixfunc pfx name suf rnd_op args:vararg |
242 function ff_\pfx\name\suf\()_neon, export=1 | 230 function ff_\pfx\name\suf\()_neon, export=1 |
243 \name \rnd_op \args | 231 \name \rnd_op \args |