annotate arm/dsputil_neon.S @ 12530:63edd10ad4bc libavcodec tip

Try to fix crashes introduced by r25218 r25218 made assumptions about the existence of past reference frames that weren't necessarily true.
author darkshikari
date Tue, 28 Sep 2010 09:06:22 +0000
parents 659f16d04776
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
1 /*
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
2 * ARM NEON optimised DSP functions
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
4 *
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
5 * This file is part of FFmpeg.
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
6 *
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
7 * FFmpeg is free software; you can redistribute it and/or
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
8 * modify it under the terms of the GNU Lesser General Public
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
9 * License as published by the Free Software Foundation; either
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
10 * version 2.1 of the License, or (at your option) any later version.
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
11 *
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
12 * FFmpeg is distributed in the hope that it will be useful,
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
15 * Lesser General Public License for more details.
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
16 *
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
17 * You should have received a copy of the GNU Lesser General Public
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
18 * License along with FFmpeg; if not, write to the Free Software
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
20 */
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
21
10046
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
22 #include "config.h"
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
23 #include "asm.S"
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
24
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
25 preserve8
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
26 .text
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
27
11807
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
28 function ff_clear_block_neon, export=1
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
29 vmov.i16 q0, #0
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
30 .rept 8
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
31 vst1.16 {q0}, [r0,:128]!
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
32 .endr
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
33 bx lr
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
34 endfunc
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
35
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
36 function ff_clear_blocks_neon, export=1
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
37 vmov.i16 q0, #0
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
38 .rept 8*6
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
39 vst1.16 {q0}, [r0,:128]!
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
40 .endr
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
41 bx lr
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
42 endfunc
659f16d04776 ARM: NEON clear_block[s]
mru
parents: 11443
diff changeset
43
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
44 .macro pixels16 avg=0
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
45 .if \avg
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
46 mov ip, r0
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
47 .endif
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
48 1: vld1.64 {d0, d1}, [r1], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
49 vld1.64 {d2, d3}, [r1], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
50 vld1.64 {d4, d5}, [r1], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
51 pld [r1, r2, lsl #2]
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
52 vld1.64 {d6, d7}, [r1], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
53 pld [r1]
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
54 pld [r1, r2]
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
55 pld [r1, r2, lsl #1]
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
56 .if \avg
9451
93c20dd3da43 Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents: 9345
diff changeset
57 vld1.64 {d16,d17}, [ip,:128], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
58 vrhadd.u8 q0, q0, q8
9451
93c20dd3da43 Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents: 9345
diff changeset
59 vld1.64 {d18,d19}, [ip,:128], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
60 vrhadd.u8 q1, q1, q9
9451
93c20dd3da43 Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents: 9345
diff changeset
61 vld1.64 {d20,d21}, [ip,:128], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
62 vrhadd.u8 q2, q2, q10
9451
93c20dd3da43 Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents: 9345
diff changeset
63 vld1.64 {d22,d23}, [ip,:128], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
64 vrhadd.u8 q3, q3, q11
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
65 .endif
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
66 subs r3, r3, #4
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
67 vst1.64 {d0, d1}, [r0,:128], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
68 vst1.64 {d2, d3}, [r0,:128], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
69 vst1.64 {d4, d5}, [r0,:128], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
70 vst1.64 {d6, d7}, [r0,:128], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
71 bne 1b
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
72 bx lr
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
73 .endm
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
74
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
75 .macro pixels16_x2 vhadd=vrhadd.u8
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
76 1: vld1.64 {d0-d2}, [r1], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
77 vld1.64 {d4-d6}, [r1], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
78 pld [r1]
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
79 pld [r1, r2]
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
80 subs r3, r3, #2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
81 vext.8 q1, q0, q1, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
82 \vhadd q0, q0, q1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
83 vext.8 q3, q2, q3, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
84 \vhadd q2, q2, q3
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
85 vst1.64 {d0, d1}, [r0,:128], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
86 vst1.64 {d4, d5}, [r0,:128], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
87 bne 1b
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
88 bx lr
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
89 .endm
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
90
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
91 .macro pixels16_y2 vhadd=vrhadd.u8
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
92 vld1.64 {d0, d1}, [r1], r2
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
93 vld1.64 {d2, d3}, [r1], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
94 1: subs r3, r3, #2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
95 \vhadd q2, q0, q1
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
96 vld1.64 {d0, d1}, [r1], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
97 \vhadd q3, q0, q1
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
98 vld1.64 {d2, d3}, [r1], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
99 pld [r1]
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
100 pld [r1, r2]
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
101 vst1.64 {d4, d5}, [r0,:128], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
102 vst1.64 {d6, d7}, [r0,:128], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
103 bne 1b
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
104 bx lr
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
105 .endm
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
106
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
107 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
108 vld1.64 {d0-d2}, [r1], r2
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
109 vld1.64 {d4-d6}, [r1], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
110 .if \no_rnd
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
111 vmov.i16 q13, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
112 .endif
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
113 pld [r1]
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
114 pld [r1, r2]
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
115 vext.8 q1, q0, q1, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
116 vext.8 q3, q2, q3, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
117 vaddl.u8 q8, d0, d2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
118 vaddl.u8 q10, d1, d3
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
119 vaddl.u8 q9, d4, d6
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
120 vaddl.u8 q11, d5, d7
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
121 1: subs r3, r3, #2
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
122 vld1.64 {d0-d2}, [r1], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
123 vadd.u16 q12, q8, q9
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
124 pld [r1]
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
125 .if \no_rnd
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
126 vadd.u16 q12, q12, q13
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
127 .endif
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
128 vext.8 q15, q0, q1, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
129 vadd.u16 q1 , q10, q11
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
130 \vshrn d28, q12, #2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
131 .if \no_rnd
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
132 vadd.u16 q1, q1, q13
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
133 .endif
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
134 \vshrn d29, q1, #2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
135 vaddl.u8 q8, d0, d30
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
136 vld1.64 {d2-d4}, [r1], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
137 vaddl.u8 q10, d1, d31
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
138 vst1.64 {d28,d29}, [r0,:128], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
139 vadd.u16 q12, q8, q9
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
140 pld [r1, r2]
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
141 .if \no_rnd
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
142 vadd.u16 q12, q12, q13
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
143 .endif
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
144 vext.8 q2, q1, q2, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
145 vadd.u16 q0, q10, q11
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
146 \vshrn d30, q12, #2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
147 .if \no_rnd
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
148 vadd.u16 q0, q0, q13
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
149 .endif
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
150 \vshrn d31, q0, #2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
151 vaddl.u8 q9, d2, d4
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
152 vaddl.u8 q11, d3, d5
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
153 vst1.64 {d30,d31}, [r0,:128], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
154 bgt 1b
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
155 bx lr
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
156 .endm
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
157
10375
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
158 .macro pixels8 avg=0
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
159 1: vld1.64 {d0}, [r1], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
160 vld1.64 {d1}, [r1], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
161 vld1.64 {d2}, [r1], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
162 pld [r1, r2, lsl #2]
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
163 vld1.64 {d3}, [r1], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
164 pld [r1]
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
165 pld [r1, r2]
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
166 pld [r1, r2, lsl #1]
10375
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
167 .if \avg
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
168 vld1.64 {d4}, [r0,:64], r2
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
169 vrhadd.u8 d0, d0, d4
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
170 vld1.64 {d5}, [r0,:64], r2
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
171 vrhadd.u8 d1, d1, d5
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
172 vld1.64 {d6}, [r0,:64], r2
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
173 vrhadd.u8 d2, d2, d6
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
174 vld1.64 {d7}, [r0,:64], r2
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
175 vrhadd.u8 d3, d3, d7
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
176 sub r0, r0, r2, lsl #2
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
177 .endif
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
178 subs r3, r3, #4
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
179 vst1.64 {d0}, [r0,:64], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
180 vst1.64 {d1}, [r0,:64], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
181 vst1.64 {d2}, [r0,:64], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
182 vst1.64 {d3}, [r0,:64], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
183 bne 1b
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
184 bx lr
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
185 .endm
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
186
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
187 .macro pixels8_x2 vhadd=vrhadd.u8
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
188 1: vld1.64 {d0, d1}, [r1], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
189 vext.8 d1, d0, d1, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
190 vld1.64 {d2, d3}, [r1], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
191 vext.8 d3, d2, d3, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
192 pld [r1]
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
193 pld [r1, r2]
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
194 subs r3, r3, #2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
195 vswp d1, d2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
196 \vhadd q0, q0, q1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
197 vst1.64 {d0}, [r0,:64], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
198 vst1.64 {d1}, [r0,:64], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
199 bne 1b
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
200 bx lr
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
201 .endm
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
202
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
203 .macro pixels8_y2 vhadd=vrhadd.u8
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
204 vld1.64 {d0}, [r1], r2
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
205 vld1.64 {d1}, [r1], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
206 1: subs r3, r3, #2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
207 \vhadd d4, d0, d1
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
208 vld1.64 {d0}, [r1], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
209 \vhadd d5, d0, d1
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
210 vld1.64 {d1}, [r1], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
211 pld [r1]
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
212 pld [r1, r2]
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
213 vst1.64 {d4}, [r0,:64], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
214 vst1.64 {d5}, [r0,:64], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
215 bne 1b
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
216 bx lr
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
217 .endm
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
218
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
219 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
220 vld1.64 {d0, d1}, [r1], r2
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
221 vld1.64 {d2, d3}, [r1], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
222 .if \no_rnd
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
223 vmov.i16 q11, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
224 .endif
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
225 pld [r1]
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
226 pld [r1, r2]
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
227 vext.8 d4, d0, d1, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
228 vext.8 d6, d2, d3, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
229 vaddl.u8 q8, d0, d4
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
230 vaddl.u8 q9, d2, d6
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
231 1: subs r3, r3, #2
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
232 vld1.64 {d0, d1}, [r1], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
233 pld [r1]
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
234 vadd.u16 q10, q8, q9
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
235 vext.8 d4, d0, d1, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
236 .if \no_rnd
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
237 vadd.u16 q10, q10, q11
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
238 .endif
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
239 vaddl.u8 q8, d0, d4
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
240 \vshrn d5, q10, #2
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
241 vld1.64 {d2, d3}, [r1], r2
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
242 vadd.u16 q10, q8, q9
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
243 pld [r1, r2]
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
244 .if \no_rnd
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
245 vadd.u16 q10, q10, q11
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
246 .endif
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
247 vst1.64 {d5}, [r0,:64], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
248 \vshrn d7, q10, #2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
249 vext.8 d6, d2, d3, #1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
250 vaddl.u8 q9, d2, d6
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
251 vst1.64 {d7}, [r0,:64], r2
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
252 bgt 1b
9581
2b3b9358bee7 ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents: 9580
diff changeset
253 bx lr
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
254 .endm
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
255
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
256 .macro pixfunc pfx name suf rnd_op args:vararg
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
257 function ff_\pfx\name\suf\()_neon, export=1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
258 \name \rnd_op \args
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
259 endfunc
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
260 .endm
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
261
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
262 .macro pixfunc2 pfx name args:vararg
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
263 pixfunc \pfx \name
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
264 pixfunc \pfx \name \args
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
265 .endm
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
266
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
267 function ff_put_h264_qpel16_mc00_neon, export=1
10376
5c5b864d66e1 ARM: whitespace cosmetics
mru
parents: 10375
diff changeset
268 mov r3, #16
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
269 endfunc
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
270
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
271 pixfunc put_ pixels16
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
272 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
273 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
274 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
275
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
276 function ff_avg_h264_qpel16_mc00_neon, export=1
10376
5c5b864d66e1 ARM: whitespace cosmetics
mru
parents: 10375
diff changeset
277 mov r3, #16
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
278 endfunc
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
279
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
280 pixfunc avg_ pixels16,, 1
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
281
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
282 function ff_put_h264_qpel8_mc00_neon, export=1
10376
5c5b864d66e1 ARM: whitespace cosmetics
mru
parents: 10375
diff changeset
283 mov r3, #8
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
284 endfunc
8334
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
285
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
286 pixfunc put_ pixels8
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
287 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
288 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
6bdd6dfc3574 ARM: NEON optimised put_pixels functions
mru
parents:
diff changeset
289 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
8492
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
290
10375
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
291 function ff_avg_h264_qpel8_mc00_neon, export=1
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
292 mov r3, #8
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
293 endfunc
10375
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
294
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
295 pixfunc avg_ pixels8,, 1
199949177888 ARM: NEON avg_pixels8 and avg_h264_qpel8_mc00
mru
parents: 10360
diff changeset
296
9580
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
297 function ff_put_pixels_clamped_neon, export=1
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
298 vld1.64 {d16-d19}, [r0,:128]!
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
299 vqmovun.s16 d0, q8
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
300 vld1.64 {d20-d23}, [r0,:128]!
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
301 vqmovun.s16 d1, q9
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
302 vld1.64 {d24-d27}, [r0,:128]!
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
303 vqmovun.s16 d2, q10
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
304 vld1.64 {d28-d31}, [r0,:128]!
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
305 vqmovun.s16 d3, q11
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
306 vst1.64 {d0}, [r1,:64], r2
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
307 vqmovun.s16 d4, q12
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
308 vst1.64 {d1}, [r1,:64], r2
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
309 vqmovun.s16 d5, q13
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
310 vst1.64 {d2}, [r1,:64], r2
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
311 vqmovun.s16 d6, q14
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
312 vst1.64 {d3}, [r1,:64], r2
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
313 vqmovun.s16 d7, q15
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
314 vst1.64 {d4}, [r1,:64], r2
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
315 vst1.64 {d5}, [r1,:64], r2
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
316 vst1.64 {d6}, [r1,:64], r2
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
317 vst1.64 {d7}, [r1,:64], r2
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
318 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
319 endfunc
9580
51e8f5ab8f1e ARM: NEON put_pixels_clamped
conrad
parents: 9451
diff changeset
320
9345
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
321 function ff_put_signed_pixels_clamped_neon, export=1
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
322 vmov.u8 d31, #128
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
323 vld1.64 {d16-d17}, [r0,:128]!
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
324 vqmovn.s16 d0, q8
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
325 vld1.64 {d18-d19}, [r0,:128]!
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
326 vqmovn.s16 d1, q9
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
327 vld1.64 {d16-d17}, [r0,:128]!
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
328 vqmovn.s16 d2, q8
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
329 vld1.64 {d18-d19}, [r0,:128]!
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
330 vadd.u8 d0, d0, d31
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
331 vld1.64 {d20-d21}, [r0,:128]!
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
332 vadd.u8 d1, d1, d31
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
333 vld1.64 {d22-d23}, [r0,:128]!
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
334 vadd.u8 d2, d2, d31
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
335 vst1.64 {d0}, [r1,:64], r2
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
336 vqmovn.s16 d3, q9
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
337 vst1.64 {d1}, [r1,:64], r2
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
338 vqmovn.s16 d4, q10
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
339 vst1.64 {d2}, [r1,:64], r2
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
340 vqmovn.s16 d5, q11
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
341 vld1.64 {d24-d25}, [r0,:128]!
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
342 vadd.u8 d3, d3, d31
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
343 vld1.64 {d26-d27}, [r0,:128]!
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
344 vadd.u8 d4, d4, d31
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
345 vadd.u8 d5, d5, d31
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
346 vst1.64 {d3}, [r1,:64], r2
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
347 vqmovn.s16 d6, q12
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
348 vst1.64 {d4}, [r1,:64], r2
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
349 vqmovn.s16 d7, q13
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
350 vst1.64 {d5}, [r1,:64], r2
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
351 vadd.u8 d6, d6, d31
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
352 vadd.u8 d7, d7, d31
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
353 vst1.64 {d6}, [r1,:64], r2
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
354 vst1.64 {d7}, [r1,:64], r2
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
355 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
356 endfunc
9345
e0a7a6338526 ARM: NEON optimized put_signed_pixels_clamped
conrad
parents: 9344
diff changeset
357
9344
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
358 function ff_add_pixels_clamped_neon, export=1
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
359 mov r3, r1
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
360 vld1.64 {d16}, [r1,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
361 vld1.64 {d0-d1}, [r0,:128]!
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
362 vaddw.u8 q0, q0, d16
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
363 vld1.64 {d17}, [r1,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
364 vld1.64 {d2-d3}, [r0,:128]!
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
365 vqmovun.s16 d0, q0
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
366 vld1.64 {d18}, [r1,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
367 vaddw.u8 q1, q1, d17
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
368 vld1.64 {d4-d5}, [r0,:128]!
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
369 vaddw.u8 q2, q2, d18
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
370 vst1.64 {d0}, [r3,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
371 vqmovun.s16 d2, q1
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
372 vld1.64 {d19}, [r1,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
373 vld1.64 {d6-d7}, [r0,:128]!
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
374 vaddw.u8 q3, q3, d19
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
375 vqmovun.s16 d4, q2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
376 vst1.64 {d2}, [r3,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
377 vld1.64 {d16}, [r1,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
378 vqmovun.s16 d6, q3
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
379 vld1.64 {d0-d1}, [r0,:128]!
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
380 vaddw.u8 q0, q0, d16
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
381 vst1.64 {d4}, [r3,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
382 vld1.64 {d17}, [r1,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
383 vld1.64 {d2-d3}, [r0,:128]!
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
384 vaddw.u8 q1, q1, d17
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
385 vst1.64 {d6}, [r3,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
386 vqmovun.s16 d0, q0
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
387 vld1.64 {d18}, [r1,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
388 vld1.64 {d4-d5}, [r0,:128]!
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
389 vaddw.u8 q2, q2, d18
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
390 vst1.64 {d0}, [r3,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
391 vqmovun.s16 d2, q1
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
392 vld1.64 {d19}, [r1,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
393 vqmovun.s16 d4, q2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
394 vld1.64 {d6-d7}, [r0,:128]!
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
395 vaddw.u8 q3, q3, d19
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
396 vst1.64 {d2}, [r3,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
397 vqmovun.s16 d6, q3
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
398 vst1.64 {d4}, [r3,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
399 vst1.64 {d6}, [r3,:64], r2
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
400 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
401 endfunc
9344
9ea1ea6db616 ARM: NEON optimised add_pixels_clamped
mru
parents: 8698
diff changeset
402
8492
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
403 function ff_float_to_int16_neon, export=1
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
404 subs r2, r2, #8
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
405 vld1.64 {d0-d1}, [r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
406 vcvt.s32.f32 q8, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
407 vld1.64 {d2-d3}, [r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
408 vcvt.s32.f32 q9, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
409 beq 3f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
410 bics ip, r2, #15
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
411 beq 2f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
412 1: subs ip, ip, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
413 vshrn.s32 d4, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
414 vld1.64 {d0-d1}, [r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
415 vcvt.s32.f32 q0, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
416 vshrn.s32 d5, q9, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
417 vld1.64 {d2-d3}, [r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
418 vcvt.s32.f32 q1, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
419 vshrn.s32 d6, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
420 vst1.64 {d4-d5}, [r0,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
421 vshrn.s32 d7, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
422 vld1.64 {d16-d17},[r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
423 vcvt.s32.f32 q8, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
424 vld1.64 {d18-d19},[r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
425 vcvt.s32.f32 q9, q9, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
426 vst1.64 {d6-d7}, [r0,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
427 bne 1b
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
428 ands r2, r2, #15
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
429 beq 3f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
430 2: vld1.64 {d0-d1}, [r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
431 vshrn.s32 d4, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
432 vcvt.s32.f32 q0, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
433 vld1.64 {d2-d3}, [r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
434 vshrn.s32 d5, q9, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
435 vcvt.s32.f32 q1, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
436 vshrn.s32 d6, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
437 vst1.64 {d4-d5}, [r0,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
438 vshrn.s32 d7, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
439 vst1.64 {d6-d7}, [r0,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
440 bx lr
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
441 3: vshrn.s32 d4, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
442 vshrn.s32 d5, q9, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
443 vst1.64 {d4-d5}, [r0,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
444 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
445 endfunc
8492
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
446
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
447 function ff_float_to_int16_interleave_neon, export=1
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
448 cmp r3, #2
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
449 ldrlt r1, [r1]
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
450 blt ff_float_to_int16_neon
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
451 bne 4f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
452
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
453 ldr r3, [r1]
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
454 ldr r1, [r1, #4]
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
455
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
456 subs r2, r2, #8
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
457 vld1.64 {d0-d1}, [r3,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
458 vcvt.s32.f32 q8, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
459 vld1.64 {d2-d3}, [r3,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
460 vcvt.s32.f32 q9, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
461 vld1.64 {d20-d21},[r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
462 vcvt.s32.f32 q10, q10, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
463 vld1.64 {d22-d23},[r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
464 vcvt.s32.f32 q11, q11, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
465 beq 3f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
466 bics ip, r2, #15
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
467 beq 2f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
468 1: subs ip, ip, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
469 vld1.64 {d0-d1}, [r3,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
470 vcvt.s32.f32 q0, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
471 vsri.32 q10, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
472 vld1.64 {d2-d3}, [r3,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
473 vcvt.s32.f32 q1, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
474 vld1.64 {d24-d25},[r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
475 vcvt.s32.f32 q12, q12, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
476 vld1.64 {d26-d27},[r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
477 vsri.32 q11, q9, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
478 vst1.64 {d20-d21},[r0,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
479 vcvt.s32.f32 q13, q13, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
480 vst1.64 {d22-d23},[r0,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
481 vsri.32 q12, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
482 vld1.64 {d16-d17},[r3,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
483 vsri.32 q13, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
484 vst1.64 {d24-d25},[r0,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
485 vcvt.s32.f32 q8, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
486 vld1.64 {d18-d19},[r3,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
487 vcvt.s32.f32 q9, q9, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
488 vld1.64 {d20-d21},[r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
489 vcvt.s32.f32 q10, q10, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
490 vld1.64 {d22-d23},[r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
491 vcvt.s32.f32 q11, q11, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
492 vst1.64 {d26-d27},[r0,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
493 bne 1b
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
494 ands r2, r2, #15
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
495 beq 3f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
496 2: vsri.32 q10, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
497 vld1.64 {d0-d1}, [r3,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
498 vcvt.s32.f32 q0, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
499 vld1.64 {d2-d3}, [r3,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
500 vcvt.s32.f32 q1, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
501 vld1.64 {d24-d25},[r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
502 vcvt.s32.f32 q12, q12, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
503 vsri.32 q11, q9, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
504 vld1.64 {d26-d27},[r1,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
505 vcvt.s32.f32 q13, q13, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
506 vst1.64 {d20-d21},[r0,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
507 vsri.32 q12, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
508 vst1.64 {d22-d23},[r0,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
509 vsri.32 q13, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
510 vst1.64 {d24-d27},[r0,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
511 bx lr
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
512 3: vsri.32 q10, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
513 vsri.32 q11, q9, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
514 vst1.64 {d20-d23},[r0,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
515 bx lr
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
516
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
517 4: push {r4-r8,lr}
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
518 cmp r3, #4
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
519 lsl ip, r3, #1
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
520 blt 4f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
521
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
522 @ 4 channels
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
523 5: ldmia r1!, {r4-r7}
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
524 mov lr, r2
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
525 mov r8, r0
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
526 vld1.64 {d16-d17},[r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
527 vcvt.s32.f32 q8, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
528 vld1.64 {d18-d19},[r5,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
529 vcvt.s32.f32 q9, q9, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
530 vld1.64 {d20-d21},[r6,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
531 vcvt.s32.f32 q10, q10, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
532 vld1.64 {d22-d23},[r7,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
533 vcvt.s32.f32 q11, q11, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
534 6: subs lr, lr, #8
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
535 vld1.64 {d0-d1}, [r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
536 vcvt.s32.f32 q0, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
537 vsri.32 q9, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
538 vld1.64 {d2-d3}, [r5,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
539 vcvt.s32.f32 q1, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
540 vsri.32 q11, q10, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
541 vld1.64 {d4-d5}, [r6,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
542 vcvt.s32.f32 q2, q2, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
543 vzip.32 d18, d22
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
544 vld1.64 {d6-d7}, [r7,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
545 vcvt.s32.f32 q3, q3, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
546 vzip.32 d19, d23
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
547 vst1.64 {d18}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
548 vsri.32 q1, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
549 vst1.64 {d22}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
550 vsri.32 q3, q2, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
551 vst1.64 {d19}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
552 vzip.32 d2, d6
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
553 vst1.64 {d23}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
554 vzip.32 d3, d7
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
555 beq 7f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
556 vld1.64 {d16-d17},[r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
557 vcvt.s32.f32 q8, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
558 vst1.64 {d2}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
559 vld1.64 {d18-d19},[r5,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
560 vcvt.s32.f32 q9, q9, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
561 vst1.64 {d6}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
562 vld1.64 {d20-d21},[r6,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
563 vcvt.s32.f32 q10, q10, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
564 vst1.64 {d3}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
565 vld1.64 {d22-d23},[r7,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
566 vcvt.s32.f32 q11, q11, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
567 vst1.64 {d7}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
568 b 6b
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
569 7: vst1.64 {d2}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
570 vst1.64 {d6}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
571 vst1.64 {d3}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
572 vst1.64 {d7}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
573 subs r3, r3, #4
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
574 popeq {r4-r8,pc}
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
575 cmp r3, #4
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
576 add r0, r0, #8
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
577 bge 5b
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
578
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
579 @ 2 channels
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
580 4: cmp r3, #2
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
581 blt 4f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
582 ldmia r1!, {r4-r5}
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
583 mov lr, r2
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
584 mov r8, r0
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
585 tst lr, #8
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
586 vld1.64 {d16-d17},[r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
587 vcvt.s32.f32 q8, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
588 vld1.64 {d18-d19},[r5,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
589 vcvt.s32.f32 q9, q9, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
590 vld1.64 {d20-d21},[r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
591 vcvt.s32.f32 q10, q10, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
592 vld1.64 {d22-d23},[r5,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
593 vcvt.s32.f32 q11, q11, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
594 beq 6f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
595 subs lr, lr, #8
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
596 beq 7f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
597 vsri.32 d18, d16, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
598 vsri.32 d19, d17, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
599 vld1.64 {d16-d17},[r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
600 vcvt.s32.f32 q8, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
601 vst1.32 {d18[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
602 vsri.32 d22, d20, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
603 vst1.32 {d18[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
604 vsri.32 d23, d21, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
605 vst1.32 {d19[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
606 vst1.32 {d19[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
607 vld1.64 {d18-d19},[r5,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
608 vcvt.s32.f32 q9, q9, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
609 vst1.32 {d22[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
610 vst1.32 {d22[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
611 vld1.64 {d20-d21},[r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
612 vcvt.s32.f32 q10, q10, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
613 vst1.32 {d23[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
614 vst1.32 {d23[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
615 vld1.64 {d22-d23},[r5,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
616 vcvt.s32.f32 q11, q11, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
617 6: subs lr, lr, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
618 vld1.64 {d0-d1}, [r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
619 vcvt.s32.f32 q0, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
620 vsri.32 d18, d16, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
621 vld1.64 {d2-d3}, [r5,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
622 vcvt.s32.f32 q1, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
623 vsri.32 d19, d17, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
624 vld1.64 {d4-d5}, [r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
625 vcvt.s32.f32 q2, q2, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
626 vld1.64 {d6-d7}, [r5,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
627 vcvt.s32.f32 q3, q3, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
628 vst1.32 {d18[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
629 vsri.32 d22, d20, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
630 vst1.32 {d18[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
631 vsri.32 d23, d21, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
632 vst1.32 {d19[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
633 vsri.32 d2, d0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
634 vst1.32 {d19[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
635 vsri.32 d3, d1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
636 vst1.32 {d22[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
637 vsri.32 d6, d4, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
638 vst1.32 {d22[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
639 vsri.32 d7, d5, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
640 vst1.32 {d23[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
641 vst1.32 {d23[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
642 beq 6f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
643 vld1.64 {d16-d17},[r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
644 vcvt.s32.f32 q8, q8, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
645 vst1.32 {d2[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
646 vst1.32 {d2[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
647 vld1.64 {d18-d19},[r5,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
648 vcvt.s32.f32 q9, q9, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
649 vst1.32 {d3[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
650 vst1.32 {d3[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
651 vld1.64 {d20-d21},[r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
652 vcvt.s32.f32 q10, q10, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
653 vst1.32 {d6[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
654 vst1.32 {d6[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
655 vld1.64 {d22-d23},[r5,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
656 vcvt.s32.f32 q11, q11, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
657 vst1.32 {d7[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
658 vst1.32 {d7[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
659 bgt 6b
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
660 6: vst1.32 {d2[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
661 vst1.32 {d2[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
662 vst1.32 {d3[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
663 vst1.32 {d3[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
664 vst1.32 {d6[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
665 vst1.32 {d6[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
666 vst1.32 {d7[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
667 vst1.32 {d7[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
668 b 8f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
669 7: vsri.32 d18, d16, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
670 vsri.32 d19, d17, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
671 vst1.32 {d18[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
672 vsri.32 d22, d20, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
673 vst1.32 {d18[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
674 vsri.32 d23, d21, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
675 vst1.32 {d19[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
676 vst1.32 {d19[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
677 vst1.32 {d22[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
678 vst1.32 {d22[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
679 vst1.32 {d23[0]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
680 vst1.32 {d23[1]}, [r8], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
681 8: subs r3, r3, #2
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
682 add r0, r0, #4
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
683 popeq {r4-r8,pc}
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
684
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
685 @ 1 channel
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
686 4: ldr r4, [r1],#4
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
687 tst r2, #8
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
688 mov lr, r2
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
689 mov r5, r0
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
690 vld1.64 {d0-d1}, [r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
691 vcvt.s32.f32 q0, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
692 vld1.64 {d2-d3}, [r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
693 vcvt.s32.f32 q1, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
694 bne 8f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
695 6: subs lr, lr, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
696 vld1.64 {d4-d5}, [r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
697 vcvt.s32.f32 q2, q2, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
698 vld1.64 {d6-d7}, [r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
699 vcvt.s32.f32 q3, q3, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
700 vst1.16 {d0[1]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
701 vst1.16 {d0[3]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
702 vst1.16 {d1[1]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
703 vst1.16 {d1[3]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
704 vst1.16 {d2[1]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
705 vst1.16 {d2[3]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
706 vst1.16 {d3[1]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
707 vst1.16 {d3[3]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
708 beq 7f
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
709 vld1.64 {d0-d1}, [r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
710 vcvt.s32.f32 q0, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
711 vld1.64 {d2-d3}, [r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
712 vcvt.s32.f32 q1, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
713 7: vst1.16 {d4[1]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
714 vst1.16 {d4[3]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
715 vst1.16 {d5[1]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
716 vst1.16 {d5[3]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
717 vst1.16 {d6[1]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
718 vst1.16 {d6[3]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
719 vst1.16 {d7[1]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
720 vst1.16 {d7[3]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
721 bgt 6b
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
722 pop {r4-r8,pc}
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
723 8: subs lr, lr, #8
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
724 vst1.16 {d0[1]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
725 vst1.16 {d0[3]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
726 vst1.16 {d1[1]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
727 vst1.16 {d1[3]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
728 vst1.16 {d2[1]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
729 vst1.16 {d2[3]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
730 vst1.16 {d3[1]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
731 vst1.16 {d3[3]}, [r5,:16], ip
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
732 popeq {r4-r8,pc}
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
733 vld1.64 {d0-d1}, [r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
734 vcvt.s32.f32 q0, q0, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
735 vld1.64 {d2-d3}, [r4,:128]!
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
736 vcvt.s32.f32 q1, q1, #16
639169d7fad5 ARM: NEON optimised float_to_int16
mru
parents: 8359
diff changeset
737 b 6b
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
738 endfunc
8697
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
739
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
740 function ff_vector_fmul_neon, export=1
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
741 mov r3, r0
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
742 subs r2, r2, #8
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
743 vld1.64 {d0-d3}, [r0,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
744 vld1.64 {d4-d7}, [r1,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
745 vmul.f32 q8, q0, q2
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
746 vmul.f32 q9, q1, q3
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
747 beq 3f
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
748 bics ip, r2, #15
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
749 beq 2f
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
750 1: subs ip, ip, #16
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
751 vld1.64 {d0-d1}, [r0,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
752 vld1.64 {d4-d5}, [r1,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
753 vmul.f32 q10, q0, q2
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
754 vld1.64 {d2-d3}, [r0,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
755 vld1.64 {d6-d7}, [r1,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
756 vmul.f32 q11, q1, q3
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
757 vst1.64 {d16-d19},[r3,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
758 vld1.64 {d0-d1}, [r0,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
759 vld1.64 {d4-d5}, [r1,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
760 vmul.f32 q8, q0, q2
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
761 vld1.64 {d2-d3}, [r0,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
762 vld1.64 {d6-d7}, [r1,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
763 vmul.f32 q9, q1, q3
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
764 vst1.64 {d20-d23},[r3,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
765 bne 1b
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
766 ands r2, r2, #15
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
767 beq 3f
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
768 2: vld1.64 {d0-d1}, [r0,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
769 vld1.64 {d4-d5}, [r1,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
770 vst1.64 {d16-d17},[r3,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
771 vmul.f32 q8, q0, q2
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
772 vld1.64 {d2-d3}, [r0,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
773 vld1.64 {d6-d7}, [r1,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
774 vst1.64 {d18-d19},[r3,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
775 vmul.f32 q9, q1, q3
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
776 3: vst1.64 {d16-d19},[r3,:128]!
307b176f91e7 ARM: NEON optimised vector_fmul
mru
parents: 8492
diff changeset
777 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
778 endfunc
8698
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
779
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
780 function ff_vector_fmul_window_neon, export=1
9969
5cca2790d582 ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents: 9581
diff changeset
781 VFP vdup.32 q8, d0[0]
5cca2790d582 ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents: 9581
diff changeset
782 NOVFP vld1.32 {d16[],d17[]}, [sp,:32]
8698
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
783 push {r4,r5,lr}
9969
5cca2790d582 ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents: 9581
diff changeset
784 VFP ldr lr, [sp, #12]
5cca2790d582 ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents: 9581
diff changeset
785 NOVFP ldr lr, [sp, #16]
8698
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
786 sub r2, r2, #8
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
787 sub r5, lr, #2
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
788 add r2, r2, r5, lsl #2
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
789 add r4, r3, r5, lsl #3
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
790 add ip, r0, r5, lsl #3
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
791 mov r5, #-16
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
792 vld1.64 {d0,d1}, [r1,:128]!
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
793 vld1.64 {d2,d3}, [r2,:128], r5
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
794 vld1.64 {d4,d5}, [r3,:128]!
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
795 vld1.64 {d6,d7}, [r4,:128], r5
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
796 1: subs lr, lr, #4
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
797 vmov q11, q8
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
798 vmla.f32 d22, d0, d4
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
799 vmov q10, q8
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
800 vmla.f32 d23, d1, d5
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
801 vrev64.32 q3, q3
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
802 vmla.f32 d20, d0, d7
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
803 vrev64.32 q1, q1
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
804 vmla.f32 d21, d1, d6
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
805 beq 2f
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
806 vmla.f32 d22, d3, d7
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
807 vld1.64 {d0,d1}, [r1,:128]!
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
808 vmla.f32 d23, d2, d6
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
809 vld1.64 {d18,d19},[r2,:128], r5
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
810 vmls.f32 d20, d3, d4
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
811 vld1.64 {d24,d25},[r3,:128]!
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
812 vmls.f32 d21, d2, d5
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
813 vld1.64 {d6,d7}, [r4,:128], r5
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
814 vmov q1, q9
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
815 vrev64.32 q11, q11
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
816 vmov q2, q12
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
817 vswp d22, d23
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
818 vst1.64 {d20,d21},[r0,:128]!
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
819 vst1.64 {d22,d23},[ip,:128], r5
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
820 b 1b
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
821 2: vmla.f32 d22, d3, d7
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
822 vmla.f32 d23, d2, d6
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
823 vmls.f32 d20, d3, d4
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
824 vmls.f32 d21, d2, d5
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
825 vrev64.32 q11, q11
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
826 vswp d22, d23
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
827 vst1.64 {d20,d21},[r0,:128]!
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
828 vst1.64 {d22,d23},[ip,:128], r5
24a7b5d0eb27 ARM: NEON optimised vector_fmul_window
mru
parents: 8697
diff changeset
829 pop {r4,r5,pc}
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
830 endfunc
10046
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
831
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
832 #if CONFIG_VORBIS_DECODER
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
833 function ff_vorbis_inverse_coupling_neon, export=1
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
834 vmov.i32 q10, #1<<31
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
835 subs r2, r2, #4
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
836 mov r3, r0
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
837 mov r12, r1
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
838 beq 3f
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
839
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
840 vld1.32 {d24-d25},[r1,:128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
841 vld1.32 {d22-d23},[r0,:128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
842 vcle.s32 q8, q12, #0
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
843 vand q9, q11, q10
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
844 veor q12, q12, q9
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
845 vand q2, q12, q8
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
846 vbic q3, q12, q8
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
847 vadd.f32 q12, q11, q2
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
848 vsub.f32 q11, q11, q3
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
849 1: vld1.32 {d2-d3}, [r1,:128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
850 vld1.32 {d0-d1}, [r0,:128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
851 vcle.s32 q8, q1, #0
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
852 vand q9, q0, q10
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
853 veor q1, q1, q9
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
854 vst1.32 {d24-d25},[r3, :128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
855 vst1.32 {d22-d23},[r12,:128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
856 vand q2, q1, q8
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
857 vbic q3, q1, q8
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
858 vadd.f32 q1, q0, q2
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
859 vsub.f32 q0, q0, q3
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
860 subs r2, r2, #8
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
861 ble 2f
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
862 vld1.32 {d24-d25},[r1,:128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
863 vld1.32 {d22-d23},[r0,:128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
864 vcle.s32 q8, q12, #0
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
865 vand q9, q11, q10
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
866 veor q12, q12, q9
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
867 vst1.32 {d2-d3}, [r3, :128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
868 vst1.32 {d0-d1}, [r12,:128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
869 vand q2, q12, q8
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
870 vbic q3, q12, q8
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
871 vadd.f32 q12, q11, q2
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
872 vsub.f32 q11, q11, q3
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
873 b 1b
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
874
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
875 2: vst1.32 {d2-d3}, [r3, :128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
876 vst1.32 {d0-d1}, [r12,:128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
877 bxlt lr
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
878
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
879 3: vld1.32 {d2-d3}, [r1,:128]
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
880 vld1.32 {d0-d1}, [r0,:128]
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
881 vcle.s32 q8, q1, #0
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
882 vand q9, q0, q10
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
883 veor q1, q1, q9
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
884 vand q2, q1, q8
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
885 vbic q3, q1, q8
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
886 vadd.f32 q1, q0, q2
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
887 vsub.f32 q0, q0, q3
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
888 vst1.32 {d2-d3}, [r0,:128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
889 vst1.32 {d0-d1}, [r1,:128]!
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
890 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
891 endfunc
10046
1e651d94b35f ARM: NEON optimised vorbis_inverse_coupling
mru
parents: 9969
diff changeset
892 #endif
10221
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
893
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
894 function ff_vector_fmul_scalar_neon, export=1
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
895 VFP len .req r2
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
896 NOVFP len .req r3
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
897 VFP vdup.32 q8, d0[0]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
898 NOVFP vdup.32 q8, r2
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
899 bics r12, len, #15
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
900 beq 3f
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
901 vld1.32 {q0},[r1,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
902 vld1.32 {q1},[r1,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
903 1: vmul.f32 q0, q0, q8
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
904 vld1.32 {q2},[r1,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
905 vmul.f32 q1, q1, q8
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
906 vld1.32 {q3},[r1,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
907 vmul.f32 q2, q2, q8
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
908 vst1.32 {q0},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
909 vmul.f32 q3, q3, q8
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
910 vst1.32 {q1},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
911 subs r12, r12, #16
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
912 beq 2f
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
913 vld1.32 {q0},[r1,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
914 vst1.32 {q2},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
915 vld1.32 {q1},[r1,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
916 vst1.32 {q3},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
917 b 1b
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
918 2: vst1.32 {q2},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
919 vst1.32 {q3},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
920 ands len, len, #15
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
921 bxeq lr
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
922 3: vld1.32 {q0},[r1,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
923 vmul.f32 q0, q0, q8
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
924 vst1.32 {q0},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
925 subs len, len, #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
926 bgt 3b
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
927 bx lr
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
928 .unreq len
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
929 endfunc
10221
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
930
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
931 function ff_vector_fmul_sv_scalar_2_neon, export=1
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
932 VFP vdup.32 d16, d0[0]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
933 NOVFP vdup.32 d16, r3
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
934 NOVFP ldr r3, [sp]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
935 vld1.32 {d0},[r1,:64]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
936 vld1.32 {d1},[r1,:64]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
937 1: subs r3, r3, #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
938 vmul.f32 d4, d0, d16
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
939 vmul.f32 d5, d1, d16
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
940 ldr r12, [r2], #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
941 vld1.32 {d2},[r12,:64]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
942 ldr r12, [r2], #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
943 vld1.32 {d3},[r12,:64]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
944 vmul.f32 d4, d4, d2
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
945 vmul.f32 d5, d5, d3
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
946 beq 2f
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
947 vld1.32 {d0},[r1,:64]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
948 vld1.32 {d1},[r1,:64]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
949 vst1.32 {d4},[r0,:64]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
950 vst1.32 {d5},[r0,:64]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
951 b 1b
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
952 2: vst1.32 {d4},[r0,:64]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
953 vst1.32 {d5},[r0,:64]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
954 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
955 endfunc
10221
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
956
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
957 function ff_vector_fmul_sv_scalar_4_neon, export=1
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
958 VFP vdup.32 q10, d0[0]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
959 NOVFP vdup.32 q10, r3
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
960 NOVFP ldr r3, [sp]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
961 push {lr}
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
962 bics lr, r3, #7
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
963 beq 3f
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
964 vld1.32 {q0},[r1,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
965 vld1.32 {q2},[r1,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
966 1: ldr r12, [r2], #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
967 vld1.32 {q1},[r12,:128]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
968 ldr r12, [r2], #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
969 vld1.32 {q3},[r12,:128]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
970 vmul.f32 q8, q0, q10
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
971 vmul.f32 q8, q8, q1
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
972 vmul.f32 q9, q2, q10
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
973 vmul.f32 q9, q9, q3
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
974 subs lr, lr, #8
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
975 beq 2f
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
976 vld1.32 {q0},[r1,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
977 vld1.32 {q2},[r1,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
978 vst1.32 {q8},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
979 vst1.32 {q9},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
980 b 1b
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
981 2: vst1.32 {q8},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
982 vst1.32 {q9},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
983 ands r3, r3, #7
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
984 popeq {pc}
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
985 3: vld1.32 {q0},[r1,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
986 ldr r12, [r2], #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
987 vld1.32 {q1},[r12,:128]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
988 vmul.f32 q0, q0, q10
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
989 vmul.f32 q0, q0, q1
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
990 vst1.32 {q0},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
991 subs r3, r3, #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
992 bgt 3b
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
993 pop {pc}
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
994 endfunc
10221
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
995
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
996 function ff_sv_fmul_scalar_2_neon, export=1
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
997 VFP len .req r2
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
998 NOVFP len .req r3
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
999 VFP vdup.32 q8, d0[0]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1000 NOVFP vdup.32 q8, r2
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1001 ldr r12, [r1], #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1002 vld1.32 {d0},[r12,:64]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1003 ldr r12, [r1], #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1004 vld1.32 {d1},[r12,:64]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1005 1: vmul.f32 q1, q0, q8
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1006 subs len, len, #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1007 beq 2f
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1008 ldr r12, [r1], #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1009 vld1.32 {d0},[r12,:64]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1010 ldr r12, [r1], #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1011 vld1.32 {d1},[r12,:64]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1012 vst1.32 {q1},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1013 b 1b
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1014 2: vst1.32 {q1},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1015 bx lr
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1016 .unreq len
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
1017 endfunc
10221
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1018
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1019 function ff_sv_fmul_scalar_4_neon, export=1
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1020 VFP len .req r2
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1021 NOVFP len .req r3
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1022 VFP vdup.32 q8, d0[0]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1023 NOVFP vdup.32 q8, r2
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1024 1: ldr r12, [r1], #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1025 vld1.32 {q0},[r12,:128]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1026 vmul.f32 q0, q0, q8
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1027 vst1.32 {q0},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1028 subs len, len, #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1029 bgt 1b
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1030 bx lr
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1031 .unreq len
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
1032 endfunc
10221
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1033
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1034 function ff_butterflies_float_neon, export=1
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1035 1: vld1.32 {q0},[r0,:128]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1036 vld1.32 {q1},[r1,:128]
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1037 vsub.f32 q2, q0, q1
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1038 vadd.f32 q1, q0, q1
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1039 vst1.32 {q2},[r1,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1040 vst1.32 {q1},[r0,:128]!
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1041 subs r2, r2, #4
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1042 bgt 1b
2791393081ff ARM: NEON optimisations for some dsputil functions
mru
parents: 10047
diff changeset
1043 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
1044 endfunc
10228
b783894a1c62 ARM: NEON optimised scalarproduct_float
mru
parents: 10221
diff changeset
1045
b783894a1c62 ARM: NEON optimised scalarproduct_float
mru
parents: 10221
diff changeset
1046 function ff_scalarproduct_float_neon, export=1
b783894a1c62 ARM: NEON optimised scalarproduct_float
mru
parents: 10221
diff changeset
1047 vmov.f32 q2, #0.0
b783894a1c62 ARM: NEON optimised scalarproduct_float
mru
parents: 10221
diff changeset
1048 1: vld1.32 {q0},[r0,:128]!
b783894a1c62 ARM: NEON optimised scalarproduct_float
mru
parents: 10221
diff changeset
1049 vld1.32 {q1},[r1,:128]!
b783894a1c62 ARM: NEON optimised scalarproduct_float
mru
parents: 10221
diff changeset
1050 vmla.f32 q2, q0, q1
b783894a1c62 ARM: NEON optimised scalarproduct_float
mru
parents: 10221
diff changeset
1051 subs r2, r2, #4
b783894a1c62 ARM: NEON optimised scalarproduct_float
mru
parents: 10221
diff changeset
1052 bgt 1b
b783894a1c62 ARM: NEON optimised scalarproduct_float
mru
parents: 10221
diff changeset
1053 vadd.f32 d0, d4, d5
b783894a1c62 ARM: NEON optimised scalarproduct_float
mru
parents: 10221
diff changeset
1054 vpadd.f32 d0, d0, d0
b783894a1c62 ARM: NEON optimised scalarproduct_float
mru
parents: 10221
diff changeset
1055 NOVFP vmov.32 r0, d0[0]
b783894a1c62 ARM: NEON optimised scalarproduct_float
mru
parents: 10221
diff changeset
1056 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
1057 endfunc
10253
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1058
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1059 function ff_int32_to_float_fmul_scalar_neon, export=1
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1060 VFP vdup.32 q0, d0[0]
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1061 VFP len .req r2
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1062 NOVFP vdup.32 q0, r2
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1063 NOVFP len .req r3
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1064
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1065 vld1.32 {q1},[r1,:128]!
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1066 vcvt.f32.s32 q3, q1
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1067 vld1.32 {q2},[r1,:128]!
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1068 vcvt.f32.s32 q8, q2
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1069 1: subs len, len, #8
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1070 pld [r1, #16]
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1071 vmul.f32 q9, q3, q0
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1072 vmul.f32 q10, q8, q0
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1073 beq 2f
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1074 vld1.32 {q1},[r1,:128]!
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1075 vcvt.f32.s32 q3, q1
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1076 vld1.32 {q2},[r1,:128]!
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1077 vcvt.f32.s32 q8, q2
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1078 vst1.32 {q9}, [r0,:128]!
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1079 vst1.32 {q10},[r0,:128]!
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1080 b 1b
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1081 2: vst1.32 {q9}, [r0,:128]!
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1082 vst1.32 {q10},[r0,:128]!
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1083 bx lr
64dd9515b93b ARM: NEON optimised int32_to_float_fmul_scalar
mru
parents: 10228
diff changeset
1084 .unreq len
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
1085 endfunc
10274
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1086
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1087 function ff_vector_fmul_reverse_neon, export=1
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1088 add r2, r2, r3, lsl #2
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1089 sub r2, r2, #32
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1090 mov r12, #-32
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1091 vld1.32 {q0-q1}, [r1,:128]!
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1092 vld1.32 {q2-q3}, [r2,:128], r12
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1093 1: pld [r1, #32]
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1094 vrev64.32 q3, q3
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1095 vmul.f32 d16, d0, d7
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1096 vmul.f32 d17, d1, d6
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1097 pld [r2, #-32]
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1098 vrev64.32 q2, q2
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1099 vmul.f32 d18, d2, d5
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1100 vmul.f32 d19, d3, d4
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1101 subs r3, r3, #8
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1102 beq 2f
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1103 vld1.32 {q0-q1}, [r1,:128]!
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1104 vld1.32 {q2-q3}, [r2,:128], r12
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1105 vst1.32 {q8-q9}, [r0,:128]!
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1106 b 1b
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1107 2: vst1.32 {q8-q9}, [r0,:128]!
bcf5c5551b3c ARM: NEON optimised vector_fmul_reverse
mru
parents: 10253
diff changeset
1108 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
1109 endfunc
10276
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1110
10302
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1111 function ff_vector_fmul_add_neon, export=1
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1112 ldr r12, [sp]
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1113 vld1.32 {q0-q1}, [r1,:128]!
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1114 vld1.32 {q8-q9}, [r2,:128]!
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1115 vld1.32 {q2-q3}, [r3,:128]!
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1116 vmul.f32 q10, q0, q8
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1117 vmul.f32 q11, q1, q9
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1118 1: vadd.f32 q12, q2, q10
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1119 vadd.f32 q13, q3, q11
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1120 pld [r1, #16]
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1121 pld [r2, #16]
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1122 pld [r3, #16]
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1123 subs r12, r12, #8
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1124 beq 2f
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1125 vld1.32 {q0}, [r1,:128]!
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1126 vld1.32 {q8}, [r2,:128]!
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1127 vmul.f32 q10, q0, q8
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1128 vld1.32 {q1}, [r1,:128]!
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1129 vld1.32 {q9}, [r2,:128]!
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1130 vmul.f32 q11, q1, q9
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1131 vld1.32 {q2-q3}, [r3,:128]!
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1132 vst1.32 {q12-q13},[r0,:128]!
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1133 b 1b
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1134 2: vst1.32 {q12-q13},[r0,:128]!
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1135 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
1136 endfunc
10302
6db89678b326 ARM: NEON optimised vector_fmul_add
mru
parents: 10276
diff changeset
1137
10276
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1138 function ff_vector_clipf_neon, export=1
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1139 VFP vdup.32 q1, d0[1]
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1140 VFP vdup.32 q0, d0[0]
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1141 NOVFP vdup.32 q0, r2
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1142 NOVFP vdup.32 q1, r3
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1143 NOVFP ldr r2, [sp]
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1144 vld1.f32 {q2},[r1,:128]!
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1145 vmin.f32 q10, q2, q1
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1146 vld1.f32 {q3},[r1,:128]!
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1147 vmin.f32 q11, q3, q1
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1148 1: vmax.f32 q8, q10, q0
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1149 vmax.f32 q9, q11, q0
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1150 subs r2, r2, #8
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1151 beq 2f
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1152 vld1.f32 {q2},[r1,:128]!
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1153 vmin.f32 q10, q2, q1
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1154 vld1.f32 {q3},[r1,:128]!
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1155 vmin.f32 q11, q3, q1
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1156 vst1.f32 {q8},[r0,:128]!
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1157 vst1.f32 {q9},[r0,:128]!
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1158 b 1b
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1159 2: vst1.f32 {q8},[r0,:128]!
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1160 vst1.f32 {q9},[r0,:128]!
06d4e87718b1 ARM: NEON optimised vector_clipf
mru
parents: 10274
diff changeset
1161 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 10376
diff changeset
1162 endfunc