comparison armv4l/dsputil_vfp.S @ 8071:2487a9db02a0 libavcodec

ARM: move VFP DSP functions to dsputils_vfp.S
author mru
date Mon, 27 Oct 2008 00:25:16 +0000
parents
children 92008e82ce6c
comparison
equal deleted inserted replaced
8070:59be7e4941e8 8071:2487a9db02a0
1 /*
2 * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "config.h"
22 #include "asm.S"
23
24 /*
25 * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle
26 * throughput for almost all the instructions (except for double precision
27 * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles
28 * for arithmetic operations. Scheduling code to avoid pipeline stalls is very
29 * important for performance. One more interesting feature is that VFP has
30 * independent load/store and arithmetics pipelines, so it is possible to make
31 * them work simultaneously and get more than 1 operation per cycle. Load/store
32 * pipeline can process 2 single precision floating point values per cycle and
33 * supports bulk loads and stores for large sets of registers. Arithmetic operations
34 * can be done on vectors, which allows to keep the arithmetics pipeline busy,
35 * while the processor may issue and execute other instructions. Detailed
36 * optimization manuals can be found at http://www.arm.com
37 */
38
39 /**
40 * ARM VFP optimized implementation of 'vector_fmul_c' function.
41 * Assume that len is a positive number and is multiple of 8
42 */
43 @ void ff_vector_fmul_vfp(float *dst, const float *src, int len)
44 function ff_vector_fmul_vfp, export=1
45 vpush {d8-d15}
46 mov r3, r0
47 fmrx r12, fpscr
48 orr r12, r12, #(3 << 16) /* set vector size to 4 */
49 fmxr fpscr, r12
50
51 fldmias r3!, {s0-s3}
52 fldmias r1!, {s8-s11}
53 fldmias r3!, {s4-s7}
54 fldmias r1!, {s12-s15}
55 fmuls s8, s0, s8
56 1:
57 subs r2, r2, #16
58 fmuls s12, s4, s12
59 fldmiasge r3!, {s16-s19}
60 fldmiasge r1!, {s24-s27}
61 fldmiasge r3!, {s20-s23}
62 fldmiasge r1!, {s28-s31}
63 fmulsge s24, s16, s24
64 fstmias r0!, {s8-s11}
65 fstmias r0!, {s12-s15}
66 fmulsge s28, s20, s28
67 fldmiasgt r3!, {s0-s3}
68 fldmiasgt r1!, {s8-s11}
69 fldmiasgt r3!, {s4-s7}
70 fldmiasgt r1!, {s12-s15}
71 fmulsge s8, s0, s8
72 fstmiasge r0!, {s24-s27}
73 fstmiasge r0!, {s28-s31}
74 bgt 1b
75
76 bic r12, r12, #(7 << 16) /* set vector size back to 1 */
77 fmxr fpscr, r12
78 vpop {d8-d15}
79 bx lr
80 .endfunc
81
82 /**
83 * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
84 * Assume that len is a positive number and is multiple of 8
85 */
86 @ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
87 @ const float *src1, int len)
88 function ff_vector_fmul_reverse_vfp, export=1
89 vpush {d8-d15}
90 add r2, r2, r3, lsl #2
91 fldmdbs r2!, {s0-s3}
92 fldmias r1!, {s8-s11}
93 fldmdbs r2!, {s4-s7}
94 fldmias r1!, {s12-s15}
95 fmuls s8, s3, s8
96 fmuls s9, s2, s9
97 fmuls s10, s1, s10
98 fmuls s11, s0, s11
99 1:
100 subs r3, r3, #16
101 fldmdbsge r2!, {s16-s19}
102 fmuls s12, s7, s12
103 fldmiasge r1!, {s24-s27}
104 fmuls s13, s6, s13
105 fldmdbsge r2!, {s20-s23}
106 fmuls s14, s5, s14
107 fldmiasge r1!, {s28-s31}
108 fmuls s15, s4, s15
109 fmulsge s24, s19, s24
110 fldmdbsgt r2!, {s0-s3}
111 fmulsge s25, s18, s25
112 fstmias r0!, {s8-s13}
113 fmulsge s26, s17, s26
114 fldmiasgt r1!, {s8-s11}
115 fmulsge s27, s16, s27
116 fmulsge s28, s23, s28
117 fldmdbsgt r2!, {s4-s7}
118 fmulsge s29, s22, s29
119 fstmias r0!, {s14-s15}
120 fmulsge s30, s21, s30
121 fmulsge s31, s20, s31
122 fmulsge s8, s3, s8
123 fldmiasgt r1!, {s12-s15}
124 fmulsge s9, s2, s9
125 fmulsge s10, s1, s10
126 fstmiasge r0!, {s24-s27}
127 fmulsge s11, s0, s11
128 fstmiasge r0!, {s28-s31}
129 bgt 1b
130
131 vpop {d8-d15}
132 bx lr
133 .endfunc
134
135 #ifdef HAVE_ARMV6
136 /**
137 * ARM VFP optimized float to int16 conversion.
138 * Assume that len is a positive number and is multiple of 8, destination
139 * buffer is at least 4 bytes aligned (8 bytes alignment is better for
140 * performance), little endian byte sex
141 */
142 @ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
143 function ff_float_to_int16_vfp, export=1
144 push {r4-r8,lr}
145 vpush {d8-d11}
146 fldmias r1!, {s16-s23}
147 ftosis s0, s16
148 ftosis s1, s17
149 ftosis s2, s18
150 ftosis s3, s19
151 ftosis s4, s20
152 ftosis s5, s21
153 ftosis s6, s22
154 ftosis s7, s23
155 1:
156 subs r2, r2, #8
157 fmrrs r3, r4, {s0, s1}
158 fmrrs r5, r6, {s2, s3}
159 fmrrs r7, r8, {s4, s5}
160 fmrrs ip, lr, {s6, s7}
161 fldmiasgt r1!, {s16-s23}
162 ssat r4, #16, r4
163 ssat r3, #16, r3
164 ssat r6, #16, r6
165 ssat r5, #16, r5
166 pkhbt r3, r3, r4, lsl #16
167 pkhbt r4, r5, r6, lsl #16
168 ftosisgt s0, s16
169 ftosisgt s1, s17
170 ftosisgt s2, s18
171 ftosisgt s3, s19
172 ftosisgt s4, s20
173 ftosisgt s5, s21
174 ftosisgt s6, s22
175 ftosisgt s7, s23
176 ssat r8, #16, r8
177 ssat r7, #16, r7
178 ssat lr, #16, lr
179 ssat ip, #16, ip
180 pkhbt r5, r7, r8, lsl #16
181 pkhbt r6, ip, lr, lsl #16
182 stmia r0!, {r3-r6}
183 bgt 1b
184
185 vpop {d8-d11}
186 pop {r4-r8,pc}
187 .endfunc
188 #endif