Mercurial > libavcodec.hg
comparison armv4l/dsputil_vfp.S @ 8071:2487a9db02a0 libavcodec
ARM: move VFP DSP functions to dsputils_vfp.S
author | mru |
---|---|
date | Mon, 27 Oct 2008 00:25:16 +0000 |
parents | |
children | 92008e82ce6c |
comparison
equal
deleted
inserted
replaced
8070:59be7e4941e8 | 8071:2487a9db02a0 |
---|---|
1 /* | |
2 * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
21 #include "config.h" | |
22 #include "asm.S" | |
23 | |
24 /* | |
25 * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle | |
26 * throughput for almost all the instructions (except for double precision | |
27 * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles | |
28 * for arithmetic operations. Scheduling code to avoid pipeline stalls is very | |
29 * important for performance. One more interesting feature is that VFP has | |
30 * independent load/store and arithmetics pipelines, so it is possible to make | |
31 * them work simultaneously and get more than 1 operation per cycle. Load/store | |
32 * pipeline can process 2 single precision floating point values per cycle and | |
33 * supports bulk loads and stores for large sets of registers. Arithmetic operations | |
34 * can be done on vectors, which allows to keep the arithmetics pipeline busy, | |
35 * while the processor may issue and execute other instructions. Detailed | |
36 * optimization manuals can be found at http://www.arm.com | |
37 */ | |
38 | |
39 /** | |
40 * ARM VFP optimized implementation of 'vector_fmul_c' function. | |
41 * Assume that len is a positive number and is multiple of 8 | |
42 */ | |
43 @ void ff_vector_fmul_vfp(float *dst, const float *src, int len) | |
44 function ff_vector_fmul_vfp, export=1 | |
45 vpush {d8-d15} | |
46 mov r3, r0 | |
47 fmrx r12, fpscr | |
48 orr r12, r12, #(3 << 16) /* set vector size to 4 */ | |
49 fmxr fpscr, r12 | |
50 | |
51 fldmias r3!, {s0-s3} | |
52 fldmias r1!, {s8-s11} | |
53 fldmias r3!, {s4-s7} | |
54 fldmias r1!, {s12-s15} | |
55 fmuls s8, s0, s8 | |
56 1: | |
57 subs r2, r2, #16 | |
58 fmuls s12, s4, s12 | |
59 fldmiasge r3!, {s16-s19} | |
60 fldmiasge r1!, {s24-s27} | |
61 fldmiasge r3!, {s20-s23} | |
62 fldmiasge r1!, {s28-s31} | |
63 fmulsge s24, s16, s24 | |
64 fstmias r0!, {s8-s11} | |
65 fstmias r0!, {s12-s15} | |
66 fmulsge s28, s20, s28 | |
67 fldmiasgt r3!, {s0-s3} | |
68 fldmiasgt r1!, {s8-s11} | |
69 fldmiasgt r3!, {s4-s7} | |
70 fldmiasgt r1!, {s12-s15} | |
71 fmulsge s8, s0, s8 | |
72 fstmiasge r0!, {s24-s27} | |
73 fstmiasge r0!, {s28-s31} | |
74 bgt 1b | |
75 | |
76 bic r12, r12, #(7 << 16) /* set vector size back to 1 */ | |
77 fmxr fpscr, r12 | |
78 vpop {d8-d15} | |
79 bx lr | |
80 .endfunc | |
81 | |
82 /** | |
83 * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. | |
84 * Assume that len is a positive number and is multiple of 8 | |
85 */ | |
86 @ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, | |
87 @ const float *src1, int len) | |
88 function ff_vector_fmul_reverse_vfp, export=1 | |
89 vpush {d8-d15} | |
90 add r2, r2, r3, lsl #2 | |
91 fldmdbs r2!, {s0-s3} | |
92 fldmias r1!, {s8-s11} | |
93 fldmdbs r2!, {s4-s7} | |
94 fldmias r1!, {s12-s15} | |
95 fmuls s8, s3, s8 | |
96 fmuls s9, s2, s9 | |
97 fmuls s10, s1, s10 | |
98 fmuls s11, s0, s11 | |
99 1: | |
100 subs r3, r3, #16 | |
101 fldmdbsge r2!, {s16-s19} | |
102 fmuls s12, s7, s12 | |
103 fldmiasge r1!, {s24-s27} | |
104 fmuls s13, s6, s13 | |
105 fldmdbsge r2!, {s20-s23} | |
106 fmuls s14, s5, s14 | |
107 fldmiasge r1!, {s28-s31} | |
108 fmuls s15, s4, s15 | |
109 fmulsge s24, s19, s24 | |
110 fldmdbsgt r2!, {s0-s3} | |
111 fmulsge s25, s18, s25 | |
112 fstmias r0!, {s8-s13} | |
113 fmulsge s26, s17, s26 | |
114 fldmiasgt r1!, {s8-s11} | |
115 fmulsge s27, s16, s27 | |
116 fmulsge s28, s23, s28 | |
117 fldmdbsgt r2!, {s4-s7} | |
118 fmulsge s29, s22, s29 | |
119 fstmias r0!, {s14-s15} | |
120 fmulsge s30, s21, s30 | |
121 fmulsge s31, s20, s31 | |
122 fmulsge s8, s3, s8 | |
123 fldmiasgt r1!, {s12-s15} | |
124 fmulsge s9, s2, s9 | |
125 fmulsge s10, s1, s10 | |
126 fstmiasge r0!, {s24-s27} | |
127 fmulsge s11, s0, s11 | |
128 fstmiasge r0!, {s28-s31} | |
129 bgt 1b | |
130 | |
131 vpop {d8-d15} | |
132 bx lr | |
133 .endfunc | |
134 | |
135 #ifdef HAVE_ARMV6 | |
136 /** | |
137 * ARM VFP optimized float to int16 conversion. | |
138 * Assume that len is a positive number and is multiple of 8, destination | |
139 * buffer is at least 4 bytes aligned (8 bytes alignment is better for | |
140 * performance), little endian byte sex | |
141 */ | |
142 @ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) | |
143 function ff_float_to_int16_vfp, export=1 | |
144 push {r4-r8,lr} | |
145 vpush {d8-d11} | |
146 fldmias r1!, {s16-s23} | |
147 ftosis s0, s16 | |
148 ftosis s1, s17 | |
149 ftosis s2, s18 | |
150 ftosis s3, s19 | |
151 ftosis s4, s20 | |
152 ftosis s5, s21 | |
153 ftosis s6, s22 | |
154 ftosis s7, s23 | |
155 1: | |
156 subs r2, r2, #8 | |
157 fmrrs r3, r4, {s0, s1} | |
158 fmrrs r5, r6, {s2, s3} | |
159 fmrrs r7, r8, {s4, s5} | |
160 fmrrs ip, lr, {s6, s7} | |
161 fldmiasgt r1!, {s16-s23} | |
162 ssat r4, #16, r4 | |
163 ssat r3, #16, r3 | |
164 ssat r6, #16, r6 | |
165 ssat r5, #16, r5 | |
166 pkhbt r3, r3, r4, lsl #16 | |
167 pkhbt r4, r5, r6, lsl #16 | |
168 ftosisgt s0, s16 | |
169 ftosisgt s1, s17 | |
170 ftosisgt s2, s18 | |
171 ftosisgt s3, s19 | |
172 ftosisgt s4, s20 | |
173 ftosisgt s5, s21 | |
174 ftosisgt s6, s22 | |
175 ftosisgt s7, s23 | |
176 ssat r8, #16, r8 | |
177 ssat r7, #16, r7 | |
178 ssat lr, #16, lr | |
179 ssat ip, #16, ip | |
180 pkhbt r5, r7, r8, lsl #16 | |
181 pkhbt r6, ip, lr, lsl #16 | |
182 stmia r0!, {r3-r6} | |
183 bgt 1b | |
184 | |
185 vpop {d8-d11} | |
186 pop {r4-r8,pc} | |
187 .endfunc | |
188 #endif |