Mercurial > libavcodec.hg
annotate arm/dsputil_vfp.S @ 11032:01bd040f8607 libavcodec
Unroll main loop so the edge==0 case is seperate.
This allows many things to be simplified away.
h264 decoder is overall 1% faster with a mbaff sample and
0.1% slower with the cathedral sample, probably because the slow loop
filter code must be loaded into the code cache for each first MB of each
row but isnt used for the following MBs.
author | michael |
---|---|
date | Thu, 28 Jan 2010 01:24:25 +0000 |
parents | bdcc1c52f223 |
children | 361a5fcb4393 |
rev | line source |
---|---|
8071 | 1 /* |
2 * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
21 #include "config.h" | |
22 #include "asm.S" | |
23 | |
10348
bdcc1c52f223
ARM: use undocumented .syntax directive to enable UAL syntax
mru
parents:
8590
diff
changeset
|
24 .syntax unified |
8071 | 25 /* |
26 * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle | |
27 * throughput for almost all the instructions (except for double precision | |
28 * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles | |
29 * for arithmetic operations. Scheduling code to avoid pipeline stalls is very | |
30 * important for performance. One more interesting feature is that VFP has | |
31 * independent load/store and arithmetics pipelines, so it is possible to make | |
32 * them work simultaneously and get more than 1 operation per cycle. Load/store | |
33 * pipeline can process 2 single precision floating point values per cycle and | |
34 * supports bulk loads and stores for large sets of registers. Arithmetic operations | |
35 * can be done on vectors, which allows to keep the arithmetics pipeline busy, | |
36 * while the processor may issue and execute other instructions. Detailed | |
37 * optimization manuals can be found at http://www.arm.com | |
38 */ | |
39 | |
40 /** | |
41 * ARM VFP optimized implementation of 'vector_fmul_c' function. | |
42 * Assume that len is a positive number and is multiple of 8 | |
43 */ | |
44 @ void ff_vector_fmul_vfp(float *dst, const float *src, int len) | |
45 function ff_vector_fmul_vfp, export=1 | |
46 vpush {d8-d15} | |
47 mov r3, r0 | |
48 fmrx r12, fpscr | |
49 orr r12, r12, #(3 << 16) /* set vector size to 4 */ | |
50 fmxr fpscr, r12 | |
51 | |
8252 | 52 vldmia r3!, {s0-s3} |
53 vldmia r1!, {s8-s11} | |
54 vldmia r3!, {s4-s7} | |
55 vldmia r1!, {s12-s15} | |
56 vmul.f32 s8, s0, s8 | |
8071 | 57 1: |
58 subs r2, r2, #16 | |
8252 | 59 vmul.f32 s12, s4, s12 |
60 vldmiage r3!, {s16-s19} | |
61 vldmiage r1!, {s24-s27} | |
62 vldmiage r3!, {s20-s23} | |
63 vldmiage r1!, {s28-s31} | |
64 vmulge.f32 s24, s16, s24 | |
65 vstmia r0!, {s8-s11} | |
66 vstmia r0!, {s12-s15} | |
67 vmulge.f32 s28, s20, s28 | |
68 vldmiagt r3!, {s0-s3} | |
69 vldmiagt r1!, {s8-s11} | |
70 vldmiagt r3!, {s4-s7} | |
71 vldmiagt r1!, {s12-s15} | |
72 vmulge.f32 s8, s0, s8 | |
73 vstmiage r0!, {s24-s27} | |
74 vstmiage r0!, {s28-s31} | |
8071 | 75 bgt 1b |
76 | |
77 bic r12, r12, #(7 << 16) /* set vector size back to 1 */ | |
78 fmxr fpscr, r12 | |
79 vpop {d8-d15} | |
80 bx lr | |
81 .endfunc | |
82 | |
83 /** | |
84 * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. | |
85 * Assume that len is a positive number and is multiple of 8 | |
86 */ | |
87 @ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, | |
88 @ const float *src1, int len) | |
89 function ff_vector_fmul_reverse_vfp, export=1 | |
90 vpush {d8-d15} | |
91 add r2, r2, r3, lsl #2 | |
8252 | 92 vldmdb r2!, {s0-s3} |
93 vldmia r1!, {s8-s11} | |
94 vldmdb r2!, {s4-s7} | |
95 vldmia r1!, {s12-s15} | |
96 vmul.f32 s8, s3, s8 | |
97 vmul.f32 s9, s2, s9 | |
98 vmul.f32 s10, s1, s10 | |
99 vmul.f32 s11, s0, s11 | |
8071 | 100 1: |
101 subs r3, r3, #16 | |
8252 | 102 vldmdbge r2!, {s16-s19} |
103 vmul.f32 s12, s7, s12 | |
104 vldmiage r1!, {s24-s27} | |
105 vmul.f32 s13, s6, s13 | |
106 vldmdbge r2!, {s20-s23} | |
107 vmul.f32 s14, s5, s14 | |
108 vldmiage r1!, {s28-s31} | |
109 vmul.f32 s15, s4, s15 | |
110 vmulge.f32 s24, s19, s24 | |
111 vldmdbgt r2!, {s0-s3} | |
112 vmulge.f32 s25, s18, s25 | |
113 vstmia r0!, {s8-s13} | |
114 vmulge.f32 s26, s17, s26 | |
115 vldmiagt r1!, {s8-s11} | |
116 vmulge.f32 s27, s16, s27 | |
117 vmulge.f32 s28, s23, s28 | |
118 vldmdbgt r2!, {s4-s7} | |
119 vmulge.f32 s29, s22, s29 | |
120 vstmia r0!, {s14-s15} | |
121 vmulge.f32 s30, s21, s30 | |
122 vmulge.f32 s31, s20, s31 | |
123 vmulge.f32 s8, s3, s8 | |
124 vldmiagt r1!, {s12-s15} | |
125 vmulge.f32 s9, s2, s9 | |
126 vmulge.f32 s10, s1, s10 | |
127 vstmiage r0!, {s24-s27} | |
128 vmulge.f32 s11, s0, s11 | |
129 vstmiage r0!, {s28-s31} | |
8071 | 130 bgt 1b |
131 | |
132 vpop {d8-d15} | |
133 bx lr | |
134 .endfunc | |
135 | |
8590 | 136 #if HAVE_ARMV6 |
8071 | 137 /** |
138 * ARM VFP optimized float to int16 conversion. | |
139 * Assume that len is a positive number and is multiple of 8, destination | |
140 * buffer is at least 4 bytes aligned (8 bytes alignment is better for | |
141 * performance), little endian byte sex | |
142 */ | |
143 @ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) | |
144 function ff_float_to_int16_vfp, export=1 | |
145 push {r4-r8,lr} | |
146 vpush {d8-d11} | |
8252 | 147 vldmia r1!, {s16-s23} |
148 vcvt.s32.f32 s0, s16 | |
149 vcvt.s32.f32 s1, s17 | |
150 vcvt.s32.f32 s2, s18 | |
151 vcvt.s32.f32 s3, s19 | |
152 vcvt.s32.f32 s4, s20 | |
153 vcvt.s32.f32 s5, s21 | |
154 vcvt.s32.f32 s6, s22 | |
155 vcvt.s32.f32 s7, s23 | |
8071 | 156 1: |
157 subs r2, r2, #8 | |
8252 | 158 vmov r3, r4, s0, s1 |
159 vmov r5, r6, s2, s3 | |
160 vmov r7, r8, s4, s5 | |
161 vmov ip, lr, s6, s7 | |
162 vldmiagt r1!, {s16-s23} | |
8071 | 163 ssat r4, #16, r4 |
164 ssat r3, #16, r3 | |
165 ssat r6, #16, r6 | |
166 ssat r5, #16, r5 | |
167 pkhbt r3, r3, r4, lsl #16 | |
168 pkhbt r4, r5, r6, lsl #16 | |
8252 | 169 vcvtgt.s32.f32 s0, s16 |
170 vcvtgt.s32.f32 s1, s17 | |
171 vcvtgt.s32.f32 s2, s18 | |
172 vcvtgt.s32.f32 s3, s19 | |
173 vcvtgt.s32.f32 s4, s20 | |
174 vcvtgt.s32.f32 s5, s21 | |
175 vcvtgt.s32.f32 s6, s22 | |
176 vcvtgt.s32.f32 s7, s23 | |
8071 | 177 ssat r8, #16, r8 |
178 ssat r7, #16, r7 | |
179 ssat lr, #16, lr | |
180 ssat ip, #16, ip | |
181 pkhbt r5, r7, r8, lsl #16 | |
182 pkhbt r6, ip, lr, lsl #16 | |
183 stmia r0!, {r3-r6} | |
184 bgt 1b | |
185 | |
186 vpop {d8-d11} | |
187 pop {r4-r8,pc} | |
188 .endfunc | |
189 #endif |