Mercurial > libavcodec.hg
comparison arm/synth_filter_neon.S @ 11593:b7fa70eabb1f libavcodec
ARM: NEON optimised synth_filter_float
2.7x faster DCA decoding on Cortex-A8
author | mru |
---|---|
date | Sat, 10 Apr 2010 16:27:56 +0000 |
parents | |
children | 475eab960878 |
comparison
equal
deleted
inserted
replaced
11592:18f17f44de37 | 11593:b7fa70eabb1f |
---|---|
1 /* | |
2 * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
21 #include "asm.S" | |
22 | |
23 preserve8 | |
24 | |
25 function ff_synth_filter_float_neon, export=1 | |
26 push {r3-r11,lr} | |
27 | |
28 ldr r4, [r2] @ synth_buf_offset | |
29 add r1, r1, r4, lsl #2 @ synth_buf | |
30 sub r12, r4, #32 | |
31 bfc r12, #9, #23 | |
32 bic r4, r4, #63 | |
33 str r12, [r2] | |
34 | |
35 ldr r2, [sp, #12*4] @ in | |
36 mov r9, r1 @ synth_buf | |
37 | |
38 bl ff_imdct_half_neon | |
39 pop {r3} | |
40 | |
41 ldr r5, [sp, #9*4] @ window | |
42 ldr r2, [sp, #10*4] @ out | |
43 vldr d0, [sp, #12*4] @ scale, bias | |
44 add r8, r9, #12*4 | |
45 | |
46 mov lr, #64*4 | |
47 mov r1, #4 | |
48 1: | |
49 add r10, r9, #16*4 @ synth_buf | |
50 add r11, r8, #16*4 | |
51 add r0, r5, #16*4 @ window | |
52 add r6, r5, #32*4 | |
53 add r7, r5, #48*4 | |
54 | |
55 vld1.32 {q10}, [r3,:128] @ a | |
56 add r3, r3, #16*4 | |
57 vld1.32 {q1}, [r3,:128] @ b | |
58 vmov.f32 q2, #0.0 @ c | |
59 vmov.f32 q3, #0.0 @ d | |
60 | |
61 mov r12, #512 | |
62 2: | |
63 vld1.32 {q9}, [r8, :128], lr | |
64 vrev64.32 q9, q9 | |
65 vld1.32 {q8}, [r5, :128], lr | |
66 vmls.f32 d20, d16, d19 | |
67 vld1.32 {q11}, [r0, :128], lr | |
68 vmls.f32 d21, d17, d18 | |
69 vld1.32 {q12}, [r9, :128], lr | |
70 vmla.f32 d2, d22, d24 | |
71 vld1.32 {q8}, [r6, :128], lr | |
72 vmla.f32 d3, d23, d25 | |
73 vld1.32 {q9}, [r10,:128], lr | |
74 vmla.f32 d4, d16, d18 | |
75 vld1.32 {q12}, [r11,:128], lr | |
76 vmla.f32 d5, d17, d19 | |
77 vrev64.32 q12, q12 | |
78 vld1.32 {q11}, [r7, :128], lr | |
79 vmla.f32 d6, d22, d25 | |
80 vmla.f32 d7, d23, d24 | |
81 subs r12, r12, #64 | |
82 beq 3f | |
83 cmp r12, r4 | |
84 bne 2b | |
85 sub r8, r8, #512*4 | |
86 sub r9, r9, #512*4 | |
87 sub r10, r10, #512*4 | |
88 sub r11, r11, #512*4 | |
89 b 2b | |
90 3: | |
91 vdup.32 q8, d0[1] | |
92 vdup.32 q9, d0[1] | |
93 vmla.f32 q8, q10, d0[0] | |
94 vmla.f32 q9, q1, d0[0] | |
95 vst1.32 {q3}, [r3,:128] | |
96 sub r3, r3, #16*4 | |
97 vst1.32 {q2}, [r3,:128] | |
98 vst1.32 {q8}, [r2,:128] | |
99 add r2, r2, #16*4 | |
100 vst1.32 {q9}, [r2,:128] | |
101 | |
102 subs r1, r1, #1 | |
103 popeq {r4-r11,pc} | |
104 | |
105 cmp r4, #0 | |
106 subeq r8, r8, #512*4 | |
107 subeq r9, r9, #512*4 | |
108 sub r5, r5, #512*4 | |
109 sub r2, r2, #12*4 @ out | |
110 add r3, r3, #4*4 @ synth_buf2 | |
111 add r5, r5, #4*4 @ window | |
112 add r9, r9, #4*4 @ synth_buf | |
113 sub r8, r8, #4*4 @ synth_buf | |
114 b 1b | |
115 endfunc |