annotate arm/vp3dsp_neon.S @ 12530:63edd10ad4bc libavcodec tip

Try to fix crashes introduced by r25218 r25218 made assumptions about the existence of past reference frames that weren't necessarily true.
author darkshikari
date Tue, 28 Sep 2010 09:06:22 +0000
parents 2064f8a1691e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9693
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
1 /*
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
2 * Copyright (c) 2009 David Conrad
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
3 *
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
4 * This file is part of FFmpeg.
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
5 *
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
10 *
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
14 * Lesser General Public License for more details.
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
15 *
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
19 */
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
20
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
21 #include "asm.S"
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
22
9916
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
23 .section .rodata
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
24 .align 4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
25
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
26 vp3_idct_constants:
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
27 .short 64277, 60547, 54491, 46341, 36410, 25080, 12785
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
28
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
29 #define xC1S7 d0[0]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
30 #define xC2S6 d0[1]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
31 #define xC3S5 d0[2]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
32 #define xC4S4 d0[3]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
33 #define xC5S3 d1[0]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
34 #define xC6S2 d1[1]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
35 #define xC7S1 d1[2]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
36
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
37 .text
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
38
9693
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
39 .macro vp3_loop_filter
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
40 vsubl.u8 q3, d18, d17
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
41 vsubl.u8 q2, d16, d19
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
42 vadd.i16 q1, q3, q3
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
43 vadd.i16 q2, q2, q3
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
44 vadd.i16 q0, q1, q2
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
45 vrshr.s16 q0, q0, #3
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
46 vmovl.u8 q9, d18
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
47 vdup.u16 q15, r2
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
48
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
49 vabs.s16 q1, q0
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
50 vshr.s16 q0, q0, #15
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
51 vqsub.u16 q2, q15, q1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
52 vqsub.u16 q3, q2, q1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
53 vsub.i16 q1, q2, q3
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
54 veor q1, q1, q0
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
55 vsub.i16 q0, q1, q0
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
56
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
57 vaddw.u8 q2, q0, d17
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
58 vsub.i16 q3, q9, q0
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
59 vqmovun.s16 d0, q2
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
60 vqmovun.s16 d1, q3
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
61 .endm
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
62
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
63 function ff_vp3_v_loop_filter_neon, export=1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
64 sub ip, r0, r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
65 sub r0, r0, r1, lsl #1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
66 vld1.64 {d16}, [r0,:64], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
67 vld1.64 {d17}, [r0,:64], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
68 vld1.64 {d18}, [r0,:64], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
69 vld1.64 {d19}, [r0,:64], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
70 ldrb r2, [r2, #129*4]
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
71
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
72 vp3_loop_filter
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
73
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
74 vst1.64 {d0}, [ip,:64], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
75 vst1.64 {d1}, [ip,:64], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
76 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 9916
diff changeset
77 endfunc
9693
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
78
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
79 function ff_vp3_h_loop_filter_neon, export=1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
80 sub ip, r0, #1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
81 sub r0, r0, #2
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
82 vld1.32 {d16[]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
83 vld1.32 {d17[]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
84 vld1.32 {d18[]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
85 vld1.32 {d19[]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
86 vld1.32 {d16[1]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
87 vld1.32 {d17[1]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
88 vld1.32 {d18[1]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
89 vld1.32 {d19[1]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
90 ldrb r2, [r2, #129*4]
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
91
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
92 vtrn.8 d16, d17
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
93 vtrn.8 d18, d19
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
94 vtrn.16 d16, d18
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
95 vtrn.16 d17, d19
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
96
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
97 vp3_loop_filter
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
98
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
99 vtrn.8 d0, d1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
100
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
101 vst1.16 {d0[0]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
102 vst1.16 {d1[0]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
103 vst1.16 {d0[1]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
104 vst1.16 {d1[1]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
105 vst1.16 {d0[2]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
106 vst1.16 {d1[2]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
107 vst1.16 {d0[3]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
108 vst1.16 {d1[3]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
109 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 9916
diff changeset
110 endfunc
9916
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
111
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
112
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
113 function vp3_idct_start_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
114 vpush {d8-d15}
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
115 movrel r3, vp3_idct_constants
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
116 vld1.64 {d0-d1}, [r3,:128]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
117 vld1.64 {d16-d19}, [r2,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
118 vld1.64 {d20-d23}, [r2,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
119 vld1.64 {d24-d27}, [r2,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
120 vadd.s16 q1, q8, q12
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
121 vsub.s16 q8, q8, q12
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
122 vld1.64 {d28-d31}, [r2,:128]!
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 9916
diff changeset
123 endfunc
9916
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
124
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
125 function vp3_idct_core_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
126 vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
127 vmull.s16 q3, d19, xC1S7
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
128 vmull.s16 q4, d2, xC4S4 // ((ip[0] + ip[4]) * C4) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
129 vmull.s16 q5, d3, xC4S4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
130 vmull.s16 q6, d16, xC4S4 // ((ip[0] - ip[4]) * C4) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
131 vmull.s16 q7, d17, xC4S4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
132 vshrn.s32 d4, q2, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
133 vshrn.s32 d5, q3, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
134 vshrn.s32 d6, q4, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
135 vshrn.s32 d7, q5, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
136 vshrn.s32 d8, q6, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
137 vshrn.s32 d9, q7, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
138 vadd.s16 q12, q1, q3 // E = (ip[0] + ip[4]) * C4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
139 vadd.s16 q8, q8, q4 // F = (ip[0] - ip[4]) * C4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
140 vadd.s16 q1, q2, q9 // ip[1] * C1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
141
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
142 vmull.s16 q2, d30, xC1S7 // (ip[7] * C1) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
143 vmull.s16 q3, d31, xC1S7
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
144 vmull.s16 q4, d30, xC7S1 // (ip[7] * C7) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
145 vmull.s16 q5, d31, xC7S1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
146 vmull.s16 q6, d18, xC7S1 // (ip[1] * C7) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
147 vmull.s16 q7, d19, xC7S1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
148 vshrn.s32 d4, q2, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
149 vshrn.s32 d5, q3, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
150 vshrn.s32 d6, q4, #16 // ip[7] * C7
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
151 vshrn.s32 d7, q5, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
152 vshrn.s32 d8, q6, #16 // ip[1] * C7
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
153 vshrn.s32 d9, q7, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
154 vadd.s16 q2, q2, q15 // ip[7] * C1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
155 vadd.s16 q9, q1, q3 // A = ip[1] * C1 + ip[7] * C7
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
156 vsub.s16 q15, q4, q2 // B = ip[1] * C7 - ip[7] * C1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
157
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
158 vmull.s16 q2, d22, xC5S3 // (ip[3] * C5) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
159 vmull.s16 q3, d23, xC5S3
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
160 vmull.s16 q4, d22, xC3S5 // (ip[3] * C3) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
161 vmull.s16 q5, d23, xC3S5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
162 vmull.s16 q6, d26, xC5S3 // (ip[5] * C5) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
163 vmull.s16 q7, d27, xC5S3
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
164 vshrn.s32 d4, q2, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
165 vshrn.s32 d5, q3, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
166 vshrn.s32 d6, q4, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
167 vshrn.s32 d7, q5, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
168 vshrn.s32 d8, q6, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
169 vshrn.s32 d9, q7, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
170 vadd.s16 q3, q3, q11 // ip[3] * C3
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
171 vadd.s16 q4, q4, q13 // ip[5] * C5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
172 vadd.s16 q1, q2, q11 // ip[3] * C5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
173 vadd.s16 q11, q3, q4 // C = ip[3] * C3 + ip[5] * C5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
174
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
175 vmull.s16 q2, d26, xC3S5 // (ip[5] * C3) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
176 vmull.s16 q3, d27, xC3S5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
177 vmull.s16 q4, d20, xC2S6 // (ip[2] * C2) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
178 vmull.s16 q5, d21, xC2S6
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
179 vmull.s16 q6, d28, xC6S2 // (ip[6] * C6) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
180 vmull.s16 q7, d29, xC6S2
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
181 vshrn.s32 d4, q2, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
182 vshrn.s32 d5, q3, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
183 vshrn.s32 d6, q4, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
184 vshrn.s32 d7, q5, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
185 vshrn.s32 d8, q6, #16 // ip[6] * C6
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
186 vshrn.s32 d9, q7, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
187 vadd.s16 q2, q2, q13 // ip[5] * C3
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
188 vadd.s16 q3, q3, q10 // ip[2] * C2
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
189 vsub.s16 q13, q2, q1 // D = ip[5] * C3 - ip[3] * C5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
190 vsub.s16 q1, q9, q11 // (A - C)
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
191 vadd.s16 q11, q9, q11 // Cd = A + C
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
192 vsub.s16 q9, q15, q13 // (B - D)
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
193 vadd.s16 q13, q15, q13 // Dd = B + D
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
194 vadd.s16 q15, q3, q4 // G = ip[2] * C2 + ip[6] * C6
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
195
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
196 vmull.s16 q2, d2, xC4S4 // ((A - C) * C4) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
197 vmull.s16 q3, d3, xC4S4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
198 vmull.s16 q4, d28, xC2S6 // (ip[6] * C2) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
199 vmull.s16 q5, d29, xC2S6
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
200 vmull.s16 q6, d20, xC6S2 // (ip[2] * C6) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
201 vmull.s16 q7, d21, xC6S2
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
202 vshrn.s32 d4, q2, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
203 vshrn.s32 d5, q3, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
204 vshrn.s32 d6, q4, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
205 vshrn.s32 d7, q5, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
206 vshrn.s32 d8, q6, #16 // ip[2] * C6
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
207 vmull.s16 q5, d18, xC4S4 // ((B - D) * C4) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
208 vmull.s16 q6, d19, xC4S4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
209 vshrn.s32 d9, q7, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
210 vadd.s16 q3, q3, q14 // ip[6] * C2
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
211 vadd.s16 q10, q1, q2 // Ad = (A - C) * C4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
212 vsub.s16 q14, q4, q3 // H = ip[2] * C6 - ip[6] * C2
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
213 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 9916
diff changeset
214 endfunc
9916
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
215
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
216 .macro VP3_IDCT_END type
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
217 function vp3_idct_end_\type\()_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
218 .ifc \type, col
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
219 vdup.16 q0, r3
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
220 vadd.s16 q12, q12, q0
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
221 vadd.s16 q8, q8, q0
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
222 .endif
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
223
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
224 vshrn.s32 d2, q5, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
225 vshrn.s32 d3, q6, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
226 vadd.s16 q2, q12, q15 // Gd = E + G
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
227 vadd.s16 q9, q1, q9 // (B - D) * C4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
228 vsub.s16 q12, q12, q15 // Ed = E - G
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
229 vsub.s16 q3, q8, q10 // Fd = F - Ad
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
230 vadd.s16 q10, q8, q10 // Add = F + Ad
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
231 vadd.s16 q4, q9, q14 // Hd = Bd + H
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
232 vsub.s16 q14, q9, q14 // Bdd = Bd - H
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
233 vadd.s16 q8, q2, q11 // [0] = Gd + Cd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
234 vsub.s16 q15, q2, q11 // [7] = Gd - Cd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
235 vadd.s16 q9, q10, q4 // [1] = Add + Hd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
236 vsub.s16 q10, q10, q4 // [2] = Add - Hd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
237 vadd.s16 q11, q12, q13 // [3] = Ed + Dd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
238 vsub.s16 q12, q12, q13 // [4] = Ed - Dd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
239 .ifc \type, row
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
240 vtrn.16 q8, q9
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
241 .endif
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
242 vadd.s16 q13, q3, q14 // [5] = Fd + Bdd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
243 vsub.s16 q14, q3, q14 // [6] = Fd - Bdd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
244
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
245 .ifc \type, row
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
246 // 8x8 transpose
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
247 vtrn.16 q10, q11
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
248 vtrn.16 q12, q13
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
249 vtrn.16 q14, q15
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
250 vtrn.32 q8, q10
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
251 vtrn.32 q9, q11
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
252 vtrn.32 q12, q14
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
253 vtrn.32 q13, q15
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
254 vswp d17, d24
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
255 vswp d19, d26
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
256 vadd.s16 q1, q8, q12
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
257 vswp d21, d28
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
258 vsub.s16 q8, q8, q12
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
259 vswp d23, d30
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
260 .endif
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
261 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 9916
diff changeset
262 endfunc
9916
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
263 .endm
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
264
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
265 VP3_IDCT_END row
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
266 VP3_IDCT_END col
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
267
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
268 function ff_vp3_idct_neon, export=1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
269 mov ip, lr
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
270 mov r2, r0
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
271 bl vp3_idct_start_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
272 bl vp3_idct_end_row_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
273 mov r3, #8
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
274 bl vp3_idct_core_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
275 bl vp3_idct_end_col_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
276 mov lr, ip
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
277 vpop {d8-d15}
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
278
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
279 vshr.s16 q8, q8, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
280 vshr.s16 q9, q9, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
281 vshr.s16 q10, q10, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
282 vshr.s16 q11, q11, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
283 vshr.s16 q12, q12, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
284 vst1.64 {d16-d19}, [r0,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
285 vshr.s16 q13, q13, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
286 vshr.s16 q14, q14, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
287 vst1.64 {d20-d23}, [r0,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
288 vshr.s16 q15, q15, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
289 vst1.64 {d24-d27}, [r0,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
290 vst1.64 {d28-d31}, [r0,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
291 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 9916
diff changeset
292 endfunc
9916
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
293
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
294 function ff_vp3_idct_put_neon, export=1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
295 mov ip, lr
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
296 bl vp3_idct_start_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
297 bl vp3_idct_end_row_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
298 mov r3, #8
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
299 add r3, r3, #2048 // convert signed pixel to unsigned
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
300 bl vp3_idct_core_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
301 bl vp3_idct_end_col_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
302 mov lr, ip
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
303 vpop {d8-d15}
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
304
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
305 vqshrun.s16 d0, q8, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
306 vqshrun.s16 d1, q9, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
307 vqshrun.s16 d2, q10, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
308 vqshrun.s16 d3, q11, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
309 vst1.64 {d0}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
310 vqshrun.s16 d4, q12, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
311 vst1.64 {d1}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
312 vqshrun.s16 d5, q13, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
313 vst1.64 {d2}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
314 vqshrun.s16 d6, q14, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
315 vst1.64 {d3}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
316 vqshrun.s16 d7, q15, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
317 vst1.64 {d4}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
318 vst1.64 {d5}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
319 vst1.64 {d6}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
320 vst1.64 {d7}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
321 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 9916
diff changeset
322 endfunc
9916
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
323
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
324 function ff_vp3_idct_add_neon, export=1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
325 mov ip, lr
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
326 bl vp3_idct_start_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
327 bl vp3_idct_end_row_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
328 mov r3, #8
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
329 bl vp3_idct_core_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
330 bl vp3_idct_end_col_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
331 mov lr, ip
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
332 vpop {d8-d15}
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
333 mov r2, r0
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
334
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
335 vld1.64 {d0}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
336 vshr.s16 q8, q8, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
337 vld1.64 {d1}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
338 vshr.s16 q9, q9, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
339 vld1.64 {d2}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
340 vaddw.u8 q8, q8, d0
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
341 vld1.64 {d3}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
342 vaddw.u8 q9, q9, d1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
343 vld1.64 {d4}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
344 vshr.s16 q10, q10, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
345 vld1.64 {d5}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
346 vshr.s16 q11, q11, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
347 vld1.64 {d6}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
348 vqmovun.s16 d0, q8
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
349 vld1.64 {d7}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
350 vqmovun.s16 d1, q9
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
351 vaddw.u8 q10, q10, d2
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
352 vaddw.u8 q11, q11, d3
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
353 vshr.s16 q12, q12, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
354 vshr.s16 q13, q13, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
355 vqmovun.s16 d2, q10
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
356 vqmovun.s16 d3, q11
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
357 vaddw.u8 q12, q12, d4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
358 vaddw.u8 q13, q13, d5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
359 vshr.s16 q14, q14, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
360 vshr.s16 q15, q15, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
361 vst1.64 {d0}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
362 vqmovun.s16 d4, q12
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
363 vst1.64 {d1}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
364 vqmovun.s16 d5, q13
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
365 vst1.64 {d2}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
366 vaddw.u8 q14, q14, d6
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
367 vst1.64 {d3}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
368 vaddw.u8 q15, q15, d7
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
369 vst1.64 {d4}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
370 vqmovun.s16 d6, q14
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
371 vst1.64 {d5}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
372 vqmovun.s16 d7, q15
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
373 vst1.64 {d6}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
374 vst1.64 {d7}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
375 bx lr
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 9916
diff changeset
376 endfunc
11637
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
377
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
378 function ff_vp3_idct_dc_add_neon, export=1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
379 ldrsh r2, [r2]
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
380 mov r3, r0
11789
2064f8a1691e vp3: 10l Fix DC-only IDCT for C and ARM too
conrad
parents: 11637
diff changeset
381 add r2, r2, #15
11637
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
382 vdup.16 q15, r2
11789
2064f8a1691e vp3: 10l Fix DC-only IDCT for C and ARM too
conrad
parents: 11637
diff changeset
383 vshr.s16 q15, q15, #5
11637
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
384
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
385 vld1.8 {d0}, [r0,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
386 vld1.8 {d1}, [r0,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
387 vld1.8 {d2}, [r0,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
388 vaddw.u8 q8, q15, d0
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
389 vld1.8 {d3}, [r0,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
390 vaddw.u8 q9, q15, d1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
391 vld1.8 {d4}, [r0,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
392 vaddw.u8 q10, q15, d2
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
393 vld1.8 {d5}, [r0,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
394 vaddw.u8 q11, q15, d3
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
395 vld1.8 {d6}, [r0,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
396 vaddw.u8 q12, q15, d4
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
397 vld1.8 {d7}, [r0,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
398 vaddw.u8 q13, q15, d5
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
399 vqmovun.s16 d0, q8
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
400 vaddw.u8 q14, q15, d6
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
401 vqmovun.s16 d1, q9
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
402 vaddw.u8 q15, q15, d7
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
403 vqmovun.s16 d2, q10
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
404 vst1.8 {d0}, [r3,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
405 vqmovun.s16 d3, q11
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
406 vst1.8 {d1}, [r3,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
407 vqmovun.s16 d4, q12
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
408 vst1.8 {d2}, [r3,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
409 vqmovun.s16 d5, q13
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
410 vst1.8 {d3}, [r3,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
411 vqmovun.s16 d6, q14
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
412 vst1.8 {d4}, [r3,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
413 vqmovun.s16 d7, q15
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
414 vst1.8 {d5}, [r3,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
415 vst1.8 {d6}, [r3,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
416 vst1.8 {d7}, [r3,:64], r1
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
417 bx lr
f7281af560fe vp3: DC-only IDCT
conrad
parents: 11443
diff changeset
418 endfunc