annotate arm/vp3dsp_neon.S @ 11183:d1a855cb0a0c libavcodec

Split setting neighboring MBs from fill_decode_caches() no speed change.
author michael
date Mon, 15 Feb 2010 22:07:02 +0000
parents 2c1c28f26a27
children 361a5fcb4393
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9693
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
1 /*
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
2 * Copyright (c) 2009 David Conrad
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
3 *
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
4 * This file is part of FFmpeg.
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
5 *
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
10 *
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
14 * Lesser General Public License for more details.
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
15 *
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
19 */
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
20
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
21 #include "asm.S"
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
22
9916
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
23 .section .rodata
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
24 .align 4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
25
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
26 vp3_idct_constants:
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
27 .short 64277, 60547, 54491, 46341, 36410, 25080, 12785
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
28
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
29 #define xC1S7 d0[0]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
30 #define xC2S6 d0[1]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
31 #define xC3S5 d0[2]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
32 #define xC4S4 d0[3]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
33 #define xC5S3 d1[0]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
34 #define xC6S2 d1[1]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
35 #define xC7S1 d1[2]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
36
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
37 .text
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
38
9693
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
39 .macro vp3_loop_filter
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
40 vsubl.u8 q3, d18, d17
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
41 vsubl.u8 q2, d16, d19
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
42 vadd.i16 q1, q3, q3
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
43 vadd.i16 q2, q2, q3
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
44 vadd.i16 q0, q1, q2
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
45 vrshr.s16 q0, q0, #3
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
46 vmovl.u8 q9, d18
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
47 vdup.u16 q15, r2
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
48
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
49 vabs.s16 q1, q0
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
50 vshr.s16 q0, q0, #15
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
51 vqsub.u16 q2, q15, q1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
52 vqsub.u16 q3, q2, q1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
53 vsub.i16 q1, q2, q3
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
54 veor q1, q1, q0
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
55 vsub.i16 q0, q1, q0
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
56
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
57 vaddw.u8 q2, q0, d17
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
58 vsub.i16 q3, q9, q0
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
59 vqmovun.s16 d0, q2
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
60 vqmovun.s16 d1, q3
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
61 .endm
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
62
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
63 function ff_vp3_v_loop_filter_neon, export=1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
64 sub ip, r0, r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
65 sub r0, r0, r1, lsl #1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
66 vld1.64 {d16}, [r0,:64], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
67 vld1.64 {d17}, [r0,:64], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
68 vld1.64 {d18}, [r0,:64], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
69 vld1.64 {d19}, [r0,:64], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
70 ldrb r2, [r2, #129*4]
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
71
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
72 vp3_loop_filter
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
73
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
74 vst1.64 {d0}, [ip,:64], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
75 vst1.64 {d1}, [ip,:64], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
76 bx lr
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
77 .endfunc
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
78
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
79 function ff_vp3_h_loop_filter_neon, export=1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
80 sub ip, r0, #1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
81 sub r0, r0, #2
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
82 vld1.32 {d16[]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
83 vld1.32 {d17[]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
84 vld1.32 {d18[]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
85 vld1.32 {d19[]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
86 vld1.32 {d16[1]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
87 vld1.32 {d17[1]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
88 vld1.32 {d18[1]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
89 vld1.32 {d19[1]}, [r0], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
90 ldrb r2, [r2, #129*4]
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
91
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
92 vtrn.8 d16, d17
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
93 vtrn.8 d18, d19
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
94 vtrn.16 d16, d18
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
95 vtrn.16 d17, d19
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
96
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
97 vp3_loop_filter
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
98
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
99 vtrn.8 d0, d1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
100
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
101 vst1.16 {d0[0]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
102 vst1.16 {d1[0]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
103 vst1.16 {d0[1]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
104 vst1.16 {d1[1]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
105 vst1.16 {d0[2]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
106 vst1.16 {d1[2]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
107 vst1.16 {d0[3]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
108 vst1.16 {d1[3]}, [ip], r1
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
109 bx lr
feaf99ca99a6 ARM: actually add VP3 loop filter
conrad
parents:
diff changeset
110 .endfunc
9916
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
111
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
112
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
113 function vp3_idct_start_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
114 vpush {d8-d15}
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
115 movrel r3, vp3_idct_constants
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
116 vld1.64 {d0-d1}, [r3,:128]
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
117 vld1.64 {d16-d19}, [r2,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
118 vld1.64 {d20-d23}, [r2,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
119 vld1.64 {d24-d27}, [r2,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
120 vadd.s16 q1, q8, q12
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
121 vsub.s16 q8, q8, q12
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
122 vld1.64 {d28-d31}, [r2,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
123 .endfunc
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
124
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
125 function vp3_idct_core_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
126 vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
127 vmull.s16 q3, d19, xC1S7
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
128 vmull.s16 q4, d2, xC4S4 // ((ip[0] + ip[4]) * C4) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
129 vmull.s16 q5, d3, xC4S4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
130 vmull.s16 q6, d16, xC4S4 // ((ip[0] - ip[4]) * C4) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
131 vmull.s16 q7, d17, xC4S4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
132 vshrn.s32 d4, q2, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
133 vshrn.s32 d5, q3, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
134 vshrn.s32 d6, q4, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
135 vshrn.s32 d7, q5, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
136 vshrn.s32 d8, q6, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
137 vshrn.s32 d9, q7, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
138 vadd.s16 q12, q1, q3 // E = (ip[0] + ip[4]) * C4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
139 vadd.s16 q8, q8, q4 // F = (ip[0] - ip[4]) * C4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
140 vadd.s16 q1, q2, q9 // ip[1] * C1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
141
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
142 vmull.s16 q2, d30, xC1S7 // (ip[7] * C1) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
143 vmull.s16 q3, d31, xC1S7
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
144 vmull.s16 q4, d30, xC7S1 // (ip[7] * C7) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
145 vmull.s16 q5, d31, xC7S1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
146 vmull.s16 q6, d18, xC7S1 // (ip[1] * C7) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
147 vmull.s16 q7, d19, xC7S1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
148 vshrn.s32 d4, q2, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
149 vshrn.s32 d5, q3, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
150 vshrn.s32 d6, q4, #16 // ip[7] * C7
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
151 vshrn.s32 d7, q5, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
152 vshrn.s32 d8, q6, #16 // ip[1] * C7
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
153 vshrn.s32 d9, q7, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
154 vadd.s16 q2, q2, q15 // ip[7] * C1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
155 vadd.s16 q9, q1, q3 // A = ip[1] * C1 + ip[7] * C7
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
156 vsub.s16 q15, q4, q2 // B = ip[1] * C7 - ip[7] * C1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
157
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
158 vmull.s16 q2, d22, xC5S3 // (ip[3] * C5) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
159 vmull.s16 q3, d23, xC5S3
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
160 vmull.s16 q4, d22, xC3S5 // (ip[3] * C3) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
161 vmull.s16 q5, d23, xC3S5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
162 vmull.s16 q6, d26, xC5S3 // (ip[5] * C5) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
163 vmull.s16 q7, d27, xC5S3
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
164 vshrn.s32 d4, q2, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
165 vshrn.s32 d5, q3, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
166 vshrn.s32 d6, q4, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
167 vshrn.s32 d7, q5, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
168 vshrn.s32 d8, q6, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
169 vshrn.s32 d9, q7, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
170 vadd.s16 q3, q3, q11 // ip[3] * C3
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
171 vadd.s16 q4, q4, q13 // ip[5] * C5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
172 vadd.s16 q1, q2, q11 // ip[3] * C5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
173 vadd.s16 q11, q3, q4 // C = ip[3] * C3 + ip[5] * C5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
174
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
175 vmull.s16 q2, d26, xC3S5 // (ip[5] * C3) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
176 vmull.s16 q3, d27, xC3S5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
177 vmull.s16 q4, d20, xC2S6 // (ip[2] * C2) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
178 vmull.s16 q5, d21, xC2S6
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
179 vmull.s16 q6, d28, xC6S2 // (ip[6] * C6) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
180 vmull.s16 q7, d29, xC6S2
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
181 vshrn.s32 d4, q2, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
182 vshrn.s32 d5, q3, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
183 vshrn.s32 d6, q4, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
184 vshrn.s32 d7, q5, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
185 vshrn.s32 d8, q6, #16 // ip[6] * C6
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
186 vshrn.s32 d9, q7, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
187 vadd.s16 q2, q2, q13 // ip[5] * C3
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
188 vadd.s16 q3, q3, q10 // ip[2] * C2
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
189 vsub.s16 q13, q2, q1 // D = ip[5] * C3 - ip[3] * C5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
190 vsub.s16 q1, q9, q11 // (A - C)
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
191 vadd.s16 q11, q9, q11 // Cd = A + C
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
192 vsub.s16 q9, q15, q13 // (B - D)
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
193 vadd.s16 q13, q15, q13 // Dd = B + D
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
194 vadd.s16 q15, q3, q4 // G = ip[2] * C2 + ip[6] * C6
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
195
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
196 vmull.s16 q2, d2, xC4S4 // ((A - C) * C4) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
197 vmull.s16 q3, d3, xC4S4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
198 vmull.s16 q4, d28, xC2S6 // (ip[6] * C2) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
199 vmull.s16 q5, d29, xC2S6
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
200 vmull.s16 q6, d20, xC6S2 // (ip[2] * C6) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
201 vmull.s16 q7, d21, xC6S2
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
202 vshrn.s32 d4, q2, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
203 vshrn.s32 d5, q3, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
204 vshrn.s32 d6, q4, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
205 vshrn.s32 d7, q5, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
206 vshrn.s32 d8, q6, #16 // ip[2] * C6
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
207 vmull.s16 q5, d18, xC4S4 // ((B - D) * C4) << 16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
208 vmull.s16 q6, d19, xC4S4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
209 vshrn.s32 d9, q7, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
210 vadd.s16 q3, q3, q14 // ip[6] * C2
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
211 vadd.s16 q10, q1, q2 // Ad = (A - C) * C4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
212 vsub.s16 q14, q4, q3 // H = ip[2] * C6 - ip[6] * C2
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
213 bx lr
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
214 .endfunc
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
215
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
216 .macro VP3_IDCT_END type
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
217 function vp3_idct_end_\type\()_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
218 .ifc \type, col
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
219 vdup.16 q0, r3
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
220 vadd.s16 q12, q12, q0
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
221 vadd.s16 q8, q8, q0
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
222 .endif
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
223
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
224 vshrn.s32 d2, q5, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
225 vshrn.s32 d3, q6, #16
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
226 vadd.s16 q2, q12, q15 // Gd = E + G
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
227 vadd.s16 q9, q1, q9 // (B - D) * C4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
228 vsub.s16 q12, q12, q15 // Ed = E - G
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
229 vsub.s16 q3, q8, q10 // Fd = F - Ad
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
230 vadd.s16 q10, q8, q10 // Add = F + Ad
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
231 vadd.s16 q4, q9, q14 // Hd = Bd + H
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
232 vsub.s16 q14, q9, q14 // Bdd = Bd - H
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
233 vadd.s16 q8, q2, q11 // [0] = Gd + Cd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
234 vsub.s16 q15, q2, q11 // [7] = Gd - Cd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
235 vadd.s16 q9, q10, q4 // [1] = Add + Hd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
236 vsub.s16 q10, q10, q4 // [2] = Add - Hd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
237 vadd.s16 q11, q12, q13 // [3] = Ed + Dd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
238 vsub.s16 q12, q12, q13 // [4] = Ed - Dd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
239 .ifc \type, row
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
240 vtrn.16 q8, q9
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
241 .endif
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
242 vadd.s16 q13, q3, q14 // [5] = Fd + Bdd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
243 vsub.s16 q14, q3, q14 // [6] = Fd - Bdd
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
244
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
245 .ifc \type, row
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
246 // 8x8 transpose
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
247 vtrn.16 q10, q11
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
248 vtrn.16 q12, q13
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
249 vtrn.16 q14, q15
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
250 vtrn.32 q8, q10
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
251 vtrn.32 q9, q11
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
252 vtrn.32 q12, q14
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
253 vtrn.32 q13, q15
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
254 vswp d17, d24
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
255 vswp d19, d26
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
256 vadd.s16 q1, q8, q12
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
257 vswp d21, d28
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
258 vsub.s16 q8, q8, q12
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
259 vswp d23, d30
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
260 .endif
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
261 bx lr
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
262 .endfunc
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
263 .endm
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
264
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
265 VP3_IDCT_END row
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
266 VP3_IDCT_END col
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
267
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
268 function ff_vp3_idct_neon, export=1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
269 mov ip, lr
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
270 mov r2, r0
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
271 bl vp3_idct_start_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
272 bl vp3_idct_end_row_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
273 mov r3, #8
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
274 bl vp3_idct_core_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
275 bl vp3_idct_end_col_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
276 mov lr, ip
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
277 vpop {d8-d15}
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
278
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
279 vshr.s16 q8, q8, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
280 vshr.s16 q9, q9, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
281 vshr.s16 q10, q10, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
282 vshr.s16 q11, q11, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
283 vshr.s16 q12, q12, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
284 vst1.64 {d16-d19}, [r0,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
285 vshr.s16 q13, q13, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
286 vshr.s16 q14, q14, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
287 vst1.64 {d20-d23}, [r0,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
288 vshr.s16 q15, q15, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
289 vst1.64 {d24-d27}, [r0,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
290 vst1.64 {d28-d31}, [r0,:128]!
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
291 bx lr
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
292 .endfunc
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
293
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
294 function ff_vp3_idct_put_neon, export=1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
295 mov ip, lr
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
296 bl vp3_idct_start_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
297 bl vp3_idct_end_row_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
298 mov r3, #8
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
299 add r3, r3, #2048 // convert signed pixel to unsigned
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
300 bl vp3_idct_core_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
301 bl vp3_idct_end_col_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
302 mov lr, ip
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
303 vpop {d8-d15}
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
304
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
305 vqshrun.s16 d0, q8, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
306 vqshrun.s16 d1, q9, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
307 vqshrun.s16 d2, q10, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
308 vqshrun.s16 d3, q11, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
309 vst1.64 {d0}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
310 vqshrun.s16 d4, q12, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
311 vst1.64 {d1}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
312 vqshrun.s16 d5, q13, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
313 vst1.64 {d2}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
314 vqshrun.s16 d6, q14, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
315 vst1.64 {d3}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
316 vqshrun.s16 d7, q15, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
317 vst1.64 {d4}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
318 vst1.64 {d5}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
319 vst1.64 {d6}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
320 vst1.64 {d7}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
321 bx lr
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
322 .endfunc
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
323
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
324 function ff_vp3_idct_add_neon, export=1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
325 mov ip, lr
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
326 bl vp3_idct_start_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
327 bl vp3_idct_end_row_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
328 mov r3, #8
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
329 bl vp3_idct_core_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
330 bl vp3_idct_end_col_neon
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
331 mov lr, ip
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
332 vpop {d8-d15}
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
333 mov r2, r0
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
334
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
335 vld1.64 {d0}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
336 vshr.s16 q8, q8, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
337 vld1.64 {d1}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
338 vshr.s16 q9, q9, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
339 vld1.64 {d2}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
340 vaddw.u8 q8, q8, d0
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
341 vld1.64 {d3}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
342 vaddw.u8 q9, q9, d1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
343 vld1.64 {d4}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
344 vshr.s16 q10, q10, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
345 vld1.64 {d5}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
346 vshr.s16 q11, q11, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
347 vld1.64 {d6}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
348 vqmovun.s16 d0, q8
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
349 vld1.64 {d7}, [r0,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
350 vqmovun.s16 d1, q9
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
351 vaddw.u8 q10, q10, d2
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
352 vaddw.u8 q11, q11, d3
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
353 vshr.s16 q12, q12, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
354 vshr.s16 q13, q13, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
355 vqmovun.s16 d2, q10
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
356 vqmovun.s16 d3, q11
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
357 vaddw.u8 q12, q12, d4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
358 vaddw.u8 q13, q13, d5
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
359 vshr.s16 q14, q14, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
360 vshr.s16 q15, q15, #4
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
361 vst1.64 {d0}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
362 vqmovun.s16 d4, q12
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
363 vst1.64 {d1}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
364 vqmovun.s16 d5, q13
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
365 vst1.64 {d2}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
366 vaddw.u8 q14, q14, d6
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
367 vst1.64 {d3}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
368 vaddw.u8 q15, q15, d7
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
369 vst1.64 {d4}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
370 vqmovun.s16 d6, q14
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
371 vst1.64 {d5}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
372 vqmovun.s16 d7, q15
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
373 vst1.64 {d6}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
374 vst1.64 {d7}, [r2,:64], r1
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
375 bx lr
2c1c28f26a27 ARM: NEON VP3 IDCT
conrad
parents: 9693
diff changeset
376 .endfunc