annotate arm/h264dsp_neon.S @ 11040:f0b31169d04d libavcodec

Restructure check_mv() ~20 cpu cycles faster loopfilter
author michael
date Thu, 28 Jan 2010 11:12:46 +0000
parents 5506cbb012b4
children 361a5fcb4393
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
1 /*
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
3 *
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
4 * This file is part of FFmpeg.
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
5 *
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
10 *
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
14 * Lesser General Public License for more details.
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
15 *
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
19 */
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
20
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
21 #include "asm.S"
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
22
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
23 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
24 vtrn.32 \r0, \r4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
25 vtrn.32 \r1, \r5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
26 vtrn.32 \r2, \r6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
27 vtrn.32 \r3, \r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
28 vtrn.16 \r0, \r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
29 vtrn.16 \r1, \r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
30 vtrn.16 \r4, \r6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
31 vtrn.16 \r5, \r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
32 vtrn.8 \r0, \r1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
33 vtrn.8 \r2, \r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
34 vtrn.8 \r4, \r5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
35 vtrn.8 \r6, \r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
36 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
37
9864
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
38 .macro transpose_4x4 r0 r1 r2 r3
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
39 vtrn.16 \r0, \r2
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
40 vtrn.16 \r1, \r3
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
41 vtrn.8 \r0, \r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
42 vtrn.8 \r2, \r3
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
43 .endm
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
44
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
45 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
46 vswp \r0, \r4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
47 vswp \r1, \r5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
48 vswp \r2, \r6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
49 vswp \r3, \r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
50 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
51
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
52 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
53 vtrn.32 \r0, \r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
54 vtrn.32 \r1, \r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
55 vtrn.32 \r4, \r6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
56 vtrn.32 \r5, \r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
57 vtrn.16 \r0, \r1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
58 vtrn.16 \r2, \r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
59 vtrn.16 \r4, \r5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
60 vtrn.16 \r6, \r7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
61 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
62
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
64 .macro h264_chroma_mc8 type
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
65 function ff_\type\()_h264_chroma_mc8_neon, export=1
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
66 push {r4-r7, lr}
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
67 ldrd r4, [sp, #20]
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
68 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
69 mov lr, r0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
70 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
71 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
72 pld [r1, r2]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
73
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
74 muls r7, r4, r5
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
75 rsb r6, r7, r5, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
76 rsb ip, r7, r4, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
77 sub r4, r7, r4, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
78 sub r4, r4, r5, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
79 add r4, r4, #64
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
80
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
81 beq 2f
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
82
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
83 add r5, r1, r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
84
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
85 vdup.8 d0, r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
86 lsl r4, r2, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
87 vdup.8 d1, ip
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
88 vld1.64 {d4, d5}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
89 vdup.8 d2, r6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
90 vld1.64 {d6, d7}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
91 vdup.8 d3, r7
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
92
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
93 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
94 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
95
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
96 1: pld [r5]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
97 vmull.u8 q8, d4, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
98 vmlal.u8 q8, d5, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
99 vld1.64 {d4, d5}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
100 vmlal.u8 q8, d6, d2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
101 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
102 vmlal.u8 q8, d7, d3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
103 vmull.u8 q9, d6, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
104 subs r3, r3, #2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
105 vmlal.u8 q9, d7, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
106 vmlal.u8 q9, d4, d2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
107 vmlal.u8 q9, d5, d3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
108 vrshrn.u16 d16, q8, #6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
109 vld1.64 {d6, d7}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
110 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
111 vrshrn.u16 d17, q9, #6
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
112 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
113 vld1.64 {d20}, [lr,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
114 vld1.64 {d21}, [lr,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
115 vrhadd.u8 q8, q8, q10
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
116 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
117 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
118 vst1.64 {d16}, [r0,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
119 vst1.64 {d17}, [r0,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
120 bgt 1b
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
121
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
122 pop {r4-r7, pc}
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
123
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
124 2: tst r6, r6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
125 add ip, ip, r6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
126 vdup.8 d0, r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
127 vdup.8 d1, ip
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
128
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
129 beq 4f
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
130
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
131 add r5, r1, r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
132 lsl r4, r2, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
133 vld1.64 {d4}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
134 vld1.64 {d6}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
135
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
136 3: pld [r5]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
137 vmull.u8 q8, d4, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
138 vmlal.u8 q8, d6, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
139 vld1.64 {d4}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
140 vmull.u8 q9, d6, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
141 vmlal.u8 q9, d4, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
142 vld1.64 {d6}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
143 vrshrn.u16 d16, q8, #6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
144 vrshrn.u16 d17, q9, #6
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
145 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
146 vld1.64 {d20}, [lr,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
147 vld1.64 {d21}, [lr,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
148 vrhadd.u8 q8, q8, q10
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
149 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
150 subs r3, r3, #2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
151 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
152 vst1.64 {d16}, [r0,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
153 vst1.64 {d17}, [r0,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
154 bgt 3b
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
155
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
156 pop {r4-r7, pc}
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
157
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
158 4: vld1.64 {d4, d5}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
159 vld1.64 {d6, d7}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
160 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
161 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
162
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
163 5: pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
164 subs r3, r3, #2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
165 vmull.u8 q8, d4, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
166 vmlal.u8 q8, d5, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
167 vld1.64 {d4, d5}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
168 vmull.u8 q9, d6, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
169 vmlal.u8 q9, d7, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
170 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
171 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
172 vrshrn.u16 d16, q8, #6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
173 vrshrn.u16 d17, q9, #6
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
174 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
175 vld1.64 {d20}, [lr,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
176 vld1.64 {d21}, [lr,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
177 vrhadd.u8 q8, q8, q10
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
178 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
179 vld1.64 {d6, d7}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
180 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
181 vst1.64 {d16}, [r0,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
182 vst1.64 {d17}, [r0,:64], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
183 bgt 5b
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
184
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
185 pop {r4-r7, pc}
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
186 .endfunc
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
187 .endm
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
188
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
189 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
190 .macro h264_chroma_mc4 type
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
191 function ff_\type\()_h264_chroma_mc4_neon, export=1
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
192 push {r4-r7, lr}
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
193 ldrd r4, [sp, #20]
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
194 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
195 mov lr, r0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
196 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
197 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
198 pld [r1, r2]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
199
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
200 muls r7, r4, r5
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
201 rsb r6, r7, r5, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
202 rsb ip, r7, r4, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
203 sub r4, r7, r4, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
204 sub r4, r4, r5, lsl #3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
205 add r4, r4, #64
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
206
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
207 beq 2f
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
208
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
209 add r5, r1, r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
210
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
211 vdup.8 d0, r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
212 lsl r4, r2, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
213 vdup.8 d1, ip
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
214 vld1.64 {d4}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
215 vdup.8 d2, r6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
216 vld1.64 {d6}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
217 vdup.8 d3, r7
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
218
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
219 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
220 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
221 vtrn.32 d4, d5
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
222 vtrn.32 d6, d7
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
223
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
224 vtrn.32 d0, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
225 vtrn.32 d2, d3
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
226
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
227 1: pld [r5]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
228 vmull.u8 q8, d4, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
229 vmlal.u8 q8, d6, d2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
230 vld1.64 {d4}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
231 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
232 vtrn.32 d4, d5
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
233 vmull.u8 q9, d6, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
234 vmlal.u8 q9, d4, d2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
235 vld1.64 {d6}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
236 vadd.i16 d16, d16, d17
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
237 vadd.i16 d17, d18, d19
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
238 vrshrn.u16 d16, q8, #6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
239 subs r3, r3, #2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
240 pld [r1]
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
241 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
242 vld1.32 {d20[0]}, [lr,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
243 vld1.32 {d20[1]}, [lr,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
244 vrhadd.u8 d16, d16, d20
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
245 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
246 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
247 vtrn.32 d6, d7
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
248 vst1.32 {d16[0]}, [r0,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
249 vst1.32 {d16[1]}, [r0,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
250 bgt 1b
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
251
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
252 pop {r4-r7, pc}
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
253
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
254 2: tst r6, r6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
255 add ip, ip, r6
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
256 vdup.8 d0, r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
257 vdup.8 d1, ip
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
258 vtrn.32 d0, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
259
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
260 beq 4f
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
261
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
262 vext.32 d1, d0, d1, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
263 add r5, r1, r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
264 lsl r4, r2, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
265 vld1.32 {d4[0]}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
266 vld1.32 {d4[1]}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
267
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
268 3: pld [r5]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
269 vmull.u8 q8, d4, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
270 vld1.32 {d4[0]}, [r1], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
271 vmull.u8 q9, d4, d1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
272 vld1.32 {d4[1]}, [r5], r4
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
273 vadd.i16 d16, d16, d17
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
274 vadd.i16 d17, d18, d19
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
275 vrshrn.u16 d16, q8, #6
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
276 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
277 vld1.32 {d20[0]}, [lr,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
278 vld1.32 {d20[1]}, [lr,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
279 vrhadd.u8 d16, d16, d20
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
280 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
281 subs r3, r3, #2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
282 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
283 vst1.32 {d16[0]}, [r0,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
284 vst1.32 {d16[1]}, [r0,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
285 bgt 3b
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
286
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
287 pop {r4-r7, pc}
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
288
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
289 4: vld1.64 {d4}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
290 vld1.64 {d6}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
291 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
292 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
293 vtrn.32 d4, d5
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
294 vtrn.32 d6, d7
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
295
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
296 5: vmull.u8 q8, d4, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
297 vmull.u8 q9, d6, d0
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
298 subs r3, r3, #2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
299 vld1.64 {d4}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
300 vext.8 d5, d4, d5, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
301 vtrn.32 d4, d5
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
302 vadd.i16 d16, d16, d17
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
303 vadd.i16 d17, d18, d19
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
304 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
305 vrshrn.u16 d16, q8, #6
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
306 .ifc \type,avg
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
307 vld1.32 {d20[0]}, [lr,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
308 vld1.32 {d20[1]}, [lr,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
309 vrhadd.u8 d16, d16, d20
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
310 .endif
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
311 vld1.64 {d6}, [r1], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
312 vext.8 d7, d6, d7, #1
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
313 vtrn.32 d6, d7
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
314 pld [r1]
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
315 vst1.32 {d16[0]}, [r0,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
316 vst1.32 {d16[1]}, [r0,:32], r2
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
317 bgt 5b
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
318
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
319 pop {r4-r7, pc}
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
320 .endfunc
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
321 .endm
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
322
10617
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
323 .macro h264_chroma_mc2 type
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
324 function ff_\type\()_h264_chroma_mc2_neon, export=1
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
325 push {r4-r6, lr}
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
326 ldr r4, [sp, #16]
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
327 ldr lr, [sp, #20]
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
328 pld [r1]
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
329 pld [r1, r2]
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
330 orrs r5, r4, lr
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
331 beq 2f
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
332
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
333 mul r5, r4, lr
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
334 rsb r6, r5, lr, lsl #3
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
335 rsb r12, r5, r4, lsl #3
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
336 sub r4, r5, r4, lsl #3
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
337 sub r4, r4, lr, lsl #3
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
338 add r4, r4, #64
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
339 vdup.8 d0, r4
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
340 vdup.8 d2, r12
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
341 vdup.8 d1, r6
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
342 vdup.8 d3, r5
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
343 vtrn.16 q0, q1
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
344 1:
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
345 vld1.32 {d4[0]}, [r1], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
346 vld1.32 {d4[1]}, [r1], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
347 vrev64.32 d5, d4
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
348 vld1.32 {d5[1]}, [r1]
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
349 vext.8 q3, q2, q2, #1
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
350 vtrn.16 q2, q3
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
351 vmull.u8 q8, d4, d0
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
352 vmlal.u8 q8, d5, d1
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
353 .ifc \type,avg
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
354 vld1.16 {d18[0]}, [r0,:16], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
355 vld1.16 {d18[1]}, [r0,:16]
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
356 sub r0, r0, r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
357 .endif
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
358 vtrn.32 d16, d17
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
359 vadd.i16 d16, d16, d17
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
360 vrshrn.u16 d16, q8, #6
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
361 .ifc \type,avg
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
362 vrhadd.u8 d16, d16, d18
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
363 .endif
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
364 vst1.16 {d16[0]}, [r0,:16], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
365 vst1.16 {d16[1]}, [r0,:16], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
366 subs r3, r3, #2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
367 bgt 1b
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
368 pop {r4-r6, pc}
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
369 2:
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
370 .ifc \type,put
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
371 ldrh r5, [r1], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
372 strh r5, [r0], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
373 ldrh r6, [r1], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
374 strh r6, [r0], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
375 .else
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
376 vld1.16 {d16[0]}, [r1], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
377 vld1.16 {d16[1]}, [r1], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
378 vld1.16 {d18[0]}, [r0,:16], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
379 vld1.16 {d18[1]}, [r0,:16]
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
380 sub r0, r0, r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
381 vrhadd.u8 d16, d16, d18
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
382 vst1.16 {d16[0]}, [r0,:16], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
383 vst1.16 {d16[1]}, [r0,:16], r2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
384 .endif
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
385 subs r3, r3, #2
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
386 bgt 2b
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
387 pop {r4-r6, pc}
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
388 .endfunc
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
389 .endm
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
390
8336
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
391 .text
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
392 .align
c8401acb05d1 ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
mru
parents:
diff changeset
393
8626
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
394 h264_chroma_mc8 put
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
395 h264_chroma_mc8 avg
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
396 h264_chroma_mc4 put
8d425ee85ddb ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents: 8359
diff changeset
397 h264_chroma_mc4 avg
10617
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
398 h264_chroma_mc2 put
5506cbb012b4 ARM: NEON 2xN chroma MC
mru
parents: 10616
diff changeset
399 h264_chroma_mc2 avg
8337
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
400
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
401 /* H.264 loop filter */
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
402
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
403 .macro h264_loop_filter_start
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
404 ldr ip, [sp]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
405 tst r2, r2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
406 ldr ip, [ip]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
407 tstne r3, r3
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
408 vmov.32 d24[0], ip
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
409 and ip, ip, ip, lsl #16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
410 bxeq lr
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
411 ands ip, ip, ip, lsl #8
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
412 bxlt lr
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
413 .endm
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
414
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
415 .macro align_push_regs
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
416 and ip, sp, #15
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
417 add ip, ip, #32
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
418 sub sp, sp, ip
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
419 vst1.64 {d12-d15}, [sp,:128]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
420 sub sp, sp, #32
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
421 vst1.64 {d8-d11}, [sp,:128]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
422 .endm
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
423
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
424 .macro align_pop_regs
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
425 vld1.64 {d8-d11}, [sp,:128]!
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
426 vld1.64 {d12-d15}, [sp,:128], ip
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
427 .endm
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
428
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
429 .macro h264_loop_filter_luma
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
430 vdup.8 q11, r2 @ alpha
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
431 vmovl.u8 q12, d24
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
432 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
433 vmovl.u16 q12, d24
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
434 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
435 vsli.16 q12, q12, #8
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
436 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
437 vsli.32 q12, q12, #16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
438 vclt.u8 q6, q6, q11 @ < alpha
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
439 vdup.8 q11, r3 @ beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
440 vclt.s8 q7, q12, #0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
441 vclt.u8 q14, q14, q11 @ < beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
442 vclt.u8 q15, q15, q11 @ < beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
443 vbic q6, q6, q7
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
444 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
445 vand q6, q6, q14
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
446 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
447 vclt.u8 q4, q4, q11 @ < beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
448 vand q6, q6, q15
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
449 vclt.u8 q5, q5, q11 @ < beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
450 vand q4, q4, q6
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
451 vand q5, q5, q6
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
452 vand q12, q12, q6
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
453 vrhadd.u8 q14, q8, q0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
454 vsub.i8 q6, q12, q4
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
455 vqadd.u8 q7, q9, q12
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
456 vhadd.u8 q10, q10, q14
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
457 vsub.i8 q6, q6, q5
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
458 vhadd.u8 q14, q2, q14
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
459 vmin.u8 q7, q7, q10
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
460 vqsub.u8 q11, q9, q12
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
461 vqadd.u8 q2, q1, q12
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
462 vmax.u8 q7, q7, q11
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
463 vqsub.u8 q11, q1, q12
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
464 vmin.u8 q14, q2, q14
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
465 vmovl.u8 q2, d0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
466 vmax.u8 q14, q14, q11
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
467 vmovl.u8 q10, d1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
468 vsubw.u8 q2, q2, d16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
469 vsubw.u8 q10, q10, d17
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
470 vshl.i16 q2, q2, #2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
471 vshl.i16 q10, q10, #2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
472 vaddw.u8 q2, q2, d18
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
473 vaddw.u8 q10, q10, d19
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
474 vsubw.u8 q2, q2, d2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
475 vsubw.u8 q10, q10, d3
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
476 vrshrn.i16 d4, q2, #3
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
477 vrshrn.i16 d5, q10, #3
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
478 vbsl q4, q7, q9
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
479 vbsl q5, q14, q1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
480 vneg.s8 q7, q6
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
481 vmovl.u8 q14, d16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
482 vmin.s8 q2, q2, q6
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
483 vmovl.u8 q6, d17
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
484 vmax.s8 q2, q2, q7
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
485 vmovl.u8 q11, d0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
486 vmovl.u8 q12, d1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
487 vaddw.s8 q14, q14, d4
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
488 vaddw.s8 q6, q6, d5
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
489 vsubw.s8 q11, q11, d4
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
490 vsubw.s8 q12, q12, d5
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
491 vqmovun.s16 d16, q14
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
492 vqmovun.s16 d17, q6
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
493 vqmovun.s16 d0, q11
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
494 vqmovun.s16 d1, q12
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
495 .endm
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
496
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
497 function ff_h264_v_loop_filter_luma_neon, export=1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
498 h264_loop_filter_start
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
499
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
500 vld1.64 {d0, d1}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
501 vld1.64 {d2, d3}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
502 vld1.64 {d4, d5}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
503 sub r0, r0, r1, lsl #2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
504 sub r0, r0, r1, lsl #1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
505 vld1.64 {d20,d21}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
506 vld1.64 {d18,d19}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
507 vld1.64 {d16,d17}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
508
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
509 align_push_regs
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
510
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
511 h264_loop_filter_luma
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
512
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
513 sub r0, r0, r1, lsl #1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
514 vst1.64 {d8, d9}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
515 vst1.64 {d16,d17}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
516 vst1.64 {d0, d1}, [r0,:128], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
517 vst1.64 {d10,d11}, [r0,:128]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
518
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
519 align_pop_regs
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
520 bx lr
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
521 .endfunc
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
522
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
523 function ff_h264_h_loop_filter_luma_neon, export=1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
524 h264_loop_filter_start
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
525
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
526 sub r0, r0, #4
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
527 vld1.64 {d6}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
528 vld1.64 {d20}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
529 vld1.64 {d18}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
530 vld1.64 {d16}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
531 vld1.64 {d0}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
532 vld1.64 {d2}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
533 vld1.64 {d4}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
534 vld1.64 {d26}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
535 vld1.64 {d7}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
536 vld1.64 {d21}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
537 vld1.64 {d19}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
538 vld1.64 {d17}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
539 vld1.64 {d1}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
540 vld1.64 {d3}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
541 vld1.64 {d5}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
542 vld1.64 {d27}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
543
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
544 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
8337
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
545
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
546 align_push_regs
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
547
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
548 h264_loop_filter_luma
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
549
9864
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
550 transpose_4x4 q4, q8, q0, q5
8337
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
551
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
552 sub r0, r0, r1, lsl #4
9864
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
553 add r0, r0, #2
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
554 vst1.32 {d8[0]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
555 vst1.32 {d16[0]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
556 vst1.32 {d0[0]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
557 vst1.32 {d10[0]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
558 vst1.32 {d8[1]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
559 vst1.32 {d16[1]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
560 vst1.32 {d0[1]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
561 vst1.32 {d10[1]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
562 vst1.32 {d9[0]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
563 vst1.32 {d17[0]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
564 vst1.32 {d1[0]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
565 vst1.32 {d11[0]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
566 vst1.32 {d9[1]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
567 vst1.32 {d17[1]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
568 vst1.32 {d1[1]}, [r0], r1
f5ffd813dc7f ARM: slightly faster NEON H264 horizontal loop filter
mru
parents: 9072
diff changeset
569 vst1.32 {d11[1]}, [r0], r1
8337
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
570
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
571 align_pop_regs
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
572 bx lr
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
573 .endfunc
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
574
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
575 .macro h264_loop_filter_chroma
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
576 vdup.8 d22, r2 @ alpha
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
577 vmovl.u8 q12, d24
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
578 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
579 vmovl.u8 q2, d0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
580 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
581 vsubw.u8 q2, q2, d16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
582 vsli.16 d24, d24, #8
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
583 vshl.i16 q2, q2, #2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
584 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
585 vaddw.u8 q2, q2, d18
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
586 vclt.u8 d26, d26, d22 @ < alpha
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
587 vsubw.u8 q2, q2, d2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
588 vdup.8 d22, r3 @ beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
589 vclt.s8 d25, d24, #0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
590 vrshrn.i16 d4, q2, #3
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
591 vclt.u8 d28, d28, d22 @ < beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
592 vbic d26, d26, d25
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
593 vclt.u8 d30, d30, d22 @ < beta
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
594 vand d26, d26, d28
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
595 vneg.s8 d25, d24
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
596 vand d26, d26, d30
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
597 vmin.s8 d4, d4, d24
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
598 vmovl.u8 q14, d16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
599 vand d4, d4, d26
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
600 vmax.s8 d4, d4, d25
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
601 vmovl.u8 q11, d0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
602 vaddw.s8 q14, q14, d4
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
603 vsubw.s8 q11, q11, d4
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
604 vqmovun.s16 d16, q14
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
605 vqmovun.s16 d0, q11
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
606 .endm
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
607
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
608 function ff_h264_v_loop_filter_chroma_neon, export=1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
609 h264_loop_filter_start
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
610
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
611 sub r0, r0, r1, lsl #1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
612 vld1.64 {d18}, [r0,:64], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
613 vld1.64 {d16}, [r0,:64], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
614 vld1.64 {d0}, [r0,:64], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
615 vld1.64 {d2}, [r0,:64]
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
616
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
617 h264_loop_filter_chroma
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
618
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
619 sub r0, r0, r1, lsl #1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
620 vst1.64 {d16}, [r0,:64], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
621 vst1.64 {d0}, [r0,:64], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
622
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
623 bx lr
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
624 .endfunc
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
625
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
626 function ff_h264_h_loop_filter_chroma_neon, export=1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
627 h264_loop_filter_start
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
628
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
629 sub r0, r0, #2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
630 vld1.32 {d18[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
631 vld1.32 {d16[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
632 vld1.32 {d0[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
633 vld1.32 {d2[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
634 vld1.32 {d18[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
635 vld1.32 {d16[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
636 vld1.32 {d0[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
637 vld1.32 {d2[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
638
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
639 vtrn.16 d18, d0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
640 vtrn.16 d16, d2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
641 vtrn.8 d18, d16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
642 vtrn.8 d0, d2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
643
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
644 h264_loop_filter_chroma
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
645
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
646 vtrn.16 d18, d0
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
647 vtrn.16 d16, d2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
648 vtrn.8 d18, d16
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
649 vtrn.8 d0, d2
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
650
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
651 sub r0, r0, r1, lsl #3
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
652 vst1.32 {d18[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
653 vst1.32 {d16[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
654 vst1.32 {d0[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
655 vst1.32 {d2[0]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
656 vst1.32 {d18[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
657 vst1.32 {d16[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
658 vst1.32 {d0[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
659 vst1.32 {d2[1]}, [r0], r1
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
660
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
661 bx lr
d43b7f4c5c1c ARM: NEON optimised H.264 loop filter
mru
parents: 8336
diff changeset
662 .endfunc
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
663
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
664 /* H.264 qpel MC */
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
665
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
666 .macro lowpass_const r
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
667 movw \r, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
668 movt \r, #20
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
669 vmov.32 d6[0], \r
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
670 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
671
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
672 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
673 .if \narrow
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
674 t0 .req q0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
675 t1 .req q8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
676 .else
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
677 t0 .req \d0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
678 t1 .req \d1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
679 .endif
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
680 vext.8 d2, \r0, \r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
681 vext.8 d3, \r0, \r1, #3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
682 vaddl.u8 q1, d2, d3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
683 vext.8 d4, \r0, \r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
684 vext.8 d5, \r0, \r1, #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
685 vaddl.u8 q2, d4, d5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
686 vext.8 d30, \r0, \r1, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
687 vaddl.u8 t0, \r0, d30
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
688 vext.8 d18, \r2, \r3, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
689 vmla.i16 t0, q1, d6[1]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
690 vext.8 d19, \r2, \r3, #3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
691 vaddl.u8 q9, d18, d19
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
692 vext.8 d20, \r2, \r3, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
693 vmls.i16 t0, q2, d6[0]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
694 vext.8 d21, \r2, \r3, #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
695 vaddl.u8 q10, d20, d21
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
696 vext.8 d31, \r2, \r3, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
697 vaddl.u8 t1, \r2, d31
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
698 vmla.i16 t1, q9, d6[1]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
699 vmls.i16 t1, q10, d6[0]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
700 .if \narrow
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
701 vqrshrun.s16 \d0, t0, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
702 vqrshrun.s16 \d1, t1, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
703 .endif
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
704 .unreq t0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
705 .unreq t1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
706 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
707
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
708 .macro lowpass_8_1 r0, r1, d0, narrow=1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
709 .if \narrow
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
710 t0 .req q0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
711 .else
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
712 t0 .req \d0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
713 .endif
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
714 vext.8 d2, \r0, \r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
715 vext.8 d3, \r0, \r1, #3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
716 vaddl.u8 q1, d2, d3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
717 vext.8 d4, \r0, \r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
718 vext.8 d5, \r0, \r1, #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
719 vaddl.u8 q2, d4, d5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
720 vext.8 d30, \r0, \r1, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
721 vaddl.u8 t0, \r0, d30
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
722 vmla.i16 t0, q1, d6[1]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
723 vmls.i16 t0, q2, d6[0]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
724 .if \narrow
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
725 vqrshrun.s16 \d0, t0, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
726 .endif
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
727 .unreq t0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
728 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
729
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
730 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
731 vext.16 q1, \r0, \r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
732 vext.16 q0, \r0, \r1, #3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
733 vaddl.s16 q9, d2, d0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
734 vext.16 q2, \r0, \r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
735 vaddl.s16 q1, d3, d1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
736 vext.16 q3, \r0, \r1, #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
737 vaddl.s16 q10, d4, d6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
738 vext.16 \r1, \r0, \r1, #5
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
739 vaddl.s16 q2, d5, d7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
740 vaddl.s16 q0, \h0, \h1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
741 vaddl.s16 q8, \l0, \l1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
742
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
743 vshl.i32 q3, q9, #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
744 vshl.i32 q9, q9, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
745 vshl.i32 q15, q10, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
746 vadd.i32 q9, q9, q3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
747 vadd.i32 q10, q10, q15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
748
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
749 vshl.i32 q3, q1, #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
750 vshl.i32 q1, q1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
751 vshl.i32 q15, q2, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
752 vadd.i32 q1, q1, q3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
753 vadd.i32 q2, q2, q15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
754
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
755 vadd.i32 q9, q9, q8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
756 vsub.i32 q9, q9, q10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
757
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
758 vadd.i32 q1, q1, q0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
759 vsub.i32 q1, q1, q2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
760
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
761 vrshrn.s32 d18, q9, #10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
762 vrshrn.s32 d19, q1, #10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
763
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
764 vqmovun.s16 \d, q9
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
765 .endm
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
766
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
767 function put_h264_qpel16_h_lowpass_neon_packed
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
768 mov r4, lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
769 mov ip, #16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
770 mov r3, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
771 bl put_h264_qpel8_h_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
772 sub r1, r1, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
773 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
774 mov ip, #16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
775 mov lr, r4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
776 b put_h264_qpel8_h_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
777 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
778
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
779 .macro h264_qpel_h_lowpass type
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
780 function \type\()_h264_qpel16_h_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
781 push {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
782 mov ip, #16
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
783 bl \type\()_h264_qpel8_h_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
784 sub r0, r0, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
785 sub r1, r1, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
786 add r0, r0, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
787 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
788 mov ip, #16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
789 pop {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
790 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
791
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
792 function \type\()_h264_qpel8_h_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
793 1: vld1.64 {d0, d1}, [r1], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
794 vld1.64 {d16,d17}, [r1], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
795 subs ip, ip, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
796 lowpass_8 d0, d1, d16, d17, d0, d16
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
797 .ifc \type,avg
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
798 vld1.8 {d2}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
799 vrhadd.u8 d0, d0, d2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
800 vld1.8 {d3}, [r0,:64]
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
801 vrhadd.u8 d16, d16, d3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
802 sub r0, r0, r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
803 .endif
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
804 vst1.64 {d0}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
805 vst1.64 {d16}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
806 bne 1b
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
807 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
808 .endfunc
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
809 .endm
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
810
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
811 h264_qpel_h_lowpass put
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
812 h264_qpel_h_lowpass avg
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
813
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
814 .macro h264_qpel_h_lowpass_l2 type
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
815 function \type\()_h264_qpel16_h_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
816 push {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
817 mov ip, #16
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
818 bl \type\()_h264_qpel8_h_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
819 sub r0, r0, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
820 sub r1, r1, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
821 sub r3, r3, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
822 add r0, r0, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
823 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
824 add r3, r3, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
825 mov ip, #16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
826 pop {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
827 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
828
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
829 function \type\()_h264_qpel8_h_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
830 1: vld1.64 {d0, d1}, [r1], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
831 vld1.64 {d16,d17}, [r1], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
832 vld1.64 {d28}, [r3], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
833 vld1.64 {d29}, [r3], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
834 subs ip, ip, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
835 lowpass_8 d0, d1, d16, d17, d0, d1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
836 vrhadd.u8 q0, q0, q14
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
837 .ifc \type,avg
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
838 vld1.8 {d2}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
839 vrhadd.u8 d0, d0, d2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
840 vld1.8 {d3}, [r0,:64]
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
841 vrhadd.u8 d1, d1, d3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
842 sub r0, r0, r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
843 .endif
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
844 vst1.64 {d0}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
845 vst1.64 {d1}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
846 bne 1b
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
847 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
848 .endfunc
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
849 .endm
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
850
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
851 h264_qpel_h_lowpass_l2 put
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
852 h264_qpel_h_lowpass_l2 avg
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
853
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
854 function put_h264_qpel16_v_lowpass_neon_packed
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
855 mov r4, lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
856 mov r2, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
857 bl put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
858 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
859 bl put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
860 sub r1, r1, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
861 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
862 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
863 bl put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
864 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
865 mov lr, r4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
866 b put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
867 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
868
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
869 .macro h264_qpel_v_lowpass type
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
870 function \type\()_h264_qpel16_v_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
871 mov r4, lr
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
872 bl \type\()_h264_qpel8_v_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
873 sub r1, r1, r3, lsl #2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
874 bl \type\()_h264_qpel8_v_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
875 sub r0, r0, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
876 add r0, r0, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
877 sub r1, r1, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
878 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
879 add r1, r1, #8
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
880 bl \type\()_h264_qpel8_v_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
881 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
882 mov lr, r4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
883 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
884
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
885 function \type\()_h264_qpel8_v_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
886 vld1.64 {d8}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
887 vld1.64 {d10}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
888 vld1.64 {d12}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
889 vld1.64 {d14}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
890 vld1.64 {d22}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
891 vld1.64 {d24}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
892 vld1.64 {d26}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
893 vld1.64 {d28}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
894 vld1.64 {d9}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
895 vld1.64 {d11}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
896 vld1.64 {d13}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
897 vld1.64 {d15}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
898 vld1.64 {d23}, [r1]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
899
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
900 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
901 lowpass_8 d8, d9, d10, d11, d8, d10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
902 lowpass_8 d12, d13, d14, d15, d12, d14
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
903 lowpass_8 d22, d23, d24, d25, d22, d24
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
904 lowpass_8 d26, d27, d28, d29, d26, d28
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
905 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
906
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
907 .ifc \type,avg
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
908 vld1.8 {d9}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
909 vrhadd.u8 d8, d8, d9
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
910 vld1.8 {d11}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
911 vrhadd.u8 d10, d10, d11
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
912 vld1.8 {d13}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
913 vrhadd.u8 d12, d12, d13
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
914 vld1.8 {d15}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
915 vrhadd.u8 d14, d14, d15
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
916 vld1.8 {d23}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
917 vrhadd.u8 d22, d22, d23
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
918 vld1.8 {d25}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
919 vrhadd.u8 d24, d24, d25
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
920 vld1.8 {d27}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
921 vrhadd.u8 d26, d26, d27
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
922 vld1.8 {d29}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
923 vrhadd.u8 d28, d28, d29
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
924 sub r0, r0, r2, lsl #3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
925 .endif
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
926
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
927 vst1.64 {d8}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
928 vst1.64 {d10}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
929 vst1.64 {d12}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
930 vst1.64 {d14}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
931 vst1.64 {d22}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
932 vst1.64 {d24}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
933 vst1.64 {d26}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
934 vst1.64 {d28}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
935
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
936 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
937 .endfunc
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
938 .endm
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
939
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
940 h264_qpel_v_lowpass put
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
941 h264_qpel_v_lowpass avg
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
942
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
943 .macro h264_qpel_v_lowpass_l2 type
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
944 function \type\()_h264_qpel16_v_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
945 mov r4, lr
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
946 bl \type\()_h264_qpel8_v_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
947 sub r1, r1, r3, lsl #2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
948 bl \type\()_h264_qpel8_v_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
949 sub r0, r0, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
950 sub ip, ip, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
951 add r0, r0, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
952 add ip, ip, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
953 sub r1, r1, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
954 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
955 add r1, r1, #8
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
956 bl \type\()_h264_qpel8_v_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
957 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
958 mov lr, r4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
959 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
960
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
961 function \type\()_h264_qpel8_v_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
962 vld1.64 {d8}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
963 vld1.64 {d10}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
964 vld1.64 {d12}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
965 vld1.64 {d14}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
966 vld1.64 {d22}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
967 vld1.64 {d24}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
968 vld1.64 {d26}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
969 vld1.64 {d28}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
970 vld1.64 {d9}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
971 vld1.64 {d11}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
972 vld1.64 {d13}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
973 vld1.64 {d15}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
974 vld1.64 {d23}, [r1]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
975
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
976 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
977 lowpass_8 d8, d9, d10, d11, d8, d9
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
978 lowpass_8 d12, d13, d14, d15, d12, d13
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
979 lowpass_8 d22, d23, d24, d25, d22, d23
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
980 lowpass_8 d26, d27, d28, d29, d26, d27
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
981 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
982
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
983 vld1.64 {d0}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
984 vld1.64 {d1}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
985 vld1.64 {d2}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
986 vld1.64 {d3}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
987 vld1.64 {d4}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
988 vrhadd.u8 q0, q0, q4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
989 vld1.64 {d5}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
990 vrhadd.u8 q1, q1, q6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
991 vld1.64 {d10}, [ip], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
992 vrhadd.u8 q2, q2, q11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
993 vld1.64 {d11}, [ip], r2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
994 vrhadd.u8 q5, q5, q13
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
995
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
996 .ifc \type,avg
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
997 vld1.8 {d16}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
998 vrhadd.u8 d0, d0, d16
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
999 vld1.8 {d17}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1000 vrhadd.u8 d1, d1, d17
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1001 vld1.8 {d16}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1002 vrhadd.u8 d2, d2, d16
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1003 vld1.8 {d17}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1004 vrhadd.u8 d3, d3, d17
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1005 vld1.8 {d16}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1006 vrhadd.u8 d4, d4, d16
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1007 vld1.8 {d17}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1008 vrhadd.u8 d5, d5, d17
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1009 vld1.8 {d16}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1010 vrhadd.u8 d10, d10, d16
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1011 vld1.8 {d17}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1012 vrhadd.u8 d11, d11, d17
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1013 sub r0, r0, r3, lsl #3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1014 .endif
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1015
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1016 vst1.64 {d0}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1017 vst1.64 {d1}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1018 vst1.64 {d2}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1019 vst1.64 {d3}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1020 vst1.64 {d4}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1021 vst1.64 {d5}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1022 vst1.64 {d10}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1023 vst1.64 {d11}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1024
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1025 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1026 .endfunc
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1027 .endm
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1028
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1029 h264_qpel_v_lowpass_l2 put
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1030 h264_qpel_v_lowpass_l2 avg
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1031
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1032 function put_h264_qpel8_hv_lowpass_neon_top
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1033 lowpass_const ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1034 mov ip, #12
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1035 1: vld1.64 {d0, d1}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1036 vld1.64 {d16,d17}, [r1], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1037 subs ip, ip, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1038 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1039 vst1.64 {d22-d25}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1040 bne 1b
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1041
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1042 vld1.64 {d0, d1}, [r1]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1043 lowpass_8_1 d0, d1, q12, narrow=0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1044
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1045 mov ip, #-16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1046 add r4, r4, ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1047 vld1.64 {d30,d31}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1048 vld1.64 {d20,d21}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1049 vld1.64 {d18,d19}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1050 vld1.64 {d16,d17}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1051 vld1.64 {d14,d15}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1052 vld1.64 {d12,d13}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1053 vld1.64 {d10,d11}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1054 vld1.64 {d8, d9}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1055 vld1.64 {d6, d7}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1056 vld1.64 {d4, d5}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1057 vld1.64 {d2, d3}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1058 vld1.64 {d0, d1}, [r4,:128]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1059
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1060 swap4 d1, d3, d5, d7, d8, d10, d12, d14
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1061 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1062
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1063 swap4 d17, d19, d21, d31, d24, d26, d28, d22
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1064 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1065
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1066 vst1.64 {d30,d31}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1067 vst1.64 {d6, d7}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1068 vst1.64 {d20,d21}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1069 vst1.64 {d4, d5}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1070 vst1.64 {d18,d19}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1071 vst1.64 {d2, d3}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1072 vst1.64 {d16,d17}, [r4,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1073 vst1.64 {d0, d1}, [r4,:128]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1074
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1075 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1076 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1077 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1078 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1079
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1080 vld1.64 {d16,d17}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1081 vld1.64 {d30,d31}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1082 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1083 vld1.64 {d16,d17}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1084 vld1.64 {d30,d31}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1085 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1086 vld1.64 {d16,d17}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1087 vld1.64 {d30,d31}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1088 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1089 vld1.64 {d16,d17}, [r4,:128], ip
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1090 vld1.64 {d30,d31}, [r4,:128]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1091 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1092
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1093 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1094
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1095 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1096 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1097
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1098 .macro h264_qpel8_hv_lowpass type
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1099 function \type\()_h264_qpel8_hv_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1100 mov r10, lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1101 bl put_h264_qpel8_hv_lowpass_neon_top
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1102 .ifc \type,avg
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1103 vld1.8 {d0}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1104 vrhadd.u8 d12, d12, d0
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1105 vld1.8 {d1}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1106 vrhadd.u8 d13, d13, d1
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1107 vld1.8 {d2}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1108 vrhadd.u8 d14, d14, d2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1109 vld1.8 {d3}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1110 vrhadd.u8 d15, d15, d3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1111 vld1.8 {d4}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1112 vrhadd.u8 d8, d8, d4
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1113 vld1.8 {d5}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1114 vrhadd.u8 d9, d9, d5
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1115 vld1.8 {d6}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1116 vrhadd.u8 d10, d10, d6
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1117 vld1.8 {d7}, [r0,:64], r2
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1118 vrhadd.u8 d11, d11, d7
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1119 sub r0, r0, r2, lsl #3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1120 .endif
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1121 vst1.64 {d12}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1122 vst1.64 {d13}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1123 vst1.64 {d14}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1124 vst1.64 {d15}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1125 vst1.64 {d8}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1126 vst1.64 {d9}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1127 vst1.64 {d10}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1128 vst1.64 {d11}, [r0,:64], r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1129
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1130 mov lr, r10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1131 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1132 .endfunc
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1133 .endm
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1134
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1135 h264_qpel8_hv_lowpass put
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1136 h264_qpel8_hv_lowpass avg
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1137
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1138 .macro h264_qpel8_hv_lowpass_l2 type
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1139 function \type\()_h264_qpel8_hv_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1140 mov r10, lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1141 bl put_h264_qpel8_hv_lowpass_neon_top
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1142
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1143 vld1.64 {d0, d1}, [r2,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1144 vld1.64 {d2, d3}, [r2,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1145 vrhadd.u8 q0, q0, q6
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1146 vld1.64 {d4, d5}, [r2,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1147 vrhadd.u8 q1, q1, q7
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1148 vld1.64 {d6, d7}, [r2,:128]!
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1149 vrhadd.u8 q2, q2, q4
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1150 vrhadd.u8 q3, q3, q5
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1151 .ifc \type,avg
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1152 vld1.8 {d16}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1153 vrhadd.u8 d0, d0, d16
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1154 vld1.8 {d17}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1155 vrhadd.u8 d1, d1, d17
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1156 vld1.8 {d18}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1157 vrhadd.u8 d2, d2, d18
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1158 vld1.8 {d19}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1159 vrhadd.u8 d3, d3, d19
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1160 vld1.8 {d20}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1161 vrhadd.u8 d4, d4, d20
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1162 vld1.8 {d21}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1163 vrhadd.u8 d5, d5, d21
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1164 vld1.8 {d22}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1165 vrhadd.u8 d6, d6, d22
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1166 vld1.8 {d23}, [r0,:64], r3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1167 vrhadd.u8 d7, d7, d23
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1168 sub r0, r0, r3, lsl #3
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1169 .endif
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1170 vst1.64 {d0}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1171 vst1.64 {d1}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1172 vst1.64 {d2}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1173 vst1.64 {d3}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1174 vst1.64 {d4}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1175 vst1.64 {d5}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1176 vst1.64 {d6}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1177 vst1.64 {d7}, [r0,:64], r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1178
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1179 mov lr, r10
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1180 bx lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1181 .endfunc
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1182 .endm
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1183
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1184 h264_qpel8_hv_lowpass_l2 put
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1185 h264_qpel8_hv_lowpass_l2 avg
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1186
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1187 .macro h264_qpel16_hv type
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1188 function \type\()_h264_qpel16_hv_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1189 mov r9, lr
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1190 bl \type\()_h264_qpel8_hv_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1191 sub r1, r1, r3, lsl #2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1192 bl \type\()_h264_qpel8_hv_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1193 sub r1, r1, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1194 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1195 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1196 sub r0, r0, r2, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1197 add r0, r0, #8
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1198 bl \type\()_h264_qpel8_hv_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1199 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1200 mov lr, r9
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1201 b \type\()_h264_qpel8_hv_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1202 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1203
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1204 function \type\()_h264_qpel16_hv_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1205 mov r9, lr
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1206 sub r2, r4, #256
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1207 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1208 sub r1, r1, r3, lsl #2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1209 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1210 sub r1, r1, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1211 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1212 add r1, r1, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1213 sub r0, r0, r3, lsl #4
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1214 add r0, r0, #8
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1215 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1216 sub r1, r1, r3, lsl #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1217 mov lr, r9
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1218 b \type\()_h264_qpel8_hv_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1219 .endfunc
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1220 .endm
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1221
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1222 h264_qpel16_hv put
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1223 h264_qpel16_hv avg
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1224
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1225 .macro h264_qpel8 type
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1226 function ff_\type\()_h264_qpel8_mc10_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1227 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1228 mov r3, r1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1229 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1230 mov ip, #8
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1231 b \type\()_h264_qpel8_h_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1232 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1233
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1234 function ff_\type\()_h264_qpel8_mc20_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1235 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1236 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1237 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1238 mov ip, #8
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1239 b \type\()_h264_qpel8_h_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1240 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1241
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1242 function ff_\type\()_h264_qpel8_mc30_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1243 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1244 add r3, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1245 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1246 mov ip, #8
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1247 b \type\()_h264_qpel8_h_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1248 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1249
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1250 function ff_\type\()_h264_qpel8_mc01_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1251 push {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1252 mov ip, r1
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1253 \type\()_h264_qpel8_mc01:
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1254 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1255 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1256 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1257 vpush {d8-d15}
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1258 bl \type\()_h264_qpel8_v_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1259 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1260 pop {pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1261 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1262
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1263 function ff_\type\()_h264_qpel8_mc11_neon, export=1
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1264 push {r0, r1, r11, lr}
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1265 \type\()_h264_qpel8_mc11:
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1266 lowpass_const r3
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1267 mov r11, sp
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1268 bic sp, sp, #15
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1269 sub sp, sp, #64
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1270 mov r0, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1271 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1272 mov r3, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1273 mov ip, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1274 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1275 bl put_h264_qpel8_h_lowpass_neon
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1276 ldrd r0, [r11]
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1277 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1278 add ip, sp, #64
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1279 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1280 mov r2, #8
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1281 bl \type\()_h264_qpel8_v_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1282 vpop {d8-d15}
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1283 add sp, r11, #8
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1284 pop {r11, pc}
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1285 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1286
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1287 function ff_\type\()_h264_qpel8_mc21_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1288 push {r0, r1, r4, r10, r11, lr}
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1289 \type\()_h264_qpel8_mc21:
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1290 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1291 mov r11, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1292 bic sp, sp, #15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1293 sub sp, sp, #(8*8+16*12)
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1294 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1295 mov r3, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1296 mov r0, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1297 mov ip, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1298 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1299 bl put_h264_qpel8_h_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1300 mov r4, r0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1301 ldrd r0, [r11]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1302 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1303 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1304 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1305 sub r2, r4, #64
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1306 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1307 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1308 add sp, r11, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1309 pop {r4, r10, r11, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1310 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1311
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1312 function ff_\type\()_h264_qpel8_mc31_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1313 add r1, r1, #1
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1314 push {r0, r1, r11, lr}
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1315 sub r1, r1, #1
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1316 b \type\()_h264_qpel8_mc11
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1317 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1318
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1319 function ff_\type\()_h264_qpel8_mc02_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1320 push {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1321 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1322 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1323 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1324 vpush {d8-d15}
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1325 bl \type\()_h264_qpel8_v_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1326 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1327 pop {pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1328 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1329
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1330 function ff_\type\()_h264_qpel8_mc12_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1331 push {r0, r1, r4, r10, r11, lr}
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1332 \type\()_h264_qpel8_mc12:
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1333 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1334 mov r11, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1335 bic sp, sp, #15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1336 sub sp, sp, #(8*8+16*12)
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1337 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1338 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1339 mov r2, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1340 mov r0, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1341 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1342 bl put_h264_qpel8_v_lowpass_neon
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1343 mov r4, r0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1344 ldrd r0, [r11]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1345 sub r1, r1, r3, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1346 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1347 sub r2, r4, #64
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1348 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1349 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1350 add sp, r11, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1351 pop {r4, r10, r11, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1352 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1353
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1354 function ff_\type\()_h264_qpel8_mc22_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1355 push {r4, r10, r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1356 mov r11, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1357 bic sp, sp, #15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1358 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1359 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1360 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1361 sub sp, sp, #(16*12)
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1362 mov r4, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1363 vpush {d8-d15}
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1364 bl \type\()_h264_qpel8_hv_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1365 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1366 mov sp, r11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1367 pop {r4, r10, r11, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1368 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1369
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1370 function ff_\type\()_h264_qpel8_mc32_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1371 push {r0, r1, r4, r10, r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1372 add r1, r1, #1
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1373 b \type\()_h264_qpel8_mc12
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1374 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1375
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1376 function ff_\type\()_h264_qpel8_mc03_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1377 push {lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1378 add ip, r1, r2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1379 b \type\()_h264_qpel8_mc01
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1380 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1381
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1382 function ff_\type\()_h264_qpel8_mc13_neon, export=1
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1383 push {r0, r1, r11, lr}
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1384 add r1, r1, r2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1385 b \type\()_h264_qpel8_mc11
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1386 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1387
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1388 function ff_\type\()_h264_qpel8_mc23_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1389 push {r0, r1, r4, r10, r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1390 add r1, r1, r2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1391 b \type\()_h264_qpel8_mc21
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1392 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1393
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1394 function ff_\type\()_h264_qpel8_mc33_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1395 add r1, r1, #1
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1396 push {r0, r1, r11, lr}
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1397 add r1, r1, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1398 sub r1, r1, #1
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1399 b \type\()_h264_qpel8_mc11
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1400 .endfunc
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1401 .endm
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1402
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1403 h264_qpel8 put
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1404 h264_qpel8 avg
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1405
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1406 .macro h264_qpel16 type
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1407 function ff_\type\()_h264_qpel16_mc10_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1408 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1409 mov r3, r1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1410 sub r1, r1, #2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1411 b \type\()_h264_qpel16_h_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1412 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1413
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1414 function ff_\type\()_h264_qpel16_mc20_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1415 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1416 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1417 mov r3, r2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1418 b \type\()_h264_qpel16_h_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1419 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1420
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1421 function ff_\type\()_h264_qpel16_mc30_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1422 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1423 add r3, r1, #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1424 sub r1, r1, #2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1425 b \type\()_h264_qpel16_h_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1426 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1427
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1428 function ff_\type\()_h264_qpel16_mc01_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1429 push {r4, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1430 mov ip, r1
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1431 \type\()_h264_qpel16_mc01:
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1432 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1433 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1434 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1435 vpush {d8-d15}
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1436 bl \type\()_h264_qpel16_v_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1437 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1438 pop {r4, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1439 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1440
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1441 function ff_\type\()_h264_qpel16_mc11_neon, export=1
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1442 push {r0, r1, r4, r11, lr}
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1443 \type\()_h264_qpel16_mc11:
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1444 lowpass_const r3
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1445 mov r11, sp
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1446 bic sp, sp, #15
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1447 sub sp, sp, #256
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1448 mov r0, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1449 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1450 mov r3, #16
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1451 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1452 bl put_h264_qpel16_h_lowpass_neon
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1453 ldrd r0, [r11]
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1454 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1455 add ip, sp, #64
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1456 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1457 mov r2, #16
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1458 bl \type\()_h264_qpel16_v_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1459 vpop {d8-d15}
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1460 add sp, r11, #8
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1461 pop {r4, r11, pc}
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1462 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1463
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1464 function ff_\type\()_h264_qpel16_mc21_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1465 push {r0, r1, r4-r5, r9-r11, lr}
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1466 \type\()_h264_qpel16_mc21:
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1467 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1468 mov r11, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1469 bic sp, sp, #15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1470 sub sp, sp, #(16*16+16*12)
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1471 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1472 mov r0, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1473 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1474 bl put_h264_qpel16_h_lowpass_neon_packed
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1475 mov r4, r0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1476 ldrd r0, [r11]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1477 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1478 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1479 mov r3, r2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1480 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1481 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1482 add sp, r11, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1483 pop {r4-r5, r9-r11, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1484 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1485
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1486 function ff_\type\()_h264_qpel16_mc31_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1487 add r1, r1, #1
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1488 push {r0, r1, r4, r11, lr}
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1489 sub r1, r1, #1
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1490 b \type\()_h264_qpel16_mc11
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1491 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1492
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1493 function ff_\type\()_h264_qpel16_mc02_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1494 push {r4, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1495 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1496 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1497 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1498 vpush {d8-d15}
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1499 bl \type\()_h264_qpel16_v_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1500 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1501 pop {r4, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1502 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1503
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1504 function ff_\type\()_h264_qpel16_mc12_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1505 push {r0, r1, r4-r5, r9-r11, lr}
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1506 \type\()_h264_qpel16_mc12:
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1507 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1508 mov r11, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1509 bic sp, sp, #15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1510 sub sp, sp, #(16*16+16*12)
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1511 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1512 mov r0, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1513 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1514 vpush {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1515 bl put_h264_qpel16_v_lowpass_neon_packed
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1516 mov r4, r0
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1517 ldrd r0, [r11]
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1518 sub r1, r1, r3, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1519 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1520 mov r2, r3
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1521 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1522 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1523 add sp, r11, #8
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1524 pop {r4-r5, r9-r11, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1525 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1526
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1527 function ff_\type\()_h264_qpel16_mc22_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1528 push {r4, r9-r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1529 lowpass_const r3
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1530 mov r11, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1531 bic sp, sp, #15
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1532 sub r1, r1, r2, lsl #1
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1533 sub r1, r1, #2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1534 mov r3, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1535 sub sp, sp, #(16*12)
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1536 mov r4, sp
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1537 vpush {d8-d15}
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1538 bl \type\()_h264_qpel16_hv_lowpass_neon
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1539 vpop {d8-d15}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1540 mov sp, r11
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1541 pop {r4, r9-r11, pc}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1542 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1543
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1544 function ff_\type\()_h264_qpel16_mc32_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1545 push {r0, r1, r4-r5, r9-r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1546 add r1, r1, #1
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1547 b \type\()_h264_qpel16_mc12
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1548 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1549
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1550 function ff_\type\()_h264_qpel16_mc03_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1551 push {r4, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1552 add ip, r1, r2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1553 b \type\()_h264_qpel16_mc01
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1554 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1555
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1556 function ff_\type\()_h264_qpel16_mc13_neon, export=1
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1557 push {r0, r1, r4, r11, lr}
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1558 add r1, r1, r2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1559 b \type\()_h264_qpel16_mc11
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1560 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1561
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1562 function ff_\type\()_h264_qpel16_mc23_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1563 push {r0, r1, r4-r5, r9-r11, lr}
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1564 add r1, r1, r2
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1565 b \type\()_h264_qpel16_mc21
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1566 .endfunc
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1567
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1568 function ff_\type\()_h264_qpel16_mc33_neon, export=1
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1569 add r1, r1, #1
10385
bc98e5724513 ARM: align stack in NEON h264 mc functions
mru
parents: 10349
diff changeset
1570 push {r0, r1, r4, r11, lr}
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1571 add r1, r1, r2
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1572 sub r1, r1, #1
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1573 b \type\()_h264_qpel16_mc11
8338
b294a0d5bc50 ARM: NEON optimised H.264 8x8 and 16x16 qpel MC
mru
parents: 8337
diff changeset
1574 .endfunc
10616
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1575 .endm
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1576
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1577 h264_qpel16 put
d3b98479ef62 ARM: NEON 16x16 and 8x8 avg qpel MC
mru
parents: 10385
diff changeset
1578 h264_qpel16 avg
8663
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1579
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1580 @ Biweighted prediction
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1581
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1582 .macro biweight_16 macs, macd
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1583 vdup.8 d0, r4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1584 vdup.8 d1, r5
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1585 vmov q2, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1586 vmov q3, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1587 1: subs ip, ip, #2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1588 vld1.8 {d20-d21},[r0,:128], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1589 \macd q2, d0, d20
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1590 pld [r0]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1591 \macd q3, d0, d21
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1592 vld1.8 {d22-d23},[r1,:128], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1593 \macs q2, d1, d22
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1594 pld [r1]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1595 \macs q3, d1, d23
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1596 vmov q12, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1597 vld1.8 {d28-d29},[r0,:128], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1598 vmov q13, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1599 \macd q12, d0, d28
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1600 pld [r0]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1601 \macd q13, d0, d29
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1602 vld1.8 {d30-d31},[r1,:128], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1603 \macs q12, d1, d30
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1604 pld [r1]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1605 \macs q13, d1, d31
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1606 vshl.s16 q2, q2, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1607 vshl.s16 q3, q3, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1608 vqmovun.s16 d4, q2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1609 vqmovun.s16 d5, q3
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1610 vshl.s16 q12, q12, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1611 vshl.s16 q13, q13, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1612 vqmovun.s16 d24, q12
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1613 vqmovun.s16 d25, q13
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1614 vmov q3, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1615 vst1.8 {d4- d5}, [r6,:128], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1616 vmov q2, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1617 vst1.8 {d24-d25},[r6,:128], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1618 bne 1b
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1619 pop {r4-r6, pc}
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1620 .endm
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1621
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1622 .macro biweight_8 macs, macd
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1623 vdup.8 d0, r4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1624 vdup.8 d1, r5
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1625 vmov q1, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1626 vmov q10, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1627 1: subs ip, ip, #2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1628 vld1.8 {d4},[r0,:64], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1629 \macd q1, d0, d4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1630 pld [r0]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1631 vld1.8 {d5},[r1,:64], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1632 \macs q1, d1, d5
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1633 pld [r1]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1634 vld1.8 {d6},[r0,:64], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1635 \macd q10, d0, d6
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1636 pld [r0]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1637 vld1.8 {d7},[r1,:64], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1638 \macs q10, d1, d7
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1639 pld [r1]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1640 vshl.s16 q1, q1, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1641 vqmovun.s16 d2, q1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1642 vshl.s16 q10, q10, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1643 vqmovun.s16 d4, q10
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1644 vmov q10, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1645 vst1.8 {d2},[r6,:64], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1646 vmov q1, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1647 vst1.8 {d4},[r6,:64], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1648 bne 1b
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1649 pop {r4-r6, pc}
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1650 .endm
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1651
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1652 .macro biweight_4 macs, macd
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1653 vdup.8 d0, r4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1654 vdup.8 d1, r5
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1655 vmov q1, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1656 vmov q10, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1657 1: subs ip, ip, #4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1658 vld1.32 {d4[0]},[r0,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1659 vld1.32 {d4[1]},[r0,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1660 \macd q1, d0, d4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1661 pld [r0]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1662 vld1.32 {d5[0]},[r1,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1663 vld1.32 {d5[1]},[r1,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1664 \macs q1, d1, d5
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1665 pld [r1]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1666 blt 2f
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1667 vld1.32 {d6[0]},[r0,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1668 vld1.32 {d6[1]},[r0,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1669 \macd q10, d0, d6
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1670 pld [r0]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1671 vld1.32 {d7[0]},[r1,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1672 vld1.32 {d7[1]},[r1,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1673 \macs q10, d1, d7
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1674 pld [r1]
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1675 vshl.s16 q1, q1, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1676 vqmovun.s16 d2, q1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1677 vshl.s16 q10, q10, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1678 vqmovun.s16 d4, q10
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1679 vmov q10, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1680 vst1.32 {d2[0]},[r6,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1681 vst1.32 {d2[1]},[r6,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1682 vmov q1, q8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1683 vst1.32 {d4[0]},[r6,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1684 vst1.32 {d4[1]},[r6,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1685 bne 1b
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1686 pop {r4-r6, pc}
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1687 2: vshl.s16 q1, q1, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1688 vqmovun.s16 d2, q1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1689 vst1.32 {d2[0]},[r6,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1690 vst1.32 {d2[1]},[r6,:32], r2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1691 pop {r4-r6, pc}
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1692 .endm
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1693
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1694 .macro biweight_func w
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1695 function biweight_h264_pixels_\w\()_neon
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1696 push {r4-r6, lr}
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1697 add r4, sp, #16
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1698 ldm r4, {r4-r6}
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1699 lsr lr, r4, #31
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1700 add r6, r6, #1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1701 eors lr, lr, r5, lsr #30
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1702 orr r6, r6, #1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1703 vdup.16 q9, r3
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1704 lsl r6, r6, r3
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1705 vmvn q9, q9
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1706 vdup.16 q8, r6
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1707 mov r6, r0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1708 beq 10f
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1709 subs lr, lr, #1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1710 beq 20f
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1711 subs lr, lr, #1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1712 beq 30f
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1713 b 40f
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1714 10: biweight_\w vmlal.u8, vmlal.u8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1715 20: rsb r4, r4, #0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1716 biweight_\w vmlal.u8, vmlsl.u8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1717 30: rsb r4, r4, #0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1718 rsb r5, r5, #0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1719 biweight_\w vmlsl.u8, vmlsl.u8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1720 40: rsb r5, r5, #0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1721 biweight_\w vmlsl.u8, vmlal.u8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1722 .endfunc
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1723 .endm
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1724
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1725 .macro biweight_entry w, h, b=1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1726 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1727 mov ip, #\h
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1728 .if \b
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1729 b biweight_h264_pixels_\w\()_neon
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1730 .endif
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1731 .endfunc
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1732 .endm
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1733
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1734 biweight_entry 16, 8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1735 biweight_entry 16, 16, b=0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1736 biweight_func 16
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1737
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1738 biweight_entry 8, 16
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1739 biweight_entry 8, 4
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1740 biweight_entry 8, 8, b=0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1741 biweight_func 8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1742
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1743 biweight_entry 4, 8
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1744 biweight_entry 4, 2
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1745 biweight_entry 4, 4, b=0
23f7711e777e ARM: NEON optimised H.264 biweighted prediction
mru
parents: 8626
diff changeset
1746 biweight_func 4
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1747
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1748 @ Weighted prediction
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1749
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1750 .macro weight_16 add
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1751 vdup.8 d0, r3
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1752 1: subs ip, ip, #2
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1753 vld1.8 {d20-d21},[r0,:128], r1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1754 vmull.u8 q2, d0, d20
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1755 pld [r0]
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1756 vmull.u8 q3, d0, d21
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1757 vld1.8 {d28-d29},[r0,:128], r1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1758 vmull.u8 q12, d0, d28
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1759 pld [r0]
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1760 vmull.u8 q13, d0, d29
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1761 \add q2, q8, q2
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1762 vrshl.s16 q2, q2, q9
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1763 \add q3, q8, q3
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1764 vrshl.s16 q3, q3, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1765 vqmovun.s16 d4, q2
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1766 vqmovun.s16 d5, q3
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1767 \add q12, q8, q12
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1768 vrshl.s16 q12, q12, q9
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1769 \add q13, q8, q13
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1770 vrshl.s16 q13, q13, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1771 vqmovun.s16 d24, q12
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1772 vqmovun.s16 d25, q13
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1773 vst1.8 {d4- d5}, [r4,:128], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1774 vst1.8 {d24-d25},[r4,:128], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1775 bne 1b
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1776 pop {r4, pc}
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1777 .endm
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1778
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1779 .macro weight_8 add
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1780 vdup.8 d0, r3
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1781 1: subs ip, ip, #2
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1782 vld1.8 {d4},[r0,:64], r1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1783 vmull.u8 q1, d0, d4
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1784 pld [r0]
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1785 vld1.8 {d6},[r0,:64], r1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1786 vmull.u8 q10, d0, d6
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1787 \add q1, q8, q1
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1788 pld [r0]
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1789 vrshl.s16 q1, q1, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1790 vqmovun.s16 d2, q1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1791 \add q10, q8, q10
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1792 vrshl.s16 q10, q10, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1793 vqmovun.s16 d4, q10
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1794 vst1.8 {d2},[r4,:64], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1795 vst1.8 {d4},[r4,:64], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1796 bne 1b
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1797 pop {r4, pc}
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1798 .endm
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1799
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1800 .macro weight_4 add
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1801 vdup.8 d0, r3
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1802 vmov q1, q8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1803 vmov q10, q8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1804 1: subs ip, ip, #4
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1805 vld1.32 {d4[0]},[r0,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1806 vld1.32 {d4[1]},[r0,:32], r1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1807 vmull.u8 q1, d0, d4
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1808 pld [r0]
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1809 blt 2f
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1810 vld1.32 {d6[0]},[r0,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1811 vld1.32 {d6[1]},[r0,:32], r1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1812 vmull.u8 q10, d0, d6
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1813 pld [r0]
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1814 \add q1, q8, q1
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1815 vrshl.s16 q1, q1, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1816 vqmovun.s16 d2, q1
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1817 \add q10, q8, q10
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1818 vrshl.s16 q10, q10, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1819 vqmovun.s16 d4, q10
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1820 vmov q10, q8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1821 vst1.32 {d2[0]},[r4,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1822 vst1.32 {d2[1]},[r4,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1823 vmov q1, q8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1824 vst1.32 {d4[0]},[r4,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1825 vst1.32 {d4[1]},[r4,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1826 bne 1b
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1827 pop {r4, pc}
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1828 2: \add q1, q8, q1
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1829 vrshl.s16 q1, q1, q9
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1830 vqmovun.s16 d2, q1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1831 vst1.32 {d2[0]},[r4,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1832 vst1.32 {d2[1]},[r4,:32], r1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1833 pop {r4, pc}
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1834 .endm
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1835
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1836 .macro weight_func w
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1837 function weight_h264_pixels_\w\()_neon
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1838 push {r4, lr}
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1839 ldr r4, [sp, #8]
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1840 cmp r2, #1
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1841 lsl r4, r4, r2
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1842 vdup.16 q8, r4
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1843 mov r4, r0
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1844 ble 20f
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1845 rsb lr, r2, #1
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1846 vdup.16 q9, lr
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1847 cmp r3, #0
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1848 blt 10f
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1849 weight_\w vhadd.s16
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1850 10: rsb r3, r3, #0
9072
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1851 weight_\w vhsub.s16
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1852 20: rsb lr, r2, #0
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1853 vdup.16 q9, lr
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1854 cmp r3, #0
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1855 blt 10f
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1856 weight_\w vadd.s16
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1857 10: rsb r3, r3, #0
d56b711c6c5d ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents: 8664
diff changeset
1858 weight_\w vsub.s16
8664
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1859 .endfunc
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1860 .endm
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1861
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1862 .macro weight_entry w, h, b=1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1863 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1864 mov ip, #\h
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1865 .if \b
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1866 b weight_h264_pixels_\w\()_neon
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1867 .endif
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1868 .endfunc
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1869 .endm
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1870
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1871 weight_entry 16, 8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1872 weight_entry 16, 16, b=0
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1873 weight_func 16
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1874
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1875 weight_entry 8, 16
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1876 weight_entry 8, 4
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1877 weight_entry 8, 8, b=0
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1878 weight_func 8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1879
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1880 weight_entry 4, 8
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1881 weight_entry 4, 2
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1882 weight_entry 4, 4, b=0
882c351e69c2 ARM: NEON optimised H.264 weighted prediction
mru
parents: 8663
diff changeset
1883 weight_func 4